diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 803a814793c..bf9c0a785b6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -330,7 +330,7 @@ jobs: if: matrix.os == 'windows-x64' run: | cargo nextest run --cargo-profile ci --locked --workspace --all-features --no-fail-fast ` - --exclude vortex-bench --exclude vortex-bench-server ` + --exclude vortex-bench ` --exclude vortex-python --exclude vortex-duckdb ` --exclude vortex-fuzz --exclude vortex-cuda --exclude vortex-cuda-ffi ` --exclude vortex-nvcomp --exclude vortex-cub --exclude vortex-test-e2e-cuda ` @@ -342,19 +342,6 @@ jobs: if: matrix.os != 'windows-x64' run: | cargo nextest run --cargo-profile ci --locked --workspace --all-features --no-fail-fast --exclude vortex-bench --exclude xtask --exclude vortex-sqllogictest - - name: vortex-bench-server admin snapshot tests (Linux only - network-dependent) - # The /api/admin/snapshot tests INSTALL+LOAD the vortex DuckDB - # core extension from extensions.duckdb.org on first call. They - # are #[ignore]'d by default so `cargo test` works in offline - # environments (sandboxed CI, local dev without network). This - # step runs them explicitly on the Linux runners, which DO have - # outbound network, so the entire backup contract is covered - # in CI before merge. macOS/arm64 also exercises them - same - # extension, same network. Windows skipped (bench-server is - # excluded from Windows test matrix above). - if: matrix.os == 'linux-x64' || matrix.os == 'linux-arm64' || matrix.os == 'macos-arm64' - run: | - cargo nextest run --cargo-profile ci --locked -p vortex-bench-server --test admin --run-ignored only - uses: ./.github/actions/check-rebuild if: matrix.os != 'windows-x64' with: diff --git a/.github/workflows/rust-instrumented.yml b/.github/workflows/rust-instrumented.yml index f53169e23ba..d58463d53d2 100644 --- a/.github/workflows/rust-instrumented.yml +++ b/.github/workflows/rust-instrumented.yml @@ -144,8 +144,8 @@ jobs: - name: Run tests with sanitizer run: | RUSTFLAGS="${RUSTFLAGS} ${{ matrix.sanitizer_flags }}" \ - cargo +$NIGHTLY_TOOLCHAIN nextest run --locked --all-features \ - --target x86_64-unknown-linux-gnu --no-fail-fast -Zbuild-std \ + cargo +$NIGHTLY_TOOLCHAIN nextest run --locked --all-features --no-fail-fast \ + --target x86_64-unknown-linux-gnu -Zbuild-std \ -p vortex-buffer -p vortex-fastlanes -p vortex-fsst -p vortex-alp -p vortex-array # vortex-ffi requires --no-default-features as otherwise we pull in diff --git a/Cargo.lock b/Cargo.lock index c45406a8c6c..e8182ac67f3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -168,9 +168,6 @@ name = "arbitrary" version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" -dependencies = [ - "derive_arbitrary", -] [[package]] name = "arc-swap" @@ -510,7 +507,7 @@ dependencies = [ "futures-lite", "parking", "polling", - "rustix 1.1.4", + "rustix", "slab", "windows-sys 0.61.2", ] @@ -552,7 +549,7 @@ dependencies = [ "cfg-if", "event-listener", "futures-lite", - "rustix 1.1.4", + "rustix", ] [[package]] @@ -578,7 +575,7 @@ dependencies = [ "cfg-if", "futures-core", "futures-io", - "rustix 1.1.4", + "rustix", "signal-hook-registry", "slab", "windows-sys 0.61.2", @@ -675,58 +672,6 @@ dependencies = [ "fs_extra", ] -[[package]] -name = "axum" -version = "0.8.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90" -dependencies = [ - "axum-core", - "bytes", - "form_urlencoded", - "futures-util", - "http", - "http-body", - "http-body-util", - "hyper", - "hyper-util", - "itoa", - "matchit", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "serde_core", - "serde_json", - "serde_path_to_error", - "serde_urlencoded", - "sync_wrapper", - "tokio", - "tower", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "axum-core" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" -dependencies = [ - "bytes", - "futures-core", - "http", - "http-body", - "http-body-util", - "mime", - "pin-project-lite", - "sync_wrapper", - "tower-layer", - "tower-service", - "tracing", -] - [[package]] name = "base16ct" version = "1.0.0" @@ -1414,7 +1359,6 @@ version = "7.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" dependencies = [ - "crossterm 0.28.1", "unicode-segmentation", "unicode-width 0.2.2", ] @@ -1461,7 +1405,6 @@ version = "0.4.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce2548391e9c1929c21bf6aa2680af86fe4c1b33e6cea9ac1cfeec0bd11218cf" dependencies = [ - "brotli", "bzip2", "compression-core", "flate2", @@ -1743,19 +1686,6 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" -[[package]] -name = "crossterm" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" -dependencies = [ - "bitflags", - "crossterm_winapi", - "parking_lot", - "rustix 0.38.44", - "winapi", -] - [[package]] name = "crossterm" version = "0.29.0" @@ -1769,7 +1699,7 @@ dependencies = [ "futures-core", "mio", "parking_lot", - "rustix 1.1.4", + "rustix", "signal-hook", "signal-hook-mio", "winapi", @@ -3380,17 +3310,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "derive_arbitrary" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - [[package]] name = "derive_more" version = "2.1.1" @@ -3495,25 +3414,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab23e69df104e2fd85ee63a533a22d2132ef5975dc6b36f9f3e5a7305e4a8ed7" -[[package]] -name = "duckdb" -version = "1.10502.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fdc796383b176dd5a45353fbb5e64583c0ee4da12cb62c9e510b785324b2488" -dependencies = [ - "arrow", - "cast", - "comfy-table", - "fallible-iterator", - "fallible-streaming-iterator", - "hashlink", - "libduckdb-sys", - "num", - "num-integer", - "rust_decimal", - "strum 0.27.2", -] - [[package]] name = "duckdb-bench" version = "0.1.0" @@ -3715,18 +3615,6 @@ dependencies = [ "ext-trait", ] -[[package]] -name = "fallible-iterator" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" - -[[package]] -name = "fallible-streaming-iterator" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" - [[package]] name = "fast-float2" version = "0.2.3" @@ -4263,15 +4151,6 @@ dependencies = [ "foldhash 0.2.0", ] -[[package]] -name = "hashlink" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1" -dependencies = [ - "hashbrown 0.15.5", -] - [[package]] name = "heck" version = "0.5.0" @@ -4338,12 +4217,6 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" -[[package]] -name = "httpdate" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" - [[package]] name = "humansize" version = "2.1.3" @@ -4382,7 +4255,6 @@ dependencies = [ "http", "http-body", "httparse", - "httpdate", "itoa", "pin-project-lite", "smallvec", @@ -4404,7 +4276,6 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", - "webpki-roots", ] [[package]] @@ -4663,7 +4534,6 @@ checksum = "7b4a6248eb93a4401ed2f37dfe8ea592d3cf05b7cf4f8efa867b6895af7e094e" dependencies = [ "console 0.16.3", "once_cell", - "regex", "similar 2.7.0", "tempfile", ] @@ -5557,23 +5427,6 @@ version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" -[[package]] -name = "libduckdb-sys" -version = "1.10502.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d7401630ae2abcff642f7156294289e50f2d222e061c026ad797b01bf20c215" -dependencies = [ - "cc", - "flate2", - "pkg-config", - "reqwest 0.12.28", - "serde", - "serde_json", - "tar", - "vcpkg", - "zip 6.0.0", -] - [[package]] name = "libfuzzer-sys" version = "0.4.13" @@ -5678,12 +5531,6 @@ dependencies = [ "cc", ] -[[package]] -name = "linux-raw-sys" -version = "0.4.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" - [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -5807,12 +5654,6 @@ dependencies = [ "regex-automata", ] -[[package]] -name = "matchit" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" - [[package]] name = "matrixmultiply" version = "0.3.10" @@ -5826,30 +5667,6 @@ dependencies = [ "thread-tree", ] -[[package]] -name = "maud" -version = "0.27.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8156733e27020ea5c684db5beac5d1d611e1272ab17901a49466294b84fc217e" -dependencies = [ - "axum-core", - "http", - "itoa", - "maud_macros", -] - -[[package]] -name = "maud_macros" -version = "0.27.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7261b00f3952f617899bc012e3dbd56e4f0110a038175929fa5d18e5a19913ca" -dependencies = [ - "proc-macro2", - "proc-macro2-diagnostics", - "quote", - "syn 2.0.117", -] - [[package]] name = "md-5" version = "0.10.6" @@ -5970,28 +5787,6 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" -[[package]] -name = "multiversion" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edb7f0ff51249dfda9ab96b5823695e15a052dc15074c9dbf3d118afaf2c201" -dependencies = [ - "multiversion-macros", - "target-features", -] - -[[package]] -name = "multiversion-macros" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b093064383341eb3271f42e381cb8f10a01459478446953953c75d24bd339fc0" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", - "target-features", -] - [[package]] name = "ndarray" version = "0.16.1" @@ -6155,20 +5950,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "num" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" -dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", -] - [[package]] name = "num-bigint" version = "0.4.6" @@ -6203,28 +5984,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-iter" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-bigint", - "num-integer", - "num-traits", -] - [[package]] name = "num-traits" version = "0.2.19" @@ -6860,7 +6619,7 @@ dependencies = [ "concurrent-queue", "hermit-abi", "pin-project-lite", - "rustix 1.1.4", + "rustix", "windows-sys 0.61.2", ] @@ -6976,18 +6735,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "proc-macro2-diagnostics" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", - "version_check", -] - [[package]] name = "prost" version = "0.12.6" @@ -7494,7 +7241,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b2867bedcbd6a690ca4f8672a687b730ec07660c79844517b084311b529980c" dependencies = [ "cfg-if", - "crossterm 0.29.0", + "crossterm", "instability", "ratatui-core", ] @@ -7680,7 +7427,6 @@ dependencies = [ "base64", "bytes", "encoding_rs", - "futures-channel", "futures-core", "futures-util", "h2", @@ -7715,7 +7461,6 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams 0.4.2", "web-sys", - "webpki-roots", ] [[package]] @@ -7746,8 +7491,6 @@ dependencies = [ "rustls", "rustls-pki-types", "rustls-platform-verifier", - "serde", - "serde_json", "sync_wrapper", "tokio", "tokio-rustls", @@ -7903,19 +7646,6 @@ dependencies = [ "semver", ] -[[package]] -name = "rustix" -version = "0.38.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" -dependencies = [ - "bitflags", - "errno", - "libc", - "linux-raw-sys 0.4.15", - "windows-sys 0.59.0", -] - [[package]] name = "rustix" version = "1.1.4" @@ -7925,7 +7655,7 @@ dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys 0.12.1", + "linux-raw-sys", "windows-sys 0.61.2", ] @@ -8165,17 +7895,6 @@ dependencies = [ "zmij", ] -[[package]] -name = "serde_path_to_error" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" -dependencies = [ - "itoa", - "serde", - "serde_core", -] - [[package]] name = "serde_repr" version = "0.1.20" @@ -8616,15 +8335,6 @@ dependencies = [ "strum_macros 0.26.4", ] -[[package]] -name = "strum" -version = "0.27.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" -dependencies = [ - "strum_macros 0.27.2", -] - [[package]] name = "strum" version = "0.28.0" @@ -8647,18 +8357,6 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "strum_macros" -version = "0.27.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "syn 2.0.117", -] - [[package]] name = "strum_macros" version = "0.28.0" @@ -8815,12 +8513,6 @@ dependencies = [ "xattr", ] -[[package]] -name = "target-features" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1bbb9f3c5c463a01705937a24fdabc5047929ac764b2d5b9cf681c1f5041ed5" - [[package]] name = "target-lexicon" version = "0.13.5" @@ -8845,7 +8537,7 @@ dependencies = [ "fastrand", "getrandom 0.4.2", "once_cell", - "rustix 1.1.4", + "rustix", "windows-sys 0.61.2", ] @@ -8864,7 +8556,7 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874" dependencies = [ - "rustix 1.1.4", + "rustix", "windows-sys 0.61.2", ] @@ -9214,7 +8906,6 @@ dependencies = [ "tokio", "tower-layer", "tower-service", - "tracing", ] [[package]] @@ -9237,7 +8928,6 @@ dependencies = [ "tower", "tower-layer", "tower-service", - "tracing", "url", ] @@ -9530,12 +9220,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" -[[package]] -name = "vcpkg" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" - [[package]] name = "version_check" version = "0.9.5" @@ -9643,7 +9327,6 @@ dependencies = [ "inventory", "itertools 0.14.0", "jiff", - "multiversion", "num-traits", "num_enum", "parking_lot", @@ -9696,7 +9379,6 @@ dependencies = [ "arrow-schema", "arrow-select", "async-trait", - "bytes", "bzip2", "clap", "futures", @@ -9733,60 +9415,6 @@ dependencies = [ "vortex-tensor", ] -[[package]] -name = "vortex-bench-migrate" -version = "0.1.0-alpha.0" -dependencies = [ - "anyhow", - "arrow-array", - "arrow-buffer", - "arrow-schema", - "clap", - "duckdb", - "flate2", - "reqwest 0.13.4", - "rstest", - "serde", - "serde_json", - "tempfile", - "tokio", - "tracing", - "tracing-subscriber", - "vortex-bench-server", - "vortex-utils", -] - -[[package]] -name = "vortex-bench-server" -version = "0.1.0-alpha.0" -dependencies = [ - "anyhow", - "axum", - "base64", - "brotli", - "bytes", - "dashmap", - "duckdb", - "flate2", - "futures", - "insta", - "maud", - "parking_lot", - "reqwest 0.13.4", - "serde", - "serde_json", - "subtle", - "tempfile", - "thiserror 2.0.18", - "tokio", - "tower", - "tower-http", - "tracing", - "tracing-subscriber", - "twox-hash", - "vortex-utils", -] - [[package]] name = "vortex-btrblocks" version = "0.1.0" @@ -9797,9 +9425,7 @@ dependencies = [ "pco", "rand 0.10.1", "rstest", - "rustc-hash", "test-with", - "tracing", "vortex-alp", "vortex-array", "vortex-buffer", @@ -9809,7 +9435,6 @@ dependencies = [ "vortex-error", "vortex-fastlanes", "vortex-fsst", - "vortex-mask", "vortex-onpair", "vortex-pco", "vortex-runend", @@ -10075,7 +9700,7 @@ dependencies = [ "vortex-sequence", "vortex-utils", "wkb", - "zip 8.6.0", + "zip", ] [[package]] @@ -10376,7 +10001,6 @@ dependencies = [ name = "vortex-mask" version = "0.1.0" dependencies = [ - "arrow-buffer", "codspeed-divan-compat", "itertools 0.14.0", "rstest", @@ -10632,7 +10256,6 @@ dependencies = [ "vortex-buffer", "vortex-compressor", "vortex-error", - "vortex-fastlanes", "vortex-session", "vortex-utils", ] @@ -10657,7 +10280,7 @@ dependencies = [ "arrow-schema", "clap", "console_error_panic_hook", - "crossterm 0.29.0", + "crossterm", "datafusion 54.0.0", "env_logger", "flatbuffers", @@ -10687,7 +10310,6 @@ version = "0.1.0" dependencies = [ "dashmap", "hashbrown 0.17.1", - "parking_lot", "vortex-error", ] @@ -10790,7 +10412,6 @@ dependencies = [ "cfg-if", "once_cell", "rustversion", - "serde", "wasm-bindgen-macro", "wasm-bindgen-shared", ] @@ -10926,15 +10547,6 @@ dependencies = [ "rustls-pki-types", ] -[[package]] -name = "webpki-roots" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" -dependencies = [ - "rustls-pki-types", -] - [[package]] name = "which" version = "8.0.3" @@ -11417,7 +11029,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" dependencies = [ "libc", - "rustix 1.1.4", + "rustix", ] [[package]] @@ -11570,20 +11182,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "zip" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb2a05c7c36fde6c09b08576c9f7fb4cda705990f73b58fe011abf7dfb24168b" -dependencies = [ - "arbitrary", - "crc32fast", - "flate2", - "indexmap 2.14.0", - "memchr", - "zopfli", -] - [[package]] name = "zip" version = "8.6.0" diff --git a/Cargo.toml b/Cargo.toml index 0fea9f6b125..6fecf067db4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,9 +64,6 @@ members = [ "benchmarks/datafusion-bench", "benchmarks/duckdb-bench", "benchmarks/random-access-bench", - # Benchmarks website v3 (alpha) - leaf binary, not part of vortex-* API - "benchmarks-website/server", - "benchmarks-website/migrate", "vortex-geo", ] exclude = ["java/testfiles", "wasm-test"] diff --git a/benchmarks-website/AGENTS.md b/benchmarks-website/AGENTS.md deleted file mode 100644 index 5d1ffde4006..00000000000 --- a/benchmarks-website/AGENTS.md +++ /dev/null @@ -1,116 +0,0 @@ - - -# AGENTS.md - `benchmarks-website/` - -Read [`README.md`](README.md) first for the architecture and the v2/v3 -side-by-side situation. Then this file. The root [`CLAUDE.md`](../CLAUDE.md) -covers Rust style, test layout, commit conventions. - -## Don't touch the v2 site - -Until the cutover PR lands, the top-level v2 files -(`server.js`, `src/`, `index.html`, `vite.config.js`, `package.json`, -`package-lock.json`, `public/`, the top-level `Dockerfile`, -`docker-compose.yml`) and the `publish-benchmarks-website.yml` workflow -are production. Don't edit them as part of unrelated work. - -The v3 deploy lives entirely under `server/`, `migrate/`, and `ops/`. -The operator runbook is [`ops/README.md`](ops/README.md). - -## v3 specifics - -- **Wire shapes are a coordinated change.** [`server/src/records.rs`](server/src/records.rs), - [`vortex-bench/src/v3.rs`](../vortex-bench/src/v3.rs), and (until cutover) - [`migrate/src/classifier.rs`](migrate/src/classifier.rs) must agree. - Bumping a shape means changing all three plus the snapshot fixtures in - one commit. `SCHEMA_VERSION` is the version literal coupled across two - named sites: [`server/src/schema.rs`](server/src/schema.rs) (source of - truth) and [`scripts/post-ingest.py`](../scripts/post-ingest.py) (the - CI ingest wrapper, which hardcodes it as a Python literal). Bump in - lockstep or every CI ingest run 400s. The server-side validation in - `records.rs` + `ingest.rs` and the echo in `/health` all consume the - constant through `crate::schema`. -- **Numeric `?n=` is clamped to 1000; `?n=all` is the uncapped escape - hatch.** HTML routes hydrate from the materialized latest-100 shard - artifact by default; `?n=all` is an explicit opt-in - (chart-init.js's full-history zoom-out hop uses it once, and curl - power users can request it). The numeric `?n=` path is bounded by - `MAX_NUMERIC_COMMIT_WINDOW` in [`server/src/api/window.rs`](server/src/api/window.rs) - as a DoS-protection floor against `curl ...?n=99999999`. If you need - full history, use `?n=all`. Do NOT raise the numeric cap or remove it - without thinking about the DoS surface. -- **`measurement_id` is server-internal.** Never put it on the wire. It is - a deterministic hash over `commit_sha` plus the dim tuple, computed in - [`server/src/db.rs`](server/src/db.rs) and reused by the migrator via - the same crate. -- **Don't write a server-side classifier for live ingest.** The emitter - produces v3-shape records directly; the migrator's classifier only - exists to translate v2 records once and goes away after cutover. -- **Don't reach for WASM.** SSR + a thin hydration script in - [`server/static/chart-init.js`](server/static/chart-init.js) is the - whole client. -- **v3 ingest is no longer best-effort in CI.** The `Ingest results to v3 - server` step in [`bench.yml`](../.github/workflows/bench.yml), - [`sql-benchmarks.yml`](../.github/workflows/sql-benchmarks.yml), and - [`v3-commit-metadata.yml`](../.github/workflows/v3-commit-metadata.yml) - no longer carries `continue-on-error: true`. A v3-server outage on a - develop push now fails the bench workflow and triggers the existing - `incident.io` alert. The gate is `vars.V3_INGEST_URL != ''` so forks - and unconfigured environments are unaffected. -- **Don't re-introduce a server-side commit cap on `?n=all`.** `?n=all` - is the uncapped escape hatch (chart-init.js fetches it once for the - zoom-out path); visual downsampling happens client-side via LTTB on - the visible commit range only. Numeric `?n=` is clamped per the bullet - above. Default fetches from chart-init.js use the materialized - latest-100 shard artifact, not `?n=all`. -- **Don't refetch on every scope change.** Once a chart's payload is in - memory, pan/zoom/slider/range-strip all rebuild in place via the - in-memory LTTB pass on the cached payload. The single exception is the - latest-100 to full-history zoom-out path: charts initially hydrate from - the materialized latest-100 group shard artifact (served from - `/api/artifacts/{generation}/groups/{slug}/shards/{i}`); when the user - zooms past that window for the first time, `chart-init.js` lazy-fetches - `?n=all` once and replaces the latest-100 payload in place. - -## Footguns we have already hit - -- **Reverse predecessor walk in the tooltip.** `payload.commits[]` is - sorted oldest-first by SQL - `commits[0]` is the oldest, `commits[N-1]` - is the newest. For per-row delta the predecessor of `commits[idx]` is - at `idx - 1`. We caught a regression where a "fix" flipped this to - `idx + 1`; the original walk-backward direction is right. -- **`pointer-events: auto` on the tooltip host.** The tooltip is - positioned at the cursor; making it pointer-interactive causes a - flicker loop. Keep it `pointer-events: none` and offset via - `transform: translate(12px, 12px)`. -- **`change` events on the slider.** Use `input` events with a small - throttle; `change` only fires on release and feels broken. - -## Local dev - -```bash -# Public-only run (read API + ingest only, admin routes 404): -INGEST_BEARER_TOKEN=dev cargo run -p vortex-bench-server - -# With admin endpoints mounted on a separate loopback listener: -INGEST_BEARER_TOKEN=dev ADMIN_BEARER_TOKEN=dev \ - cargo run -p vortex-bench-server - -cargo nextest run -p vortex-bench-server -p vortex-bench-migrate -INSTA_UPDATE=auto cargo nextest run -p vortex-bench-server # update snapshots -``` - -For the full env-var contract (admin bind, snapshot dir, extension dir, -logging spec, PaaS `PORT` fallback) see [`ops/config/vortex-bench.env.example`](ops/config/vortex-bench.env.example) -and the lib-level `//!` doc on [`server/src/main.rs`](server/src/main.rs). - -For the migrator end-to-end against the real S3 dump: - -```bash -cargo run -p vortex-bench-migrate -- run --output ./bench.duckdb -VORTEX_BENCH_DB=./bench.duckdb INGEST_BEARER_TOKEN=dev \ - cargo run -p vortex-bench-server -``` diff --git a/benchmarks-website/migrate/Cargo.toml b/benchmarks-website/migrate/Cargo.toml deleted file mode 100644 index ab448c90e7a..00000000000 --- a/benchmarks-website/migrate/Cargo.toml +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors - -[package] -name = "vortex-bench-migrate" -version = "0.1.0-alpha.0" -edition = "2024" -rust-version = "1.91.0" -license = "Apache-2.0" -description = "One-shot historical migrator from the v2 benchmarks S3 dataset to a v3 DuckDB file" -publish = false - -[[bin]] -name = "vortex-bench-migrate" -path = "src/main.rs" - -# Throwaway binary, not part of the vortex-* public API surface. -# Errors use anyhow, and the crate is intentionally outside the -# workspace. - -[dependencies] -anyhow = { workspace = true } -arrow-array = { workspace = true } -arrow-buffer = { workspace = true } -arrow-schema = { workspace = true } -clap = { workspace = true, features = ["derive"] } -# track vortex-duckdb's bundled engine version (build.rs) -duckdb = { version = "1.10502", features = ["bundled", "appender-arrow"] } -flate2 = "1.1" -reqwest = { workspace = true, features = ["json"] } -serde = { workspace = true, features = ["derive"] } -serde_json = { workspace = true } -tokio = { workspace = true, features = ["rt-multi-thread", "macros"] } -tracing = { workspace = true, features = ["std"] } -tracing-subscriber = { workspace = true, features = ["env-filter", "fmt"] } -vortex-bench-server = { path = "../server" } -vortex-utils = { workspace = true } - -[dev-dependencies] -rstest = { workspace = true } -tempfile = { workspace = true } diff --git a/benchmarks-website/migrate/build.rs b/benchmarks-website/migrate/build.rs deleted file mode 100644 index 37bb34d013a..00000000000 --- a/benchmarks-website/migrate/build.rs +++ /dev/null @@ -1,8 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -fn main() { - if std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("windows") { - println!("cargo:rustc-link-lib=dylib=rstrtmgr"); - } -} diff --git a/benchmarks-website/migrate/src/classifier.rs b/benchmarks-website/migrate/src/classifier.rs deleted file mode 100644 index e473c2a6a02..00000000000 --- a/benchmarks-website/migrate/src/classifier.rs +++ /dev/null @@ -1,867 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Bug-for-bug port of v2's `getGroup`, `formatQuery`, and -//! `normalizeChartName` from `benchmarks-website/server.js`, plus the -//! mapping from v2 group + name pattern to a v3 fact-table bin. -//! -//! The v2 classifier was the source of truth for what historical -//! records mean. It groups records by name prefix into one of: -//! "Random Access", "Compression", "Compression Size", or one of the -//! SQL query suites (with optional fan-out by storage and scale -//! factor for TPC-H/TPC-DS). This module reproduces that logic and -//! then hops to a v3 fact-table bin, since v3 stores dim values as -//! columns instead of name fragments. -//! -//! Engine and format strings stored in v3 columns are pulled from the -//! raw, pre-rename v2 record name. v2's `ENGINE_RENAMES` was a v2 -//! read-time UI concern (e.g. `vortex-file-compressed` rendered as -//! `vortex` and `parquet-tokio-local-disk` rendered as `parquet-nvme`). -//! v3 stores canonical `Format::name()` strings to match what the v3 -//! live emitter writes, so historical and live records share series. - -use crate::v2::V2Record; -use crate::v2::dataset_scale_factor; - -/// Static port of v2's `QUERY_SUITES`. -pub const QUERY_SUITES: &[QuerySuite] = &[ - QuerySuite { - prefix: "clickbench", - display_name: "Clickbench", - query_prefix: "CLICKBENCH", - dataset_key: None, - fan_out: false, - skip: false, - }, - QuerySuite { - prefix: "statpopgen", - display_name: "Statistical and Population Genetics", - query_prefix: "STATPOPGEN", - dataset_key: None, - fan_out: false, - skip: false, - }, - QuerySuite { - prefix: "polarsignals", - display_name: "PolarSignals Profiling", - query_prefix: "POLARSIGNALS", - dataset_key: None, - fan_out: false, - skip: false, - }, - QuerySuite { - prefix: "gharchive", - display_name: "GhArchive", - query_prefix: "GHARCHIVE", - dataset_key: None, - fan_out: false, - skip: false, - }, - QuerySuite { - prefix: "tpch", - display_name: "TPC-H", - query_prefix: "TPC-H", - dataset_key: Some("tpch"), - fan_out: true, - skip: false, - }, - QuerySuite { - prefix: "tpcds", - display_name: "TPC-DS", - query_prefix: "TPC-DS", - dataset_key: Some("tpcds"), - fan_out: true, - skip: false, - }, - QuerySuite { - prefix: "fineweb", - display_name: "Fineweb", - query_prefix: "FINEWEB", - dataset_key: None, - fan_out: false, - skip: false, - }, -]; - -/// Static port of v2's `ENGINE_RENAMES`. Applied to the "series" half -/// of a benchmark name (the part after the first `/`) before splitting -/// on `:` into engine/format. Order doesn't matter — keys are unique. -const ENGINE_RENAMES: &[(&str, &str)] = &[ - ("datafusion:vortex-file-compressed", "datafusion:vortex"), - ("datafusion:parquet", "datafusion:parquet"), - ("datafusion:arrow", "datafusion:in-memory-arrow"), - ("datafusion:lance", "datafusion:lance"), - ("datafusion:vortex-compact", "datafusion:vortex-compact"), - ("duckdb:vortex-file-compressed", "duckdb:vortex"), - ("duckdb:parquet", "duckdb:parquet"), - ("duckdb:duckdb", "duckdb:duckdb"), - ("duckdb:vortex-compact", "duckdb:vortex-compact"), - ("vortex-tokio-local-disk", "vortex-nvme"), - ("vortex-compact-tokio-local-disk", "vortex-compact-nvme"), - ("lance-tokio-local-disk", "lance-nvme"), - ("parquet-tokio-local-disk", "parquet-nvme"), - ("lance", "lance"), -]; - -/// One entry of [`QUERY_SUITES`]. -#[derive(Debug, Clone, Copy)] -pub struct QuerySuite { - /// Lowercase suite prefix used to match v2 record names (e.g. `tpch`). - pub prefix: &'static str, - /// Human-readable suite name as v2 served it from `/api/metadata`. - pub display_name: &'static str, - /// Uppercase prefix v2's `formatQuery` produced (e.g. `TPC-H`). - pub query_prefix: &'static str, - /// Override for the dataset key v2 records use inside their `dataset` - /// object. Falls back to `prefix` when `None`. - pub dataset_key: Option<&'static str>, - /// True if the suite's group name fans out by `(storage, scale_factor)` - /// (e.g. `TPC-H (NVMe) (SF=1)`); false collapses to a single group. - pub fan_out: bool, - /// True if v2 deliberately ignored this suite (no live group is rendered). - pub skip: bool, -} - -/// Group a v2 record falls into. Mirrors `getGroup` in `server.js`, -/// including the fan-out group naming for TPC-H/TPC-DS. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum V2Group { - RandomAccess, - Compression, - CompressionSize, - Query { - suite_index: usize, - /// `Some` for fan-out suites only. - storage: Option, - /// `Some` for fan-out suites only. - scale_factor: Option, - }, -} - -impl V2Group { - /// Display name as v2 served it from `/api/metadata`. - pub fn display_name(&self) -> String { - match self { - V2Group::RandomAccess => "Random Access".into(), - V2Group::Compression => "Compression".into(), - V2Group::CompressionSize => "Compression Size".into(), - V2Group::Query { - suite_index, - storage, - scale_factor, - } => { - let suite = &QUERY_SUITES[*suite_index]; - if let (Some(storage), Some(sf)) = (storage, scale_factor) { - format!("{} ({}) (SF={})", suite.display_name, storage, sf) - } else { - suite.display_name.to_string() - } - } - } - } -} - -/// Apply v2's `ENGINE_RENAMES`. Reproduces the JS `rename`: -/// `RENAMES[s.toLowerCase()] || RENAMES[s] || s`. -pub fn rename_engine(s: &str) -> String { - let lower = s.to_lowercase(); - for (k, v) in ENGINE_RENAMES { - if *k == lower { - return (*v).to_string(); - } - } - for (k, v) in ENGINE_RENAMES { - if *k == s { - return (*v).to_string(); - } - } - s.to_string() -} - -/// Faithful port of v2's `formatQuery`: maps `clickbench_q07` → -/// `"CLICKBENCH Q7"`. Returns the original (uppercased, -/// `-` and `_` replaced with spaces) when no suite matches. -pub fn format_query(q: &str) -> String { - let lower = q.to_lowercase(); - for suite in QUERY_SUITES { - if suite.skip { - continue; - } - let prefix = suite.prefix; - if let Some(rest) = lower.strip_prefix(prefix) - && let Some(idx) = parse_query_index(rest) - { - return format!("{} Q{}", suite.query_prefix, idx); - } - } - let mut out = q.to_uppercase(); - out = out.replace(['_', '-'], " "); - out -} - -/// Parse the `_q07` / ` q7` / `q42` tail used by `format_query`. -/// Returns the integer query index if the tail matches the v2 regex -/// `^[_ ]?q(\d+)`. -fn parse_query_index(rest: &str) -> Option { - let after_sep = rest - .strip_prefix('_') - .or_else(|| rest.strip_prefix(' ')) - .unwrap_or(rest); - let after_q = after_sep - .strip_prefix('q') - .or_else(|| after_sep.strip_prefix('Q'))?; - let digits: String = after_q.chars().take_while(|c| c.is_ascii_digit()).collect(); - if digits.is_empty() { - return None; - } - digits.parse().ok() -} - -/// Faithful port of v2's `normalizeChartName`. -pub fn normalize_chart_name(group: &V2Group, chart_name: &str) -> String { - if matches!(group, V2Group::CompressionSize) && chart_name == "VORTEX FILE COMPRESSED SIZE" { - return "VORTEX SIZE".into(); - } - chart_name.to_string() -} - -/// Port of v2's `getGroup`. Returns `None` for skipped suites -/// (e.g. `fineweb`) or names that match nothing. -pub fn get_group(record: &V2Record) -> Option { - let lower = record.name.to_lowercase(); - - if lower.starts_with("random-access/") || lower.starts_with("random access/") { - return Some(V2Group::RandomAccess); - } - - if lower.starts_with("vortex size/") - || lower.starts_with("vortex-file-compressed size/") - || lower.starts_with("parquet size/") - || lower.starts_with("parquet-zstd size/") - || lower.starts_with("lance size/") - || lower.contains(":raw size/") - || lower.contains(":parquet-zstd size/") - || lower.contains(":lance size/") - { - return Some(V2Group::CompressionSize); - } - - if lower.starts_with("compress time/") - || lower.starts_with("decompress time/") - || lower.starts_with("parquet_rs-zstd compress") - || lower.starts_with("parquet_rs-zstd decompress") - || lower.starts_with("lance compress") - || lower.starts_with("lance decompress") - || lower.starts_with("vortex:lance ratio") - || lower.starts_with("vortex:parquet-zstd ratio") - // Typo'd v2 emitter wrote `parquet-zst` (no `d`) for some - // ratio records; match both spellings so they classify as - // derived ratios instead of falling through to Unknown. - || lower.starts_with("vortex:parquet-zst ratio") - || lower.starts_with("vortex:raw ratio") - { - return Some(V2Group::Compression); - } - - for (i, suite) in QUERY_SUITES.iter().enumerate() { - let prefix_q = format!("{}_q", suite.prefix); - let prefix_slash = format!("{}/", suite.prefix); - if !lower.starts_with(&prefix_q) && !lower.starts_with(&prefix_slash) { - continue; - } - if suite.skip { - return None; - } - if !suite.fan_out { - return Some(V2Group::Query { - suite_index: i, - storage: None, - scale_factor: None, - }); - } - let storage = match record.storage.as_deref().map(str::to_uppercase).as_deref() { - Some("S3") => "S3", - _ => "NVMe", - }; - let dataset_key = suite.dataset_key.unwrap_or(suite.prefix); - let raw_sf = record - .dataset - .as_ref() - .and_then(|d| dataset_scale_factor(d, dataset_key)); - let sf = raw_sf - .as_deref() - .and_then(|s| s.parse::().ok()) - .map(|f| f.round() as i64) - .unwrap_or(1); - return Some(V2Group::Query { - suite_index: i, - storage: Some(storage.into()), - scale_factor: Some(sf.to_string()), - }); - } - - None -} - -/// Group + chart + series breakdown for a v2 record, using the same -/// rules `server.js` applies in `refresh()`. Equivalent to v2's -/// `(group, chartName, seriesName)` triple after rename / skip rules. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct V2Classification { - /// Group the v2 server would place this record in. - pub group: V2Group, - /// Chart name v2 displayed for this record (uppercase, separators - /// normalized). - pub chart: String, - /// Series name after v2's `ENGINE_RENAMES` was applied. - pub series: String, -} - -/// Apply the same chart / series naming v2's `refresh()` does, plus -/// the throughput / `PARQUET-UNC` skip rules. -pub fn classify_v2(record: &V2Record) -> Option { - if record.name.contains(" throughput") { - return None; - } - let group = get_group(record)?; - let parts: Vec<&str> = record.name.split('/').collect(); - let (chart, series) = match (&group, parts.len()) { - (V2Group::RandomAccess, 4) => { - let chart = format!("{}/{}", parts[1], parts[2]) - .to_uppercase() - .replace(['_', '-'], " "); - let series = rename_engine(if parts[3].is_empty() { - "default" - } else { - parts[3] - }); - (chart, series) - } - (V2Group::RandomAccess, 2) => ( - "RANDOM ACCESS".to_string(), - rename_engine(if parts[1].is_empty() { - "default" - } else { - parts[1] - }), - ), - (V2Group::RandomAccess, _) => return None, - _ => { - let series_raw = if parts.len() >= 2 && !parts[1].is_empty() { - parts[1] - } else { - "default" - }; - let series = rename_engine(series_raw); - let chart = format_query(parts[0]); - (chart, series) - } - }; - let chart = normalize_chart_name(&group, &chart); - if chart.contains("PARQUET-UNC") { - return None; - } - Some(V2Classification { - group, - chart, - series, - }) -} - -/// Mapping target: which v3 fact table a v2 record lands in, plus the -/// dim values that table needs. -#[derive(Debug, Clone, PartialEq)] -pub enum V3Bin { - Query { - dataset: String, - dataset_variant: Option, - scale_factor: Option, - query_idx: i32, - storage: String, - engine: String, - format: String, - }, - CompressionTime { - dataset: String, - dataset_variant: Option, - format: String, - op: String, - }, - CompressionSize { - dataset: String, - dataset_variant: Option, - format: String, - }, - RandomAccess { - dataset: String, - format: String, - }, -} - -/// Top-level entry point. Combines `classify_v2` with the v3 fact-table -/// mapping. Returns `None` for records that: -/// -/// - Don't match any v2 group (uncategorized prefix). -/// - Are explicitly skipped by v2 (throughput, PARQUET-UNC, fineweb). -/// - Are computed-at-read-time ratios that v3 derives from -/// `compression_sizes` (`vortex:parquet-zstd ratio …`, -/// `vortex:lance ratio …`, `vortex:raw ratio …`, -/// `vortex:* size/…`). -pub fn classify(record: &V2Record) -> Option { - let cls = classify_v2(record)?; - match &cls.group { - V2Group::RandomAccess => bin_random_access(record), - V2Group::Compression => bin_compression_time(&cls, record), - V2Group::CompressionSize => bin_compression_size(&cls, record), - V2Group::Query { .. } => bin_query(&cls, record), - } -} - -/// Reason the classifier dropped a record. Intentional skips (v2 -/// patterns v3 deliberately doesn't store) are NOT errors; they don't -/// count against the uncategorized gate. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Skip { - /// `vortex:* ratio …` and `vortex:* size` — derived in v3 from - /// `compression_sizes` joined to itself. - DerivedRatio, - /// `throughput` records — v2 derived these from latencies. - Throughput, - /// A v2 query suite marked `skip: true` in QUERY_SUITES. - SkippedSuite, - /// random-access record with an unsupported part count. - UnsupportedShape, - /// Record had no `value` field. - NoValue, - /// Dim outside the v3 emitter's allowlist (e.g. `parquet-zstd`, - /// historical-only suites no longer in CI). - Deprecated, - /// v2 memory measurements (`*_memory/*` records). Carry top-level - /// `peak_physical_memory` / `peak_virtual_memory` / - /// `physical_memory_delta` / `virtual_memory_delta` fields that - /// `V2Record` doesn't deserialize. Not migrated for alpha; merging - /// into the corresponding QueryMeasurement row is future work. - HistoricalMemory, -} - -/// Engines the v3 emitter produces today. Mirrors -/// `vortex-bench/src/lib.rs::Engine`. Anything else is historical and gets -/// bucketed as `Skip::Deprecated`. -const V3_ENGINES: &[&str] = &["datafusion", "duckdb", "vortex", "arrow"]; - -/// Formats the v3 emitter produces today (`Format::name()` values from -/// `vortex-bench/src/lib.rs`). -const V3_FORMATS: &[&str] = &[ - "vortex-file-compressed", - "vortex-compact", - "parquet", - "lance", - "csv", - "arrow", - "duckdb", -]; - -/// Query suites the v3 CI runs today. Suites outside this list still -/// classify (so historical analyses stay coherent) but get bucketed -/// as `Skip::Deprecated` so they don't render as orphan charts in v3. -/// -/// `fineweb` is included because `.github/workflows/sql-benchmarks.yml` -/// still has `fineweb` and `fineweb-s3` matrix entries. `gharchive` -/// stays excluded — it's defined in `vortex-bench` but no current -/// workflow runs it. -const V3_QUERY_SUITES: &[&str] = &[ - "clickbench", - "tpch", - "tpcds", - "statpopgen", - "polarsignals", - "fineweb", -]; - -/// Returns true if every dim that v3 stores as a column is on the -/// emitter's current allowlist. Dim values outside the allowlist mean -/// historical-only formats / engines that the v3 UI has nothing to -/// render against. -fn is_v3_dim(bin: &V3Bin) -> bool { - match bin { - V3Bin::Query { engine, format, .. } => { - V3_ENGINES.contains(&engine.as_str()) && V3_FORMATS.contains(&format.as_str()) - } - V3Bin::CompressionTime { format, .. } - | V3Bin::CompressionSize { format, .. } - | V3Bin::RandomAccess { format, .. } => V3_FORMATS.contains(&format.as_str()), - } -} - -/// Outcome of running the classifier on a v2 record. Distinguishes -/// "we know we don't want this" (`Skip`) from "we don't recognize this" -/// (`Unknown`); the migrator's 5% gate fires only on the latter. -#[derive(Debug, Clone)] -pub enum Outcome { - Bin(V3Bin), - Skip(Skip), - Unknown, -} - -/// Like [`classify`], but reports *why* a record was dropped. Intended -/// for the migrator so the 5% uncategorized gate doesn't trip on -/// records v2 deliberately doesn't render (ratios, throughput, -/// skipped suites). -pub fn classify_outcome(record: &V2Record) -> Outcome { - if record.name.contains(" throughput") { - return Outcome::Skip(Skip::Throughput); - } - // v2 memory records: e.g. "clickbench_q07_memory/datafusion:parquet". - // Match the `_memory/` infix BEFORE the engine/format split, so they - // route to a known Skip variant instead of slipping through to - // Outcome::Unknown and tripping the 5% gate. - let lower = record.name.to_lowercase(); - if let Some((head, _)) = lower.split_once('/') - && head.ends_with("_memory") - { - return Outcome::Skip(Skip::HistoricalMemory); - } - let Some(group) = get_group(record) else { - return Outcome::Unknown; - }; - if let V2Group::Query { suite_index, .. } = &group - && QUERY_SUITES[*suite_index].skip - { - return Outcome::Skip(Skip::SkippedSuite); - } - let Some(cls) = classify_v2(record) else { - // get_group succeeded but classify_v2 didn't — shape mismatch. - return Outcome::Skip(Skip::UnsupportedShape); - }; - let derived = match &cls.group { - V2Group::Compression => { - let lc = cls.chart.to_lowercase(); - lc.contains("ratio") || lc.contains(':') - } - V2Group::CompressionSize => cls.chart.to_lowercase().contains(':'), - _ => false, - }; - if derived { - return Outcome::Skip(Skip::DerivedRatio); - } - let bin = match &cls.group { - V2Group::RandomAccess => match bin_random_access(record) { - Some(b) => Some(b), - // `bin_random_access` only returns None for malformed - // shapes (empty dataset/pattern segment, empty/`default` - // format). Route them to Skip so the `Outcome::Unknown` - // arm below — and the 5% uncategorized gate in - // `migrate::run` — don't trip on them. - None => return Outcome::Skip(Skip::UnsupportedShape), - }, - V2Group::Compression => bin_compression_time(&cls, record), - V2Group::CompressionSize => bin_compression_size(&cls, record), - V2Group::Query { .. } => bin_query(&cls, record), - }; - let Some(bin) = bin else { - return Outcome::Unknown; - }; - if !is_v3_dim(&bin) { - return Outcome::Skip(Skip::Deprecated); - } - if let V2Group::Query { suite_index, .. } = &group - && !V3_QUERY_SUITES.contains(&QUERY_SUITES[*suite_index].prefix) - { - return Outcome::Skip(Skip::Deprecated); - } - Outcome::Bin(bin) -} - -fn bin_random_access(record: &V2Record) -> Option { - // Pull dataset and format from the raw, pre-rename v2 name so v3 - // stores meaningful values. Two raw shapes are supported: - // - // - 4-part `random-access///-tokio-local-disk` - // - 2-part legacy `random-access/-tokio-local-disk` - // - // The 2-part shape is what `random-access-bench`'s `measurement_name` - // emits when called without an `AccessPattern`, and per its source - // comment that path is only taken for the legacy taxi run - // (`if dataset.name() == "taxi"` in `benchmarks/random-access-bench/ - // src/main.rs`). The live v3 emitter `random_access_record` writes - // `dataset="taxi"` for those same measurements, so the historical - // 2-part records are taxi too — assigning `dataset="taxi"` here - // recovers the time series instead of letting it disappear under - // v2's "RANDOM ACCESS" placeholder. Deriving from the raw name - // (rather than `cls.chart`) keeps this independent of v2's - // `normalizeChartName`. - // - // After stripping the `-tokio-local-disk` suffix, map the v2 - // random-access ext label (`vortex`, from `Format::ext()`) to the - // canonical name (`vortex-file-compressed`, from `Format::name()`). - // `parquet` and `lance` match between ext and name. The `vortex` - // ext is shared by both `OnDiskVortex` (name - // `vortex-file-compressed`) and `VortexCompact` (name - // `vortex-compact`), but v2's random-access bench only emitted - // `OnDiskVortex`, so mapping to `vortex-file-compressed` is - // correct for all historical data. - // - // Records whose `` segment ends in `-footer` (the bench's - // reopen-mode variant, e.g. `parquet-tokio-local-disk-footer`) - // intentionally do not strip clean to a v3-allowlisted format; the - // outer `is_v3_dim` filter then routes them to `Skip::Deprecated`. - // The live v3 emitter doesn't distinguish reopen vs cached either - // (`random_access_record` uses `format.name()` for both), so - // dropping `-footer` here keeps migration consistent with what - // v3 ingests live. - let parts: Vec<&str> = record.name.split('/').collect(); - let (dataset, raw_format) = match parts.as_slice() { - [_, ds, pat, format] => { - if ds.is_empty() || pat.is_empty() { - return None; - } - (format!("{ds}/{pat}").to_lowercase(), *format) - } - [_, format] => ("taxi".to_string(), *format), - _ => return None, - }; - if raw_format.is_empty() || raw_format == "default" { - return None; - } - let stripped = raw_format - .strip_suffix("-tokio-local-disk") - .unwrap_or(raw_format); - let format = match stripped { - "vortex" => "vortex-file-compressed".to_string(), - other => other.to_lowercase(), - }; - Some(V3Bin::RandomAccess { dataset, format }) -} - -fn bin_compression_time(cls: &V2Classification, _record: &V2Record) -> Option { - // v2 compression chart names look like (after format_query): - // "COMPRESS TIME" [vortex/encode] - // "DECOMPRESS TIME" [vortex/decode] - // "PARQUET RS ZSTD COMPRESS TIME" [parquet/encode] - // "PARQUET RS ZSTD DECOMPRESS TIME" [parquet/decode] - // "LANCE COMPRESS TIME" [lance/encode] - // "LANCE DECOMPRESS TIME" [lance/decode] - // "VORTEX:LANCE RATIO COMPRESS TIME" [drop] - // "VORTEX:PARQUET-ZSTD RATIO COMPRESS TIME" [drop] - // "VORTEX:RAW RATIO COMPRESS TIME" [drop] - let lc = cls.chart.to_lowercase(); - if lc.contains("ratio") || lc.contains(':') { - // Ratios are computed at read time from compression_sizes. - return None; - } - let (format, op) = if lc.starts_with("compress time") { - ("vortex-file-compressed", "encode") - } else if lc.starts_with("decompress time") { - ("vortex-file-compressed", "decode") - } else if lc.starts_with("parquet rs zstd compress time") { - ("parquet", "encode") - } else if lc.starts_with("parquet rs zstd decompress time") { - ("parquet", "decode") - } else if lc.starts_with("lance compress time") { - ("lance", "encode") - } else if lc.starts_with("lance decompress time") { - ("lance", "decode") - } else { - return None; - }; - let dataset = cls.series.to_lowercase(); - if dataset.is_empty() || dataset == "default" { - return None; - } - Some(V3Bin::CompressionTime { - dataset, - dataset_variant: None, - format: format.to_string(), - op: op.to_string(), - }) -} - -fn bin_compression_size(cls: &V2Classification, record: &V2Record) -> Option { - let lc = cls.chart.to_lowercase(); - // Ratios like "VORTEX:PARQUET ZSTD SIZE" / "VORTEX:LANCE SIZE" / - // "VORTEX:RAW SIZE" are derived from compression_sizes at read - // time, not stored. - if lc.contains(':') { - return None; - } - // `parquet-zstd size` shares a leading "parquet" with `parquet size`, - // so check the more specific prefix first. `format_query` upper-cases - // and replaces `-`/`_` with spaces, so the chart we match against is - // `"PARQUET ZSTD SIZE"` (no hyphen) — same convention as the existing - // `"parquet rs zstd compress time"` branches above. - let format = if lc.starts_with("vortex size") { - "vortex-file-compressed" - } else if lc.starts_with("parquet zstd size") { - "parquet-zstd" - } else if lc.starts_with("parquet size") { - "parquet" - } else if lc.starts_with("lance size") { - "lance" - } else { - return None; - }; - let dataset = cls.series.to_lowercase(); - if dataset.is_empty() || dataset == "default" { - return None; - } - // Mirror the file-sizes ingest path's dataset_variant derivation - // (see `migrate::migrate_file_sizes`): pull the SF out of the v2 - // record's `dataset` object when present and run it through - // `canonical_scale_factor` so `"1"`, `"1.0"`, `"10"` and `"10.0"` - // collapse to one canonical form. Without this both code paths - // produce the same `mid` only by accident, so SF=10 file-sizes - // rows wouldn't merge with the matching data.json.gz - // "vortex size/tpch" rows when one side wrote `"10"` and the - // other wrote `"10.0"`. - let dataset_variant = crate::v2::canonical_scale_factor( - record - .dataset - .as_ref() - .and_then(|d| crate::v2::dataset_scale_factor(d, dataset.as_str())) - .as_deref(), - ); - Some(V3Bin::CompressionSize { - dataset, - dataset_variant, - format: format.to_string(), - }) -} - -fn bin_query(cls: &V2Classification, record: &V2Record) -> Option { - let V2Group::Query { - suite_index, - storage, - scale_factor, - } = &cls.group - else { - return None; - }; - let suite = &QUERY_SUITES[*suite_index]; - - // Pull the query index from the *raw* name's first part instead of - // the formatted chart, so we don't have to round-trip "Q07". - let raw_first = record.name.split('/').next().unwrap_or(""); - let query_idx = parse_query_index_from_first(raw_first)?; - - // Pull engine:format from the raw, pre-rename second segment so v3 - // stores canonical `Format::name()` strings (e.g. - // `vortex-file-compressed`) that match what the v3 live emitter - // writes. `cls.series` has been through v2's `ENGINE_RENAMES` for - // UI display and is not appropriate for v3 columns. - // - // Older v2 records emitted display-case engines (e.g. `DataFusion`, - // `DuckDB`); newer ones emit lowercase. Lowercase here so dedup - // collapses both spellings into a single canonical row. - let raw_series = record.name.split('/').nth(1)?; - let (engine, format) = split_engine_format(raw_series)?; - let engine = engine.to_lowercase(); - let format = format.to_lowercase(); - - let storage_v3 = match storage.as_deref() { - Some("S3") => "s3".to_string(), - Some("NVMe") => "nvme".to_string(), - _ => "nvme".to_string(), - }; - - // ClickBench's "flavor" lives in `dataset_variant`, but v2 record names - // never encoded it — leave it `None` so historical and live rows merge - // (the live emitter does the same; see `vortex-bench/src/v3.rs`'s - // `benchmark_dataset_dims` for the matching shape). - Some(V3Bin::Query { - dataset: suite.prefix.to_string(), - dataset_variant: None, - scale_factor: scale_factor.clone(), - query_idx, - storage: storage_v3, - engine, - format, - }) -} - -/// Pull the integer query index out of the leading name part, which is -/// always `_q` or ` q` for SQL query records. -fn parse_query_index_from_first(first: &str) -> Option { - let lower = first.to_lowercase(); - for suite in QUERY_SUITES { - if let Some(rest) = lower.strip_prefix(suite.prefix) - && let Some(idx) = parse_query_index(rest) - { - return Some(idx as i32); - } - } - None -} - -/// Split a renamed series like `datafusion:parquet` into -/// `(engine, format)`. Returns `None` for series with no `:` since -/// v3 requires both columns. -fn split_engine_format(series: &str) -> Option<(String, String)> { - let mut split = series.splitn(2, ':'); - let engine = split.next()?.trim().to_string(); - let format = split.next()?.trim().to_string(); - if engine.is_empty() || format.is_empty() { - return None; - } - Some((engine, format)) -} - -#[cfg(test)] -mod tests { - use anyhow::Context as _; - - use super::*; - - fn record(name: &str) -> V2Record { - V2Record { - name: name.to_string(), - commit_id: Some("deadbeef".into()), - unit: None, - value: None, - storage: None, - dataset: None, - all_runtimes: None, - env_triple: None, - } - } - - #[test] - fn format_query_round_trips() { - assert_eq!(format_query("clickbench_q07"), "CLICKBENCH Q7"); - assert_eq!(format_query("tpch_q01"), "TPC-H Q1"); - assert_eq!(format_query("tpcds_q42"), "TPC-DS Q42"); - assert_eq!(format_query("statpopgen_q3"), "STATPOPGEN Q3"); - assert_eq!(format_query("foo bar"), "FOO BAR"); - } - - #[test] - fn rename_engine_canonicalizes_disk_names() { - assert_eq!(rename_engine("vortex-tokio-local-disk"), "vortex-nvme"); - assert_eq!( - rename_engine("datafusion:vortex-file-compressed"), - "datafusion:vortex" - ); - assert_eq!(rename_engine("unknown-engine"), "unknown-engine"); - } - - #[test] - fn parse_query_index_handles_separators() { - assert_eq!(parse_query_index("_q07"), Some(7)); - assert_eq!(parse_query_index(" q7"), Some(7)); - assert_eq!(parse_query_index("q42"), Some(42)); - assert_eq!(parse_query_index("xq7"), None); - } - - #[test] - fn random_access_bins_dataset_pattern() -> anyhow::Result<()> { - let bin = classify(&record("random-access/taxi/take/parquet")) - .context("classify returned None for a known-good 4-part random-access name")?; - assert_eq!( - bin, - V3Bin::RandomAccess { - dataset: "taxi/take".into(), - format: "parquet".into(), - } - ); - Ok(()) - } -} diff --git a/benchmarks-website/migrate/src/commits.rs b/benchmarks-website/migrate/src/commits.rs deleted file mode 100644 index a9c5f056cd7..00000000000 --- a/benchmarks-website/migrate/src/commits.rs +++ /dev/null @@ -1,97 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Commit upserts. Adapts a [`crate::v2::V2Commit`] into the v3 -//! `commits` row shape (a [`vortex_bench_server::records::CommitInfo`]). - -use anyhow::Context as _; -use anyhow::Result; -use duckdb::Transaction; -use duckdb::params; - -use crate::v2::V2Commit; - -/// Insert a v3 `commits` row for one v2 commit. `tree_sha` and `url` -/// remain required and use a warning-bearing empty-string fallback; -/// the human-input fields (message, author/committer name and email) -/// are nullable in the v3 schema, so empty / missing values map to -/// SQL `NULL` instead of an empty string the UI would render as a -/// blank cell. -pub fn upsert_commit(tx: &Transaction<'_>, commit: &V2Commit) -> Result { - let mut warnings = Vec::new(); - let timestamp = require_field(&commit.timestamp, "timestamp", &commit.id, &mut warnings); - let message = optional_field(&commit.message); - let author_name = optional_field(&commit.author.as_ref().and_then(|p| p.name.clone())); - let author_email = optional_field(&commit.author.as_ref().and_then(|p| p.email.clone())); - let committer_name = optional_field(&commit.committer.as_ref().and_then(|p| p.name.clone())); - let committer_email = optional_field(&commit.committer.as_ref().and_then(|p| p.email.clone())); - let tree_sha = require_field(&commit.tree_id, "tree_id", &commit.id, &mut warnings); - let url = require_field(&commit.url, "url", &commit.id, &mut warnings); - - tx.execute( - r#" - INSERT INTO commits ( - commit_sha, timestamp, message, author_name, author_email, - committer_name, committer_email, tree_sha, url - ) VALUES (?, CAST(? AS TIMESTAMPTZ), ?, ?, ?, ?, ?, ?, ?) - ON CONFLICT (commit_sha) DO UPDATE SET - timestamp = excluded.timestamp, - message = excluded.message, - author_name = excluded.author_name, - author_email = excluded.author_email, - committer_name = excluded.committer_name, - committer_email = excluded.committer_email, - tree_sha = excluded.tree_sha, - url = excluded.url - "#, - params![ - commit.id, - timestamp, - message, - author_name, - author_email, - committer_name, - committer_email, - tree_sha, - url, - ], - ) - .with_context(|| format!("upserting commit {}", commit.id))?; - Ok(UpsertOutcome { warnings }) -} - -fn require_field( - field: &Option, - name: &str, - sha: &str, - warnings: &mut Vec, -) -> String { - match field { - Some(s) => s.clone(), - None => { - warnings.push(format!("commit {sha} missing {name}")); - String::new() - } - } -} - -/// Coerce a v2-supplied `Option` into a SQL-bindable -/// `Option`, treating an empty / whitespace-only value as -/// missing. v2 sometimes wrote `""` for blank author / committer / -/// message fields; storing those as actual `NULL` lets the UI -/// distinguish "missing metadata" from "deliberately blank". -fn optional_field(field: &Option) -> Option { - field - .as_deref() - .map(str::trim) - .filter(|s| !s.is_empty()) - .map(str::to_string) -} - -/// Per-call warning bag returned to the caller for logging. -#[derive(Debug, Default)] -pub struct UpsertOutcome { - /// Human-readable warnings — typically one per missing required field on - /// the v2 commit (timestamp, tree_id, url). - pub warnings: Vec, -} diff --git a/benchmarks-website/migrate/src/lib.rs b/benchmarks-website/migrate/src/lib.rs deleted file mode 100644 index b5aa72bc97d..00000000000 --- a/benchmarks-website/migrate/src/lib.rs +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! One-shot historical migrator from v2's S3-hosted benchmark dataset -//! to a v3 DuckDB file. -//! -//! The v2 dataset is JSONL of bare benchmark records keyed by name string. -//! v3 uses five typed fact tables with explicit dim columns. This crate -//! ports v2's `getGroup` classifier (in `benchmarks-website/server.js`) -//! bug-for-bug so that historical rows survive the migration with the -//! same group / chart / series structure as the live v2 server. -//! -//! The migrator is throwaway: once v3 cuts over, both the binary and -//! the classifier go away. - -/// Routing v2 records into v3 fact tables, ported from v2's `getGroup`. -pub mod classifier; -/// V2 commit -> v3 `commits` row upserts. -pub mod commits; -/// End-to-end migration of v2 dumps into a v3 DuckDB. -pub mod migrate; -/// Streaming readers for the v2 S3 bucket and local dumps. -pub mod source; -/// Wire shapes of the v2 benchmark dataset. -pub mod v2; -/// Structural diff between a migrated v3 DuckDB and v2's `/api/metadata`. -pub mod verify; diff --git a/benchmarks-website/migrate/src/main.rs b/benchmarks-website/migrate/src/main.rs deleted file mode 100644 index abd3a4a8a83..00000000000 --- a/benchmarks-website/migrate/src/main.rs +++ /dev/null @@ -1,131 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! `vortex-bench-migrate` CLI: a one-shot historical migrator from -//! v2's S3 dataset into a v3 DuckDB file, plus a structural diff -//! against the live v2 `/api/metadata` endpoint for spotting -//! classifier regressions. - -use std::path::PathBuf; -use std::process::ExitCode; - -use anyhow::Context as _; -use anyhow::Result; -use clap::Parser; -use clap::Subcommand; -use clap::ValueEnum; -use tracing_subscriber::EnvFilter; -use vortex_bench_migrate::migrate; -use vortex_bench_migrate::source::Source; -use vortex_bench_migrate::verify; - -/// One-shot historical migrator from v2's S3 dataset to v3 DuckDB. -#[derive(Debug, Parser)] -#[command(name = "vortex-bench-migrate", version, about)] -struct Cli { - #[command(subcommand)] - command: Command, -} - -#[derive(Debug, Subcommand)] -enum Command { - /// Read v2's data.json.gz / commits.json / file-sizes-*.json.gz - /// and write a fully populated v3 DuckDB at `--output`. - Run { - /// Path to write the v3 DuckDB to. Created if absent. - #[arg(long)] - output: PathBuf, - /// Where to fetch v2 dumps from. - #[arg(long, value_enum, default_value_t = SourceKind::PublicS3)] - source: SourceKind, - /// For `--source=local`, the directory containing - /// `data.json.gz`, `commits.json`, and `file-sizes-*.json.gz`. - #[arg(long, required_if_eq("source", "local"))] - source_dir: Option, - /// Continue past per-`file-sizes-*.json.gz` failures rather than - /// failing the migration. By default a single failed - /// `file-sizes-*` source is an error, because a "successful" - /// migrated DB with missing compression-size history is a worse - /// outcome than a loud failure that the operator can retry. Pass - /// this flag when you genuinely want partial coverage (e.g. one - /// known-bad source file you want to skip). - #[arg(long, default_value_t = false)] - allow_missing_file_sizes: bool, - }, - /// Diff a migrated DuckDB against the live v2 `/api/metadata` - /// endpoint. Exits 0 if every v2 group is present in v3, 1 - /// otherwise so this can gate a CI step. - Verify { - /// HTTPS root of a running v2 server (e.g. `https://bench.vortex.dev`). - #[arg(long)] - against: String, - /// Path to the migrated v3 DuckDB. - #[arg(long)] - duckdb: PathBuf, - }, -} - -#[derive(Debug, Clone, Copy, ValueEnum)] -enum SourceKind { - PublicS3, - Local, -} - -fn main() -> ExitCode { - if let Err(err) = run() { - eprintln!("error: {err:#}"); - return ExitCode::from(2); - } - ExitCode::SUCCESS -} - -fn run() -> Result<()> { - tracing_subscriber::fmt() - .with_env_filter( - EnvFilter::try_from_env("VORTEX_BENCH_LOG").unwrap_or_else(|_| EnvFilter::new("info")), - ) - .init(); - - let cli = Cli::parse(); - match cli.command { - Command::Run { - output, - source, - source_dir, - allow_missing_file_sizes, - } => { - let source = match source { - SourceKind::PublicS3 => Source::PublicS3, - SourceKind::Local => { - Source::Local(source_dir.context("--source=local requires --source-dir")?) - } - }; - let summary = migrate::run(&source, &output)?; - print!("{summary}"); - if summary.uncategorized_fraction() > 0.05 { - anyhow::bail!( - "uncategorized records ({:.2}%) exceed the 5% gate; \ - stop and report unmatched prefixes (see summary above) \ - before proceeding", - 100.0 * summary.uncategorized_fraction() - ); - } - if summary.file_sizes_failed > 0 && !allow_missing_file_sizes { - anyhow::bail!( - "{} file-sizes-*.json.gz source file(s) failed (see warnings above); \ - re-run with --allow-missing-file-sizes if partial coverage is intended", - summary.file_sizes_failed - ); - } - Ok(()) - } - Command::Verify { against, duckdb } => { - let report = verify::run(&against, &duckdb)?; - print!("{report}"); - if !report.v2_groups_covered() { - std::process::exit(1); - } - Ok(()) - } - } -} diff --git a/benchmarks-website/migrate/src/migrate/accum.rs b/benchmarks-website/migrate/src/migrate/accum.rs deleted file mode 100644 index 69f16fe4b93..00000000000 --- a/benchmarks-website/migrate/src/migrate/accum.rs +++ /dev/null @@ -1,357 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Per-fact-table row accumulators + their `RecordBatch` builders. -//! -//! Each `*Accum` collects classified records during the streaming JSONL -//! pass and then materialises one Arrow `RecordBatch` per fact table at -//! flush time. Three of the four use parallel column vectors with a -//! `seen` map keyed by `measurement_id`; `CompressionSizeAccum` is a -//! `HashMap` because it has two collision semantics -//! (replace from `data.json.gz`, sum from `file-sizes-*.json.gz`). - -use std::sync::Arc; - -use anyhow::Result; -use arrow_array::ArrayRef; -use arrow_array::Int32Array; -use arrow_array::Int64Array; -use arrow_array::ListArray; -use arrow_array::RecordBatch; -use arrow_array::StringArray; -use arrow_buffer::OffsetBuffer; -use arrow_schema::DataType; -use arrow_schema::Field; -use arrow_schema::Schema; -use vortex_bench_server::records::CompressionSize; -use vortex_bench_server::records::CompressionTime; -use vortex_bench_server::records::QueryMeasurement; -use vortex_bench_server::records::RandomAccessTime; -use vortex_utils::aliases::hash_map::HashMap; - -use super::MigrationSummary; - -/// `query_measurements` accumulator. Parallel column vectors plus a -/// `measurement_id`-keyed seen map; first-write wins on collision. -#[derive(Default)] -pub(super) struct QueryAccum { - pub(super) measurement_id: Vec, - pub(super) commit_sha: Vec, - pub(super) dataset: Vec, - pub(super) dataset_variant: Vec>, - pub(super) scale_factor: Vec>, - pub(super) query_idx: Vec, - pub(super) storage: Vec, - pub(super) engine: Vec, - pub(super) format: Vec, - pub(super) value_ns: Vec, - pub(super) all_runtimes_ns: Vec>, - pub(super) peak_physical: Vec>, - pub(super) peak_virtual: Vec>, - pub(super) physical_delta: Vec>, - pub(super) virtual_delta: Vec>, - pub(super) env_triple: Vec>, - /// `mid` -> index in the parallel column vecs. Lets us look up the - /// kept row's `value_ns` on collision so we can flag conflicts. - pub(super) seen: HashMap, -} - -impl QueryAccum { - pub(super) fn push(&mut self, mid: i64, r: QueryMeasurement, summary: &mut MigrationSummary) { - if let Some(&idx) = self.seen.get(&mid) { - summary.deduped += 1; - if self.value_ns[idx] != r.value_ns { - summary.deduped_with_conflict += 1; - } - return; - } - let idx = self.measurement_id.len(); - self.seen.insert(mid, idx); - self.measurement_id.push(mid); - self.commit_sha.push(r.commit_sha); - self.dataset.push(r.dataset); - self.dataset_variant.push(r.dataset_variant); - self.scale_factor.push(r.scale_factor); - self.query_idx.push(r.query_idx); - self.storage.push(r.storage); - self.engine.push(r.engine); - self.format.push(r.format); - self.value_ns.push(r.value_ns); - self.all_runtimes_ns.push(r.all_runtimes_ns); - self.peak_physical.push(r.peak_physical); - self.peak_virtual.push(r.peak_virtual); - self.physical_delta.push(r.physical_delta); - self.virtual_delta.push(r.virtual_delta); - self.env_triple.push(r.env_triple); - } -} - -/// `compression_times` accumulator. Same shape as [`QueryAccum`] minus the -/// query-only columns. -#[derive(Default)] -pub(super) struct CompressionTimeAccum { - pub(super) measurement_id: Vec, - pub(super) commit_sha: Vec, - pub(super) dataset: Vec, - pub(super) dataset_variant: Vec>, - pub(super) format: Vec, - pub(super) op: Vec, - pub(super) value_ns: Vec, - pub(super) all_runtimes_ns: Vec>, - pub(super) env_triple: Vec>, - pub(super) seen: HashMap, -} - -impl CompressionTimeAccum { - pub(super) fn push(&mut self, mid: i64, r: CompressionTime, summary: &mut MigrationSummary) { - if let Some(&idx) = self.seen.get(&mid) { - summary.deduped += 1; - if self.value_ns[idx] != r.value_ns { - summary.deduped_with_conflict += 1; - } - return; - } - let idx = self.measurement_id.len(); - self.seen.insert(mid, idx); - self.measurement_id.push(mid); - self.commit_sha.push(r.commit_sha); - self.dataset.push(r.dataset); - self.dataset_variant.push(r.dataset_variant); - self.format.push(r.format); - self.op.push(r.op); - self.value_ns.push(r.value_ns); - self.all_runtimes_ns.push(r.all_runtimes_ns); - self.env_triple.push(r.env_triple); - } -} - -/// `random_access_times` accumulator. Smallest of the three parallel-vec -/// accumulators. -#[derive(Default)] -pub(super) struct RandomAccessAccum { - pub(super) measurement_id: Vec, - pub(super) commit_sha: Vec, - pub(super) dataset: Vec, - pub(super) format: Vec, - pub(super) value_ns: Vec, - pub(super) all_runtimes_ns: Vec>, - pub(super) env_triple: Vec>, - pub(super) seen: HashMap, -} - -impl RandomAccessAccum { - pub(super) fn push(&mut self, mid: i64, r: RandomAccessTime, summary: &mut MigrationSummary) { - if let Some(&idx) = self.seen.get(&mid) { - summary.deduped += 1; - if self.value_ns[idx] != r.value_ns { - summary.deduped_with_conflict += 1; - } - return; - } - let idx = self.measurement_id.len(); - self.seen.insert(mid, idx); - self.measurement_id.push(mid); - self.commit_sha.push(r.commit_sha); - self.dataset.push(r.dataset); - self.format.push(r.format); - self.value_ns.push(r.value_ns); - self.all_runtimes_ns.push(r.all_runtimes_ns); - self.env_triple.push(r.env_triple); - } -} - -/// `compression_sizes` is fed by both `data.json.gz` (replace-on-collision) -/// and `file-sizes-*.json.gz` (sum-on-collision). Stored as a map; converted -/// to a `RecordBatch` at flush time. -#[derive(Default)] -pub(super) struct CompressionSizeAccum { - pub(super) rows: HashMap, -} - -impl CompressionSizeAccum { - /// data.json.gz path: latest write wins, mirroring the prior - /// `ON CONFLICT DO UPDATE SET value_bytes = excluded.value_bytes`. - /// Bumps `deduped_with_conflict` when an existing row's - /// `value_bytes` differs from the incoming row's, so silent - /// value-corruption is observable. - pub(super) fn push_replace( - &mut self, - mid: i64, - r: CompressionSize, - summary: &mut MigrationSummary, - ) { - if let Some(existing) = self.rows.get(&mid) - && existing.value_bytes != r.value_bytes - { - summary.deduped_with_conflict += 1; - } - self.rows.insert(mid, r); - } - - /// file-sizes-*.json.gz path: per-file rows aggregate into one - /// `(commit, dataset, dataset_variant, format)` row by summing, - /// mirroring the prior `value_bytes = compression_sizes.value_bytes - /// + excluded.value_bytes`. - pub(super) fn push_sum(&mut self, mid: i64, r: CompressionSize) { - let add = r.value_bytes; - self.rows - .entry(mid) - .and_modify(|x| x.value_bytes += add) - .or_insert(r); - } -} - -pub(super) fn build_query_batch(a: QueryAccum) -> Result { - let schema = Arc::new(Schema::new(vec![ - Field::new("measurement_id", DataType::Int64, false), - Field::new("commit_sha", DataType::Utf8, false), - Field::new("dataset", DataType::Utf8, false), - Field::new("dataset_variant", DataType::Utf8, true), - Field::new("scale_factor", DataType::Utf8, true), - Field::new("query_idx", DataType::Int32, false), - Field::new("storage", DataType::Utf8, false), - Field::new("engine", DataType::Utf8, false), - Field::new("format", DataType::Utf8, false), - Field::new("value_ns", DataType::Int64, false), - Field::new( - "all_runtimes_ns", - DataType::List(Arc::new(Field::new("item", DataType::Int64, false))), - false, - ), - Field::new("peak_physical", DataType::Int64, true), - Field::new("peak_virtual", DataType::Int64, true), - Field::new("physical_delta", DataType::Int64, true), - Field::new("virtual_delta", DataType::Int64, true), - Field::new("env_triple", DataType::Utf8, true), - ])); - let cols: Vec = vec![ - Arc::new(Int64Array::from(a.measurement_id)), - Arc::new(StringArray::from(a.commit_sha)), - Arc::new(StringArray::from(a.dataset)), - Arc::new(StringArray::from(a.dataset_variant)), - Arc::new(StringArray::from(a.scale_factor)), - Arc::new(Int32Array::from(a.query_idx)), - Arc::new(StringArray::from(a.storage)), - Arc::new(StringArray::from(a.engine)), - Arc::new(StringArray::from(a.format)), - Arc::new(Int64Array::from(a.value_ns)), - Arc::new(build_list_int64(a.all_runtimes_ns)), - Arc::new(Int64Array::from(a.peak_physical)), - Arc::new(Int64Array::from(a.peak_virtual)), - Arc::new(Int64Array::from(a.physical_delta)), - Arc::new(Int64Array::from(a.virtual_delta)), - Arc::new(StringArray::from(a.env_triple)), - ]; - Ok(RecordBatch::try_new(schema, cols)?) -} - -pub(super) fn build_compression_time_batch(a: CompressionTimeAccum) -> Result { - let schema = Arc::new(Schema::new(vec![ - Field::new("measurement_id", DataType::Int64, false), - Field::new("commit_sha", DataType::Utf8, false), - Field::new("dataset", DataType::Utf8, false), - Field::new("dataset_variant", DataType::Utf8, true), - Field::new("format", DataType::Utf8, false), - Field::new("op", DataType::Utf8, false), - Field::new("value_ns", DataType::Int64, false), - Field::new( - "all_runtimes_ns", - DataType::List(Arc::new(Field::new("item", DataType::Int64, false))), - false, - ), - Field::new("env_triple", DataType::Utf8, true), - ])); - let cols: Vec = vec![ - Arc::new(Int64Array::from(a.measurement_id)), - Arc::new(StringArray::from(a.commit_sha)), - Arc::new(StringArray::from(a.dataset)), - Arc::new(StringArray::from(a.dataset_variant)), - Arc::new(StringArray::from(a.format)), - Arc::new(StringArray::from(a.op)), - Arc::new(Int64Array::from(a.value_ns)), - Arc::new(build_list_int64(a.all_runtimes_ns)), - Arc::new(StringArray::from(a.env_triple)), - ]; - Ok(RecordBatch::try_new(schema, cols)?) -} - -pub(super) fn build_random_access_batch(a: RandomAccessAccum) -> Result { - let schema = Arc::new(Schema::new(vec![ - Field::new("measurement_id", DataType::Int64, false), - Field::new("commit_sha", DataType::Utf8, false), - Field::new("dataset", DataType::Utf8, false), - Field::new("format", DataType::Utf8, false), - Field::new("value_ns", DataType::Int64, false), - Field::new( - "all_runtimes_ns", - DataType::List(Arc::new(Field::new("item", DataType::Int64, false))), - false, - ), - Field::new("env_triple", DataType::Utf8, true), - ])); - let cols: Vec = vec![ - Arc::new(Int64Array::from(a.measurement_id)), - Arc::new(StringArray::from(a.commit_sha)), - Arc::new(StringArray::from(a.dataset)), - Arc::new(StringArray::from(a.format)), - Arc::new(Int64Array::from(a.value_ns)), - Arc::new(build_list_int64(a.all_runtimes_ns)), - Arc::new(StringArray::from(a.env_triple)), - ]; - Ok(RecordBatch::try_new(schema, cols)?) -} - -pub(super) fn build_compression_size_batch(a: CompressionSizeAccum) -> Result { - let n = a.rows.len(); - let mut measurement_id = Vec::with_capacity(n); - let mut commit_sha = Vec::with_capacity(n); - let mut dataset = Vec::with_capacity(n); - let mut dataset_variant = Vec::with_capacity(n); - let mut format = Vec::with_capacity(n); - let mut value_bytes = Vec::with_capacity(n); - for (mid, cs) in a.rows { - measurement_id.push(mid); - commit_sha.push(cs.commit_sha); - dataset.push(cs.dataset); - dataset_variant.push(cs.dataset_variant); - format.push(cs.format); - value_bytes.push(cs.value_bytes); - } - let schema = Arc::new(Schema::new(vec![ - Field::new("measurement_id", DataType::Int64, false), - Field::new("commit_sha", DataType::Utf8, false), - Field::new("dataset", DataType::Utf8, false), - Field::new("dataset_variant", DataType::Utf8, true), - Field::new("format", DataType::Utf8, false), - Field::new("value_bytes", DataType::Int64, false), - ])); - let cols: Vec = vec![ - Arc::new(Int64Array::from(measurement_id)), - Arc::new(StringArray::from(commit_sha)), - Arc::new(StringArray::from(dataset)), - Arc::new(StringArray::from(dataset_variant)), - Arc::new(StringArray::from(format)), - Arc::new(Int64Array::from(value_bytes)), - ]; - Ok(RecordBatch::try_new(schema, cols)?) -} - -/// Build a non-nullable `List` Arrow array from one inner Vec -/// per row. The outer list is non-null; inner i64 values are non-null. -fn build_list_int64(values: Vec>) -> ListArray { - let mut offsets: Vec = Vec::with_capacity(values.len() + 1); - offsets.push(0); - let mut flat: Vec = Vec::new(); - for inner in values { - flat.extend_from_slice(&inner); - offsets.push(flat.len() as i32); - } - let values_arr = Int64Array::from(flat); - let field = Arc::new(Field::new("item", DataType::Int64, false)); - ListArray::new( - field, - OffsetBuffer::new(offsets.into()), - Arc::new(values_arr), - None, - ) -} diff --git a/benchmarks-website/migrate/src/migrate/mod.rs b/benchmarks-website/migrate/src/migrate/mod.rs deleted file mode 100644 index 6c58fe459ed..00000000000 --- a/benchmarks-website/migrate/src/migrate/mod.rs +++ /dev/null @@ -1,652 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! End-to-end migration of one v2 dataset into a v3 DuckDB file. -//! -//! Streams `data.json.gz` line-by-line, runs each record through the -//! [`crate::classifier`], and writes one row per record into the appropriate -//! v3 fact table. Every row's `measurement_id` is computed via the server's -//! `measurement_id_*` functions so the result is byte-compatible with what -//! fresh `/api/ingest` would have produced. -//! -//! Bulk-load shape: rows are accumulated in memory as parallel column -//! vectors, deduplicated by `measurement_id`, then flushed to DuckDB -//! via `Appender::append_record_batch` as one Arrow `RecordBatch` per -//! fact table. - -mod accum; - -use std::collections::BTreeMap; -use std::io::BufRead; -use std::path::Path; -use std::time::Duration; -use std::time::Instant; - -use anyhow::Context as _; -use anyhow::Result; -use arrow_array::RecordBatch; -use duckdb::Connection; -use tracing::info; -use tracing::warn; -use vortex_bench_server::db::measurement_id_compression_size; -use vortex_bench_server::db::measurement_id_compression_time; -use vortex_bench_server::db::measurement_id_query; -use vortex_bench_server::db::measurement_id_random_access; -use vortex_bench_server::family; -use vortex_bench_server::records::CompressionSize; -use vortex_bench_server::records::CompressionTime; -use vortex_bench_server::records::QueryMeasurement; -use vortex_bench_server::records::RandomAccessTime; -use vortex_bench_server::schema::COMMITS_DDL; - -use self::accum::CompressionSizeAccum; -use self::accum::CompressionTimeAccum; -use self::accum::QueryAccum; -use self::accum::RandomAccessAccum; -use self::accum::build_compression_size_batch; -use self::accum::build_compression_time_batch; -use self::accum::build_query_batch; -use self::accum::build_random_access_batch; -use crate::classifier; -use crate::classifier::V3Bin; -use crate::commits::upsert_commit; -use crate::source::KNOWN_FILE_SIZES_SUITES; -use crate::source::Source; -use crate::v2::V2Commit; -use crate::v2::V2FileSize; -use crate::v2::V2Record; -use crate::v2::canonical_scale_factor; -use crate::v2::index_commits; -use crate::v2::runtime_as_i64; -use crate::v2::value_as_f64; - -/// Per-table insert counts, plus skip / missing counts. -#[derive(Debug, Default, Clone)] -pub struct MigrationSummary { - /// Lines read from `data.json.gz`. - pub records_read: u64, - /// Rows successfully inserted into `query_measurements`. - pub query_inserted: u64, - /// Rows successfully inserted into `compression_times`. - pub compression_time_inserted: u64, - /// Rows successfully inserted into `compression_sizes`. - pub compression_size_inserted: u64, - /// Rows successfully inserted into `random_access_times`. - pub random_access_inserted: u64, - /// `file-sizes-*.json.gz` lines folded into `compression_sizes`. - pub file_size_inserted: u64, - /// Records the classifier returned `Unknown` for. - pub uncategorized: u64, - /// Top-level prefix histogram of uncategorised records, for triage. - pub uncategorized_prefixes: BTreeMap, - /// Records whose `commit_id` doesn't match any commit in `commits.jsonl`. - pub missing_commit: u64, - /// Warnings emitted while upserting commits (e.g. missing tree SHA). - pub commit_warnings: u64, - /// Records dropped because their `value` was missing or non-numeric. - pub skipped_no_value: u64, - /// Records the classifier returned `Skip(reason)` for. - pub skipped_intentional: u64, - /// Commits upserted into the `commits` dim table. - pub commits_inserted: u64, - /// Records dropped by dedup because their `measurement_id` collided - /// with a previously kept row. - pub deduped: u64, - /// Number of records dropped by dedup whose `value_ns` (or - /// `value_bytes` for compression_sizes' replace path) differed - /// from the kept row's. Non-zero is a smell worth investigating. - pub deduped_with_conflict: u64, - /// `file-sizes-*.json.gz` source files that failed to download / - /// decode / parse. Non-zero means the migrated DB has missing - /// compression-size history from at least one v2 source file; the - /// CLI fails by default in that case unless - /// `--allow-missing-file-sizes` is passed. - pub file_sizes_failed: u64, -} - -impl MigrationSummary { - /// Total `data.json.gz` records that landed in some v3 fact table. - pub fn total_inserted(&self) -> u64 { - self.query_inserted - + self.compression_time_inserted - + self.compression_size_inserted - + self.random_access_inserted - } - - /// Fraction of records that were uncategorized. The orchestrator - /// stops if this exceeds the documented 5% threshold. - pub fn uncategorized_fraction(&self) -> f64 { - if self.records_read == 0 { - return 0.0; - } - self.uncategorized as f64 / self.records_read as f64 - } -} - -/// Open or create a DuckDB at `path` and apply the v3 schema. The -/// migrator is a one-shot fresh load; the bulk-append flush is pure -/// insert (no `ON CONFLICT`), so any stale rows in `path` would clash -/// with the next run on the same primary keys. Delete both the -/// database file and its WAL companion up front so every run starts -/// from a known-empty state. -pub fn open_target_db(path: &Path) -> Result { - remove_if_exists(path)?; - let wal = wal_path(path); - remove_if_exists(&wal)?; - let conn = - Connection::open(path).with_context(|| format!("opening DuckDB at {}", path.display()))?; - // Apply the v3 schema. Drives off the per-fact-table `family::Family` - // registry the same way `vortex_bench_server::db::open` does - adding - // a sixth fact table only needs a new const there, not an edit here. - conn.execute_batch(COMMITS_DDL) - .context("applying commits dim DDL")?; - for fam in family::FAMILIES { - conn.execute_batch(fam.schema_ddl) - .with_context(|| format!("applying {} DDL", fam.table_name))?; - } - Ok(conn) -} - -fn remove_if_exists(path: &Path) -> Result<()> { - match std::fs::remove_file(path) { - Ok(()) => { - info!(path = %path.display(), "removed pre-existing target file"); - Ok(()) - } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()), - Err(e) => Err(e).with_context(|| format!("removing {}", path.display())), - } -} - -/// DuckDB writes its write-ahead log next to the database file with a -/// `.wal` suffix appended (e.g. `v3.duckdb` -> `v3.duckdb.wal`). -fn wal_path(path: &Path) -> std::path::PathBuf { - let mut name = path.as_os_str().to_owned(); - name.push(".wal"); - std::path::PathBuf::from(name) -} - -/// Run the whole migration: commits, data.json.gz, and every -/// file-sizes-*.json.gz under the source. -pub fn run(source: &Source, target: &Path) -> Result { - let mut conn = open_target_db(target)?; - let mut summary = MigrationSummary::default(); - - info!(source = %source.describe(), "Reading commits.json"); - let commits = read_commits(source)?; - info!(commits = commits.len(), "Loaded commits"); - summary.commits_inserted = upsert_all_commits(&mut conn, &commits, &mut summary)?; - - let mut q = QueryAccum::default(); - let mut ct = CompressionTimeAccum::default(); - let mut cs = CompressionSizeAccum::default(); - let mut ra = RandomAccessAccum::default(); - - info!("Migrating data.json.gz"); - migrate_data_jsonl( - source, - &commits, - &mut summary, - &mut q, - &mut ct, - &mut cs, - &mut ra, - )?; - info!(records = summary.records_read, "data.json.gz done"); - - for name in source.list_file_sizes()? { - info!(name = %name, "Migrating file-sizes"); - if let Err(e) = migrate_file_sizes(source, &name, &commits, &mut summary, &mut cs) { - warn!("file-sizes file {name} failed: {e:#}"); - summary.file_sizes_failed += 1; - } - } - - info!("Flushing accumulators to DuckDB"); - flush_all(&conn, q, ct, ra, cs, &mut summary)?; - - Ok(summary) -} - -/// Flush each accumulator's batch and bump the matching per-fact -/// summary counter only AFTER the flush succeeds. This way a flush -/// failure leaves the counter at zero (or its previous value) rather -/// than reporting rows that never landed in DuckDB. -fn flush_all( - conn: &Connection, - q: QueryAccum, - ct: CompressionTimeAccum, - ra: RandomAccessAccum, - cs: CompressionSizeAccum, - summary: &mut MigrationSummary, -) -> Result<()> { - let batch = build_query_batch(q)?; - let n = batch.num_rows() as u64; - flush(conn, "query_measurements", batch)?; - summary.query_inserted = n; - - let batch = build_compression_time_batch(ct)?; - let n = batch.num_rows() as u64; - flush(conn, "compression_times", batch)?; - summary.compression_time_inserted = n; - - let batch = build_random_access_batch(ra)?; - let n = batch.num_rows() as u64; - flush(conn, "random_access_times", batch)?; - summary.random_access_inserted = n; - - let batch = build_compression_size_batch(cs)?; - let n = batch.num_rows() as u64; - flush(conn, "compression_sizes", batch)?; - summary.compression_size_inserted = n; - - Ok(()) -} - -fn read_commits(source: &Source) -> Result> { - let reader = source.open_commits_jsonl()?; - let mut commits: Vec = Vec::new(); - for line in reader.lines() { - let line = line?; - let trimmed = line.trim(); - if trimmed.is_empty() { - continue; - } - match serde_json::from_str::(trimmed) { - Ok(c) => commits.push(c), - Err(e) => warn!("skipping malformed commits.json line: {e}"), - } - } - Ok(index_commits(commits)) -} - -fn upsert_all_commits( - conn: &mut Connection, - commits: &BTreeMap, - summary: &mut MigrationSummary, -) -> Result { - let tx = conn.transaction().context("begin commits transaction")?; - let mut count = 0u64; - for commit in commits.values() { - let outcome = upsert_commit(&tx, commit)?; - for w in outcome.warnings { - warn!("{w}"); - summary.commit_warnings += 1; - } - count += 1; - } - tx.commit().context("commit commits transaction")?; - Ok(count) -} - -/// Stream `data.json.gz` and push classified records into the -/// per-table accumulators. Dedup happens inside each accumulator's -/// `push` method by `measurement_id`. -fn migrate_data_jsonl( - source: &Source, - commits: &BTreeMap, - summary: &mut MigrationSummary, - q: &mut QueryAccum, - ct: &mut CompressionTimeAccum, - cs: &mut CompressionSizeAccum, - ra: &mut RandomAccessAccum, -) -> Result<()> { - let reader = source.open_data_jsonl()?; - let started = Instant::now(); - let mut last_log = Instant::now(); - for line in reader.lines() { - let line = line?; - let trimmed = line.trim(); - if trimmed.is_empty() { - continue; - } - summary.records_read += 1; - let record: V2Record = match serde_json::from_str(trimmed) { - Ok(r) => r, - Err(e) => { - warn!("skipping malformed data.json line: {e}"); - continue; - } - }; - apply_v2_record(&record, commits, summary, q, ct, cs, ra); - if last_log.elapsed() >= Duration::from_secs(5) { - let elapsed = started.elapsed().as_secs_f64(); - let rate = summary.records_read as f64 / elapsed.max(0.001); - info!( - records = summary.records_read, - rate = format!("{rate:.0}/s"), - query = q.measurement_id.len(), - compression_time = ct.measurement_id.len(), - compression_size = cs.rows.len(), - random_access = ra.measurement_id.len(), - "migration progress", - ); - last_log = Instant::now(); - } - } - Ok(()) -} - -fn apply_v2_record( - record: &V2Record, - commits: &BTreeMap, - summary: &mut MigrationSummary, - q: &mut QueryAccum, - ct: &mut CompressionTimeAccum, - cs: &mut CompressionSizeAccum, - ra: &mut RandomAccessAccum, -) { - let Some(sha) = record.commit_id.clone() else { - summary.missing_commit += 1; - return; - }; - if !commits.contains_key(&sha) { - summary.missing_commit += 1; - return; - } - - let bin = match classifier::classify_outcome(record) { - classifier::Outcome::Bin(b) => b, - classifier::Outcome::Skip(_) => { - summary.skipped_intentional += 1; - return; - } - classifier::Outcome::Unknown => { - summary.uncategorized += 1; - let prefix = record.name.split('/').next().unwrap_or("").to_string(); - *summary.uncategorized_prefixes.entry(prefix).or_insert(0) += 1; - return; - } - }; - - let env_triple = record.env_triple.as_ref().and_then(|t| t.to_triple()); - let runtimes = record - .all_runtimes - .as_ref() - .map(|v| v.iter().filter_map(runtime_as_i64).collect::>()) - .unwrap_or_default(); - let value_f64 = match record.value.as_ref().and_then(value_as_f64) { - Some(v) => v, - None => { - summary.skipped_no_value += 1; - return; - } - }; - - match bin { - V3Bin::Query { - dataset, - dataset_variant, - scale_factor, - query_idx, - storage, - engine, - format, - } => { - let qm = QueryMeasurement { - commit_sha: sha, - dataset, - dataset_variant, - scale_factor, - query_idx, - storage, - engine, - format, - value_ns: value_f64 as i64, - all_runtimes_ns: runtimes, - peak_physical: None, - peak_virtual: None, - physical_delta: None, - virtual_delta: None, - env_triple, - }; - let mid = measurement_id_query(&qm); - q.push(mid, qm, summary); - } - V3Bin::CompressionTime { - dataset, - dataset_variant, - format, - op, - } => { - let ctr = CompressionTime { - commit_sha: sha, - dataset, - dataset_variant, - format, - op, - value_ns: value_f64 as i64, - all_runtimes_ns: runtimes, - env_triple, - }; - let mid = measurement_id_compression_time(&ctr); - ct.push(mid, ctr, summary); - } - V3Bin::CompressionSize { - dataset, - dataset_variant, - format, - } => { - let csr = CompressionSize { - commit_sha: sha, - dataset, - dataset_variant, - format, - value_bytes: value_f64 as i64, - }; - let mid = measurement_id_compression_size(&csr); - cs.push_replace(mid, csr, summary); - } - V3Bin::RandomAccess { dataset, format } => { - let rar = RandomAccessTime { - commit_sha: sha, - dataset, - format, - value_ns: value_f64 as i64, - all_runtimes_ns: runtimes, - env_triple, - }; - let mid = measurement_id_random_access(&rar); - ra.push(mid, rar, summary); - } - } -} - -fn migrate_file_sizes( - source: &Source, - name: &str, - commits: &BTreeMap, - summary: &mut MigrationSummary, - cs: &mut CompressionSizeAccum, -) -> Result<()> { - let reader = source.open_file_sizes(name)?; - // Prefix unknown-id fallbacks with `unknown:` so they're clearly - // labeled in the UI rather than masquerading as a dataset name. - let dataset_fallback = { - let stripped = name - .strip_prefix("file-sizes-") - .and_then(|s| s.strip_suffix(".json.gz")) - .unwrap_or(name); - if KNOWN_FILE_SIZES_SUITES.contains(&stripped) { - stripped.to_string() - } else { - format!("unknown:{stripped}") - } - }; - let started = Instant::now(); - let mut last_log = Instant::now(); - for line in reader.lines() { - let line = line?; - let trimmed = line.trim(); - if trimmed.is_empty() { - continue; - } - let sz: V2FileSize = match serde_json::from_str(trimmed) { - Ok(r) => r, - Err(e) => { - warn!("skipping malformed {name} line: {e}"); - continue; - } - }; - if !commits.contains_key(&sz.commit_id) { - summary.missing_commit += 1; - continue; - } - let dataset = if sz.benchmark.is_empty() { - dataset_fallback.clone() - } else { - sz.benchmark.clone() - }; - // Run SF through canonical_scale_factor so `"1"`, `"1.0"`, `"10"` - // and `"10.0"` collapse to one form, matching what - // `bin_compression_size` writes for the data.json.gz path. - let dataset_variant = canonical_scale_factor(sz.scale_factor.as_deref()); - let csr = CompressionSize { - commit_sha: sz.commit_id.clone(), - dataset, - dataset_variant, - format: sz.format.clone(), - value_bytes: sz.size_bytes, - }; - let mid = measurement_id_compression_size(&csr); - cs.push_sum(mid, csr); - summary.file_size_inserted += 1; - if last_log.elapsed() >= Duration::from_secs(5) { - let elapsed = started.elapsed().as_secs_f64(); - let rate = summary.file_size_inserted as f64 / elapsed.max(0.001); - info!( - name = %name, - file_sizes = summary.file_size_inserted, - rate = format!("{rate:.0}/s"), - "file-sizes progress", - ); - last_log = Instant::now(); - } - } - Ok(()) -} - -/// Append an Arrow `RecordBatch` to a DuckDB table via `Appender`. -fn flush(conn: &Connection, table: &str, batch: RecordBatch) -> Result<()> { - let mut app = conn - .appender(table) - .with_context(|| format!("opening appender for {table}"))?; - app.append_record_batch(batch) - .with_context(|| format!("appending record batch to {table}"))?; - drop(app); - Ok(()) -} - -/// Print the summary in a human-readable form. Returned by the CLI. -impl std::fmt::Display for MigrationSummary { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "Records read: {}", self.records_read)?; - writeln!(f, "Commits upserted: {}", self.commits_inserted)?; - writeln!(f, "Commit warnings: {}", self.commit_warnings)?; - writeln!(f, "Inserted (query): {}", self.query_inserted)?; - writeln!( - f, - "Inserted (compress t): {}", - self.compression_time_inserted - )?; - writeln!( - f, - "Inserted (compress s): {}", - self.compression_size_inserted - )?; - writeln!(f, "Inserted (random acc): {}", self.random_access_inserted)?; - writeln!(f, "Inserted (file sizes): {}", self.file_size_inserted)?; - writeln!(f, "Missing commit: {}", self.missing_commit)?; - writeln!(f, "Skipped (no value): {}", self.skipped_no_value)?; - writeln!(f, "Skipped (intentional): {}", self.skipped_intentional)?; - writeln!(f, "Deduplicated: {}", self.deduped)?; - writeln!(f, "Dedup w/ value diff: {}", self.deduped_with_conflict)?; - writeln!( - f, - "Uncategorized: {} ({:.2}%)", - self.uncategorized, - 100.0 * self.uncategorized_fraction() - )?; - if !self.uncategorized_prefixes.is_empty() { - let mut top: Vec<_> = self.uncategorized_prefixes.iter().collect(); - top.sort_by(|a, b| b.1.cmp(a.1)); - writeln!(f, "Top uncategorized prefixes:")?; - for (prefix, n) in top.iter().take(20) { - writeln!(f, " {prefix:>32} : {n}")?; - } - } - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use vortex_bench_server::records::QueryMeasurement; - - use super::*; - - fn open_db_without(table: &str) -> Result<(tempfile::TempDir, Connection)> { - let dir = tempfile::TempDir::new()?; - let path = dir.path().join("v3.duckdb"); - let conn = open_target_db(&path)?; - conn.execute_batch(&format!("DROP TABLE {table}"))?; - Ok((dir, conn)) - } - - fn one_query_row() -> QueryMeasurement { - QueryMeasurement { - commit_sha: "deadbeef".into(), - dataset: "clickbench".into(), - dataset_variant: None, - scale_factor: None, - query_idx: 7, - storage: "nvme".into(), - engine: "datafusion".into(), - format: "parquet".into(), - value_ns: 100, - all_runtimes_ns: vec![100], - peak_physical: None, - peak_virtual: None, - physical_delta: None, - virtual_delta: None, - env_triple: None, - } - } - - #[test] - fn flush_all_does_not_overcount_on_failure() -> Result<()> { - // Drop `compression_times` before flushing so the second - // flush in `flush_all` fails. The first (queries) succeeded, - // so its counter must be set; the failed table's counter and - // every later table's counter must stay at zero. - let (_dir, conn) = open_db_without("compression_times")?; - - let mut summary = MigrationSummary::default(); - let mut q = QueryAccum::default(); - let qm = one_query_row(); - let mid = vortex_bench_server::db::measurement_id_query(&qm); - q.push(mid, qm, &mut summary); - - let ct = CompressionTimeAccum::default(); - let ra = RandomAccessAccum::default(); - let cs = CompressionSizeAccum::default(); - - let result = flush_all(&conn, q, ct, ra, cs, &mut summary); - assert!(result.is_err(), "expected flush to fail on missing table"); - - assert_eq!( - summary.query_inserted, 1, - "query flushed before the failure must be counted" - ); - assert_eq!( - summary.compression_time_inserted, 0, - "failed flush must not bump the counter" - ); - assert_eq!(summary.random_access_inserted, 0, "later flushes never ran"); - assert_eq!( - summary.compression_size_inserted, 0, - "later flushes never ran" - ); - Ok(()) - } -} diff --git a/benchmarks-website/migrate/src/source.rs b/benchmarks-website/migrate/src/source.rs deleted file mode 100644 index acd53c7626e..00000000000 --- a/benchmarks-website/migrate/src/source.rs +++ /dev/null @@ -1,140 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Streaming readers for v2's public S3 bucket. -//! -//! The bucket is `--no-sign-request`, so we fetch the underlying -//! HTTPS URL directly and stream-decompress with `flate2`. The -//! downloads are wrapped in [`reqwest::blocking`] to keep the read -//! path synchronous; the binary's hot path is single-threaded -//! per-source already (DuckDB is a single-writer). -//! -//! For tests and offline runs, [`Source::Local`](crate::source::Source::Local) accepts a local -//! directory of dumps; the migrator's `--source` flag picks the -//! variant. - -use std::fs::File; -use std::io::BufRead; -use std::io::BufReader; -use std::io::Read; -use std::path::Path; -use std::path::PathBuf; - -use anyhow::Context as _; -use anyhow::Result; -use flate2::read::GzDecoder; -use tracing::info; - -/// Public S3 bucket the live v2 server reads from. -pub const PUBLIC_BUCKET_BASE: &str = "https://vortex-ci-benchmark-results.s3.amazonaws.com"; - -/// Where to read the v2 dataset from. Either the public S3 bucket -/// (the live deployment) or a local directory of dumps. -#[derive(Debug, Clone)] -pub enum Source { - /// HTTPS GETs against `s3.amazonaws.com`. - PublicS3, - /// A directory containing `data.json.gz`, `commits.json`, and - /// `file-sizes-*.json.gz` files. - Local(PathBuf), -} - -impl Source { - /// Short human-readable description for log messages. - pub fn describe(&self) -> String { - match self { - Source::PublicS3 => "public S3 bucket".to_string(), - Source::Local(p) => format!("local dir {}", p.display()), - } - } - - /// Open `data.json.gz` for streaming, decompressing on the fly. - pub fn open_data_jsonl(&self) -> Result> { - let stream = self.open_raw("data.json.gz")?; - Ok(Box::new(BufReader::new(GzDecoder::new(stream)))) - } - - /// Open `commits.json` (uncompressed). - pub fn open_commits_jsonl(&self) -> Result> { - let stream = self.open_raw("commits.json")?; - Ok(Box::new(BufReader::new(stream))) - } - - /// Enumerate `file-sizes-*.json.gz` files. For local sources this - /// is a directory glob; for the public bucket we hit the documented - /// suite ids. - pub fn list_file_sizes(&self) -> Result> { - match self { - Source::Local(dir) => { - let mut out = Vec::new(); - for entry in std::fs::read_dir(dir)? { - let entry = entry?; - let name = entry.file_name(); - let s = name.to_string_lossy(); - if s.starts_with("file-sizes-") && s.ends_with(".json.gz") { - out.push(s.into_owned()); - } - } - out.sort(); - Ok(out) - } - Source::PublicS3 => { - // The S3 bucket's ListObjects is denied for unsigned - // requests, so we hit the documented per-suite keys - // emitted by `.github/workflows/sql-benchmarks.yml`. - Ok(KNOWN_FILE_SIZES_SUITES - .iter() - .map(|id| format!("file-sizes-{id}.json.gz")) - .collect()) - } - } - } - - /// Open one `file-sizes-*.json.gz` for streaming. - pub fn open_file_sizes(&self, name: &str) -> Result> { - let stream = self.open_raw(name)?; - Ok(Box::new(BufReader::new(GzDecoder::new(stream)))) - } - - fn open_raw(&self, name: &str) -> Result> { - match self { - Source::Local(dir) => open_local(&dir.join(name)), - Source::PublicS3 => open_s3(name), - } - } -} - -fn open_local(path: &Path) -> Result> { - let f = File::open(path).with_context(|| format!("opening {}", path.display()))?; - Ok(Box::new(f)) -} - -fn open_s3(name: &str) -> Result> { - let url = format!("{PUBLIC_BUCKET_BASE}/{name}"); - info!(url = %url, "GET"); - let resp = reqwest::blocking::get(&url).with_context(|| format!("GET {url}"))?; - if !resp.status().is_success() { - anyhow::bail!("GET {url} returned {}", resp.status()); - } - Ok(Box::new(resp)) -} - -/// Suite IDs we know publish a `file-sizes-{id}.json.gz` to S3. -/// -/// Source of truth: the `matrix.id` values in -/// `.github/workflows/sql-benchmarks.yml`'s `benchmark_matrix` default. -/// The post-bench `file-sizes` step uploads `file-sizes-${{ matrix.id -/// }}.json.gz`, so this list must match those IDs verbatim. Adding a -/// new matrix entry to that workflow means adding the same ID here. -pub(crate) const KNOWN_FILE_SIZES_SUITES: &[&str] = &[ - "clickbench-nvme", - "tpch-nvme", - "tpch-s3", - "tpch-nvme-10", - "tpch-s3-10", - "tpcds-nvme", - "statpopgen", - "fineweb", - "fineweb-s3", - "polarsignals", -]; diff --git a/benchmarks-website/migrate/src/v2.rs b/benchmarks-website/migrate/src/v2.rs deleted file mode 100644 index 79785ba1fd4..00000000000 --- a/benchmarks-website/migrate/src/v2.rs +++ /dev/null @@ -1,197 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Wire shapes of the v2 benchmark dataset on S3. -//! -//! These types capture only the fields the migrator reads. v2 records -//! are serialized by `vortex-bench` (see `vortex-bench/src/measurements.rs`) -//! and by older non-Rust scripts; the union of fields is loose, so we -//! deserialize permissively (`serde(default)`, untyped `serde_json::Value` -//! for the polymorphic `dataset` field). - -use std::collections::BTreeMap; - -use serde::Deserialize; - -/// One JSONL line of `data.json.gz`. -/// -/// The shape is the union of every emitter's output. Most fields are -/// optional because different benches emit different subsets. -#[derive(Debug, Clone, Deserialize)] -pub struct V2Record { - /// Slash-separated benchmark identifier (e.g. `tpch_q01/datafusion:vortex-file-compressed`). - /// The classifier parses this string to recover dim values. - pub name: String, - /// 40-hex commit SHA. Present on every well-formed v2 record. - #[serde(default)] - pub commit_id: Option, - /// v2 unit string (`ns`, `bytes`, `ratio`, ...). Not used for routing — - /// the classifier picks the v3 fact table from the `name` prefix instead. - #[serde(default)] - pub unit: Option, - /// Polymorphic value — emitters wrote both numbers and stringified - /// numbers. Use [`value_as_f64`] to normalize. - #[serde(default)] - pub value: Option, - /// Storage backend the run targeted (`S3` or `NVMe`, mixed case in v2). - #[serde(default)] - pub storage: Option, - /// Polymorphic dataset block — sometimes a string, sometimes an object - /// keyed by suite name with a `scale_factor` inside (use - /// [`dataset_scale_factor`]). - #[serde(default)] - pub dataset: Option, - /// Per-iteration runtimes; same numeric polymorphism as `value`. - #[serde(default)] - pub all_runtimes: Option>, - /// Host environment triple block. - #[serde(default)] - pub env_triple: Option, -} - -/// `dataset` in v2 records is sometimes a string, sometimes an object -/// keyed by suite name (`{ "tpch": { "scale_factor": "10" } }`). -/// This helper looks up the scale factor for a given suite without -/// assuming a particular shape. -pub fn dataset_scale_factor(dataset: &serde_json::Value, key: &str) -> Option { - let obj = dataset.as_object()?; - let entry = obj.get(key)?; - let sf = entry.get("scale_factor")?; - match sf { - serde_json::Value::String(s) => Some(s.clone()), - serde_json::Value::Number(n) => Some(n.to_string()), - _ => None, - } -} - -/// Canonicalize a v2 scale-factor string for use in `dataset_variant`. -/// -/// v2 emitters wrote scale factors as either `"1"`, `"1.0"`, `"10"`, or -/// `"10.0"` for the same logical SF, so the data.json.gz path -/// (`bin_compression_size`) and the file-sizes-*.json.gz path -/// (`migrate_file_sizes`) would otherwise produce different -/// `dataset_variant` strings and never collapse onto the same -/// `measurement_id`. Parse to f64 and format with no trailing zeros so -/// every shape collapses to one canonical form (`"1"`, `"10"`, `"0.1"`). -/// SF=1 is the implicit default and folds to `None`. -pub fn canonical_scale_factor(raw: Option<&str>) -> Option { - let s = raw?.trim(); - if s.is_empty() { - return None; - } - let value: f64 = s.parse().ok()?; - if value == 1.0 { - return None; - } - Some(format!("{value}")) -} - -/// Best-effort numeric coercion for the polymorphic `value` field. -pub fn value_as_f64(value: &serde_json::Value) -> Option { - match value { - serde_json::Value::Number(n) => n.as_f64(), - serde_json::Value::String(s) => s.parse().ok(), - _ => None, - } -} - -/// Best-effort coercion of a runtime entry to nanoseconds. -pub fn runtime_as_i64(value: &serde_json::Value) -> Option { - match value { - serde_json::Value::Number(n) => { - if let Some(i) = n.as_i64() { - Some(i) - } else { - n.as_f64().map(|f| f as i64) - } - } - serde_json::Value::String(s) => s.parse().ok(), - _ => None, - } -} - -/// Triple block as emitted by `vortex-bench`'s `--gh-json` path. v2 -/// stored it as an object; we serialize it back out as `arch-os-env`. -#[derive(Debug, Clone, Deserialize)] -pub struct V2EnvTriple { - /// Host CPU architecture (e.g. `x86_64`). - #[serde(default)] - pub architecture: Option, - /// Operating system name (e.g. `linux`). - #[serde(default)] - pub operating_system: Option, - /// Host environment label (e.g. `gnu`). - #[serde(default)] - pub environment: Option, -} - -impl V2EnvTriple { - /// Format as the `arch-os-env` triple used by v3's `env_triple` column. - pub fn to_triple(&self) -> Option { - let arch = self.architecture.as_deref()?; - let os = self.operating_system.as_deref()?; - let env = self.environment.as_deref()?; - Some(format!("{arch}-{os}-{env}")) - } -} - -/// One JSONL line of `commits.json`. -#[derive(Debug, Clone, Deserialize)] -pub struct V2Commit { - /// 40-hex commit SHA (the v2 schema named this `id`, not `commit_sha`). - pub id: String, - /// RFC 3339 commit timestamp; required for the v3 row but tolerated as - /// missing in the source dump. - #[serde(default)] - pub timestamp: Option, - /// Full commit message. - #[serde(default)] - pub message: Option, - /// Author block. - #[serde(default)] - pub author: Option, - /// Committer block. - #[serde(default)] - pub committer: Option, - /// Git tree SHA. - #[serde(default)] - pub tree_id: Option, - /// GitHub commit URL. - #[serde(default)] - pub url: Option, -} - -/// Author or committer block on a v2 commit record. -#[derive(Debug, Clone, Deserialize)] -pub struct V2Person { - /// Display name. - #[serde(default)] - pub name: Option, - /// Email address. - #[serde(default)] - pub email: Option, -} - -/// One JSONL line of `file-sizes-*.json.gz` produced by -/// `scripts/capture-file-sizes.py`. -#[derive(Debug, Clone, Deserialize)] -pub struct V2FileSize { - /// 40-hex commit SHA. - pub commit_id: String, - /// Compression dataset name (`benchmark` is the v2 field name). - pub benchmark: String, - /// TPC SF as a string when relevant. - #[serde(default)] - pub scale_factor: Option, - /// Format the file was produced in. - pub format: String, - /// Path of the underlying file (e.g. `lineitem.parquet`); informational. - pub file: String, - /// Size in bytes; summed across files in the same `(commit, dataset, format)`. - pub size_bytes: i64, -} - -/// Build a sha-keyed map of commits. -pub fn index_commits(commits: Vec) -> BTreeMap { - commits.into_iter().map(|c| (c.id.clone(), c)).collect() -} diff --git a/benchmarks-website/migrate/src/verify.rs b/benchmarks-website/migrate/src/verify.rs deleted file mode 100644 index c855904a5a5..00000000000 --- a/benchmarks-website/migrate/src/verify.rs +++ /dev/null @@ -1,360 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Structural diff between a migrated v3 DuckDB and the live v2 -//! `/api/metadata` endpoint. -//! -//! Compares group / chart structure only; values aren't compared -//! because v2 converts ns → ms and bytes → MiB on read while v3 -//! stores raw and the chart query divides. Group/chart structural -//! equivalence is enough to spot classifier regressions before -//! cutover. - -use std::collections::BTreeMap; -use std::collections::BTreeSet; -use std::path::Path; - -use anyhow::Context as _; -use anyhow::Result; -use duckdb::Connection; -use serde::Deserialize; - -use crate::classifier::QUERY_SUITES; - -/// Result of one `verify` run. -#[derive(Debug, Default)] -pub struct VerifyReport { - /// Group display names present in both v2 and v3. - pub matched_groups: Vec, - /// Group display names that exist in v3 but not v2. - pub only_in_v3: Vec, - /// Group display names that exist in v2 but not v3 — these gate the CLI's - /// non-zero exit. - pub only_in_v2: Vec, - /// Per-group chart-count diffs for groups present on both sides. - pub chart_diffs: Vec, -} - -/// One group's chart-count divergence between v2 and v3, captured when the -/// group is structurally present on both sides but the counts differ. -#[derive(Debug, Clone)] -pub struct ChartDiff { - /// Group display name. - pub group: String, - /// Number of charts v2 reported for this group. - pub v2_count: usize, - /// Number of charts the migrated v3 DuckDB has for this group. - pub v3_count: usize, -} - -impl VerifyReport { - /// True if every v2 group is represented in v3. The CLI's exit - /// code reflects this. - pub fn v2_groups_covered(&self) -> bool { - self.only_in_v2.is_empty() - } -} - -impl std::fmt::Display for VerifyReport { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "Groups in both v2 and v3:")?; - for g in &self.matched_groups { - writeln!(f, " + {g}")?; - } - if !self.only_in_v2.is_empty() { - writeln!(f, "Groups only in v2 (regression candidates):")?; - for g in &self.only_in_v2 { - writeln!(f, " - {g}")?; - } - } - if !self.only_in_v3.is_empty() { - writeln!(f, "Groups only in v3:")?; - for g in &self.only_in_v3 { - writeln!(f, " + {g}")?; - } - } - if !self.chart_diffs.is_empty() { - writeln!(f, "Chart count diffs:")?; - for d in &self.chart_diffs { - writeln!( - f, - " {} : v2={} v3={} (delta={})", - d.group, - d.v2_count, - d.v3_count, - d.v3_count as i64 - d.v2_count as i64, - )?; - } - } - Ok(()) - } -} - -/// v2's `/api/metadata` reply — only the fields we need. -#[derive(Debug, Deserialize)] -struct V2Metadata { - groups: BTreeMap, -} - -#[derive(Debug, Deserialize)] -struct V2GroupMeta { - #[serde(default)] - charts: Vec, -} - -#[derive(Debug, Deserialize)] -struct V2ChartMeta { - #[serde(default)] - name: String, -} - -/// Open the migrated DuckDB at `duckdb_path`, fetch `/api/metadata`, -/// and produce a structural diff. -pub fn run(v2_server: &str, duckdb_path: &Path) -> Result { - let v3 = collect_v3_groups(duckdb_path)?; - let v2 = fetch_v2_metadata(v2_server)?; - Ok(diff(&v2, &v3)) -} - -fn collect_v3_groups(duckdb_path: &Path) -> Result>> { - let conn = Connection::open(duckdb_path) - .with_context(|| format!("opening DuckDB at {}", duckdb_path.display()))?; - let mut groups: BTreeMap> = BTreeMap::new(); - - // query_measurements: chart per (dataset, query_idx); group per - // (dataset, dataset_variant, scale_factor, storage). We want v2 - // group display names so the verifier can compare apples to - // apples, so we re-format them here using the same suite table. - let mut stmt = conn.prepare( - r#" - SELECT dataset, dataset_variant, scale_factor, storage, query_idx - FROM query_measurements - GROUP BY dataset, dataset_variant, scale_factor, storage, query_idx - "#, - )?; - let rows = stmt.query_map([], |row| { - Ok(( - row.get::<_, String>(0)?, - row.get::<_, Option>(1)?, - row.get::<_, Option>(2)?, - row.get::<_, String>(3)?, - row.get::<_, i32>(4)?, - )) - })?; - for row in rows { - let (dataset, _variant, sf, storage, query_idx) = row?; - let group_name = display_query_group(&dataset, sf.as_deref(), &storage); - let chart_name = chart_name_query(&dataset, query_idx); - groups - .entry(group_name) - .or_default() - .insert(normalize_chart(&chart_name)); - } - - // compression_times: group "Compression", charts per dataset. - let mut stmt = conn.prepare( - r#" - SELECT dataset, format, op - FROM compression_times - GROUP BY dataset, format, op - "#, - )?; - let rows = stmt.query_map([], |row| { - Ok(( - row.get::<_, String>(0)?, - row.get::<_, String>(1)?, - row.get::<_, String>(2)?, - )) - })?; - for row in rows { - let (dataset, format, op) = row?; - let chart = chart_name_compression_time(&format, &op, &dataset); - groups - .entry("Compression".to_string()) - .or_default() - .insert(normalize_chart(&chart)); - } - - let mut stmt = conn.prepare( - r#" - SELECT dataset, format - FROM compression_sizes - GROUP BY dataset, format - "#, - )?; - let rows = stmt.query_map([], |row| { - Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)) - })?; - for row in rows { - let (_dataset, format) = row?; - let chart = chart_name_compression_size(&format); - groups - .entry("Compression Size".to_string()) - .or_default() - .insert(normalize_chart(&chart)); - } - - let mut stmt = conn.prepare( - r#" - SELECT DISTINCT dataset - FROM random_access_times - "#, - )?; - let rows = stmt.query_map([], |row| row.get::<_, String>(0))?; - for row in rows { - let dataset = row?; - groups - .entry("Random Access".to_string()) - .or_default() - .insert(normalize_chart(&dataset)); - } - - Ok(groups) -} - -fn fetch_v2_metadata(server: &str) -> Result>> { - let url = format!("{}/api/metadata", server.trim_end_matches('/')); - let body = reqwest::blocking::get(&url) - .with_context(|| format!("GET {url}"))? - .error_for_status() - .with_context(|| format!("non-2xx from {url}"))? - .json::() - .with_context(|| format!("parsing {url} as v2 /api/metadata"))?; - let mut out: BTreeMap> = BTreeMap::new(); - for (name, group) in body.groups { - let charts = group - .charts - .into_iter() - .map(|c| normalize_chart(&c.name)) - .collect(); - out.insert(name, charts); - } - Ok(out) -} - -fn diff( - v2: &BTreeMap>, - v3: &BTreeMap>, -) -> VerifyReport { - let mut report = VerifyReport::default(); - let v2_keys: BTreeSet<&String> = v2.keys().collect(); - let v3_keys: BTreeSet<&String> = v3.keys().collect(); - for g in v2_keys.intersection(&v3_keys) { - report.matched_groups.push((**g).clone()); - let v2_charts = &v2[*g]; - let v3_charts = &v3[*g]; - if v2_charts.len() != v3_charts.len() { - report.chart_diffs.push(ChartDiff { - group: (**g).clone(), - v2_count: v2_charts.len(), - v3_count: v3_charts.len(), - }); - } - } - for g in v3_keys.difference(&v2_keys) { - report.only_in_v3.push((**g).clone()); - } - for g in v2_keys.difference(&v3_keys) { - report.only_in_v2.push((**g).clone()); - } - report.matched_groups.sort(); - report.only_in_v3.sort(); - report.only_in_v2.sort(); - report -} - -fn display_query_group(dataset: &str, scale_factor: Option<&str>, storage: &str) -> String { - let suite = QUERY_SUITES - .iter() - .find(|s| s.prefix.eq_ignore_ascii_case(dataset)) - .copied(); - match suite { - Some(suite) if suite.fan_out => { - let storage_disp = match storage { - "s3" | "S3" => "S3", - _ => "NVMe", - }; - let sf = scale_factor.unwrap_or("1"); - format!("{} ({}) (SF={})", suite.display_name, storage_disp, sf) - } - Some(suite) => suite.display_name.to_string(), - None => format!("{dataset} ({storage})"), - } -} - -fn chart_name_query(dataset: &str, query_idx: i32) -> String { - let suite = QUERY_SUITES - .iter() - .find(|s| s.prefix.eq_ignore_ascii_case(dataset)) - .copied(); - match suite { - Some(suite) => format!("{} Q{}", suite.query_prefix, query_idx), - None => format!("{} Q{}", dataset.to_uppercase(), query_idx), - } -} - -fn chart_name_compression_time(format: &str, op: &str, _dataset: &str) -> String { - // Re-derive the v2 chart name (the metric, not the dataset) so we - // can compare. v2's chart axis is the metric; series is the - // dataset. v3 inverts that. For structural comparison, we project - // back to v2's per-chart key. - match (format, op) { - ("vortex-file-compressed", "encode") => "COMPRESS TIME".into(), - ("vortex-file-compressed", "decode") => "DECOMPRESS TIME".into(), - ("parquet", "encode") => "PARQUET RS ZSTD COMPRESS TIME".into(), - ("parquet", "decode") => "PARQUET RS ZSTD DECOMPRESS TIME".into(), - ("lance", "encode") => "LANCE COMPRESS TIME".into(), - ("lance", "decode") => "LANCE DECOMPRESS TIME".into(), - _ => format!("{} {} TIME", format.to_uppercase(), op.to_uppercase()), - } -} - -fn chart_name_compression_size(format: &str) -> String { - match format { - "vortex-file-compressed" => "VORTEX SIZE".into(), - "parquet" => "PARQUET SIZE".into(), - "lance" => "LANCE SIZE".into(), - _ => format!("{} SIZE", format.to_uppercase()), - } -} - -/// Strip casing and `_-` differences between v2 and v3 chart names. -/// v2 displays uppercase; v3 stores raw values. Comparing in this -/// canonical form is enough for structural verification. -fn normalize_chart(s: &str) -> String { - s.trim() - .to_uppercase() - .replace(['_', '-'], " ") - .split_whitespace() - .collect::>() - .join(" ") -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn normalize_chart_canonicalizes() { - assert_eq!(normalize_chart("taxi/take"), "TAXI/TAKE"); - assert_eq!(normalize_chart("TAXI/TAKE"), "TAXI/TAKE"); - assert_eq!(normalize_chart("tpc-h q1"), "TPC H Q1"); - assert_eq!(normalize_chart("tpc h q1"), "TPC H Q1"); - } - - #[test] - fn display_query_group_handles_fan_out() { - assert_eq!( - display_query_group("tpch", Some("10"), "s3"), - "TPC-H (S3) (SF=10)" - ); - assert_eq!( - display_query_group("tpch", Some("100"), "nvme"), - "TPC-H (NVMe) (SF=100)" - ); - assert_eq!( - display_query_group("clickbench", None, "nvme"), - "Clickbench" - ); - } -} diff --git a/benchmarks-website/migrate/tests/classifier.rs b/benchmarks-website/migrate/tests/classifier.rs deleted file mode 100644 index 71e97cb6c9a..00000000000 --- a/benchmarks-website/migrate/tests/classifier.rs +++ /dev/null @@ -1,531 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Classifier behavior pinned by representative v2 names from each -//! group in `benchmarks-website/server.js`'s `getGroup`. - -use rstest::rstest; -use serde_json::json; -use vortex_bench_migrate::classifier::Outcome; -use vortex_bench_migrate::classifier::Skip; -use vortex_bench_migrate::classifier::V3Bin; -use vortex_bench_migrate::classifier::classify; -use vortex_bench_migrate::classifier::classify_outcome; -use vortex_bench_migrate::classifier::format_query; -use vortex_bench_migrate::classifier::rename_engine; -use vortex_bench_migrate::v2::V2Record; - -fn record(name: &str) -> V2Record { - V2Record { - name: name.to_string(), - commit_id: Some("deadbeef".into()), - unit: Some("ns".into()), - value: Some(json!(123)), - storage: None, - dataset: None, - all_runtimes: None, - env_triple: None, - } -} - -fn record_with_storage_and_sf(name: &str, storage: &str, suite: &str, sf: &str) -> V2Record { - let mut r = record(name); - r.storage = Some(storage.into()); - r.dataset = Some(json!({ suite: { "scale_factor": sf } })); - r -} - -#[rstest] -#[case::clickbench( - "clickbench_q07/datafusion:parquet", - V3Bin::Query { - dataset: "clickbench".into(), - dataset_variant: None, - scale_factor: None, - query_idx: 7, - storage: "nvme".into(), - engine: "datafusion".into(), - format: "parquet".into(), - }, -)] -#[case::clickbench_vortex_renamed( - "clickbench_q12/datafusion:vortex-file-compressed", - V3Bin::Query { - dataset: "clickbench".into(), - dataset_variant: None, - scale_factor: None, - query_idx: 12, - storage: "nvme".into(), - engine: "datafusion".into(), - format: "vortex-file-compressed".into(), - }, -)] -#[case::statpopgen( - "statpopgen_q3/datafusion:parquet", - V3Bin::Query { - dataset: "statpopgen".into(), - dataset_variant: None, - scale_factor: None, - query_idx: 3, - storage: "nvme".into(), - engine: "datafusion".into(), - format: "parquet".into(), - }, -)] -#[case::polarsignals( - "polarsignals_q1/duckdb:parquet", - V3Bin::Query { - dataset: "polarsignals".into(), - dataset_variant: None, - scale_factor: None, - query_idx: 1, - storage: "nvme".into(), - engine: "duckdb".into(), - format: "parquet".into(), - }, -)] -fn non_fan_out_query_records(#[case] name: &str, #[case] expected: V3Bin) { - let r = record(name); - assert_eq!(classify(&r), Some(expected)); -} - -#[rstest] -#[case::tpch_s3_sf100( - "tpch_q01/datafusion:parquet", - "S3", - "tpch", - "100", - V3Bin::Query { - dataset: "tpch".into(), - dataset_variant: None, - scale_factor: Some("100".into()), - query_idx: 1, - storage: "s3".into(), - engine: "datafusion".into(), - format: "parquet".into(), - }, -)] -#[case::tpch_nvme_sf1( - "tpch_q22/duckdb:vortex-file-compressed", - "NVMe", - "tpch", - "1", - V3Bin::Query { - dataset: "tpch".into(), - dataset_variant: None, - scale_factor: Some("1".into()), - query_idx: 22, - storage: "nvme".into(), - engine: "duckdb".into(), - format: "vortex-file-compressed".into(), - }, -)] -#[case::tpcds_nvme_sf10( - "tpcds_q05/datafusion:vortex-file-compressed", - "NVMe", - "tpcds", - "10", - V3Bin::Query { - dataset: "tpcds".into(), - dataset_variant: None, - scale_factor: Some("10".into()), - query_idx: 5, - storage: "nvme".into(), - engine: "datafusion".into(), - format: "vortex-file-compressed".into(), - }, -)] -fn fan_out_query_records( - #[case] name: &str, - #[case] storage: &str, - #[case] suite: &str, - #[case] sf: &str, - #[case] expected: V3Bin, -) { - let r = record_with_storage_and_sf(name, storage, suite, sf); - assert_eq!(classify(&r), Some(expected)); -} - -#[rstest] -#[case::random_access_4_part( - "random-access/taxi/take/parquet-tokio-local-disk", - V3Bin::RandomAccess { - dataset: "taxi/take".into(), - format: "parquet".into(), - }, -)] -#[case::random_access_4_part_vortex( - "random-access/chimp/take/vortex-tokio-local-disk", - V3Bin::RandomAccess { - dataset: "chimp/take".into(), - format: "vortex-file-compressed".into(), - }, -)] -#[case::random_access_4_part_lance( - "random-access/taxi/take/lance-tokio-local-disk", - V3Bin::RandomAccess { - dataset: "taxi/take".into(), - format: "lance".into(), - }, -)] -fn random_access_records(#[case] name: &str, #[case] expected: V3Bin) { - let r = record(name); - assert_eq!(classify(&r), Some(expected)); -} - -#[rstest] -#[case::compress_time_vortex( - "compress time/clickbench", - V3Bin::CompressionTime { - dataset: "clickbench".into(), - dataset_variant: None, - format: "vortex-file-compressed".into(), - op: "encode".into(), - }, -)] -#[case::decompress_time_vortex( - "decompress time/tpch_lineitem", - V3Bin::CompressionTime { - dataset: "tpch_lineitem".into(), - dataset_variant: None, - format: "vortex-file-compressed".into(), - op: "decode".into(), - }, -)] -#[case::parquet_compress( - "parquet_rs-zstd compress time/clickbench", - V3Bin::CompressionTime { - dataset: "clickbench".into(), - dataset_variant: None, - format: "parquet".into(), - op: "encode".into(), - }, -)] -#[case::lance_decompress( - "lance decompress time/clickbench", - V3Bin::CompressionTime { - dataset: "clickbench".into(), - dataset_variant: None, - format: "lance".into(), - op: "decode".into(), - }, -)] -fn compression_time_records(#[case] name: &str, #[case] expected: V3Bin) { - let r = record(name); - assert_eq!(classify(&r), Some(expected)); -} - -#[rstest] -#[case::vortex_size( - "vortex size/clickbench", - V3Bin::CompressionSize { - dataset: "clickbench".into(), - dataset_variant: None, - format: "vortex-file-compressed".into(), - }, -)] -#[case::vortex_file_compressed_size_normalizes( - "vortex-file-compressed size/clickbench", - V3Bin::CompressionSize { - dataset: "clickbench".into(), - dataset_variant: None, - format: "vortex-file-compressed".into(), - }, -)] -#[case::parquet_size( - "parquet size/clickbench", - V3Bin::CompressionSize { - dataset: "clickbench".into(), - dataset_variant: None, - format: "parquet".into(), - }, -)] -#[case::lance_size( - "lance size/tpch_lineitem", - V3Bin::CompressionSize { - dataset: "tpch_lineitem".into(), - dataset_variant: None, - format: "lance".into(), - }, -)] -fn compression_size_records(#[case] name: &str, #[case] expected: V3Bin) { - let r = record(name); - assert_eq!(classify(&r), Some(expected)); -} - -#[rstest] -#[case::ratio_vortex_parquet("vortex:parquet-zstd ratio compress time/clickbench")] -#[case::ratio_vortex_lance("vortex:lance ratio decompress time/clickbench")] -#[case::ratio_size_vortex_parquet("vortex:parquet-zstd size/clickbench")] -#[case::ratio_size_vortex_raw("vortex:raw size/clickbench")] -#[case::throughput("compress throughput/clickbench")] -#[case::nonsense_prefix("not-a-known-bench/series")] -#[case::random_access_3_part("random-access/taxi/parquet-tokio-local-disk")] -fn unmapped_records_yield_none(#[case] name: &str) { - let r = record(name); - assert_eq!( - classify(&r), - None, - "expected {name:?} to classify as None (drop)", - ); -} - -#[rstest] -#[case::parquet_2_part( - "random-access/parquet-tokio-local-disk", - V3Bin::RandomAccess { - dataset: "taxi".into(), - format: "parquet".into(), - }, -)] -#[case::vortex_2_part( - "random-access/vortex-tokio-local-disk", - V3Bin::RandomAccess { - dataset: "taxi".into(), - format: "vortex-file-compressed".into(), - }, -)] -#[case::lance_2_part( - "random-access/lance-tokio-local-disk", - V3Bin::RandomAccess { - dataset: "taxi".into(), - format: "lance".into(), - }, -)] -fn random_access_2_part_legacy_recovered_as_taxi(#[case] name: &str, #[case] expected: V3Bin) { - // The 2-part shape `random-access/-tokio-local-disk` is - // emitted by `random-access-bench`'s legacy taxi run (no - // `AccessPattern`, see `measurement_name` in - // `benchmarks/random-access-bench/src/main.rs`). The live v3 - // emitter writes `dataset="taxi"` for those measurements, so the - // historical 2-part records on S3 must land in the same v3 - // chart instead of being dropped as `UnsupportedShape`. - let r = record(name); - assert_eq!( - classify(&r), - Some(expected), - "2-part legacy random-access must recover as dataset=taxi" - ); -} - -#[rstest] -#[case::parquet_footer("random-access/parquet-tokio-local-disk-footer")] -#[case::vortex_footer("random-access/vortex-tokio-local-disk-footer")] -#[case::lance_footer("random-access/lance-tokio-local-disk-footer")] -fn random_access_2_part_footer_is_deprecated(#[case] name: &str) { - // The reopen-mode `-footer` variant is a different access pattern - // (file is reopened per take). The live v3 emitter passes the - // bare `format.name()` for both reopen and cached, so it can't - // distinguish them on the wire. Keep migration consistent with - // that by routing `-footer` 2-part records to Skip::Deprecated - // (they don't strip clean to a v3-allowlisted format). - let r = record(name); - assert!( - matches!(classify_outcome(&r), Outcome::Skip(Skip::Deprecated)), - "2-part `-footer` random-access must be Skip::Deprecated" - ); -} - -#[rstest] -#[case::parquet_footer("random-access/taxi/correlated/parquet-tokio-local-disk-footer")] -#[case::vortex_footer("random-access/feature-vectors/uniform/vortex-tokio-local-disk-footer")] -#[case::lance_footer("random-access/nested-structs/correlated/lance-tokio-local-disk-footer")] -fn random_access_4_part_footer_is_deprecated(#[case] name: &str) { - // Same reasoning as 2-part `-footer`: the format string ends in - // `-tokio-local-disk-footer`, the strip_suffix doesn't match, and - // the unstripped value fails the V3_FORMATS allowlist. - let r = record(name); - assert!( - matches!(classify_outcome(&r), Outcome::Skip(Skip::Deprecated)), - "4-part `-footer` random-access must be Skip::Deprecated" - ); -} - -#[test] -fn parquet_zstd_size_is_deprecated() { - // `parquet-zstd` is not on the v3 emitter's format allowlist, so - // historical `parquet-zstd size/...` records bucket under - // Skip::Deprecated and don't render as orphan charts in v3. - let r = record("parquet-zstd size/clickbench"); - assert!(matches!( - classify_outcome(&r), - Outcome::Skip(Skip::Deprecated) - )); -} - -#[test] -fn vortex_parquet_zstd_ratio_is_intentional_skip() { - let r = record("vortex:parquet-zstd ratio compress time/clickbench"); - assert!(matches!( - classify_outcome(&r), - Outcome::Skip(Skip::DerivedRatio) - )); -} - -#[test] -fn vortex_parquet_zst_typo_ratio_is_intentional_skip() { - // `parquet-zst` (no trailing `d`) was emitted by some v2 runs. - // Both spellings should classify as derived ratios. - for name in [ - "vortex:parquet-zst ratio compress time/clickbench", - "vortex:parquet-zst ratio decompress time/clickbench", - ] { - let r = record(name); - assert!( - matches!(classify_outcome(&r), Outcome::Skip(Skip::DerivedRatio)), - "{name:?} should be DerivedRatio", - ); - } -} - -#[test] -fn throughput_is_intentional_skip() { - let r = record("compress throughput/clickbench"); - assert!(matches!( - classify_outcome(&r), - Outcome::Skip(Skip::Throughput) - )); -} - -#[test] -fn unknown_prefix_is_unknown() { - let r = record("not-a-known-bench/series"); - assert!(matches!(classify_outcome(&r), Outcome::Unknown)); -} - -#[test] -fn gharchive_q00_is_deprecated() { - // gharchive isn't on the v3 query-suite allowlist, so historical - // gharchive query records bucket as Skip::Deprecated. - let r = record("gharchive_q00/datafusion:parquet"); - assert!(matches!( - classify_outcome(&r), - Outcome::Skip(Skip::Deprecated) - )); -} - -#[test] -fn fineweb_q00_classifies() { - // fineweb is on V3_QUERY_SUITES (still emitted by v3 CI per - // .github/workflows/sql-benchmarks.yml's `fineweb` matrix entry), - // so historical fineweb records ingest like any other suite. - let r = record("fineweb_q00/datafusion:parquet"); - assert!(matches!( - classify_outcome(&r), - Outcome::Bin(V3Bin::Query { .. }) - )); -} - -#[test] -fn memory_record_is_historical_memory_skip() { - // v2 emitted `_q_memory/:` records that - // carry top-level memory fields V2Record doesn't deserialize. - // Skip them with a known variant so they don't trip the 5% gate. - let r = record("clickbench_q07_memory/datafusion:parquet"); - assert!(matches!( - classify_outcome(&r), - Outcome::Skip(Skip::HistoricalMemory) - )); -} - -#[test] -fn tpch_compression_size_carries_scale_factor() { - // The data.json.gz "vortex size/tpch" path needs to derive - // dataset_variant from the v2 record's `dataset` object, the same - // way the file-sizes path does. Otherwise SF=10 rows from the two - // sources never collide on `mid` and produce duplicate rows. - let mut r = record("vortex size/tpch"); - r.dataset = Some(serde_json::json!({ "tpch": { "scale_factor": "10" } })); - let outcome = classify_outcome(&r); - let Outcome::Bin(V3Bin::CompressionSize { - dataset, - dataset_variant, - format, - }) = outcome - else { - panic!("expected Bin(CompressionSize), got {outcome:?}"); - }; - assert_eq!(dataset, "tpch"); - assert_eq!(dataset_variant, Some("10".into())); - assert_eq!(format, "vortex-file-compressed"); -} - -#[test] -fn tpch_compression_size_drops_default_scale_factor() { - // SF "1.0" matches the file-sizes path's filter and collapses to - // dataset_variant: None. - let mut r = record("vortex size/tpch"); - r.dataset = Some(serde_json::json!({ "tpch": { "scale_factor": "1.0" } })); - let outcome = classify_outcome(&r); - let Outcome::Bin(V3Bin::CompressionSize { - dataset_variant, .. - }) = outcome - else { - panic!("expected Bin(CompressionSize), got {outcome:?}"); - }; - assert_eq!(dataset_variant, None); -} - -#[rstest] -// SF=1 is the implicit default; both spellings must drop to None so -// `bin_compression_size` and `migrate_file_sizes` agree. -#[case::int_one("1", None)] -#[case::float_one("1.0", None)] -// SF=10 must produce the same canonical string regardless of spelling. -#[case::int_ten("10", Some("10".into()))] -#[case::float_ten("10.0", Some("10".into()))] -#[case::float_fractional("0.1", Some("0.1".into()))] -#[case::whitespace(" 10 ", Some("10".into()))] -#[case::empty("", None)] -fn compression_size_scale_factor_canonicalizes( - #[case] raw_sf: &str, - #[case] expected: Option, -) { - let mut r = record("vortex size/tpch"); - r.dataset = Some(serde_json::json!({ "tpch": { "scale_factor": raw_sf } })); - let outcome = classify_outcome(&r); - let Outcome::Bin(V3Bin::CompressionSize { - dataset_variant, .. - }) = outcome - else { - panic!("expected Bin(CompressionSize) for sf={raw_sf:?}, got {outcome:?}"); - }; - assert_eq!(dataset_variant, expected, "sf={raw_sf:?}"); -} - -#[test] -fn engine_casing_lowercased() { - // Older v2 records emitted display-case engines like `DataFusion` - // and `DuckDB`. The classifier lowercases at push time so dedup - // collapses display-case rows into the canonical lowercase ones. - let r = record("clickbench_q07/DataFusion:parquet"); - let outcome = classify_outcome(&r); - let Outcome::Bin(V3Bin::Query { engine, format, .. }) = outcome else { - panic!("expected Bin(Query), got {outcome:?}"); - }; - assert_eq!(engine, "datafusion"); - assert_eq!(format, "parquet"); -} - -#[test] -fn rename_engine_pins_canonical_outputs() { - assert_eq!(rename_engine("vortex-tokio-local-disk"), "vortex-nvme"); - assert_eq!( - rename_engine("datafusion:vortex-file-compressed"), - "datafusion:vortex" - ); - assert_eq!(rename_engine("LANCE"), "lance"); -} - -#[test] -fn format_query_pins_v2_display() { - assert_eq!(format_query("clickbench_q00"), "CLICKBENCH Q0"); - assert_eq!(format_query("tpch_q22"), "TPC-H Q22"); - assert_eq!(format_query("tpcds_q42"), "TPC-DS Q42"); - assert_eq!(format_query("polarsignals_q1"), "POLARSIGNALS Q1"); - // Names that don't match a suite fall back to upper + " " replace. - assert_eq!( - format_query("vortex-file-compressed size"), - "VORTEX FILE COMPRESSED SIZE" - ); -} diff --git a/benchmarks-website/migrate/tests/end_to_end.rs b/benchmarks-website/migrate/tests/end_to_end.rs deleted file mode 100644 index 83d71f5f156..00000000000 --- a/benchmarks-website/migrate/tests/end_to_end.rs +++ /dev/null @@ -1,452 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Inline JSONL fixtures driven through the full migration into a -//! tempdir DuckDB. No live S3. - -use std::fs::File; -use std::io::Write; -use std::path::Path; - -use duckdb::Connection; -use flate2::Compression; -use flate2::write::GzEncoder; -use tempfile::TempDir; -use vortex_bench_migrate::migrate; -use vortex_bench_migrate::source::Source; - -const COMMITS_JSONL: &str = r#"{"id":"deadbeef","timestamp":"2026-04-25T00:00:00Z","message":"fixture commit","author":{"name":"A","email":"a@example.com"},"committer":{"name":"C","email":"c@example.com"},"tree_id":"abcd0001","url":"https://example.com/commit/deadbeef"} -"#; - -const DATA_JSONL: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":42000,"all_runtimes":[41000,42000,43000]} -{"name":"compress time/clickbench","commit_id":"deadbeef","unit":"ns","value":99} -{"name":"vortex size/clickbench","commit_id":"deadbeef","unit":"bytes","value":1024} -{"name":"random-access/taxi/take/parquet-tokio-local-disk","commit_id":"deadbeef","unit":"ns","value":777,"all_runtimes":[700,777,800]} -"#; - -/// Build a local-source fixture directory. Caller supplies the contents -/// of `commits.json`, `data.json.gz`, and any number of -/// `file-sizes-*.json.gz` files (name → contents). -fn build_fixture(commits: &str, data: &str, file_sizes: &[(&str, &str)]) -> TempDir { - let dir = TempDir::new().expect("tempdir"); - write_text(&dir.path().join("commits.json"), commits); - write_gz(&dir.path().join("data.json.gz"), data); - for (name, body) in file_sizes { - write_gz(&dir.path().join(name), body); - } - dir -} - -fn write_text(path: &Path, body: &str) { - let mut f = File::create(path).unwrap(); - f.write_all(body.as_bytes()).unwrap(); -} - -fn write_gz(path: &Path, body: &str) { - let f = File::create(path).unwrap(); - let mut gz = GzEncoder::new(f, Compression::default()); - gz.write_all(body.as_bytes()).unwrap(); - gz.finish().unwrap(); -} - -#[test] -fn migrate_inline_fixture_populates_each_table() { - let src_dir = build_fixture(COMMITS_JSONL, DATA_JSONL, &[]); - let target_dir = TempDir::new().unwrap(); - let target = target_dir.path().join("v3.duckdb"); - - let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); - - assert_eq!(summary.records_read, 4, "summary={summary}"); - assert_eq!(summary.uncategorized, 0, "summary={summary}"); - assert_eq!(summary.commits_inserted, 1); - assert_eq!(summary.query_inserted, 1); - assert_eq!(summary.compression_time_inserted, 1); - assert_eq!(summary.compression_size_inserted, 1); - assert_eq!(summary.random_access_inserted, 1); - - let conn = Connection::open(&target).unwrap(); - let count = |table: &str| -> i64 { - conn.query_row(&format!("SELECT COUNT(*) FROM {table}"), [], |r| r.get(0)) - .unwrap() - }; - assert_eq!(count("commits"), 1); - assert_eq!(count("query_measurements"), 1); - assert_eq!(count("compression_times"), 1); - assert_eq!(count("compression_sizes"), 1); - assert_eq!(count("random_access_times"), 1); - - // Spot-check the v3 column values for each kind. - let (engine, format, query_idx, value_ns): (String, String, i32, i64) = conn - .query_row( - "SELECT engine, format, query_idx, value_ns FROM query_measurements", - [], - |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)), - ) - .unwrap(); - assert_eq!(engine, "datafusion"); - assert_eq!(format, "parquet"); - assert_eq!(query_idx, 7); - assert_eq!(value_ns, 42000); - - let (dataset, format, op): (String, String, String) = conn - .query_row( - "SELECT dataset, format, op FROM compression_times", - [], - |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)), - ) - .unwrap(); - assert_eq!(dataset, "clickbench"); - assert_eq!(format, "vortex-file-compressed"); - assert_eq!(op, "encode"); - - let (dataset, format, value_bytes): (String, String, i64) = conn - .query_row( - "SELECT dataset, format, value_bytes FROM compression_sizes", - [], - |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)), - ) - .unwrap(); - assert_eq!(dataset, "clickbench"); - assert_eq!(format, "vortex-file-compressed"); - assert_eq!(value_bytes, 1024); - - let (dataset, format): (String, String) = conn - .query_row("SELECT dataset, format FROM random_access_times", [], |r| { - Ok((r.get(0)?, r.get(1)?)) - }) - .unwrap(); - assert_eq!(dataset, "taxi/take"); - assert_eq!(format, "parquet"); -} - -#[test] -fn dedup_collision_keeps_one_row() { - // Two data.json.gz lines whose query-measurement dim columns are - // identical (same commit / dataset / engine / format / query_idx, - // and `storage` collapses to "nvme" since `storage` is unset). - // Different `value`s. The accumulator's HashSet - // should drop the second one and bump `summary.deduped`. - const DATA: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":111} -{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":222} -"#; - - let src_dir = build_fixture(COMMITS_JSONL, DATA, &[]); - let target_dir = TempDir::new().unwrap(); - let target = target_dir.path().join("v3.duckdb"); - - let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); - - assert_eq!(summary.records_read, 2, "summary={summary}"); - assert_eq!(summary.query_inserted, 1, "summary={summary}"); - assert_eq!(summary.deduped, 1, "summary={summary}"); - - let conn = Connection::open(&target).unwrap(); - let n: i64 = conn - .query_row("SELECT COUNT(*) FROM query_measurements", [], |r| r.get(0)) - .unwrap(); - assert_eq!(n, 1); -} - -#[test] -fn dedup_with_conflicting_value_ns_is_counted() { - // Same dim columns, different `value`s. Dedup keeps the first - // and bumps `deduped_with_conflict` because the dropped row's - // value_ns differed from the kept row's. This is the signal we - // care about when watching for silent value-corruption across - // duplicated v2 emissions. - const DATA: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":111} -{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":222} -"#; - - let src_dir = build_fixture(COMMITS_JSONL, DATA, &[]); - let target_dir = TempDir::new().unwrap(); - let target = target_dir.path().join("v3.duckdb"); - - let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); - - assert_eq!(summary.deduped, 1, "summary={summary}"); - assert_eq!(summary.deduped_with_conflict, 1, "summary={summary}"); -} - -#[test] -fn dedup_with_matching_value_ns_does_not_count_conflict() { - // Same dim columns AND identical `value`s. Dedup still drops the - // duplicate, but `deduped_with_conflict` stays 0. - const DATA: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":111} -{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":111} -"#; - - let src_dir = build_fixture(COMMITS_JSONL, DATA, &[]); - let target_dir = TempDir::new().unwrap(); - let target = target_dir.path().join("v3.duckdb"); - - let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); - - assert_eq!(summary.deduped, 1, "summary={summary}"); - assert_eq!(summary.deduped_with_conflict, 0, "summary={summary}"); -} - -#[test] -fn compression_size_data_and_file_sizes_merge() { - // A `vortex size/tpch` record from data.json.gz and a - // file-sizes-tpch-nvme.json.gz row covering the same (commit, - // dataset, format, SF) tuple should produce the *same* - // measurement_id so the in-memory accumulator merges them into - // one row instead of two. - // - // Both sources use scale_factor "1.0", which both code paths - // filter out → dataset_variant: None on both sides → matching mid. - const DATA: &str = r#"{"name":"vortex size/tpch","commit_id":"deadbeef","unit":"bytes","value":200,"dataset":{"tpch":{"scale_factor":"1.0"}}} -"#; - const FILE_SIZES: &str = r#"{"commit_id":"deadbeef","benchmark":"tpch","scale_factor":"1.0","format":"vortex-file-compressed","file":"part-0.vortex","size_bytes":100} -"#; - - let src_dir = build_fixture( - COMMITS_JSONL, - DATA, - &[("file-sizes-tpch-nvme.json.gz", FILE_SIZES)], - ); - let target_dir = TempDir::new().unwrap(); - let target = target_dir.path().join("v3.duckdb"); - - let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); - - assert_eq!(summary.compression_size_inserted, 1, "summary={summary}"); - - let conn = Connection::open(&target).unwrap(); - let (n, value_bytes): (i64, i64) = conn - .query_row( - "SELECT COUNT(*), SUM(value_bytes) FROM compression_sizes", - [], - |r| Ok((r.get(0)?, r.get(1)?)), - ) - .unwrap(); - assert_eq!(n, 1); - // data.json.gz seeds value_bytes=200, file-sizes adds 100. - assert_eq!(value_bytes, 300); -} - -#[test] -fn empty_author_email_stored_as_null() { - // v2 sometimes wrote `""` for blank author/email/message. The - // migrator normalizes those to None so DuckDB stores SQL NULL, - // letting the UI distinguish "missing metadata" from "empty - // string". Here author.email is "" — verify the column is NULL, - // not the empty string. - const COMMITS: &str = r#"{"id":"deadbeef","timestamp":"2026-04-25T00:00:00Z","message":"fixture","author":{"name":"A","email":""},"committer":{"name":"C","email":"c@example.com"},"tree_id":"abcd0001","url":"https://example.com/commit/deadbeef"} -"#; - - let src_dir = build_fixture(COMMITS, "", &[]); - let target_dir = TempDir::new().unwrap(); - let target = target_dir.path().join("v3.duckdb"); - - migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); - - let conn = Connection::open(&target).unwrap(); - let is_null: bool = conn - .query_row( - "SELECT author_email IS NULL FROM commits WHERE commit_sha = 'deadbeef'", - [], - |r| r.get(0), - ) - .unwrap(); - assert!(is_null, "empty author.email must store as SQL NULL"); - - // Non-empty fields still round-trip as strings. - let committer_email: String = conn - .query_row( - "SELECT committer_email FROM commits WHERE commit_sha = 'deadbeef'", - [], - |r| r.get(0), - ) - .unwrap(); - assert_eq!(committer_email, "c@example.com"); -} - -#[test] -fn open_target_db_removes_orphan_wal() { - // A `.wal` left from a previous crash with no main file present - // must still be removed so the next run starts from a known-empty - // state. Otherwise DuckDB can replay stale WAL into the fresh DB - // and corrupt subsequent inserts. - let target_dir = TempDir::new().unwrap(); - let target = target_dir.path().join("v3.duckdb"); - let wal = target_dir.path().join("v3.duckdb.wal"); - std::fs::write(&wal, b"orphan-wal-bytes").unwrap(); - assert!(wal.exists(), "precondition: orphan wal staged"); - assert!(!target.exists(), "precondition: no main db file"); - - { - let _conn = migrate::open_target_db(&target).unwrap(); - } - - // The migrator opens the DB after sweeping the WAL; DuckDB may - // recreate its own wal under load, but our pre-existing orphan - // bytes must not survive the sweep. We assert by content: either - // the path is missing, or its contents differ from the orphan we - // staged. - if wal.exists() { - let now = std::fs::read(&wal).unwrap(); - assert_ne!( - now, b"orphan-wal-bytes", - "orphan wal bytes must not survive open_target_db" - ); - } -} - -#[test] -fn file_sizes_unknown_id_falls_back_to_unknown_prefix() { - // A file-sizes-*.json.gz whose id isn't in - // `KNOWN_FILE_SIZES_SUITES`, with an empty `benchmark` field, used - // to surface as a bare id like `mystery-suite` and render as a - // dataset name. The migrator now prefixes those with `unknown:` - // so the UI can flag them. - const FILE_SIZES: &str = r#"{"commit_id":"deadbeef","benchmark":"","scale_factor":"","format":"vortex-file-compressed","file":"part-0.vortex","size_bytes":1000} -"#; - - let src_dir = build_fixture( - COMMITS_JSONL, - "", - &[("file-sizes-mystery-suite.json.gz", FILE_SIZES)], - ); - let target_dir = TempDir::new().unwrap(); - let target = target_dir.path().join("v3.duckdb"); - - migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); - - let conn = Connection::open(&target).unwrap(); - let dataset: String = conn - .query_row("SELECT dataset FROM compression_sizes", [], |r| r.get(0)) - .unwrap(); - assert_eq!(dataset, "unknown:mystery-suite"); -} - -#[test] -fn file_sizes_known_id_uses_id_directly() { - // For a KNOWN_FILE_SIZES_SUITES id, the fallback path keeps the - // raw id (no `unknown:` prefix). `clickbench-nvme` is on the list. - const FILE_SIZES: &str = r#"{"commit_id":"deadbeef","benchmark":"","scale_factor":"","format":"vortex-file-compressed","file":"part-0.vortex","size_bytes":1000} -"#; - - let src_dir = build_fixture( - COMMITS_JSONL, - "", - &[("file-sizes-clickbench-nvme.json.gz", FILE_SIZES)], - ); - let target_dir = TempDir::new().unwrap(); - let target = target_dir.path().join("v3.duckdb"); - - migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); - - let conn = Connection::open(&target).unwrap(); - let dataset: String = conn - .query_row("SELECT dataset FROM compression_sizes", [], |r| r.get(0)) - .unwrap(); - assert_eq!(dataset, "clickbench-nvme"); -} - -#[test] -fn compression_size_data_and_file_sizes_merge_with_canonical_sf() { - // Same logical SF written as `"10"` on the data.json.gz side and - // `"10.0"` on the file-sizes side. Both paths must canonicalize - // to `"10"` so the rows share a `measurement_id` and merge into - // one compression_sizes row. - const DATA: &str = r#"{"name":"vortex size/tpch","commit_id":"deadbeef","unit":"bytes","value":200,"dataset":{"tpch":{"scale_factor":"10"}}} -"#; - const FILE_SIZES: &str = r#"{"commit_id":"deadbeef","benchmark":"tpch","scale_factor":"10.0","format":"vortex-file-compressed","file":"part-0.vortex","size_bytes":100} -"#; - - let src_dir = build_fixture( - COMMITS_JSONL, - DATA, - &[("file-sizes-tpch-nvme-10.json.gz", FILE_SIZES)], - ); - let target_dir = TempDir::new().unwrap(); - let target = target_dir.path().join("v3.duckdb"); - - let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); - - assert_eq!(summary.compression_size_inserted, 1, "summary={summary}"); - let conn = Connection::open(&target).unwrap(); - let (n, value_bytes, dataset_variant): (i64, i64, String) = conn - .query_row( - "SELECT COUNT(*), SUM(value_bytes), MAX(dataset_variant) FROM compression_sizes", - [], - |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)), - ) - .unwrap(); - assert_eq!(n, 1); - // data.json.gz seeds 200, file-sizes adds 100. - assert_eq!(value_bytes, 300); - assert_eq!(dataset_variant, "10"); -} - -#[test] -fn summary_counts_match_actual_rows_on_success() { - // Sister test to migrate::tests::flush_all_does_not_overcount_on_failure. - // On a fully successful run, the post-flush summary counters must - // equal `SELECT COUNT(*)` from each fact table. This is the - // invariant the flush-after-count refactor preserves. - let src_dir = build_fixture(COMMITS_JSONL, DATA_JSONL, &[]); - let target_dir = TempDir::new().unwrap(); - let target = target_dir.path().join("v3.duckdb"); - - let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); - - let conn = Connection::open(&target).unwrap(); - let actual = |table: &str| -> u64 { - let n: i64 = conn - .query_row(&format!("SELECT COUNT(*) FROM {table}"), [], |r| r.get(0)) - .unwrap(); - n as u64 - }; - assert_eq!(summary.query_inserted, actual("query_measurements")); - assert_eq!( - summary.compression_time_inserted, - actual("compression_times") - ); - assert_eq!( - summary.compression_size_inserted, - actual("compression_sizes") - ); - assert_eq!( - summary.random_access_inserted, - actual("random_access_times") - ); -} - -#[test] -fn file_sizes_sum_into_one_row() { - // Two file-sizes rows sharing (commit, benchmark, format, - // scale_factor) and value_bytes 100 + 200 must collapse to a - // single compression_sizes row with 300. - const FILE_SIZES: &str = r#"{"commit_id":"deadbeef","benchmark":"clickbench","scale_factor":"1.0","format":"vortex-file-compressed","file":"part-0.vortex","size_bytes":100} -{"commit_id":"deadbeef","benchmark":"clickbench","scale_factor":"1.0","format":"vortex-file-compressed","file":"part-1.vortex","size_bytes":200} -"#; - - let src_dir = build_fixture( - COMMITS_JSONL, - "", - &[("file-sizes-clickbench.json.gz", FILE_SIZES)], - ); - let target_dir = TempDir::new().unwrap(); - let target = target_dir.path().join("v3.duckdb"); - - let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); - - assert_eq!(summary.file_size_inserted, 2, "summary={summary}"); - assert_eq!(summary.compression_size_inserted, 1, "summary={summary}"); - - let conn = Connection::open(&target).unwrap(); - let n: i64 = conn - .query_row("SELECT COUNT(*) FROM compression_sizes", [], |r| r.get(0)) - .unwrap(); - assert_eq!(n, 1); - let value_bytes: i64 = conn - .query_row("SELECT value_bytes FROM compression_sizes", [], |r| { - r.get(0) - }) - .unwrap(); - assert_eq!(value_bytes, 300); -} diff --git a/benchmarks-website/ops/BOOTSTRAP.md b/benchmarks-website/ops/BOOTSTRAP.md deleted file mode 100644 index 391f8717980..00000000000 --- a/benchmarks-website/ops/BOOTSTRAP.md +++ /dev/null @@ -1,741 +0,0 @@ - - -# vortex-bench-server bootstrap and recovery walkthrough - -A linear, copy-paste runbook for two scenarios: - -1. **Fresh install**: empty EC2 host, no DB, no S3 backups yet. Phases 1 through 7. -2. **Disaster recovery**: rebuild the site from S3 backups onto a new host (the old host is gone or - its DB is unrecoverable). Phases 1 through 6, then phase 8. - -[`README.md`](README.md) is the topic-organized reference manual; this file is the recipe you follow -top-to-bottom. Every step has a verification command so you can confirm it landed before moving on. -If a verification fails, the troubleshooting note below it points at the most likely cause. - -## Conventions - -- `$` lines are shell commands. Lines without `$` are example output. -- Run everything as `ec2-user` on the EC2 host unless a step says otherwise. `sudo` is called - explicitly where needed. -- The deploy timer cannot fetch over SSH. The repo's `origin` remote MUST be the HTTPS URL - `https://github.com/vortex-data/vortex.git`. If you already cloned over SSH, fix it in place: - `git -C ~/vortex remote set-url origin https://github.com/vortex-data/vortex.git`. -- `/var/lib/vortex-bench/ops/` is a directory symlink that `install.sh` creates pointing at - `/benchmarks-website/ops/`. Every script under it lives in the repo; the symlink is the - source-of-truth pointer. Deleting `~/vortex` breaks all five systemd units atomically. - -## Phase 1: AWS prerequisites (one-time, from the AWS console) - -Skip this entire phase if you are rebuilding into an EC2 instance that already has the -`VortexBenchServerRole` IAM role attached and the bucket lifecycle rule in place. Both survive -instance termination. - -### 1.1 Create the IAM policy - -In **IAM → Policies → Create policy**, paste: - -```json -{ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "ListBucket", - "Effect": "Allow", - "Action": "s3:ListBucket", - "Resource": "arn:aws:s3:::vortex-benchmark-results-database" - }, - { - "Sid": "ReadWriteV3Backups", - "Effect": "Allow", - "Action": ["s3:GetObject", "s3:PutObject", "s3:DeleteObject"], - "Resource": "arn:aws:s3:::vortex-benchmark-results-database/v3-backups/*" - } - ] -} -``` - -Name it `VortexBenchV3Backups`. - -### 1.2 Create the role and attach it - -1. **IAM → Roles → Create role → AWS service → EC2**, attach `VortexBenchV3Backups`, name it - `VortexBenchServerRole`. -2. **EC2 → Instances → bench instance → Actions → Security → Modify IAM role**, pick - `VortexBenchServerRole`, click Update. -3. Wait about 15 seconds for the instance metadata service to refresh. - -### 1.3 Create the S3 lifecycle rule - -**S3 → Buckets → vortex-benchmark-results-database → Management → Lifecycle rules → Create lifecycle -rule**: - -| Field | Value | -| ------------ | ---------------------------------------------- | -| Name | `v3-backups-7d` | -| Status | Enabled | -| Filter scope | Prefix `v3-backups/` | -| Action | Expire current versions, 7 days after creation | - -7 days at one snapshot per hour is 168 tarballs. Tune up or down to taste. - -### 1.4 Verify - -```bash -$ aws sts get-caller-identity -# Arn should end in /VortexBenchServerRole/ - -$ echo probe > /tmp/probe.txt -$ aws s3 cp /tmp/probe.txt s3://vortex-benchmark-results-database/v3-backups/_probe.txt -$ aws s3 ls s3://vortex-benchmark-results-database/v3-backups/ | grep probe -$ aws s3 rm s3://vortex-benchmark-results-database/v3-backups/_probe.txt -$ rm /tmp/probe.txt -``` - -All four operations must succeed. If any fails with `AccessDenied`, check (1) the policy is actually -attached to `VortexBenchServerRole`, (2) the instance is using that role per -`aws sts get-caller-identity`, (3) there is no bucket policy denying access. - -## Phase 2: Host packages (Amazon Linux 2023) - -```bash -$ sudo dnf install -y \ - git curl jq \ - gcc gcc-c++ make cmake pkgconfig \ - util-linux openssl tar gzip -``` - -`util-linux` provides `flock`, which `deploy.sh` uses as a serialization guard. `gcc`, `gcc-c++`, -`cmake`, and `pkgconfig` are required by the `duckdb-sys` build. - -### 2.1 Install the Rust toolchain for `ec2-user` - -```bash -$ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -$ source $HOME/.cargo/env -$ rustc --version -``` - -`rustc --version` must succeed. The deploy timer runs `cargo build --release`, which needs this -exact toolchain installed for the user systemd runs the service as (`ec2-user`). - -## Phase 3: Clone the repo - -```bash -$ cd ~ && git clone https://github.com/vortex-data/vortex.git -$ cd vortex -$ git remote -v -origin https://github.com/vortex-data/vortex.git (fetch) -origin https://github.com/vortex-data/vortex.git (push) -``` - -The HTTPS URL is mandatory. If your `git config` defaults rewrite to SSH, undo that for this -checkout: - -```bash -$ git -C ~/vortex remote set-url origin https://github.com/vortex-data/vortex.git -``` - -## Phase 4: Run the installer - -```bash -$ ./benchmarks-website/ops/install.sh -``` - -This is idempotent. It creates `/var/lib/vortex-bench/` owned by `ec2-user` (logs go to -journalctl, not to a file), drops a sudoers fragment at `/etc/sudoers.d/vortex-bench`, copies -`/etc/vortex-bench.env` from the template (mode 0600), symlinks `/var/lib/vortex-bench/ops` to the -repo's `ops/`, and installs the systemd units. - -Expected tail of output: - -``` -[install] install complete. Next steps: -[install] 1. Edit /etc/vortex-bench.env (chmod 0600, owned by ec2-user) -[install] - INGEST_BEARER_TOKEN=... -[install] - ADMIN_BEARER_TOKEN=... -``` - -If the installer warns about an SSH `origin` remote, fix it now (see Phase 3) before starting the -timers. The deploy timer will silently fail every minute otherwise. - -### 4.1 Verify - -```bash -$ ls -ld /var/lib/vortex-bench -drwxr-xr-x. 7 ec2-user ec2-user 4096 ... /var/lib/vortex-bench - -$ sudo ls -l /etc/vortex-bench.env --rw-------. 1 ec2-user ec2-user ... /etc/vortex-bench.env - -$ systemctl list-unit-files 'vortex-bench-*' --no-pager -vortex-bench-backup.service static -vortex-bench-backup.timer enabled -vortex-bench-deploy.service static -vortex-bench-deploy.timer enabled -vortex-bench-server.service enabled -``` - -`enabled` for the two timers and the server unit is the expected state. The deploy and backup -service units are `static` because they have no `[Install]` section and are fired by their -respective timers, not enabled directly. - -## Phase 5: Fill in the env file and start the timers - -### 5.1 Generate the two bearer tokens - -```bash -$ openssl rand -hex 32 # this becomes INGEST_BEARER_TOKEN -$ openssl rand -hex 32 # this becomes ADMIN_BEARER_TOKEN -``` - -Save the `INGEST_BEARER_TOKEN` to the GitHub Actions Environment that the bench CI workflow reads. -The `ADMIN_BEARER_TOKEN` never leaves the box. - -### 5.2 Edit `/etc/vortex-bench.env` - -```bash -$ sudo $EDITOR /etc/vortex-bench.env -``` - -Required fields (defaults are correct for the canonical layout): - -``` -INGEST_BEARER_TOKEN= -ADMIN_BEARER_TOKEN= -REPO_DIR=/home/ec2-user/vortex -DEPLOY_BRANCH=develop -S3_BACKUP_PREFIX=s3://vortex-benchmark-results-database/v3-backups -``` - -The remaining keys (`VORTEX_BENCH_DB`, `VORTEX_BENCH_BIND`, `VORTEX_BENCH_ADMIN_BIND`, `SERVER_URL`, -`ADMIN_URL`, `VORTEX_BENCH_SNAPSHOT_DIR`) already point at the canonical paths. - -### 5.3 Start the timers - -```bash -$ sudo systemctl start vortex-bench-deploy.timer -$ sudo systemctl start vortex-bench-backup.timer -``` - -The server unit starts itself once the deploy timer's first fire produces a binary. Do not -`start vortex-bench-server` directly yet, there is nothing for it to exec. - -### 5.4 Watch the first deploy build the binary - -```bash -$ journalctl -fu vortex-bench-deploy.service -``` - -The first fire takes about 60 to 90 seconds for a cold `cargo build --release`. Every log line is -prefixed with `[deploy ]`. Look for these milestones (paraphrased; the literal substrings -to grep are bolded): - -- **`building <7-char-sha> (was )`** -- cargo build starts. -- **`swapped symlink →`** -- atomic binary swap landed; the next /health probe is imminent. - (The arrow is the literal Unicode `→` deploy.sh emits, not ASCII `->`.) -- **`deploy ok: <7-char-sha> → live (binary )`** -- /health passed and the deploy committed - the stamp. This is the success line. - -`Ctrl-C` out of `journalctl` once the `deploy ok:` line appears. If a deploy fails it will exit -with one of the exit codes documented at the head of [`deploy.sh`](deploy.sh): 1 lock contention, -2 config error, 3 git fetch or checkout, 4 cargo build, 5 systemctl restart (with rollback -re-probed), 6 /health failed but rolled back OK, 7 /health failed AND rollback also broken -- -server is down. The four exit codes a bootstrap operator typically hits (4 through 7) have rows -in [the symptom table](#what-to-do-if-a-step-fails) below; exit codes 1-3 are self-explanatory -in the journal output. - -## Phase 6: Verify the server is up - -### 6.1 Public listener - -```bash -$ curl -fsS http://127.0.0.1:3000/health | jq -{ - "status": "ok", - "build_sha": "abc123...", - "schema_version": "...", - "db_path": "/var/lib/vortex-bench/bench.duckdb", - "row_counts": { - "commits": 0, - "query_measurements": 0, - ... - } -} -``` - -Empty row counts are expected. The DB is created with an empty schema on first server boot. - -### 6.2 Admin listener - -```bash -$ /var/lib/vortex-bench/ops/inspect.sh "SELECT COUNT(*) FROM commits;" -``` - -A `0` is correct (DB is empty). A connection refused means `ADMIN_BEARER_TOKEN` was empty when the -server started: re-check `/etc/vortex-bench.env` and restart the server with -`/var/lib/vortex-bench/ops/restart.sh`. - -### 6.3 Build SHA - -```bash -$ readlink /var/lib/vortex-bench/bin/vortex-bench-server -/var/lib/vortex-bench/bin/vortex-bench-server.. - -$ cat /var/lib/vortex-bench/last-deployed-sha -abc123... -``` - -The SHA in `last-deployed-sha` must match the `build_sha` in the `/health` JSON. - -## Phase 7: Populate the database (fresh install only) - -Pick **one** of 7.A or 7.B. Skip this entire phase if you are rebuilding from a backup (Phase 8 -supplies the data). - -### 7.A Run the v2 to v3 migration - -This is the canonical path for a brand-new install. The migrator reads the v2 source (the public -S3 bucket of v2 result JSONs) and writes into the v3 DuckDB file. - -```bash -$ source /etc/vortex-bench.env -# Check the migrator's own CLI for the up-to-date flag set. The wrapper passes args verbatim to -# `cargo run -p vortex-bench-migrate -- "$@"`, so the v2 source flag lives in that crate: -$ /var/lib/vortex-bench/ops/migrate.sh run --help -# Typical invocation (substitute the v2 source flag the --help output names): -$ /var/lib/vortex-bench/ops/migrate.sh run --output "$VORTEX_BENCH_DB" -``` - -`migrate.sh` stops the server, snapshots the current DB to `bench.prev-.duckdb` for rollback, -runs the migrator, and starts the server back up. The deploy and backup timers are paused for the -duration; they restart automatically on success. - -If the migrator fails, the script leaves the server stopped and the timers paused, and prints the -exact rollback command. Follow it. Do not retry the migration without rolling back first or you will -pile new state on top of partially-migrated state. - -### 7.B Promote an existing DuckDB file - -If you already have a `bench.duckdb` from a previous host or a manual export: - -```bash -$ sudo systemctl stop vortex-bench-server -$ cp /path/to/your/bench.duckdb /var/lib/vortex-bench/bench.duckdb -$ sudo systemctl start vortex-bench-server -$ curl -fsS http://127.0.0.1:3000/health | jq '.row_counts' -``` - -Row counts in `/health` should match the source DB. - -### 7.C Verify the data landed - -```bash -$ /var/lib/vortex-bench/ops/inspect.sh " - SELECT 'commits' AS table_name, COUNT(*) AS n FROM commits - UNION ALL SELECT 'query_measurements', COUNT(*) FROM query_measurements - UNION ALL SELECT 'compression_times', COUNT(*) FROM compression_times - UNION ALL SELECT 'compression_sizes', COUNT(*) FROM compression_sizes - UNION ALL SELECT 'random_access_times', COUNT(*) FROM random_access_times - UNION ALL SELECT 'vector_search_runs', COUNT(*) FROM vector_search_runs; -" -``` - -All six tables should have non-zero row counts that match what you expect from the source. - -### 7.D Verify the backup loop end-to-end - -Fire one snapshot by hand to prove the IAM role, the admin token, and the tarball pipeline all work: - -```bash -$ sudo systemctl start vortex-bench-backup.service -$ journalctl -u vortex-bench-backup.service --since '2 min ago' --no-pager -[backup ...] triggering /api/admin/snapshot?ts=20260520T... -[backup ...] compressing /var/lib/vortex-bench/snapshots/... → ....tar.gz -[backup ...] compressed N → M bytes (Kx) -[backup ...] uploading ....tar.gz → s3://vortex-benchmark-results-database/v3-backups/....tar.gz -[backup ...] deleting local copies ... -[backup ...] snapshot 20260520T... ok → ... - -$ aws s3 ls s3://vortex-benchmark-results-database/v3-backups/ | tail -3 -``` - -The tarball must appear in the listing. If `aws s3 cp` fails with `AccessDenied`, redo Phase 1.4 to -debug the IAM role. - -**At this point the system is fully self-driving.** Deploys land within 60 seconds of a develop -merge, snapshots upload every hour, the lifecycle rule expires old ones. You do not need to SSH back -in for routine operations. - -## Phase 8: Disaster recovery: restore the DB from an S3 backup - -Use this phase to rebuild onto a fresh host when the old host or DB is unrecoverable. **Do not run -it on a healthy host with a populated DB**, it overwrites the live `bench.duckdb`. - -You must have completed Phases 1 through 6 first (the server runs against an empty schema). Skip -Phase 7. - -### 8.1 Pick the snapshot you want to restore - -```bash -$ aws s3 ls s3://vortex-benchmark-results-database/v3-backups/ | tail -20 -2026-05-20 01:00:14 12345678 20260520T010000Z.tar.gz -2026-05-20 02:00:11 12345678 20260520T020000Z.tar.gz -... -``` - -The lifecycle rule keeps about 168 hourly snapshots (7 days). Pick the most recent known-good one, -or `tail -1` to grab the latest. - -### 8.2 Download and extract - -```bash -$ ts=20260520T020000Z # replace with the timestamp from 8.1 -$ cd /tmp -$ aws s3 cp "s3://vortex-benchmark-results-database/v3-backups/${ts}.tar.gz" . -$ tar xzf "${ts}.tar.gz" -$ ls /tmp/${ts}/ -schema.sql -commits.vortex -query_measurements.vortex -compression_times.vortex -compression_sizes.vortex -random_access_times.vortex -vector_search_runs.vortex -``` - -Six `.vortex` files plus `schema.sql`. If any file is missing, the snapshot is incomplete, pick an -earlier one. - -### 8.3 Stop the server and clear the empty DB - -```bash -$ sudo systemctl stop vortex-bench-server -$ rm -f \ - /var/lib/vortex-bench/bench.duckdb \ - /var/lib/vortex-bench/bench.duckdb.wal -``` - -(`bench.duckdb` is owned by `ec2-user` per the install layout; deleting it does not need sudo.) - -### 8.4 Install the duckdb CLI matching the bundled engine - -```bash -$ duckdb --version -v1.5.x ... -``` - -The CLI version must be at least as new as the engine the server bundles (currently `1.5.x`). If -`duckdb` is missing or older: - -```bash -$ curl -L https://github.com/duckdb/duckdb/releases/latest/download/duckdb_cli-linux-amd64.zip -o /tmp/duckdb.zip -$ unzip -j /tmp/duckdb.zip duckdb -d ~/bin -$ export PATH="$HOME/bin:$PATH" -$ duckdb --version -``` - -### 8.5 Rehydrate the DB from the snapshot - -The block below uses `${ts}` from Phase 8.2; the guard re-derives it from `/tmp/` if a fresh shell -lost the variable, so this step is safe to copy-paste into a new terminal. - -```bash -$ : "${ts:?ts is not set; \`ts=\` or re-source /etc/vortex-bench.env}" -$ [ -d "/tmp/${ts}" ] || { echo "missing /tmp/${ts}; redo 8.2" >&2; exit 1; } -$ duckdb /var/lib/vortex-bench/bench.duckdb <.duckdb`, runs the -migrator, and starts the server back up. The autopilot timers are paused for the duration and -restart on success. On failure they intentionally stay stopped (see Phase 9 for the rollback -recipe). - -```bash -$ source /etc/vortex-bench.env -$ /var/lib/vortex-bench/ops/migrate.sh run --output "${VORTEX_BENCH_DB:-/var/lib/vortex-bench/bench.duckdb}" -``` - -`migrate.sh`'s positional args pass straight through to `cargo run -p vortex-bench-migrate --`, so -the migrator's CLI is whatever the current branch says it is. As of writing it is -`run --output `. - -### Token rotation - -`INGEST_BEARER_TOKEN`: - -```bash -$ openssl rand -hex 32 # generate new value -# 1. Update the GitHub Actions Environment secret so CI uses the new value. -# 2. Edit /etc/vortex-bench.env with the new value. -$ sudo $EDITOR /etc/vortex-bench.env -$ /var/lib/vortex-bench/ops/restart.sh # picks up the new env -``` - -`ADMIN_BEARER_TOKEN`: - -```bash -$ openssl rand -hex 32 # generate new value -$ sudo $EDITOR /etc/vortex-bench.env -$ /var/lib/vortex-bench/ops/restart.sh -# The next backup timer fire reads the env file fresh, so it picks up -# the new value automatically. -``` - -The two tokens are independent. Rotating one does not affect the other. - -### Adding or removing an admin - -Being an admin is three independent grants, not a single role: - -```bash -# (1) SSH access to the EC2 box. -# Append the new admin's public key to authorized_keys. They log in -# as ec2-user (which is also the service identity). -$ sudo -u ec2-user $EDITOR /home/ec2-user/.ssh/authorized_keys -# Or use AWS SSM Session Manager: enable on the instance and add the -# admin's IAM principal to the instance's SSM connect IAM policy. - -# (2) AWS console access for IAM/lifecycle/bucket-policy changes -# (the runtime role intentionally cannot do these). -# Grant via IAM user or SSO role with read/write on IAM and the -# vortex-benchmark-results-database bucket. - -# (3) Bearer-token knowledge, if they need to hit /api/admin/* from -# their laptop. /etc/vortex-bench.env is mode 0600 owned by ec2-user; -# anyone with SSH access can read it. -``` - -To revoke an admin: delete their key from `authorized_keys`, revoke their AWS console role, and -rotate `ADMIN_BEARER_TOKEN`. CI's `INGEST_BEARER_TOKEN` is unaffected because it lives in GitHub -Actions, not on the host. - -### Disk pressure - -`/var/lib/vortex-bench/` filling up has four typical causes (see -`du -sh /var/lib/vortex-bench/* | sort -h` to identify which): - -```bash -# `bin/vortex-bench-server..` accumulation - deploy.sh keeps the -# last $KEEP_BINARIES (default 3). To prune harder: -$ sudo $EDITOR /etc/vortex-bench.env # add KEEP_BINARIES=1 -$ /var/lib/vortex-bench/ops/force-rebuild.sh # next deploy enforces the new cap - -# `snapshots//` not deleted - backup.sh removes after a successful -# S3 upload, so leftover dirs imply the upload failed. Check: -$ journalctl -u vortex-bench-backup.service --since '4 hours ago' - -# `bench.prev-.duckdb` accumulation from old migrations. These are -# kept on purpose for rollback. Delete by hand once verified: -$ ls -lt /var/lib/vortex-bench/bench.prev-*.duckdb -$ rm /var/lib/vortex-bench/bench.prev-.duckdb{,.wal} - -# `bench.duckdb` itself growing - expected, hundreds of MB is normal. -``` - -## What to do if a step fails - -| Symptom | Likely cause | Fix | -| ------------------------------------------------------------------------- | ------------------------------------------------------------------------------ | --------------------------------------------------------------------------------- | -| `install.sh` exits with `ERROR: not found. Set REPO_DIR=.` | Running from outside the repo root or with a non-default `REPO_DIR` | `cd ~/vortex && ./benchmarks-website/ops/install.sh` | -| `journalctl -u vortex-bench-deploy` shows `Permission denied (publickey)` | `origin` is the SSH remote | `git -C ~/vortex remote set-url origin https://github.com/vortex-data/vortex.git` | -| `journalctl -u vortex-bench-deploy` shows `cargo: command not found` | Rust toolchain not installed for `ec2-user` | Re-run Phase 2.1; the timer runs as `ec2-user`, not as you | -| First `curl /health` returns connection refused | Deploy timer has not produced a binary yet, or the build failed | `journalctl -fu vortex-bench-deploy.service` and read the most recent failure | -| `inspect.sh` returns 401 or 503 | `ADMIN_BEARER_TOKEN` was empty at server start, the admin listener never bound | Edit `/etc/vortex-bench.env`, `restart.sh` | -| `backup.sh` logs `/api/admin/snapshot returned 000` | The server is not running, or the admin port is wrong | `systemctl status vortex-bench-server`, check `$ADMIN_URL` in the env file | -| `backup.sh` logs `aws s3 cp failed` | IAM role missing or wrong | Re-run Phase 1.4 to debug | -| `migrate.sh` exits with the rollback instructions | The migrator itself errored, the prev DB is intact | Follow the printed `mv` lines literally | -| Phase 8.5 `INSTALL vortex` fails | DuckDB CLI is older than the bundled engine | Upgrade the CLI per Phase 8.4 | -| `deploy.sh` exits 4 (`cargo build failed`) | Source-tree compile error | Read the build log in `journalctl -u vortex-bench-deploy.service`; fix and push | -| `deploy.sh` exits 5 (`systemctl restart failed`) | systemd or sudoers issue | `systemctl status vortex-bench-server`; check the sudoers fragment at `/etc/sudoers.d/vortex-bench` | -| `deploy.sh` exits 6 (`/health failed, rolled back to prior binary`) | New binary broken; prior binary healthy | Fix the source and push the next commit; the live binary is the prior good one | -| `deploy.sh` exits 7 (`/health failed AND rollback also broken`) | Server is DOWN; both new and prior binaries fail /health | Pick a known-good binary from `/var/lib/vortex-bench/bin/`, `sudo ln -snT /var/lib/vortex-bench/bin/vortex-bench-server`, `sudo systemctl restart vortex-bench-server` | - -See [`README.md`](README.md#failure-modes-conceptual) "Failure modes (conceptual)" for the full -reference list. This file covers only the failure modes a bootstrap operator actually hits. diff --git a/benchmarks-website/ops/README.md b/benchmarks-website/ops/README.md deleted file mode 100644 index 80c7651b051..00000000000 --- a/benchmarks-website/ops/README.md +++ /dev/null @@ -1,323 +0,0 @@ - - -# vortex-bench-server - operations runbook - -This is the canonical guide for deploying and operating the v3 benchmarks site (`bench.vortex.dev`) -on EC2. It targets a fresh admin who has SSH access to the box and never seen the system before. - -The contents of this directory are everything the EC2 host needs to build, run, deploy, back up, and -inspect the server. There is no out-of-tree state - every script and unit lives in -`benchmarks-website/ops/` and gets installed onto the host by [`install.sh`](install.sh). - -## TL;DR - -- One Rust binary (`vortex-bench-server`), one DuckDB file (`/var/lib/vortex-bench/bench.duckdb`). -- A systemd timer polls `origin/develop` every 60s. If commits in the range touch website-relevant - paths it builds, atomically swaps the binary, and restarts the server. Otherwise it fast-forwards - the working tree and exits. -- A second timer fires hourly, asks the server to write a per-table Vortex snapshot (`schema.sql` + - one `.vortex` per table), `tar czf`s it, and uploads to - `s3://vortex-benchmark-results-database/v3-backups/.tar.gz`. The vortex DuckDB extension - is auto-installed from the DuckDB core extension repo on first call. Vortex compresses the - BIGINT[] runtime arrays and string columns roughly an order of magnitude better than gzipped CSV - - and dogfoods the project's own format. -- For ad-hoc reads, `inspect.sh` calls a bearer-gated `/api/admin/sql` endpoint instead of stopping - the server. -- For DB-replacing operations (re-running the v2→v3 migration), `migrate.sh` stops the server, - snapshots the current DB to `bench.prev-.duckdb`, runs the migration, and starts back up. - -## Architecture - -``` -┌──────────────────────────────────────────────────────────────────────┐ -│ EC2 host (Amazon Linux 2023, ec2-user) │ -│ │ -│ /home/ec2-user/vortex/ ← git checkout (build context only) │ -│ │ -│ /var/lib/vortex-bench/ │ -│ bench.duckdb ← live DB │ -│ bench.duckdb.wal │ -│ bench.prev-.duckdb ← pre-migration backup, never pruned │ -│ bin/ │ -│ vortex-bench-server ← symlink → versioned binary │ -│ vortex-bench-server.. │ -│ ← versioned (PID suffix breaks │ -│ same-second collisions), last │ -│ $KEEP_BINARIES (3) │ -│ snapshots// ← transient vortex-snapshot landing │ -│ last-deployed-sha ← stamp file for the deploy timer │ -│ .deploy.lock ← flock guard │ -│ ops -> /home/ec2-user/vortex/benchmarks-website/ops │ -│ │ -│ /etc/vortex-bench.env ← secrets, mode 0600 │ -│ /etc/sudoers.d/vortex-bench ← lets ec2-user systemctl restart │ -│ the server with no password │ -│ /etc/systemd/system/ │ -│ vortex-bench-server.service ← serves :3000 │ -│ vortex-bench-deploy.service ← oneshot, runs deploy.sh │ -│ vortex-bench-deploy.timer ← every 60s │ -│ vortex-bench-backup.service ← oneshot, runs backup.sh │ -│ vortex-bench-backup.timer ← hourly │ -│ │ -│ Logs: journalctl -u vortex-bench-{server,deploy,backup} │ -└──────────────────────────────────────────────────────────────────────┘ - │ - │ aws s3 cp .tar.gz - ▼ - ┌───────────────────────────────────────┐ - │ s3://vortex-benchmark-results-database/│ - │ v3-backups/ │ - │ .tar.gz │ - │ / │ - │ schema.sql │ - │
.vortex │ - └───────────────────────────────────────┘ -``` - -## Files in this directory - -| Path | Role | -| -------------------------------------------------------------------- | --------------------------------------------------------------------------- | -| [`install.sh`](install.sh) | One-time bootstrap on a fresh host. Idempotent. | -| [`deploy.sh`](deploy.sh) | Pull → build (if needed) → atomic restart. Called by timer. | -| [`migrate.sh`](migrate.sh) | Manual: stop, snapshot prev DB, run migrate, restart. | -| [`backup.sh`](backup.sh) | Hourly: trigger `/api/admin/snapshot`, sync to S3, prune local. | -| [`inspect.sh`](inspect.sh) | Read-only SQL via `/api/admin/sql`, no server stop. | -| [`force-rebuild.sh`](force-rebuild.sh) | Re-run a deploy of `$DEPLOY_BRANCH` even when origin hasn't moved. | -| [`restart.sh`](restart.sh) | Restart the binary in place with visible before/after state. | -| [`config/vortex-bench.env.example`](config/vortex-bench.env.example) | Template for `/etc/vortex-bench.env`. | -| [`systemd/`](systemd/) | Unit files installed into `/etc/systemd/system/`. | -| [`BOOTSTRAP.md`](BOOTSTRAP.md) | Step-by-step bootstrap and recovery runbook (copy-paste, verify-as-you-go). | - -**Every runnable command lives in [`BOOTSTRAP.md`](BOOTSTRAP.md).** This file explains *what* the -system is and *why* the moving parts are shaped the way they are. Operators run commands out of -`BOOTSTRAP.md`; the sections below are the conceptual companion you read before or after. - -## How the system runs - -### The deploy autopilot - -`vortex-bench-deploy.timer` fires every 60s. The service it triggers fetches `origin/$DEPLOY_BRANCH`, -compares the tip SHA against `/var/lib/vortex-bench/last-deployed-sha`, and exits early if nothing -moved. When the SHA has moved, the script inspects the diff against the old SHA: it only rebuilds -when the change touches `benchmarks-website/server/`, `benchmarks-website/migrate/`, -`benchmarks-website/Cargo.toml`, the workspace `Cargo.toml`, or `Cargo.lock`. Everything else (e.g. a -vortex-array PR) fast-forwards the working tree so the next website change builds against fresh -dependencies, but the running binary is left alone. - -When a rebuild is needed: `cargo build --release` produces a versioned binary at -`bin/vortex-bench-server..`, the symlink at `bin/vortex-bench-server` swings to it -atomically, the server unit restarts, and `deploy.sh` probes `/health` for up to 30s. On any failure -(build, restart, health check) the symlink rolls back to the previous binary and the server restarts -on the old version. The stamp file is **not** written on a failed deploy, so the next timer fire -retries the same SHA. Fix the bug and push again. - -The flock at `/var/lib/vortex-bench/.deploy.lock` serializes deploy / force-rebuild / manual-deploy -attempts. The `force-rebuild.sh` sentinel (`.force-rebuild` under `STATE_DIR`) bypasses the path -filter and the stamp comparison once, then deletes itself. - -### Identifying the running build - -Three identifiers should always agree on a healthy host: - -| Source | What it represents | -| -------------------------------------------------------- | ---------------------------------------------------------------------------------------- | -| `cat /var/lib/vortex-bench/last-deployed-sha` | What the deploy timer last successfully rolled out. | -| `readlink /var/lib/vortex-bench/bin/vortex-bench-server` | Which versioned binary the symlink points at (its filename embeds the build timestamp). | -| `curl /health` → `.build_sha` | What `cargo build` saw at compile time, baked into the running process. | - -`build_sha` is the source of truth. Disagreement means the running process is stale: a manual binary -swap, a `restart.sh` with no rebuild, or systemd holding an older pid open. - -### The three "restart" semantics - -Pick the smallest hammer: - -- **`restart.sh`** restarts the existing binary. Cheapest. Useful after editing - `/etc/vortex-bench.env` or recovering from a stuck connection. `build_sha` does not change; `pid` - and `started` do. -- **Triggering the deploy service** runs the timer's normal flow now instead of waiting up to 60s. - No-op if `origin/$DEPLOY_BRANCH` has not moved. -- **`force-rebuild.sh`** ignores both the stamp file and the path filter, so it rebuilds whatever's - on `$DEPLOY_BRANCH` even when origin has not moved. Use this when you flipped `DEPLOY_BRANCH`, are - recovering from wedged build artifacts, or want to redeploy "whatever is on the branch I'm - tracking." - -There is no "build whatever I have locally checked out" mode. The deploy script always builds -origin's tip, so to test a branch you push it first. - -### Migration semantics - -The v2 to v3 migration is destructive: it overwrites `/var/lib/vortex-bench/bench.duckdb` from the v2 -source. `migrate.sh` treats this as the most dangerous operation in the system: - -1. Pause both autopilot timers AND interrupt any in-flight service so a deploy or backup cannot race - the migrator's exclusive DB access. -2. Stop the server. -3. Copy the current `bench.duckdb` (and any `.wal`) to `bench.prev-.duckdb` for instant - rollback. -4. Run the migrator (positional args pass straight through to the `vortex-bench-migrate` CLI). -5. Start the server, poll `/health` for up to 30s. -6. On success, restart the autopilot timers. On **failure**, intentionally leave the autopilot - timers stopped and print the rollback command. The prev DB is never auto-deleted; the operator - removes it once they've verified the migration. - -This means "deploy the latest commit" and "rerun the migration" are deliberately distinct actions. -The autopilot never touches the DB. - -### The backup loop - -`vortex-bench-backup.timer` fires hourly and `vortex-bench-backup.service` runs `backup.sh`. The -script POSTs to the loopback-only `/api/admin/snapshot` endpoint, which writes a per-table Vortex -snapshot (`schema.sql` plus one `
.vortex` per table) into -`$VORTEX_BENCH_SNAPSHOT_DIR//`. The script then tars and gzips that directory, uploads to -`$S3_BACKUP_PREFIX/.tar.gz`, and deletes the local copies. The bucket lifecycle rule expires old -objects (default 7 days, 168 hourly snapshots). - -Vortex compresses our shape (BIGINT[] runtime arrays + short strings) about an order of magnitude -better than gzipped CSV, and dogfoods the project's own format. The gzip on the tarball mostly -catches `schema.sql` and tar metadata, not the data columns themselves. - -`/api/admin/snapshot` requires `ts` to match `[A-Za-z0-9_-]{1,64}` and refuses to overwrite an -existing snapshot directory (409). The same endpoint is used out-of-band before risky operations; -just pick a label the timer can't collide with (e.g. `manual-`). - -### Why two listeners? - -The server binds two ports: a public listener (`VORTEX_BENCH_BIND`, typically `0.0.0.0:3000`) carries -`/`, `/api/ingest`, and `/health`. A separate admin listener (`VORTEX_BENCH_ADMIN_BIND`, mandatorily -loopback-only, `127.0.0.1:3001`) carries `/api/admin/*`. The admin listener fails to start on a -non-loopback bind, so `/api/admin/*` cannot reach the public network even when the public bind opens -`0.0.0.0`. Hitting `/api/admin/*` on the public listener always 404s. - -The admin router is mounted only when `ADMIN_BEARER_TOKEN` is set. With the env unset (e.g. local -dev) no admin listener is bound at all, and `backup.sh` / `inspect.sh` fail fast. There is no silent -"backups disabled" mode. - -### The "three grants" admin model - -There is no admin database. Being an admin is three independent things: - -1. **SSH access to the EC2 box** as `ec2-user` (the same identity systemd runs the service as). - Granted by adding the admin's public key to `authorized_keys`, or via AWS Systems Manager Session - Manager. -2. **AWS console access** for the metadata the runtime role intentionally cannot reach (IAM, bucket - policy, lifecycle rules). Granted via IAM user or SSO role. -3. **Bearer-token knowledge** for hitting `/api/admin/*` directly. Anyone with SSH access can read - `/etc/vortex-bench.env`, so this grant follows from grant 1. - -Revoking an admin therefore means revoking all three: drop the SSH key, revoke the AWS role, rotate -`ADMIN_BEARER_TOKEN`. CI's `INGEST_BEARER_TOKEN` is unaffected because it lives in GitHub Actions, -not on the host. - -## State on disk - -Every persistent file the system owns lives under `/var/lib/vortex-bench/` (state) or `/etc/` -(config): - -| Path | Owner | Lifetime | -| ----------------------------------------------------------- | ------------------------------- | --------------------------------------------------------------------------------------- | -| `/var/lib/vortex-bench/bench.duckdb` (+ `.wal`) | server | Live data; replaced by migrate, restored from S3. | -| `/var/lib/vortex-bench/bench.prev-.duckdb` | `migrate.sh` | Kept until operator deletes; rollback target. | -| `/var/lib/vortex-bench/bin/vortex-bench-server` | `deploy.sh` | Symlink to current versioned binary. | -| `/var/lib/vortex-bench/bin/vortex-bench-server..` | `deploy.sh` | Versioned binaries; last `$KEEP_BINARIES` (default 3) kept. | -| `/var/lib/vortex-bench/snapshots//` | `/api/admin/snapshot` | Transient; `backup.sh` deletes after S3 upload. | -| `/var/lib/vortex-bench/last-deployed-sha` | `deploy.sh` | Stamp file; only written on success. | -| `/var/lib/vortex-bench/.deploy.lock` | `deploy.sh` | flock serialization guard. | -| `/var/lib/vortex-bench/duckdb-extensions/` | DuckDB | Writable extension install dir (`ProtectHome` blocks the default DuckDB path). | -| `/var/lib/vortex-bench/ops` | `install.sh` | Symlink to `/benchmarks-website/ops/`. | -| `/etc/vortex-bench.env` | `install.sh` then operator | Mode 0600 owned by ec2-user; both server and timers read it. | -| `/etc/sudoers.d/vortex-bench` | `install.sh` | Grants the run user `systemctl restart`/`start`/`stop` on the v3 units only. | -| `/etc/systemd/system/vortex-bench-*.{service,timer}` | `install.sh` | The five units. | - -## First-time install and disaster recovery - -Removed from this file. Both flows are now in [`BOOTSTRAP.md`](BOOTSTRAP.md): Phases 1 through 7 for -a fresh install, Phases 1 through 6 then 8 for a backup-restore rebuild, Phase 9 for rolling back a -botched migration. Each phase has a verification command so you find out immediately if a step did -not land. Edit `BOOTSTRAP.md` (not this file) when the procedure changes. - -## Wire APIs the ops scripts depend on - -These are the only server endpoints the operator scripts touch. They also constitute the public -admin contract for any future tooling. - -The server exposes two listeners. The public listener carries everything operator-facing and -CI-facing; the admin listener stays loopback-only so `/api/admin/*` cannot reach the public network -even when the public bind opens `0.0.0.0`. - -| Method + path | Bearer | Listener (env var) | Used by | -| ---------------------------------------------------------------- | ------ | ----------------------------------------------- | ------------------------------ | -| `GET /health` | none | public (`$SERVER_URL`, `VORTEX_BENCH_BIND`) | `deploy.sh` post-restart probe | -| `POST /api/ingest` | ingest | public | CI dual-write | -| `POST /api/admin/snapshot?ts=` | admin | admin (`$ADMIN_URL`, `VORTEX_BENCH_ADMIN_BIND`) | `backup.sh` | -| `POST /api/admin/sql` (body `{"sql": …}`, `?format=json\|table`) | admin | admin | `inspect.sh` | - -`POST /api/admin/snapshot` writes `schema.sql` + per-table `.vortex` files; `ts` must match -`[A-Za-z0-9_-]{1,64}` and the directory must not exist (409 otherwise). `POST /api/admin/sql` allows -only `SELECT`/`WITH`/`PRAGMA`/`SHOW`/`DESCRIBE`/`EXPLAIN` and runs each statement inside -`BEGIN TRANSACTION READ ONLY`. - -The admin router is mounted only when `ADMIN_BEARER_TOKEN` is set. With the env unset (e.g. in local -dev) no admin listener is bound at all - `backup.sh` and `inspect.sh` fail fast against -`$ADMIN_URL`, so there's no silent "backups disabled" mode. Hitting `/api/admin/*` on the **public** -listener always 404s, regardless of whether admin is configured. - -See [`server/src/admin.rs`](../server/src/admin.rs) for the full contract and the validation rules. - -## Failure modes (conceptual) - -When something breaks, the symptom usually points at exactly one of these. For the actual diagnostic -and repair commands, see the symptom table in -[`BOOTSTRAP.md`](BOOTSTRAP.md#what-to-do-if-a-step-fails). - -- **Deploys retry the same broken SHA forever.** The stamp file is only written on success, so a - failing deploy attempts the same SHA on every 60s tick. Fix the bug and push, or pause the timer - while you investigate. -- **`/health` is slow.** It runs six `SELECT COUNT(*)`s under the connection mutex. Over 1s during a - benchmark ingest window is normal; over 30s means the mutex is stuck. -- **Disk filling under `/var/lib/vortex-bench/`.** Four culprits in order of likelihood: piled-up - `bench.prev-*.duckdb` from old migrations, leftover `snapshots//` directories (backup uploads - failing), accumulated versioned binaries (`KEEP_BINARIES` too high), the live `bench.duckdb` itself - (expected to grow over time). -- **Backups not running.** Either the timer is stopped, the IAM role is broken, or the server - started without `ADMIN_BEARER_TOKEN` so the admin listener never bound and `curl` to it returns - `000`. -- **Migrate failed partway.** `migrate.sh` leaves the server and the autopilot timers stopped on - failure and prints the rollback commands on stderr. The prev DB is on local disk and complete; - restore it before doing anything else. -- **Migrating to a new EC2 host.** Stand the new host up, take a final snapshot on the old host, - restore from S3 on the new host, cut DNS. Total RPO is bounded by the backup timer interval (one - hour by default). - -## Local development - -You don't need any of this to run the server locally: - -```bash -INGEST_BEARER_TOKEN=dev \ -ADMIN_BEARER_TOKEN=dev \ -VORTEX_BENCH_DB=/tmp/bench.duckdb \ -cargo run -p vortex-bench-server -``` - -The admin endpoints work the same as in production. The hourly timer and the deploy timer are -systemd-only - they have no local equivalent and don't need one. - -## What's intentionally not here - -- **Docker.** A previous iteration ran the server under `docker compose` with `watchtower` polling - GHCR. We removed it: the binary is small enough that a build-on-host model is simpler, and systemd - gives us atomic restarts and rollback for free. The v2 React site retains its image-based deploy - (separate `Dockerfile` and CI workflow); v3 does not. -- **A push-based deploy.** A GitHub Actions workflow could push via SSM or SSH on every merge. We - chose polling because (a) zero inbound surface on the EC2 box, (b) no shared secret to manage in - CI, and (c) 60s is well under any reasonable expectation for a benchmarks site. If the polling - becomes unworkable, swap `vortex-bench-deploy.timer` for an SSM-triggered ExecStart and the rest - of `deploy.sh` doesn't change. -- **A dedicated SQL endpoint user.** `/api/admin/sql` is gated by the same admin token as - `/api/admin/snapshot`. If you want per-operator audit, run a reverse proxy that adds a header and - log it on the way through. diff --git a/benchmarks-website/ops/backup.sh b/benchmarks-website/ops/backup.sh deleted file mode 100755 index dd26372f8d0..00000000000 --- a/benchmarks-website/ops/backup.sh +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Hourly snapshot to S3, called by vortex-bench-backup.timer. -# -# Asks the running server to write a per-table Vortex snapshot via -# /api/admin/snapshot (so the writer uses the same DuckDB process -# that owns the file - no stop required), `tar czf`s the resulting -# directory into a single archive, uploads it to -# $S3_BACKUP_PREFIX/.tar.gz, and deletes the local copies. -# -# Vortex compresses our shape (mostly BIGINT[] runtime arrays + short -# strings) far better than gzipped CSV; the additional gzip on the -# tarball is largely catching schema.sql and tar metadata, not the -# data files themselves. -# -# The instance IAM role must already permit s3:PutObject under -# $S3_BACKUP_PREFIX. The v3 bucket is vortex-benchmark-results-database -# (distinct from v2's vortex-ci-benchmark-results). - -set -euo pipefail - -ENV_FILE="${ENV_FILE:-/etc/vortex-bench.env}" -if [ ! -f "$ENV_FILE" ]; then - echo "ERROR: missing ${ENV_FILE}" >&2 - exit 2 -fi -set -a -# shellcheck disable=SC1090 -source "$ENV_FILE" -set +a -: "${ADMIN_BEARER_TOKEN:?ADMIN_BEARER_TOKEN must be set in ${ENV_FILE}}" -: "${VORTEX_BENCH_SNAPSHOT_DIR:?VORTEX_BENCH_SNAPSHOT_DIR must be set}" -: "${S3_BACKUP_PREFIX:?S3_BACKUP_PREFIX must be set in ${ENV_FILE}}" -# `ADMIN_URL` points at the loopback-only admin listener; `SERVER_URL` -# stays for /health checks on the public listener. -: "${ADMIN_URL:=http://127.0.0.1:3001}" -: "${STATE_DIR:=/var/lib/vortex-bench}" -: "${BACKUP_LOCK_FILE:=${STATE_DIR}/.backup.lock}" - -log() { printf '[backup %s] %s\n' "$(date -u +%H:%M:%SZ)" "$*"; } - -# Serialise against ourselves: a manual `bash backup.sh` racing the timer -# fires would otherwise both hit /api/admin/snapshot at the same ts (the -# server returns 409 to the loser), then both race on `rm -rf "$local_dir"` -# while the survivor is mid-tar. Quiet bail on contention so the timer -# journal stays clean. -mkdir -p "$(dirname "$BACKUP_LOCK_FILE")" -exec 200>"$BACKUP_LOCK_FILE" -if ! flock -n 200; then - log "another backup is in progress; bailing" - exit 0 -fi - -ts="$(date -u +%Y%m%dT%H%M%SZ)" -local_dir="${VORTEX_BENCH_SNAPSHOT_DIR}/${ts}" -archive="${VORTEX_BENCH_SNAPSHOT_DIR}/${ts}.tar.gz" -remote="${S3_BACKUP_PREFIX}/${ts}.tar.gz" - -# Per-PID scratch files so a manual `bash backup.sh` invocation running -# alongside the timer-driven invocation does not clobber each other's -# response capture or the curl auth header. Cleaned up on exit/trap. -scratch="$(mktemp -d "${TMPDIR:-/tmp}/vortex-bench-backup.XXXXXX")" -trap 'rm -rf "$scratch"' EXIT -response="${scratch}/snapshot.out" -auth_header="${scratch}/auth.hdr" - -# Write the Authorization header to a 0600 file and pass via `-H @path` -# so the bearer token never appears in argv (visible to anyone reading -# `ps aux`). Same pattern in inspect.sh. Wrap in `set +x; ...; set -x` -# guard so an operator running `bash -x backup.sh` does not see the -# bearer in the trace output. -umask 077 -{ _xtrace="$(set +o | grep xtrace)"; set +x; } 2>/dev/null -printf 'Authorization: Bearer %s\n' "${ADMIN_BEARER_TOKEN}" > "$auth_header" -eval "$_xtrace" 2>/dev/null || true - -log "triggering /api/admin/snapshot?ts=${ts}" -http_status=$(curl -sS -o "$response" -w '%{http_code}' \ - -X POST \ - -H "@${auth_header}" \ - "${ADMIN_URL}/api/admin/snapshot?ts=${ts}" || echo "000") -if [ "$http_status" != "200" ]; then - echo "ERROR: /api/admin/snapshot returned ${http_status}" >&2 - cat "$response" >&2 || true - exit 3 -fi - -if [ ! -d "$local_dir" ]; then - echo "ERROR: server reported success but ${local_dir} does not exist" >&2 - exit 4 -fi - -# Completeness check: the server writes schema.sql plus one .vortex file -# per fact + dim table. If a deploy-timer restart interrupted the snapshot -# write mid-stream, the directory may be partially populated; the only -# completeness signal otherwise would be the presence of the dir, which -# tar+s3 cp would happily pack and upload as a "valid" archive that -# fails restore (`INSERT INTO ... silently no-op'd` per BOOTSTRAP 8.5). -required_files=( - schema.sql - commits.vortex - query_measurements.vortex - compression_times.vortex - compression_sizes.vortex - random_access_times.vortex - vector_search_runs.vortex -) -missing=() -for f in "${required_files[@]}"; do - [ -e "${local_dir}/${f}" ] || missing+=("$f") -done -if [ "${#missing[@]}" -gt 0 ]; then - echo "ERROR: snapshot ${local_dir} is incomplete; missing: ${missing[*]}" >&2 - echo " Most common cause: vortex-bench-server was restarted mid-snapshot." >&2 - echo " Leaving the partial directory in place for inspection." >&2 - exit 4 -fi - -# Compress the snapshot directory into a single tar.gz. `tar -C` so paths -# inside the archive are relative to the snapshot id (i.e. `/schema.sql` -# and `/
.vortex`), which matches the layout expected by the -# restore docs. -log "compressing ${local_dir} → ${archive}" -if ! tar -C "$VORTEX_BENCH_SNAPSHOT_DIR" -czf "$archive" "$ts"; then - echo "ERROR: tar czf failed" >&2 - rm -f "$archive" - exit 5 -fi - -orig_bytes=$(du -sb "$local_dir" | awk '{print $1}') -gz_bytes=$(stat -c %s "$archive") -log "compressed ${orig_bytes} → ${gz_bytes} bytes ($(( orig_bytes / (gz_bytes > 0 ? gz_bytes : 1) ))x)" - -log "uploading ${archive} → s3://${remote#s3://}" -# Retry transient `aws s3 cp` failures (rate limit / ELB blip / IAM -# role refresh hiccup) before giving up. Backoff 2s, 8s, 30s. -upload_ok=0 -for delay in 0 2 8 30; do - [ "$delay" -gt 0 ] && sleep "$delay" - if aws s3 cp --quiet "${archive}" "${remote}"; then - upload_ok=1 - break - fi - log "aws s3 cp failed; retrying after ${delay:-0}s (next attempt)" -done -if [ "$upload_ok" != "1" ]; then - echo "ERROR: aws s3 cp failed after retries; keeping ${archive} and ${local_dir} for manual recovery" >&2 - exit 6 -fi - -log "deleting local copies (${archive}, ${local_dir})" -rm -f "$archive" -rm -rf "$local_dir" - -log "snapshot ${ts} ok → ${remote}" diff --git a/benchmarks-website/ops/config/vortex-bench.env.example b/benchmarks-website/ops/config/vortex-bench.env.example deleted file mode 100644 index ee73097dd72..00000000000 --- a/benchmarks-website/ops/config/vortex-bench.env.example +++ /dev/null @@ -1,77 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Template for /etc/vortex-bench.env on the EC2 host. -# -# install.sh copies this template into place if /etc/vortex-bench.env is -# missing; the operator then fills in the secrets. The file must be -# mode 0600 owned by the user systemd runs the service as (ec2-user by -# default). systemd's EnvironmentFile= reads this for both the server -# unit and the deploy/backup timers. - -# Bearer token CI presents on POST /api/ingest. Constant-time compared. -# Generate with: openssl rand -hex 32 -INGEST_BEARER_TOKEN= - -# Bearer token operators present on /api/admin/snapshot and /api/admin/sql. -# Independent of INGEST_BEARER_TOKEN so the two can rotate separately. -# Generate with: openssl rand -hex 32 -ADMIN_BEARER_TOKEN= - -# DuckDB file the server opens. Lives outside any git checkout so a -# `git pull` never touches it. -VORTEX_BENCH_DB=/var/lib/vortex-bench/bench.duckdb - -# Where /api/admin/snapshot writes per-table Vortex snapshots (schema.sql -# plus one
.vortex file per table). backup.sh uploads the contents -# to S3 then deletes them, so this dir is transient. -VORTEX_BENCH_SNAPSHOT_DIR=/var/lib/vortex-bench/snapshots - -# Where DuckDB installs extensions. Defaults to -# `/duckdb-extensions`. The systemd unit's -# `ProtectHome=read-only` blocks DuckDB's default `~/.duckdb/extensions/...` -# install path, so this needs to live under a writable STATE_DIR subdir. -# Override only if you keep state outside /var/lib/vortex-bench. -#VORTEX_BENCH_EXTENSION_DIR=/var/lib/vortex-bench/duckdb-extensions - -# `host:port` the *public* listener binds to (ingest, read API, HTML, -# /health). Behind a reverse proxy (or just exposed directly on the -# EC2 SG), 0.0.0.0 is correct. -VORTEX_BENCH_BIND=0.0.0.0:3000 - -# `host:port` the *admin* listener binds to. Must remain loopback-only -# so `/api/admin/*` never reaches the public network even when -# VORTEX_BENCH_BIND opens 0.0.0.0. backup.sh / inspect.sh talk to this -# address via $ADMIN_URL. -VORTEX_BENCH_ADMIN_BIND=127.0.0.1:3001 - -# tracing-subscriber env filter spec. -VORTEX_BENCH_LOG=info,vortex_bench_server=info - -# --- ops scripts only (not consumed by the server itself) -------------- - -# Repo checkout the deploy timer pulls and builds from. Owned by the -# same user as the systemd services so `git pull` and `cargo build` don't -# need sudo. -REPO_DIR=/home/ec2-user/vortex - -# Branch the deploy timer tracks. -DEPLOY_BRANCH=develop - -# S3 prefix backup.sh syncs hourly snapshots to. The instance IAM role -# must already permit s3:PutObject under this prefix. -S3_BACKUP_PREFIX=s3://vortex-benchmark-results-database/v3-backups - -# URL of the *public* listener, used by deploy.sh's /health check. -SERVER_URL=http://127.0.0.1:3000 - -# URL of the *admin* listener, used by backup.sh (/api/admin/snapshot) -# and inspect.sh (/api/admin/sql). Defaults to http://127.0.0.1:3001 -# when unset; override only if you changed VORTEX_BENCH_ADMIN_BIND. -ADMIN_URL=http://127.0.0.1:3001 - -# Number of versioned binaries deploy.sh keeps under $BIN_DIR after a -# successful deploy. Defaults to 3 (current + previous + one buffer). -# Drop to 1 during a disk-pressure incident per the ops/README.md -# "Disk full" runbook, then bounce the deploy timer. -#KEEP_BINARIES=3 diff --git a/benchmarks-website/ops/deploy.sh b/benchmarks-website/ops/deploy.sh deleted file mode 100755 index 9dfd8d30442..00000000000 --- a/benchmarks-website/ops/deploy.sh +++ /dev/null @@ -1,332 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Idempotent rebuild + restart, called by vortex-bench-deploy.timer -# every 60s. Cheap and silent on the common path (no new commits). -# -# Flow: -# 1. flock on a state file (concurrent runs bail). -# 2. git fetch origin $DEPLOY_BRANCH. -# 3. If origin SHA == last-deployed SHA → exit 0. -# 4. Else: git diff against a path filter. If nothing in the filter -# changed, sync the working tree (destructive checkout) to the -# new SHA, update the stamp, exit 0. (Skips a build for monorepo -# changes that don't touch the server.) -# 5. Else: sync working tree + cargo build --release -p vortex-bench-server. -# 6. Compare new binary's sha256 to the currently-running symlink target. -# If unchanged (cargo did no real work), update stamp + exit 0. -# 7. Else: copy to bin/vortex-bench-server.. (PID suffix -# breaks same-second deploy collisions), atomically swap the -# symlink (staging symlink + `mv -Tf` so the swap is rename(2)), -# sudo systemctl restart vortex-bench-server. -# 8. Wait for /health. On failure: revert symlink, restart the prior -# binary, re-probe /health (so a rollback to an also-broken -# binary is loud), do NOT update the stamp - next tick retries. -# 9. On success: update stamp, prune binary versions older than $KEEP_BINARIES. -# -# The working-tree sync is `git checkout --force --detach `, not -# `git pull --ff-only`, so the script survives force-pushes on the -# tracked branch. -# -# Exit codes: -# 0 success (either a real deploy or a clean no-op) -# 1 another deploy is in progress (lock held) -# 2 config error (missing env file, REPO_DIR, etc.) -# 3 git fetch failed -# 4 cargo build failed -# 5 systemctl restart failed -# 6 /health check failed; rolled back to previous binary successfully -# 7 /health check failed AND rollback to the previous binary ALSO -# failed /health - server is down, manual intervention required. -# This is the worst-case path; the timer will retry on next tick -# but the prior binary is itself broken, so the retry will not -# heal the host. - -set -euo pipefail - -ENV_FILE="${ENV_FILE:-/etc/vortex-bench.env}" -STATE_DIR="${STATE_DIR:-/var/lib/vortex-bench}" -LOCK_FILE="${LOCK_FILE:-${STATE_DIR}/.deploy.lock}" -STAMP_FILE="${STAMP_FILE:-${STATE_DIR}/last-deployed-sha}" -BIN_DIR="${BIN_DIR:-${STATE_DIR}/bin}" -BIN_SYMLINK="${BIN_DIR}/vortex-bench-server" -KEEP_BINARIES="${KEEP_BINARIES:-3}" - -log() { printf '[deploy %s] %s\n' "$(date -u +%H:%M:%SZ)" "$*"; } -err() { printf '[deploy %s] ERROR: %s\n' "$(date -u +%H:%M:%SZ)" "$*" >&2; } - -# --- Load env --- -if [ ! -f "$ENV_FILE" ]; then - err "missing ${ENV_FILE}" - exit 2 -fi -set -a -# shellcheck disable=SC1090 -source "$ENV_FILE" -set +a -: "${REPO_DIR:?REPO_DIR must be set in ${ENV_FILE}}" -: "${DEPLOY_BRANCH:=develop}" -: "${SERVER_URL:=http://127.0.0.1:3000}" - -if [ ! -d "${REPO_DIR}/.git" ]; then - err "${REPO_DIR} is not a git checkout" - exit 2 -fi - -# --- Lock --- -mkdir -p "$(dirname "$LOCK_FILE")" -exec 200>"$LOCK_FILE" -if ! flock -n 200; then - log "another deploy is in progress; bailing" - exit 1 -fi - -# Pick up cargo from the user's profile if not on PATH already. -# shellcheck disable=SC1091 -. "$HOME/.cargo/env" 2>/dev/null || true - -cd "$REPO_DIR" - -last_sha="" -[ -f "$STAMP_FILE" ] && last_sha="$(cat "$STAMP_FILE")" - -# --- Fetch --- -if ! git fetch --quiet origin "$DEPLOY_BRANCH"; then - err "git fetch origin ${DEPLOY_BRANCH} failed" - exit 3 -fi -new_sha="$(git rev-parse "origin/${DEPLOY_BRANCH}")" - -# --- Force mode --- -# A `FORCE=1` env var (or a `.force-rebuild` sentinel under the state -# dir) bypasses the SHA-unchanged fast path and treats this run as if -# nothing was ever deployed. Used by `force-rebuild.sh` for the -# "redeploy current branch right now" path. The sentinel is consumed -# so the next ordinary timer fire is a no-op again. -force=0 -if [ "${FORCE:-0}" = "1" ]; then - force=1 -fi -if [ -f "${STATE_DIR}/.force-rebuild" ]; then - rm -f "${STATE_DIR}/.force-rebuild" - force=1 -fi -if [ "$force" = "1" ]; then - log "force mode: ignoring stamp comparison and path filter" - last_sha="" -fi - -if [ "$force" = "0" ] && [ "$new_sha" = "$last_sha" ]; then - # Common case: nothing new since last fire. Silent on stdout to - # keep the journal clean. - exit 0 -fi - -# --- Path filter --- -# Rebuild + restart only when commits in the range touch website code, -# the workspace lockfile, or workspace Cargo manifests. Other changes -# (e.g. vortex-array fixes) update the working tree but don't restart. -filter_paths=( - benchmarks-website/server - benchmarks-website/migrate - Cargo.lock - Cargo.toml -) - -if [ -z "$last_sha" ] || ! git cat-file -e "${last_sha}^{commit}" 2>/dev/null; then - # First run, or stamp points at a commit we no longer have. Treat - # as "must rebuild" so we don't silently skip a real change. Clear - # last_sha in the vanished-commit branch too so the hash-equal - # restart-skip fast path below sees an empty last_sha and forces a - # full restart + /health verify (the prior stamp commit being gone - # means we can't trust whatever the symlink currently points at). - log "first run / unknown stamp '${last_sha:-}'; full rebuild" - last_sha="" - relevant_changed=1 -else - if git diff --name-only "${last_sha}" "${new_sha}" -- "${filter_paths[@]}" | grep -q .; then - relevant_changed=1 - else - relevant_changed=0 - fi -fi - -# --- Sync the working tree to origin/$DEPLOY_BRANCH --- -# `git pull --ff-only` breaks the moment the tracked branch is -# force-pushed (typical during PR iteration). The deploy worker's -# checkout is build-only - no human edits live here - so a destructive -# `git checkout --force --detach $new_sha` is the right semantics. -# Detached HEAD avoids any local-branch ref drift. -if ! git checkout --quiet --force --detach "$new_sha"; then - err "git checkout --force --detach ${new_sha} failed" - exit 3 -fi - -if [ "$relevant_changed" = "0" ]; then - log "no website-relevant paths changed in ${last_sha:0:7}..${new_sha:0:7}; skipping rebuild" - # Atomic stamp write so a kill mid-redirect cannot leave a truncated - # stamp the next tick would treat as a vanished commit. - printf '%s\n' "$new_sha" > "${STAMP_FILE}.tmp" - mv -f "${STAMP_FILE}.tmp" "$STAMP_FILE" - exit 0 -fi - -# --- Build --- -prev_short="${last_sha:0:7}" -log "building ${new_sha:0:7} (was ${prev_short:-})" -if ! cargo build --release --quiet -p vortex-bench-server; then - err "cargo build -p vortex-bench-server failed" - exit 4 -fi -new_binary="${REPO_DIR}/target/release/vortex-bench-server" -if [ ! -x "$new_binary" ]; then - err "expected binary not found at ${new_binary}" - exit 4 -fi - -# --- Compare hashes; skip restart if cargo produced byte-identical output --- -# Force mode (FORCE=1 / .force-rebuild) explicitly opts out of this fast -# path: the operator asked for "redeploy and reverify", not "skip if the -# byte image matches", so we must still restart + /health-poll. -# -# An empty last_sha (no stamp file, OR a stamp file we just rejected as -# pointing at a vanished commit at line 130) ALSO opts out: the only -# guarantee that the symlink's current target was health-verified comes -# from the stamp file being present. Without it the live binary could -# be a partially-applied prior deploy that crashed between symlink swap -# and the restart, in which case the byte-equal path would stamp -# success without ever restarting the server. -new_hash="$(sha256sum "$new_binary" | awk '{print $1}')" -current_hash="" -if [ -L "$BIN_SYMLINK" ] && [ -e "$BIN_SYMLINK" ]; then - current_hash="$(sha256sum "$BIN_SYMLINK" | awk '{print $1}')" -fi -if [ "$force" = "0" ] && [ -n "$last_sha" ] && [ "$new_hash" = "$current_hash" ]; then - log "binary unchanged (sha256 ${new_hash:0:12}); skipping restart" - printf '%s\n' "$new_sha" > "${STAMP_FILE}.tmp" - mv -f "${STAMP_FILE}.tmp" "$STAMP_FILE" - exit 0 -fi - -# --- Install + atomic symlink swap --- -# `ln -sfnT` is unlink+create - there is a brief window where $BIN_SYMLINK -# does not exist, and a concurrent reader (e.g. systemctl restart firing -# from another timer fire) would see ENOENT. Do the swap in two steps so -# the final transition is `rename(2)`, which IS atomic on POSIX: create -# the new symlink under a sibling name, then `mv -Tf` it onto $BIN_SYMLINK. -# Same pattern is used in both rollback paths below. -# Include a per-process suffix so two deploys within the same UTC -# second (e.g. timer fire racing with a manual force-rebuild) cannot -# collide on the versioned filename. Without it, the second `install` -# would overwrite the first's binary and `prev_target` could end up -# pointing at a path whose contents are not the prior binary anymore. -ts="$(date -u +%Y%m%dT%H%M%SZ)" -versioned="${BIN_DIR}/vortex-bench-server.${ts}.$$" -install -m 0755 "$new_binary" "$versioned" -prev_target="" -if [ -L "$BIN_SYMLINK" ]; then - prev_target="$(readlink "$BIN_SYMLINK")" -fi - -atomic_symlink() { - # $1 = symlink target, $2 = symlink path - local target="$1" path="$2" staging - staging="${path}.new.$$" - ln -snT -- "$target" "$staging" - mv -Tf -- "$staging" "$path" -} - -atomic_symlink "$versioned" "$BIN_SYMLINK" -log "swapped symlink → ${versioned}" - -# --- Restart + verify --- -if ! sudo /bin/systemctl restart vortex-bench-server; then - err "systemctl restart failed" - if [ -n "$prev_target" ]; then - atomic_symlink "$prev_target" "$BIN_SYMLINK" - # Rollback restart is best-effort; the worst case (rollback ALSO - # failed /health) escalates to exit 7 below via the same - # /health re-probe used by the /health-failure path. Without - # the re-probe, a doubly-broken host would silently exit 5 and - # the next timer tick would loop on the same broken binary. - sudo /bin/systemctl restart vortex-bench-server || true - roll_deadline=$(( $(date +%s) + 30 )) - roll_healthy=0 - while [ "$(date +%s)" -lt "$roll_deadline" ]; do - if curl -fsS --max-time 3 "${SERVER_URL}/health" >/dev/null 2>&1; then - roll_healthy=1 - break - fi - sleep 1 - done - if [ "$roll_healthy" = "1" ]; then - log "rolled back symlink to ${prev_target} (verified healthy after systemctl-restart failure)" - exit 5 - fi - err "rollback to ${prev_target} ALSO failed /health - server is down; manual intervention required" - exit 7 - fi - exit 5 -fi - -# Give it a moment to come up, then poll /health. -deadline=$(( $(date +%s) + 30 )) -healthy=0 -while [ "$(date +%s)" -lt "$deadline" ]; do - if curl -fsS --max-time 3 "${SERVER_URL}/health" >/dev/null 2>&1; then - healthy=1 - break - fi - sleep 1 -done -if [ "$healthy" != "1" ]; then - err "/health did not respond within 30s - rolling back" - if [ -n "$prev_target" ]; then - atomic_symlink "$prev_target" "$BIN_SYMLINK" - sudo /bin/systemctl restart vortex-bench-server || true - # Verify the rolled-back binary is itself healthy before claiming - # clean rollback. A "previous binary" that's also broken (e.g. a - # prior failed deploy nobody caught) needs a louder signal. - roll_deadline=$(( $(date +%s) + 30 )) - roll_healthy=0 - while [ "$(date +%s)" -lt "$roll_deadline" ]; do - if curl -fsS --max-time 3 "${SERVER_URL}/health" >/dev/null 2>&1; then - roll_healthy=1 - break - fi - sleep 1 - done - if [ "$roll_healthy" = "1" ]; then - log "rolled back symlink to ${prev_target} (verified healthy)" - exit 6 - fi - err "rollback to ${prev_target} ALSO failed /health - server is down; manual intervention required" - exit 7 - else - err "no previous binary to roll back to" - fi - exit 6 -fi - -# --- Success: update stamp, prune old binaries --- -printf '%s\n' "$new_sha" > "${STAMP_FILE}.tmp" -mv -f "${STAMP_FILE}.tmp" "$STAMP_FILE" -log "deploy ok: ${new_sha:0:7} → live (binary ${ts})" - -# Keep the most recent $KEEP_BINARIES versioned binaries, drop the rest. -# Glob is restricted to the digit-prefix timestamp form so a stale staging -# symlink (`vortex-bench-server.new.`) left over from a killed -# `atomic_symlink` call cannot be picked up here and either inflate the -# keep-N count or be selected for deletion as if it were an old binary. -mapfile -t binaries < <(ls -1 "${BIN_DIR}"/vortex-bench-server.[0-9]* 2>/dev/null | sort) -if [ "${#binaries[@]}" -gt "$KEEP_BINARIES" ]; then - drop_count=$(( ${#binaries[@]} - KEEP_BINARIES )) - for b in "${binaries[@]:0:$drop_count}"; do - # Never delete what the symlink currently points at. - if [ "$b" != "$(readlink -f "$BIN_SYMLINK")" ]; then - rm -f "$b" - log "pruned ${b}" - fi - done -fi diff --git a/benchmarks-website/ops/force-rebuild.sh b/benchmarks-website/ops/force-rebuild.sh deleted file mode 100755 index 61a3e2d1363..00000000000 --- a/benchmarks-website/ops/force-rebuild.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Force a rebuild + restart of origin/$DEPLOY_BRANCH right now, even -# if origin hasn't moved since the last successful deploy. Drops a -# sentinel that the next deploy.sh fire consumes, then triggers it. -# -# Use cases: -# - You changed /etc/vortex-bench.env and want a fresh binary build -# (e.g. a feature flag baked into config) rather than just a -# `systemctl restart` of the existing one. -# - You flipped DEPLOY_BRANCH and want the new tip in <60s rather -# than waiting for the timer. -# - Build artefacts got wedged and you want a clean rebuild. -# -# For "build whatever I have locally checked out" rather than fetching -# origin, edit /etc/vortex-bench.env to point DEPLOY_BRANCH at a -# branch the local tip is already on, then run this. The deploy -# script always builds origin's tip - there is no "use local HEAD" -# mode by design; push to a branch first. - -set -euo pipefail - -STATE_DIR="${STATE_DIR:-/var/lib/vortex-bench}" - -if [ ! -d "$STATE_DIR" ]; then - echo "ERROR: ${STATE_DIR} not found - has install.sh run?" >&2 - exit 2 -fi - -# The sentinel file needs to be writable by the user the deploy -# service runs as. install.sh chowns STATE_DIR to that user, so this -# works without sudo. If you're running as a different user, sudo. -if ! touch "${STATE_DIR}/.force-rebuild" 2>/dev/null; then - echo "ERROR: cannot write ${STATE_DIR}/.force-rebuild - run as the install user or sudo" >&2 - exit 2 -fi - -echo "[force-rebuild] sentinel dropped; firing deploy service" -sudo /bin/systemctl start vortex-bench-deploy.service -echo "[force-rebuild] tail with: journalctl -fu vortex-bench-deploy.service" diff --git a/benchmarks-website/ops/inspect.sh b/benchmarks-website/ops/inspect.sh deleted file mode 100755 index 5362a6f5c98..00000000000 --- a/benchmarks-website/ops/inspect.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Run a read-only SQL query against the live DuckDB without stopping -# the server. Calls /api/admin/sql and prints the duckdb-cli-style -# table. -# -# Usage: -# ./inspect.sh "SELECT COUNT(*) FROM commits;" -# echo "PRAGMA table_info('commits');" | ./inspect.sh -# ./inspect.sh -j "SELECT * FROM compression_sizes LIMIT 3" # raw json -# -# The server allows SELECT, WITH, PRAGMA, SHOW, DESCRIBE, EXPLAIN. -# Anything else is rejected with 403 by the server (so a typo'd UPDATE -# can't run). - -set -euo pipefail - -ENV_FILE="${ENV_FILE:-/etc/vortex-bench.env}" -if [ ! -f "$ENV_FILE" ]; then - echo "ERROR: missing ${ENV_FILE}" >&2 - exit 2 -fi -set -a -# shellcheck disable=SC1090 -source "$ENV_FILE" -set +a -: "${ADMIN_BEARER_TOKEN:?ADMIN_BEARER_TOKEN must be set in ${ENV_FILE}}" -# Admin SQL lives on the loopback-only admin listener; the public bind -# (`SERVER_URL`) does not match `/api/admin/*` at all. -: "${ADMIN_URL:=http://127.0.0.1:3001}" - -format=table -if [ "${1:-}" = "-j" ] || [ "${1:-}" = "--json" ]; then - format=json - shift -fi - -if [ -n "${1:-}" ]; then - sql="$1" -else - sql="$(cat)" -fi - -# Build the JSON body with `jq --arg` so quoting in the SQL is a non-issue -# (or python3's json.dumps if jq is missing). -if command -v jq >/dev/null 2>&1; then - body=$(jq -nc --arg sql "$sql" '{sql: $sql}') -elif command -v python3 >/dev/null 2>&1; then - body=$(python3 -c 'import json,sys; print(json.dumps({"sql": sys.argv[1]}))' "$sql") -else - echo "ERROR: install jq or python3 to call /api/admin/sql safely" >&2 - exit 2 -fi - -# Write the Authorization header to a 0600 file and pass via `-H @path` -# so the bearer token never appears in argv (visible to anyone reading -# `ps aux`). Same pattern in backup.sh. -scratch="$(mktemp -d "${TMPDIR:-/tmp}/vortex-bench-inspect.XXXXXX")" -trap 'rm -rf "$scratch"' EXIT -auth_header="${scratch}/auth.hdr" -umask 077 -# Suppress xtrace for the one line that holds the bearer so `bash -x inspect.sh` -# does not leak the token to stderr. -{ _xtrace="$(set +o | grep xtrace)"; set +x; } 2>/dev/null -printf 'Authorization: Bearer %s\n' "${ADMIN_BEARER_TOKEN}" > "$auth_header" -eval "$_xtrace" 2>/dev/null || true - -curl -fsS \ - -X POST \ - -H "@${auth_header}" \ - -H "Content-Type: application/json" \ - --data-binary "$body" \ - "${ADMIN_URL}/api/admin/sql?format=${format}" -echo diff --git a/benchmarks-website/ops/install.sh b/benchmarks-website/ops/install.sh deleted file mode 100755 index 244aac4b52f..00000000000 --- a/benchmarks-website/ops/install.sh +++ /dev/null @@ -1,252 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# One-time bootstrap of vortex-bench-server on a fresh EC2 host. -# Idempotent - safe to re-run after editing units or to recover from -# partial state. See ops/README.md for the full operator runbook. -# -# Run as a user with sudo (typically ec2-user). The script will: -# 1. Create state directories under /var/lib/vortex-bench, owned by -# $RUN_USER. Logs go to journalctl, not to a file. -# 2. Drop a sudoers fragment that lets $RUN_USER restart the server -# service without a password (so the deploy timer can run as the -# service user). -# 3. Copy /etc/vortex-bench.env from the template if missing (mode 0600). -# 4. Install the systemd units and reload systemd. -# 5. Symlink the ops/ directory into /var/lib/vortex-bench so the -# systemd units have a stable path (the repo can move). -# 6. Enable + start the server, deploy timer, and backup timer. -# -# Usage: -# ./benchmarks-website/ops/install.sh -# REPO_DIR=$HOME/vortex ./benchmarks-website/ops/install.sh -# -# Only REPO_DIR is honored as an env override; the run user, state-dir, -# env-file, systemd-dir, and sudoers-file paths are pinned (they have -# to match the shipped systemd units, which hard-code these values). - -set -euo pipefail - -# The installed systemd units hard-code `User=ec2-user`, -# `EnvironmentFile=/etc/vortex-bench.env`, and the -# `/var/lib/vortex-bench` state-dir. Keep these values aligned with -# the units in `systemd/` and the runbook in `README.md`; the script -# does NOT template the units at install time. Anyone running on a -# different user / state-dir / env-file path needs to hand-edit the -# units before this script copies them into /etc/systemd/system. -RUN_USER="ec2-user" -RUN_GROUP="${RUN_USER}" -REPO_DIR="${REPO_DIR:-$HOME/vortex}" -STATE_DIR="/var/lib/vortex-bench" -ENV_FILE="/etc/vortex-bench.env" -SYSTEMD_DIR="/etc/systemd/system" -SUDOERS_FILE="/etc/sudoers.d/vortex-bench" - -ops_dir="${REPO_DIR}/benchmarks-website/ops" -if [ ! -d "$ops_dir" ]; then - echo "ERROR: ${ops_dir} not found. Set REPO_DIR=." >&2 - exit 2 -fi - -# Preflight: every external command the autopilot will call from this -# point on. Fail loudly here with a one-line summary instead of producing -# systemd units that 5xx silently in the journal hours later. -missing=() -for cmd in git cargo openssl jq flock aws tar gzip curl; do - command -v "$cmd" >/dev/null 2>&1 || missing+=("$cmd") -done -if [ "${#missing[@]}" -gt 0 ]; then - echo "ERROR: missing required commands: ${missing[*]}" >&2 - echo " (install per BOOTSTRAP.md Phase 2 before re-running install.sh)" >&2 - exit 2 -fi - -# The deploy timer runs as ${RUN_USER} with no SSH agent, so an SSH -# remote fails with "Permission denied (publickey)" on every fire. -# Public-repo HTTPS reads need no auth - warn early so this is not the -# first surprise out of the gate. -if [ -d "${REPO_DIR}/.git" ]; then - origin_url="$(git -C "$REPO_DIR" remote get-url origin 2>/dev/null || true)" - case "$origin_url" in - git@*|ssh://*) - echo "WARNING: ${REPO_DIR}'s origin is ${origin_url}." >&2 - echo " The deploy timer cannot fetch over SSH (no agent). Fix with:" >&2 - echo " git -C ${REPO_DIR} remote set-url origin https://github.com/vortex-data/vortex.git" >&2 - ;; - esac -fi - -log() { printf '[install] %s\n' "$*"; } - -# --- 1. State directories --- -log "creating ${STATE_DIR} (owner ${RUN_USER}:${RUN_GROUP})" -sudo install -d -m 0755 -o "$RUN_USER" -g "$RUN_GROUP" \ - "$STATE_DIR" \ - "${STATE_DIR}/bin" \ - "${STATE_DIR}/snapshots" \ - "${STATE_DIR}/duckdb-extensions" - -# --- 2. Sudoers fragment --- -# Let RUN_USER restart/start/stop/status the v3 systemd units without a -# password. Scope is intentionally tight: only the four units the -# autopilot owns, only the four verbs ops/migrate.sh and ops/deploy.sh -# call. The other v3 admin operations (editing /etc/vortex-bench.env, -# moving /var/lib/vortex-bench files) still require a full sudo session -# the operator can audit. -log "writing sudoers fragment to ${SUDOERS_FILE}" -# Write to a tempfile, validate with visudo, ONLY THEN move into place. -# A broken sudoers fragment in /etc/sudoers.d/ breaks sudo system-wide -# and removes the operator's repair path; the validate-before-rename -# pattern below makes a syntax error fail safely. -# -# Sudoers matches argv exactly. Each authorized invocation must be a -# SINGLE-unit `systemctl ` call; multi-unit invocations -# (`stop A B C`) would NOT be authorized by these per-unit lines, so -# every caller (deploy.sh, restart.sh, migrate.sh, force-rebuild.sh) -# splits its stops/starts into one call per unit. -sudoers_tmp="$(sudo mktemp /etc/sudoers.d/.vortex-bench.XXXXXX)" -sudo tee "$sudoers_tmp" >/dev/null </dev/null; then - sudo rm -f "$sudoers_tmp" - echo "ERROR: sudoers fragment failed visudo validation; refusing to install." >&2 - exit 3 -fi -sudo mv -f "$sudoers_tmp" "$SUDOERS_FILE" - -# --- 3. Env file --- -if [ ! -f "$ENV_FILE" ]; then - log "creating ${ENV_FILE} from template (mode 0600 owned by ${RUN_USER})" - sudo install -m 0600 -o "$RUN_USER" -g "$RUN_GROUP" \ - "${ops_dir}/config/vortex-bench.env.example" \ - "$ENV_FILE" - log "EDIT ${ENV_FILE} to set INGEST_BEARER_TOKEN, ADMIN_BEARER_TOKEN, REPO_DIR" -else - log "${ENV_FILE} already present - leaving alone" -fi - -# --- 4. Symlink ops/ into the state dir --- -# Gives systemd units a stable path that doesn't depend on the repo -# checkout location moving. Stage-and-rename so the symlink is never -# missing for a window where a concurrent timer fire's ExecStart could -# ENOENT (matches the atomic-symlink pattern in deploy.sh). -log "symlinking ${ops_dir} -> ${STATE_DIR}/ops" -sudo ln -snT "$ops_dir" "${STATE_DIR}/ops.new" -sudo mv -Tf "${STATE_DIR}/ops.new" "${STATE_DIR}/ops" - -# --- 5. systemd units --- -log "installing systemd units to ${SYSTEMD_DIR}" -for unit in \ - vortex-bench-server.service \ - vortex-bench-deploy.service \ - vortex-bench-deploy.timer \ - vortex-bench-backup.service \ - vortex-bench-backup.timer -do - sudo install -m 0644 -o root -g root \ - "${ops_dir}/systemd/${unit}" \ - "${SYSTEMD_DIR}/${unit}" -done -sudo systemctl daemon-reload - -# --- 6. Enable (and start, if tokens are set) --- -# The server unit needs a binary at /var/lib/vortex-bench/bin/vortex-bench-server -# before it can start. If the symlink isn't there yet, the deploy timer -# will lay one down on its first run; until then the server will fail. -if [ ! -e "${STATE_DIR}/bin/vortex-bench-server" ]; then - log "no binary at ${STATE_DIR}/bin/vortex-bench-server yet" - log " → the first deploy-timer fire (after start) will build + install one." - log " → tail it with: journalctl -fu vortex-bench-deploy.service" -fi - -# Detect whether the operator has filled in the bearer tokens. An empty -# INGEST_BEARER_TOKEN makes the server fail startup; an empty -# ADMIN_BEARER_TOKEN leaves the admin listener unbound. Both cases mean -# starting the units now would just produce noisy failures - enable but -# defer the start instead. Source the env file in a subshell and test -# the runtime values: the prior `grep '^X=.+'` heuristic matched -# explicitly-empty `X=""` lines (the two quote characters satisfy `.+`) -# and started units that immediately failed validate_bearer_token. -tokens_set=$( - set -a - # shellcheck disable=SC1090 - . "$ENV_FILE" - set +a - if [ -n "${INGEST_BEARER_TOKEN:-}" ] && [ -n "${ADMIN_BEARER_TOKEN:-}" ]; then - echo yes - fi -) - -if [ "$tokens_set" = "yes" ]; then - log "tokens present in ${ENV_FILE} - enabling + starting deploy/backup timers" - # NB: we do NOT start vortex-bench-server.service here. The binary - # at /var/lib/vortex-bench/bin/vortex-bench-server does not exist - # until the deploy timer's first fire builds and installs one; the - # server then comes up automatically when deploy.sh runs `systemctl - # restart vortex-bench-server` after the symlink swap. Starting - # the server unit before that would just produce a noisy failure. - sudo systemctl enable --now vortex-bench-deploy.timer - sudo systemctl enable --now vortex-bench-backup.timer - sudo systemctl enable vortex-bench-server.service -else - log "tokens not set in ${ENV_FILE} - timers enabled but not started" - sudo systemctl enable vortex-bench-deploy.timer - sudo systemctl enable vortex-bench-backup.timer - sudo systemctl enable vortex-bench-server.service - log "after editing ${ENV_FILE}, run:" - log " sudo systemctl start vortex-bench-deploy.timer" - log " sudo systemctl start vortex-bench-backup.timer" - log " (server starts automatically on the deploy timer's first fire)" -fi - -log "" -log "install complete. Next steps:" -log " 1. Edit ${ENV_FILE} (chmod 0600, owned by ${RUN_USER}):" -log " - INGEST_BEARER_TOKEN=" -log " - ADMIN_BEARER_TOKEN=" -log " - confirm REPO_DIR points at the actual checkout" -log " 2. After starting the timers, watch the first deploy fire build the" -log " binary and bring the server up with an empty DuckDB:" -log " journalctl -fu vortex-bench-deploy.service" -log " curl http://127.0.0.1:3000/health" -log " 3. Populate the DB with the v2 to v3 migration (server is stopped" -log " and restarted automatically; see BOOTSTRAP.md Phase 7.A for the" -log " migrator's CLI flag set):" -log " ${STATE_DIR}/ops/migrate.sh run --help" -log " 4. (If preserving an existing \$HOME/bench.duckdb instead of" -log " re-migrating, copy it into place before step 3:" -log " sudo systemctl stop vortex-bench-server" -log " mv \$HOME/bench.duckdb ${STATE_DIR}/bench.duckdb" -log " sudo systemctl start vortex-bench-server" -log " and skip step 3.)" diff --git a/benchmarks-website/ops/migrate.sh b/benchmarks-website/ops/migrate.sh deleted file mode 100755 index 13ea316a36c..00000000000 --- a/benchmarks-website/ops/migrate.sh +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Manual v2 to v3 migration wrapper. The migration tool needs exclusive -# access to the DB file, so the server is stopped first, the current DB -# is snapshotted to bench.prev-.duckdb for instant rollback, the -# migrate binary runs, and the server is started back up. -# -# Run from any directory while SSH'd onto the EC2 host. The args are -# passed through verbatim to `cargo run -p vortex-bench-migrate --`, so -# the operator owns the migrator's CLI surface (which has been changing -# while v3 stabilises). The wrapper only handles stop / snapshot prev -# DB / restart. -# -# Examples: -# /var/lib/vortex-bench/ops/migrate.sh run --output "$VORTEX_BENCH_DB" -# -# (Run as ec2-user is fine - we sudo only for systemctl.) - -set -euo pipefail - -ENV_FILE="${ENV_FILE:-/etc/vortex-bench.env}" -if [ ! -f "$ENV_FILE" ]; then - echo "ERROR: missing ${ENV_FILE}" >&2 - exit 2 -fi -set -a -# shellcheck disable=SC1090 -source "$ENV_FILE" -set +a -: "${REPO_DIR:?REPO_DIR must be set in ${ENV_FILE}}" -: "${VORTEX_BENCH_DB:?VORTEX_BENCH_DB must be set in ${ENV_FILE}}" -: "${SERVER_URL:=http://127.0.0.1:3000}" - -log() { printf '[migrate %s] %s\n' "$(date -u +%H:%M:%SZ)" "$*"; } - -if [ ! -d "${REPO_DIR}/.git" ]; then - echo "ERROR: REPO_DIR=${REPO_DIR} is not a git checkout" >&2 - exit 2 -fi - -# shellcheck disable=SC1091 -. "$HOME/.cargo/env" 2>/dev/null || true - -# Pause the autopilot for the duration of the migration. Stopping the -# timers alone is not enough - if deploy.service or backup.service is -# already mid-run, the active oneshot keeps going and can restart the -# server or call /api/admin/snapshot while the migrator owns the DB. -# Stop the services first (interrupting any active run, idempotent -# no-op if inactive), then the timers, then the server. -# -# The migration_succeeded flag is flipped to 1 only after the server -# comes back healthy. The trap restores the autopilot on success; on -# failure the autopilot stays paused so the operator can perform the -# documented mv-rollback without the deploy timer trying to re-fetch -# origin and run a fresh build on top of the half-rolled-back DB. -migration_succeeded=0 -# The sudoers fragment install.sh writes lists each unit on its own -# Cmnd line, and sudoers requires argv match exactly: multi-unit -# `systemctl stop A B C D` would NOT be authorized by per-unit entries -# and must be split into N single-unit invocations. Same on the -# success-restore path. We deliberately do NOT use `|| true` so a real -# sudo failure surfaces in the journal instead of silently no-op'ing the -# autopilot pause. Install the trap BEFORE the stop calls so a partial -# stop (one of the four sudo calls failing under set -e) still triggers -# the restore path. -restore_autopilot() { - if [ "$migration_succeeded" = "1" ]; then - log "restoring autopilot timers (deploy + backup)" - sudo /bin/systemctl start vortex-bench-deploy.timer - sudo /bin/systemctl start vortex-bench-backup.timer - else - log "migration did not complete - leaving autopilot timers stopped" - log " inspect with: systemctl status vortex-bench-server \\" - log " vortex-bench-deploy.service vortex-bench-deploy.timer \\" - log " vortex-bench-backup.service vortex-bench-backup.timer" - log " after rollback and verification, restart timers with:" - log " sudo systemctl start vortex-bench-deploy.timer" - log " sudo systemctl start vortex-bench-backup.timer" - fi -} -trap restore_autopilot EXIT - -log "stopping autopilot services (deploy + backup) + timers for migration window" -sudo /bin/systemctl stop vortex-bench-deploy.timer -sudo /bin/systemctl stop vortex-bench-deploy.service -sudo /bin/systemctl stop vortex-bench-backup.timer -sudo /bin/systemctl stop vortex-bench-backup.service - -log "stopping vortex-bench-server" -sudo /bin/systemctl stop vortex-bench-server - -# Snapshot the current DB so a botched migration can be reverted with -# one mv. WAL is folded in by DuckDB on next clean shutdown; if it -# survives a stop, copy it too. -ts="$(date -u +%Y%m%dT%H%M%SZ)" -prev="${VORTEX_BENCH_DB%.duckdb}.prev-${ts}.duckdb" -if [ -f "$VORTEX_BENCH_DB" ]; then - log "snapshotting ${VORTEX_BENCH_DB} → ${prev}" - cp -p "$VORTEX_BENCH_DB" "$prev" - [ -f "${VORTEX_BENCH_DB}.wal" ] && cp -p "${VORTEX_BENCH_DB}.wal" "${prev}.wal" -fi - -log "running cargo run --release -p vortex-bench-migrate -- (${#} args; argv NOT logged so future flags carrying secrets do not leak to journal)" -pushd "$REPO_DIR" >/dev/null -# Pass through whatever args the operator gave us. Don't inject a path -# flag - the migrator's CLI is owned by that crate. -if ! cargo run --release --quiet -p vortex-bench-migrate -- "$@"; then - popd >/dev/null - echo "ERROR: migration failed. Server is still stopped." >&2 - echo " Roll back:" >&2 - echo " mv \"$prev\" \"$VORTEX_BENCH_DB\"" >&2 - echo " [ -f \"${prev}.wal\" ] && mv \"${prev}.wal\" \"${VORTEX_BENCH_DB}.wal\" || true" >&2 - echo " Then start the server and re-enable autopilot timers:" >&2 - echo " sudo systemctl start vortex-bench-server" >&2 - echo " sudo systemctl start vortex-bench-deploy.timer" >&2 - echo " sudo systemctl start vortex-bench-backup.timer" >&2 - exit 3 -fi -popd >/dev/null - -log "starting vortex-bench-server" -sudo /bin/systemctl start vortex-bench-server - -# Give it a few seconds to come up. -deadline=$(( $(date +%s) + 30 )) -while [ "$(date +%s)" -lt "$deadline" ]; do - if curl -fsS --max-time 3 "${SERVER_URL}/health" >/dev/null 2>&1; then - migration_succeeded=1 - log "migrate ok - server is up" - log " prev DB kept at ${prev} (delete when you've verified data)" - exit 0 - fi - sleep 1 -done -echo "ERROR: server did not respond on /health within 30s" >&2 -# Stop the server BEFORE printing rollback instructions: the unit has -# Restart=on-failure RestartSec=2, so leaving it running would loop a -# broken/half-migrated binary against the new DB, and the rollback `mv` -# below races against the still-open file handle (on Linux the mv -# succeeds but the live server keeps writing to the unlinked inode). -# Do NOT swallow a `systemctl stop` failure with `|| true` here: if the -# stop fails (mid-procedure sudoers regression, systemd bus stuck), the -# rollback `mv` below races a still-running server and the operator -# corrupts the prev DB by following the printed instructions verbatim. -# Bail loudly so the operator fixes the stop path before any mv. -if ! sudo /bin/systemctl stop vortex-bench-server; then - echo "CRITICAL: 'sudo systemctl stop vortex-bench-server' failed." >&2 - echo " Do NOT execute the documented rollback 'mv' commands until the server is" >&2 - echo " verifiably stopped (check 'systemctl status vortex-bench-server')." >&2 - echo " The autopilot timers stay paused; debug the stop first." >&2 - exit 4 -fi -echo " server stopped. Roll back:" >&2 -echo " mv \"$prev\" \"$VORTEX_BENCH_DB\"" >&2 -echo " [ -f \"${prev}.wal\" ] && mv \"${prev}.wal\" \"${VORTEX_BENCH_DB}.wal\" || true" >&2 -echo " sudo systemctl start vortex-bench-server" >&2 -echo " sudo systemctl start vortex-bench-deploy.timer" >&2 -echo " sudo systemctl start vortex-bench-backup.timer" >&2 -exit 1 diff --git a/benchmarks-website/ops/restart.sh b/benchmarks-website/ops/restart.sh deleted file mode 100755 index 290675f9597..00000000000 --- a/benchmarks-website/ops/restart.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Restart the vortex-bench-server binary in place (no rebuild), with -# visible before/after state so you don't have to wonder if it worked. -# -# Prints, in order: -# - the running pid + start time + binary path the symlink points at, -# before the restart -# - the systemctl exit code -# - the running pid + start time + /health response after the restart -# - "RESTART OK" / "RESTART FAILED" + non-zero exit on failure -# -# Use this instead of `sudo systemctl restart vortex-bench-server` -# when you want any sign that it actually happened. - -set -euo pipefail - -ENV_FILE="${ENV_FILE:-/etc/vortex-bench.env}" -# Source the env file first so any SERVER_URL in /etc/vortex-bench.env is -# picked up, THEN apply the local default if both env-file and caller env -# left it unset. (Matches the sibling scripts; replaces the prior -# `default-then-source-then-no-op-default` shape that was misleading.) -if [ -f "$ENV_FILE" ]; then - set -a - # shellcheck disable=SC1090 - . "$ENV_FILE" - set +a -fi -SERVER_URL="${SERVER_URL:-http://127.0.0.1:3000}" - -snap() { - # Use systemd as the source of truth for the running pid (matches - # whatever it would restart). Falls back to "?" if the unit is - # already inactive. - local pid started binary health - pid="$(systemctl show -p MainPID --value vortex-bench-server 2>/dev/null || echo 0)" - started="$(systemctl show -p ActiveEnterTimestamp --value vortex-bench-server 2>/dev/null || echo '?')" - if [ -L /var/lib/vortex-bench/bin/vortex-bench-server ]; then - binary="$(readlink /var/lib/vortex-bench/bin/vortex-bench-server)" - else - binary="?" - fi - health="$(curl -fsS --max-time 2 "${SERVER_URL}/health" 2>/dev/null \ - | (command -v jq >/dev/null && jq -c . || cat) \ - || echo '(unreachable)')" - printf ' pid: %s\n started: %s\n binary: %s\n /health: %s\n' \ - "$pid" "$started" "$binary" "$health" -} - -echo "BEFORE:" -snap - -echo -echo "running: sudo systemctl restart vortex-bench-server" -if ! sudo /bin/systemctl restart vortex-bench-server; then - echo "ERROR: systemctl restart returned non-zero" >&2 - echo - echo "AFTER (restart did not complete):" - snap - exit 1 -fi - -# Wait up to 30s for the new process to take requests. -deadline=$(( $(date +%s) + 30 )) -ok=0 -while [ "$(date +%s)" -lt "$deadline" ]; do - if curl -fsS --max-time 2 "${SERVER_URL}/health" >/dev/null 2>&1; then - ok=1 - break - fi - sleep 0.5 -done - -echo -echo "AFTER:" -snap - -if [ "$ok" = "1" ]; then - echo - echo "RESTART OK" - exit 0 -else - echo - echo "RESTART FAILED - /health did not respond within 30s" >&2 - echo "Inspect with: journalctl -u vortex-bench-server --since '1 min ago' --no-pager" >&2 - exit 1 -fi diff --git a/benchmarks-website/ops/systemd/vortex-bench-backup.service b/benchmarks-website/ops/systemd/vortex-bench-backup.service deleted file mode 100644 index 05bd56168fa..00000000000 --- a/benchmarks-website/ops/systemd/vortex-bench-backup.service +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Oneshot unit invoked by vortex-bench-backup.timer hourly. Calls -# /api/admin/snapshot to land a per-table Vortex snapshot (schema.sql -# plus one
.vortex file per table) under -# $VORTEX_BENCH_SNAPSHOT_DIR//, `tar czf`s the directory into a -# single archive, then `aws s3 cp` uploads it to -# $S3_BACKUP_PREFIX/.tar.gz and deletes the local copies. -# -# PrivateTmp keeps the script's /tmp scratch (response capture) isolated -# from any concurrent manual `bash backup.sh` an operator might fire. - -[Unit] -Description=Vortex bench v3 hourly DB snapshot to S3 -After=vortex-bench-server.service network-online.target -Wants=network-online.target - -[Service] -Type=oneshot -User=ec2-user -Group=ec2-user -EnvironmentFile=/etc/vortex-bench.env -WorkingDirectory=/var/lib/vortex-bench -ExecStart=/var/lib/vortex-bench/ops/backup.sh -StandardOutput=journal -StandardError=journal - -# Hardening - backup only needs to write snapshot scratch + tarballs in -# the state dir and call `aws s3 cp` via the instance role. -ProtectSystem=strict -ReadWritePaths=/var/lib/vortex-bench -ProtectHome=read-only -PrivateTmp=true -NoNewPrivileges=true -ProtectKernelTunables=true -ProtectKernelModules=true -ProtectKernelLogs=true -ProtectControlGroups=true -RestrictNamespaces=true -RestrictSUIDSGID=true -RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX -LockPersonality=true -SystemCallFilter=@system-service -SystemCallFilter=~@privileged @resources -CapabilityBoundingSet= -AmbientCapabilities= diff --git a/benchmarks-website/ops/systemd/vortex-bench-backup.timer b/benchmarks-website/ops/systemd/vortex-bench-backup.timer deleted file mode 100644 index 3c1910fe892..00000000000 --- a/benchmarks-website/ops/systemd/vortex-bench-backup.timer +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Hourly snapshot timer. Persistent=true means a missed hour catches up -# after a reboot. - -[Unit] -Description=Vortex bench v3 hourly DB snapshot timer -Requires=vortex-bench-backup.service - -[Timer] -# Top of every hour, UTC. -OnCalendar=hourly -Persistent=true -RandomizedDelaySec=2min -Unit=vortex-bench-backup.service - -[Install] -WantedBy=timers.target diff --git a/benchmarks-website/ops/systemd/vortex-bench-deploy.service b/benchmarks-website/ops/systemd/vortex-bench-deploy.service deleted file mode 100644 index 2681b8a1baa..00000000000 --- a/benchmarks-website/ops/systemd/vortex-bench-deploy.service +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Oneshot unit invoked by vortex-bench-deploy.timer every 60s. Runs -# ops/deploy.sh, which is a no-op when origin/$DEPLOY_BRANCH hasn't -# moved or moved without touching benchmarks-website code. -# -# This unit deliberately runs as ec2-user - `git pull` and `cargo build` -# don't need root. The script asks for sudo only to `systemctl restart` -# vortex-bench-server, and the sudoers fragment ops/install.sh writes -# allows that single command without a password. - -[Unit] -Description=Vortex bench v3 auto-deploy -# Run after the server unit is up; if the server is failing, restarting -# it from the deploy script is what we want anyway. -After=vortex-bench-server.service network-online.target -Wants=network-online.target - -[Service] -Type=oneshot -User=ec2-user -Group=ec2-user -EnvironmentFile=/etc/vortex-bench.env -WorkingDirectory=/var/lib/vortex-bench -ExecStart=/var/lib/vortex-bench/ops/deploy.sh -StandardOutput=journal -StandardError=journal -# Exit 1 = lock contention (another deploy is in progress). This is the -# common steady-state outcome when a manual force-rebuild races the timer -# fire, and is not a real failure - treat it as success so `systemctl -# status` does not show repeated failed runs in the journal. Real -# failures (exits 2 through 7) still surface as failed. -SuccessExitStatus=0 1 -# Deploy runs cargo build (writes to $HOME/.cargo and $REPO_DIR/target); -# hardening this unit beyond ec2-user privileges would break the build. -# The narrower hardening lives on vortex-bench-server.service. diff --git a/benchmarks-website/ops/systemd/vortex-bench-deploy.timer b/benchmarks-website/ops/systemd/vortex-bench-deploy.timer deleted file mode 100644 index e4536e958ff..00000000000 --- a/benchmarks-website/ops/systemd/vortex-bench-deploy.timer +++ /dev/null @@ -1,25 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Timer that fires the auto-deploy oneshot every 60s. AccuracySec=10s -# keeps the wakeups from coalescing too aggressively; the work itself -# is a no-op when nothing relevant changed. - -[Unit] -Description=Vortex bench v3 auto-deploy timer -Requires=vortex-bench-deploy.service - -[Timer] -# Fire 30s after boot, then every 60s. -OnBootSec=30s -OnUnitActiveSec=60s -AccuracySec=10s -# Catch up after a reboot in case OnUnitActiveSec slot was missed while -# the host was down. Matches vortex-bench-backup.timer's idiom; deploy -# itself is idempotent (path-filter + SHA-stamp + hash-equal-skip) so -# a missed-tick catch-up is at worst a no-op. -Persistent=true -Unit=vortex-bench-deploy.service - -[Install] -WantedBy=timers.target diff --git a/benchmarks-website/ops/systemd/vortex-bench-server.service b/benchmarks-website/ops/systemd/vortex-bench-server.service deleted file mode 100644 index 8113c29f351..00000000000 --- a/benchmarks-website/ops/systemd/vortex-bench-server.service +++ /dev/null @@ -1,61 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# systemd unit for vortex-bench-server. Owns the DuckDB file at -# $VORTEX_BENCH_DB and serves :3000. -# -# Installed by ops/install.sh into /etc/systemd/system/. Restart on -# crash, never give up. Start order: after a successful binary install -# at /var/lib/vortex-bench/bin/vortex-bench-server (the deploy timer -# atomically swaps this symlink in place; this unit is `restart`ed via -# the sudoers fragment ops/install.sh writes). - -[Unit] -Description=Vortex bench v3 server (bench.vortex.dev) -After=network-online.target -Wants=network-online.target -# A naive `systemctl start vortex-bench-server` before the deploy timer -# has laid down a binary at the symlink would emit a confusing -# `execve: No such file or directory` in the journal; gate on the -# symlink so the unit reports "condition failed, not started" instead. -ConditionPathExists=/var/lib/vortex-bench/bin/vortex-bench-server - -[Service] -Type=simple -User=ec2-user -Group=ec2-user -WorkingDirectory=/var/lib/vortex-bench -EnvironmentFile=/etc/vortex-bench.env -ExecStart=/var/lib/vortex-bench/bin/vortex-bench-server -Restart=on-failure -RestartSec=2 -TimeoutStopSec=60s -# `journalctl -u vortex-bench-server` is the canonical log. Keep stdout -# unbuffered so `tail -f` works. -Environment=RUST_BACKTRACE=1 -StandardOutput=journal -StandardError=journal - -# Conservative hardening - server only needs DB writes and outbound TCP -# (none in steady state, but cargo build's hyper would). -ProtectSystem=strict -ReadWritePaths=/var/lib/vortex-bench -ProtectHome=read-only -PrivateTmp=true -NoNewPrivileges=true -ProtectKernelTunables=true -ProtectKernelModules=true -ProtectKernelLogs=true -ProtectControlGroups=true -RestrictNamespaces=true -RestrictSUIDSGID=true -RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX -LockPersonality=true -MemoryDenyWriteExecute=true -SystemCallFilter=@system-service -SystemCallFilter=~@privileged @resources -CapabilityBoundingSet= -AmbientCapabilities= - -[Install] -WantedBy=multi-user.target diff --git a/benchmarks-website/server/ARCHITECTURE.md b/benchmarks-website/server/ARCHITECTURE.md deleted file mode 100644 index e790b277a46..00000000000 --- a/benchmarks-website/server/ARCHITECTURE.md +++ /dev/null @@ -1,87 +0,0 @@ - - -# Benchmark Server Architecture - -The benchmark website is optimized around a materialized latest-100 read path. -DuckDB is the source of truth, but normal landing-page and group-open -hydration does not run SQL, serialize JSON, or compress responses per request. - -## Hot Read Path - -On startup the server builds a `ReadGeneration` from one DuckDB snapshot. That -generation contains precomputed JSON artifacts for: - -- `/api/groups` -- default `/api/chart/{slug}` latest-100 payloads -- default `/api/group/{slug}` latest-100 compatibility payloads -- versioned group shard payloads under - `/api/artifacts/{generation}/groups/{group_slug}/shards/{index}` - -Each artifact is stored in memory as identity, gzip, and brotli bytes. Request -handlers negotiate `Accept-Encoding` and serve those bytes directly with -`ETag`, `Vary: Accept-Encoding`, `Content-Length`, and cache headers. - -## Page Hydration - -The landing page and `/group/{slug}` render group metadata plus chart shells, -not inline chart payloads. Each group carries the active read generation, shard -count, and shard URL prefix. `chart-init.js` fetches shard 0 on intent or group -open so charts paint quickly, then queues the remaining latest-100 shards with -bounded per-tab concurrency. - -Latest-100 chart payloads include additive `history` metadata: - -- `total_commits`: full x-axis length for the chart -- `start_index`: where this payload starts in the full x-axis -- `loaded_commits`: number of loaded commits -- `complete`: whether the payload covers the full x-axis - -The client normalizes incomplete latest-100 payloads onto the full virtual -x-axis. Older unloaded commits are represented by blank labels and null series -values, so the range strip, zoom limits, and slider bounds behave as if the -whole history is present without fabricating data. - -## Full-History Warmup - -Opening a group queues `/api/chart/{slug}?n=all` for that group's charts in a -separate low-concurrency priority queue. A later-opened group gets higher -priority than queued work for older groups. If the user pans or zooms into an -unloaded virtual range before warmup finishes, that chart's queued full-history -request is promoted. In-flight requests are not cancelled. - -When the full payload arrives, the client replaces the virtual latest-100 -payload in place and preserves the current x-range when possible. - -## Fallback Paths - -`?n=all` and non-default `?n=` windows still use the DB-backed fallback path. -Those reads go through `QueryCache` single-flight entries and the DB read -semaphore so cold or unusual requests do not stampede DuckDB. Ingest writes do -not consume read permits. - -## Ingest And Rebuild - -Successful ingest invalidates `QueryCache` and schedules a read-model rebuild. -The active generation remains live while rebuilding. Repeated rebuild requests -coalesce, and a failed rebuild keeps serving the old generation. The server -keeps the active generation plus up to `RETAINED_PREVIOUS_GENERATIONS` previous -generations (currently 8) in a bounded `VecDeque` so already loaded pages can -continue resolving immutable shard URLs across a swap. The window is sized for -the worst observed CI dual-write burst; under-sizing surfaces as 404s on -`/api/artifacts/{generation}/.../shards/{i}` for stale tabs, which auto-recover -on the next group reopen. See `RETAINED_PREVIOUS_GENERATIONS` in -`src/read_model.rs`. - -## Main Files - -- `src/read_model.rs`: materialized generation and encoded artifact serving -- `src/api/mod.rs`: API routing between materialized artifacts and fallbacks -- `src/api/charts.rs`: chart DTO construction and `history` metadata -- `src/html/mod.rs`, `src/html/landing.rs`: shell/shard HTML rendering -- `static/chart-init.js`: virtual-axis normalization, shard hydration, and - full-history priority warmup -- `src/query_cache.rs`: single-flight fallback cache -- `src/db.rs`: DuckDB connection cloning and read backpressure diff --git a/benchmarks-website/server/Cargo.toml b/benchmarks-website/server/Cargo.toml deleted file mode 100644 index 8966a6fc154..00000000000 --- a/benchmarks-website/server/Cargo.toml +++ /dev/null @@ -1,67 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors - -[package] -name = "vortex-bench-server" -version = "0.1.0-alpha.0" -edition = "2024" -rust-version = "1.91.0" -license = "Apache-2.0" -description = "bench.vortex.dev v3 alpha server: HTTP API + HTML + DuckDB on local disk" -publish = false - -[lib] -name = "vortex_bench_server" -path = "src/lib.rs" - -[[bin]] -name = "vortex-bench-server" -path = "src/main.rs" - -# This is a leaf binary, not part of the vortex-* public API surface. -# Errors use anyhow / thiserror and the crate is intentionally outside -# the workspace. - -[dependencies] -anyhow = { workspace = true } -axum = "0.8" -base64 = "0.22" -brotli = "8.0.2" -bytes = "1.11" -dashmap = { workspace = true } -# track vortex-duckdb's bundled engine version (build.rs) -duckdb = { version = "1.10502", features = ["bundled"] } -flate2 = "1" -futures = { workspace = true, features = ["std"] } -maud = { version = "0.27", features = ["axum"] } -parking_lot = { workspace = true } -serde = { workspace = true, features = ["derive", "rc"] } -serde_json = { workspace = true } -subtle = "2.6" -thiserror = { workspace = true } -tokio = { workspace = true, features = [ - "rt-multi-thread", - "macros", - "net", - "signal", - "sync", -] } -tower = "0.5" -tower-http = { version = "0.6", features = [ - "compression-br", - "compression-gzip", - "trace", -] } -tracing = { workspace = true, features = ["std"] } -tracing-subscriber = { workspace = true, features = ["env-filter", "fmt"] } -twox-hash = "2.1" -vortex-utils = { workspace = true } - -[dev-dependencies] -# `filters` enables `Settings::add_filter`, which we use in -# `tests/common/mod.rs` to redact the build SHA from the HTML -# snapshots so they don't churn on every commit. -insta = { workspace = true, features = ["filters"] } -reqwest = { workspace = true, features = ["json"] } -tempfile = { workspace = true } -tokio = { workspace = true, features = ["rt-multi-thread", "macros", "net"] } diff --git a/benchmarks-website/server/build.rs b/benchmarks-website/server/build.rs deleted file mode 100644 index bd6f909c752..00000000000 --- a/benchmarks-website/server/build.rs +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -use std::process::Command; - -fn main() { - if std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("windows") { - println!("cargo:rustc-link-lib=dylib=rstrtmgr"); - } - - // Capture the git SHA at build time so /health can confirm the - // running process matches what the deploy timer last saw. Emit the - // full 40-hex SHA so operators can compare directly to the value - // in `/var/lib/vortex-bench/last-deployed-sha` (also full SHA); - // the runbook tells them to verify equality with no truncation. - // Falls back to "unknown" outside a git checkout (e.g. shallow CI - // clones, source tarballs) so the build never fails on this. - let sha = Command::new("git") - .args(["rev-parse", "HEAD"]) - .output() - .ok() - .filter(|o| o.status.success()) - .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_owned()) - .unwrap_or_else(|| "unknown".to_owned()); - println!("cargo:rustc-env=VORTEX_BENCH_BUILD_SHA={sha}"); - - // HEAD covers the common deploy.sh path - // (`git checkout --force --detach `); refs/heads/* covers - // local branches if anyone runs the binary from a checked-out - // branch. Both are no-ops if the file doesn't exist. - println!("cargo:rerun-if-changed=../../.git/HEAD"); - println!("cargo:rerun-if-changed=../../.git/refs/heads"); -} diff --git a/benchmarks-website/server/fixtures/envelope.json b/benchmarks-website/server/fixtures/envelope.json deleted file mode 100644 index 14861f54a3c..00000000000 --- a/benchmarks-website/server/fixtures/envelope.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "run_meta": { - "benchmark_id": "fixture", - "schema_version": 1, - "started_at": "2026-04-25T00:00:00Z" - }, - "commit": { - "sha": "0123456789abcdef0123456789abcdef01234567", - "timestamp": "2026-04-25T00:00:00Z", - "message": "fixture commit", - "author_name": "Test Author", - "author_email": "author@example.com", - "committer_name": "Test Committer", - "committer_email": "committer@example.com", - "tree_sha": "fedcba9876543210fedcba9876543210fedcba98", - "url": "https://github.com/vortex-data/vortex/commit/0123456789abcdef0123456789abcdef01234567" - }, - "records": [ - { - "kind": "query_measurement", - "commit_sha": "0123456789abcdef0123456789abcdef01234567", - "dataset": "tpch", - "scale_factor": "1", - "query_idx": 1, - "storage": "nvme", - "engine": "datafusion", - "format": "vortex-file-compressed", - "value_ns": 1234567, - "all_runtimes_ns": [1200000, 1234567, 1300000], - "env_triple": "x86_64-linux-gnu" - }, - { - "kind": "compression_time", - "commit_sha": "0123456789abcdef0123456789abcdef01234567", - "dataset": "tpch-lineitem", - "format": "vortex-file-compressed", - "op": "encode", - "value_ns": 9999, - "all_runtimes_ns": [9000, 9999, 10500] - }, - { - "kind": "compression_size", - "commit_sha": "0123456789abcdef0123456789abcdef01234567", - "dataset": "tpch-lineitem", - "format": "vortex-file-compressed", - "value_bytes": 4242 - }, - { - "kind": "random_access_time", - "commit_sha": "0123456789abcdef0123456789abcdef01234567", - "dataset": "taxi", - "format": "vortex-file-compressed", - "value_ns": 555, - "all_runtimes_ns": [500, 555, 600] - }, - { - "kind": "vector_search_run", - "commit_sha": "0123456789abcdef0123456789abcdef01234567", - "dataset": "cohere-large-10m", - "layout": "partitioned", - "flavor": "vortex-turboquant", - "threshold": 0.75, - "value_ns": 7777, - "all_runtimes_ns": [7700, 7777, 7800], - "matches": 42, - "rows_scanned": 1000000, - "bytes_scanned": 5000000, - "iterations": 3 - } - ] -} diff --git a/benchmarks-website/server/src/admin.rs b/benchmarks-website/server/src/admin.rs deleted file mode 100644 index 0273d61468b..00000000000 --- a/benchmarks-website/server/src/admin.rs +++ /dev/null @@ -1,763 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Admin endpoints - bearer-gated DuckDB snapshot and read-only SQL. -//! -//! Mounted at `/api/admin/*` only when `ADMIN_BEARER_TOKEN` is set on the -//! server, surfaced through [`crate::app::AppState::with_admin`]. Both routes -//! require an `Authorization: Bearer ` header - the -//! `INGEST_BEARER_TOKEN` will not work here, so the two can rotate -//! independently. The operator workflow is documented in -//! `benchmarks-website/ops/README.md`. -//! -//! ## Routes -//! -//! ### `POST /api/admin/snapshot?ts=` -//! -//! Writes a snapshot directory `//` containing: -//! - `schema.sql` - concatenated DDL ([`crate::schema::COMMITS_DDL`] plus -//! every [`crate::family::FAMILIES`] entry's `schema_ddl`), so a -//! restore knows how to recreate the tables before bulk-loading. -//! - `
.vortex` for every table in [`crate::schema::TABLES`] - -//! each produced by a `COPY (SELECT * FROM
) TO … -//! (FORMAT vortex)`. The vortex DuckDB extension is auto-installed -//! from the DuckDB core extension repo on first call, then `LOAD`ed. -//! -//! Vortex compresses the BIGINT[] runtime arrays and string columns -//! roughly an order of magnitude better than gzipped CSV on this shape; -//! it is also the project's own format, which is the obvious dogfood. -//! -//! `ts` must match `[A-Za-z0-9_-]{1,64}`; the snapshot script -//! conventionally passes a UTC timestamp like `20260508T010000Z`. The -//! target subdirectory must not already exist (409 otherwise). All -//! per-table COPY statements run on a connection cloned from the -//! shared handle, so concurrent ingest writes are not blocked. -//! -//! ### `POST /api/admin/sql` -//! -//! Body: `{ "sql": "SELECT ..." }`. Query: `?format=json|table` (default -//! `json`). Only `SELECT`, `WITH`, `PRAGMA`, `SHOW`, `DESCRIBE`, and -//! `EXPLAIN` statements are allowed - anything else is rejected with 403. -//! Results are capped at `ADMIN_SQL_ROW_LIMIT` rows; responses past -//! that cap include `"truncated": true`. The handler runs each query on -//! its own cloned connection inside a `BEGIN TRANSACTION READ ONLY` -//! wrapper, so concurrent ingest writes proceed without contention. - -use std::fmt::Write as _; -use std::path::Path; -use std::path::PathBuf; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; - -use anyhow::Context as _; -use anyhow::Result; -use axum::Json; -use axum::extract::Query; -use axum::extract::Request; -use axum::extract::State; -use axum::http::StatusCode; -use axum::http::header::AUTHORIZATION; -use axum::http::header::CONTENT_TYPE; -use axum::middleware::Next; -use axum::response::IntoResponse; -use axum::response::Response; -use duckdb::Connection; -use duckdb::types::ValueRef; -use serde::Deserialize; -use serde::Serialize; -use serde_json::Value; -use serde_json::json; -use subtle::ConstantTimeEq; -use thiserror::Error; - -use crate::app::AppState; -use crate::db; -use crate::family; -use crate::schema; - -const ADMIN_SQL_ROW_LIMIT: usize = 10_000; - -/// Errors surfaced by `/api/admin/*` handlers. Auth (401) is handled by -/// [`require_admin_bearer`] and never reaches a handler. -#[derive(Debug, Error)] -pub enum AdminError { - /// 400 - request shape is malformed (bad `ts`, bad SQL JSON body, …). - #[error("bad request: {0}")] - BadRequest(String), - /// 403 - request is well-formed but the SQL statement is not on the - /// read-only allow-list. - #[error("forbidden: {0}")] - Forbidden(String), - /// 409 - snapshot target directory already exists. - #[error("conflict: {0}")] - Conflict(String), - /// 500 - anything else (DB error, IO error, …). - #[error("internal server error")] - Internal(#[from] anyhow::Error), -} - -impl IntoResponse for AdminError { - fn into_response(self) -> Response { - let (status, body) = match &self { - Self::BadRequest(msg) => ( - StatusCode::BAD_REQUEST, - json!({ "error": "bad_request", "message": msg }), - ), - Self::Forbidden(msg) => ( - StatusCode::FORBIDDEN, - json!({ "error": "forbidden", "message": msg }), - ), - Self::Conflict(msg) => ( - StatusCode::CONFLICT, - json!({ "error": "conflict", "message": msg }), - ), - Self::Internal(err) => { - tracing::error!(error = ?err, "admin internal error"); - ( - StatusCode::INTERNAL_SERVER_ERROR, - json!({ "error": "internal" }), - ) - } - }; - (status, Json(body)).into_response() - } -} - -/// Axum middleware enforcing the admin bearer token on `/api/admin/*`. -/// 401 if the header is missing, malformed, or wrong; 503 if the server -/// was started without `ADMIN_BEARER_TOKEN` (the admin router is unmounted -/// in that case, so this is just a defensive belt-and-braces check). -pub async fn require_admin_bearer( - State(state): State, - req: Request, - next: Next, -) -> Result { - let Some(expected) = state.admin_bearer_token.as_ref() else { - return Err(( - StatusCode::SERVICE_UNAVAILABLE, - Json(json!({ "error": "admin_not_configured" })), - ) - .into_response()); - }; - let unauthorized = || { - ( - StatusCode::UNAUTHORIZED, - Json(json!({ "error": "unauthorized" })), - ) - .into_response() - }; - let header = req - .headers() - .get(AUTHORIZATION) - .ok_or_else(unauthorized)? - .to_str() - .map_err(|_| unauthorized())?; - let presented = header - .strip_prefix("Bearer ") - .ok_or_else(unauthorized)? - .as_bytes(); - if presented.ct_eq(expected.as_bytes()).into() { - Ok(next.run(req).await) - } else { - Err(unauthorized()) - } -} - -#[derive(Debug, Deserialize)] -pub struct SnapshotQuery { - /// Operator-supplied identifier for the snapshot, used as the leaf - /// directory name. Must match `[A-Za-z0-9_-]{1,64}`. - pub ts: String, -} - -#[derive(Debug, Serialize)] -pub struct SnapshotResponse { - /// Absolute path of the directory the export landed in. - pub snapshot_dir: String, -} - -/// Handler for `POST /api/admin/snapshot?ts=`. Writes -/// `schema.sql` plus one `
.vortex` file per fact/dim table into -/// a fresh subdirectory under [`AppState::snapshot_dir`]. -pub async fn snapshot( - State(state): State, - Query(q): Query, -) -> Result { - validate_ts(&q.ts)?; - let target: PathBuf = state.snapshot_dir.join(&q.ts); - - // Process-local `ts` reservation. Two concurrent calls with the - // same `ts` would otherwise both write tmp directories and then - // race at the `rename(2)` step - Linux silently overwrites an - // existing destination, so the loser's snapshot disappears with no - // signal. The reservation closes that race within a single - // `vortex-bench-server` process (the supported deployment). - let _ticket = SnapshotTicket::acquire(&state, &q.ts, &target)?; - - if target.exists() { - return Err(AdminError::Conflict(format!( - "snapshot directory already exists: {}", - target.display() - ))); - } - - let tmp = tmp_snapshot_dir(&target, &q.ts); - if tmp.exists() { - std::fs::remove_dir_all(&tmp) - .with_context(|| format!("removing stale temp snapshot dir {}", tmp.display()))?; - } - std::fs::create_dir_all(&tmp) - .with_context(|| format!("creating temp snapshot dir {}", tmp.display()))?; - - let result = write_snapshot(&state, &tmp).await; - if let Err(err) = result { - cleanup_partial(&tmp); - return Err(AdminError::Internal(err)); - } - // The ticket guarantees no other in-process call has the same `ts` - // reserved, so the final `rename(2)` will land cleanly. We still - // recheck `target.exists()` because a different process or an - // operator hand-creating the dir would also lose data on a silent - // overwrite. - if target.exists() { - cleanup_partial(&tmp); - return Err(AdminError::Conflict(format!( - "snapshot directory already exists: {}", - target.display() - ))); - } - if let Err(err) = std::fs::rename(&tmp, &target).with_context(|| { - format!( - "moving snapshot dir {} to {}", - tmp.display(), - target.display() - ) - }) { - cleanup_partial(&tmp); - return Err(AdminError::Internal(err)); - } - Ok(Json(SnapshotResponse { - snapshot_dir: target.display().to_string(), - })) -} - -/// RAII guard that holds a `ts` in [`AppState::pending_snapshots`] for the -/// duration of one `/api/admin/snapshot` call. Dropping the guard always -/// releases the reservation, even on panic or early-return error paths. -struct SnapshotTicket { - state: AppState, - ts: String, -} - -impl SnapshotTicket { - fn acquire(state: &AppState, ts: &str, target: &Path) -> Result { - let inserted = state.pending_snapshots.lock().insert(ts.to_string()); - if !inserted { - return Err(AdminError::Conflict(format!( - "snapshot for ts={ts} is already in flight (target {})", - target.display() - ))); - } - Ok(Self { - state: state.clone(), - ts: ts.to_string(), - }) - } -} - -impl Drop for SnapshotTicket { - fn drop(&mut self) { - self.state.pending_snapshots.lock().remove(&self.ts); - } -} - -/// Best-effort cleanup of a partially-written snapshot tmp dir. Logs the -/// failure rather than silently discarding it, so a wedge (disk full, -/// permission flip) is visible in the journal even when no automated -/// sweeper is wired up. -fn cleanup_partial(path: &Path) { - if let Err(err) = std::fs::remove_dir_all(path) { - // ENOENT just means the dir never got created or was already - // cleaned up by a sibling caller; ignore it. Anything else - // deserves a warn. - if err.kind() != std::io::ErrorKind::NotFound { - tracing::warn!( - path = %path.display(), - error = ?err, - "failed to clean up partial snapshot tmp dir; manual sweep may be needed" - ); - } - } -} - -/// Per-call unique temp directory used to stage a snapshot before the atomic -/// rename onto `target`. Includes a process-local counter so two concurrent -/// calls with the same `ts` in the same server process never share a staging -/// directory and clobber each other's in-progress writes. -fn tmp_snapshot_dir(target: &Path, ts: &str) -> PathBuf { - static COUNTER: AtomicU64 = AtomicU64::new(0); - let id = COUNTER.fetch_add(1, Ordering::Relaxed); - target.with_file_name(format!("{ts}.tmp-{}-{}", std::process::id(), id)) -} - -async fn write_snapshot(state: &AppState, target: &Path) -> Result<()> { - // Schema is just our DDL string verbatim; restore reads this with - // `duckdb -init schema.sql` (or `.read schema.sql`) before - // bulk-loading the per-table vortex files. The DDL is assembled - // from the commits dim + every fact-table family in the same order - // `db::open()` applies them. - let mut schema_sql = String::with_capacity(8 * 1024); - schema_sql.push_str(schema::COMMITS_DDL); - for fam in family::FAMILIES { - schema_sql.push_str(fam.schema_ddl); - } - std::fs::write(target.join("schema.sql"), schema_sql) - .with_context(|| format!("writing schema.sql under {}", target.display()))?; - - let target_for_db = target.to_path_buf(); - db::run_blocking(&state.db, move |conn| { - export_snapshot_tables(conn, &target_for_db) - }) - .await -} - -fn export_snapshot_tables(conn: &mut Connection, target: &Path) -> Result<()> { - // Idempotent - `INSTALL` is a no-op if the extension is already - // present, `LOAD` is cheap once the binary is on disk. Vortex is a - // DuckDB core extension (not community), so the unqualified `INSTALL` - // hits the right repo on first call; subsequent calls are local. - // Runs outside the snapshot transaction because extension installation - // is not transactional. - conn.execute_batch("INSTALL vortex; LOAD vortex;") - .context("INSTALL/LOAD vortex extension")?; - - // All per-table COPYs share one `READ ONLY` transaction. Otherwise an - // ingest commit between the `commits` export and the - // `query_measurements` export yields an inconsistent backup - facts - // referencing a commit row that is not in the snapshot, or vice - // versa. The transaction's READ ONLY guard also belts-and-braces - // against the snapshot path accidentally writing. - conn.execute_batch("BEGIN TRANSACTION READ ONLY") - .context("begin read-only snapshot transaction")?; - if let Err(err) = copy_each_table(conn, target) { - if let Err(rb_err) = conn.execute_batch("ROLLBACK") { - tracing::warn!( - error = ?rb_err, - "rolling back snapshot read-only transaction failed; the original \ - export error (returned to the caller) is the actionable one" - ); - } - return Err(err); - } - conn.execute_batch("COMMIT") - .context("commit read-only snapshot transaction")?; - Ok(()) -} - -fn copy_each_table(conn: &Connection, target: &Path) -> Result<()> { - for table in schema::TABLES.iter() { - let path = target.join(format!("{table}.vortex")); - let path_str = path - .to_str() - .ok_or_else(|| anyhow::anyhow!("snapshot path is not UTF-8: {}", path.display()))?; - let sql = format!( - "COPY (SELECT * FROM {table}) TO {} (FORMAT vortex)", - db::sql_string_literal(path_str) - ); - conn.execute_batch(&sql) - .with_context(|| format!("COPY {table} TO {path_str}"))?; - } - Ok(()) -} - -fn validate_ts(ts: &str) -> Result<(), AdminError> { - if ts.is_empty() || ts.len() > 64 { - return Err(AdminError::BadRequest("ts must be 1..=64 chars".into())); - } - if !ts - .chars() - .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_') - { - return Err(AdminError::BadRequest( - "ts must match [A-Za-z0-9_-]+".into(), - )); - } - Ok(()) -} - -#[derive(Debug, Deserialize)] -pub struct SqlBody { - pub sql: String, -} - -#[derive(Debug, Deserialize, Default)] -pub struct SqlQuery { - #[serde(default)] - pub format: SqlFormat, -} - -#[derive(Debug, Deserialize, Default, Clone, Copy)] -#[serde(rename_all = "lowercase")] -pub enum SqlFormat { - /// Returns `{ columns, rows, row_count }` JSON. - #[default] - Json, - /// Returns a `text/plain` ASCII table similar to `duckdb` CLI output. - Table, -} - -/// Handler for `POST /api/admin/sql`. -pub async fn sql( - State(state): State, - Query(q): Query, - Json(body): Json, -) -> Result { - validate_read_only(&body.sql)?; - let format = q.format; - let sql_text = body.sql; - let result = db::run_blocking(&state.db, move |conn| run_select(conn, &sql_text)) - .await - .map_err(AdminError::Internal)?; - Ok(match format { - SqlFormat::Json => Json(json!({ - "columns": result.columns, - "rows": result.rows, - "row_count": result.rows.len(), - "truncated": result.truncated, - })) - .into_response(), - SqlFormat::Table => ( - [(CONTENT_TYPE, "text/plain; charset=utf-8")], - format_table(&result), - ) - .into_response(), - }) -} - -/// Strips leading whitespace, parens, semicolons, and SQL comments (both `--` -/// line comments and `/* ... */` block comments) from `sql`. Returns the byte -/// offset of the first non-comment, non-whitespace token. Used by -/// [`validate_read_only`] so a query like `-- justify the call\nSELECT 1` is -/// not rejected with `only [...] are allowed; got ""`. -fn skip_leading_noise(sql: &str) -> usize { - let bytes = sql.as_bytes(); - let mut i = 0; - while i < bytes.len() { - let b = bytes[i]; - if b == b' ' || b == b'\t' || b == b'\n' || b == b'\r' || b == b'(' || b == b';' { - i += 1; - continue; - } - if b == b'-' && i + 1 < bytes.len() && bytes[i + 1] == b'-' { - // Line comment runs to end of line. - i += 2; - while i < bytes.len() && bytes[i] != b'\n' { - i += 1; - } - continue; - } - if b == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'*' { - // Block comment, search for the matching `*/`. - i += 2; - while i + 1 < bytes.len() && !(bytes[i] == b'*' && bytes[i + 1] == b'/') { - i += 1; - } - if i + 1 < bytes.len() { - i += 2; - } else { - // Unterminated block comment; let the SQL parser surface - // the error rather than guessing. - return i; - } - continue; - } - break; - } - i -} - -fn validate_read_only(sql: &str) -> Result<(), AdminError> { - ensure_single_statement(sql)?; - let start = skip_leading_noise(sql); - let first_word: String = sql[start..] - .chars() - .take_while(|c| c.is_ascii_alphabetic()) - .collect::() - .to_ascii_uppercase(); - const ALLOWED: &[&str] = &["SELECT", "WITH", "PRAGMA", "SHOW", "DESCRIBE", "EXPLAIN"]; - if !ALLOWED.contains(&first_word.as_str()) { - return Err(AdminError::Forbidden(format!( - "only {ALLOWED:?} statements are allowed; got {first_word:?}" - ))); - } - Ok(()) -} - -fn ensure_single_statement(sql: &str) -> Result<(), AdminError> { - #[derive(Clone, Copy, PartialEq, Eq)] - enum State { - Normal, - SingleQuote, - DoubleQuote, - LineComment, - BlockComment, - } - - let mut state = State::Normal; - let mut chars = sql.char_indices().peekable(); - while let Some((idx, ch)) = chars.next() { - match state { - State::Normal => match ch { - '\'' => state = State::SingleQuote, - '"' => state = State::DoubleQuote, - '-' if chars.peek().is_some_and(|(_, next)| *next == '-') => { - chars.next(); - state = State::LineComment; - } - '/' if chars.peek().is_some_and(|(_, next)| *next == '*') => { - chars.next(); - state = State::BlockComment; - } - ';' => { - // Allow trailing whitespace and SQL comments after the - // terminator (`SELECT 1; -- note` and `SELECT 1; /* a */` - // are valid single statements). Only error if a - // non-comment, non-whitespace token follows. - let suffix_start = idx + ch.len_utf8(); - let after = skip_leading_noise(&sql[suffix_start..]); - if !sql[suffix_start + after..].is_empty() { - return Err(AdminError::Forbidden( - "admin SQL accepts a single statement only".into(), - )); - } - return Ok(()); - } - _ => {} - }, - State::SingleQuote => { - if ch == '\'' { - if chars.peek().is_some_and(|(_, next)| *next == '\'') { - chars.next(); - } else { - state = State::Normal; - } - } - } - State::DoubleQuote => { - if ch == '"' { - if chars.peek().is_some_and(|(_, next)| *next == '"') { - chars.next(); - } else { - state = State::Normal; - } - } - } - State::LineComment => { - if ch == '\n' { - state = State::Normal; - } - } - State::BlockComment => { - if ch == '*' && chars.peek().is_some_and(|(_, next)| *next == '/') { - chars.next(); - state = State::Normal; - } - } - } - } - Ok(()) -} - -struct QueryResult { - columns: Vec, - rows: Vec>, - truncated: bool, -} - -fn run_select(conn: &mut Connection, sql: &str) -> Result { - conn.execute_batch("BEGIN TRANSACTION READ ONLY") - .context("begin read-only admin SQL transaction")?; - let result = run_select_in_transaction(conn, sql); - match result { - Ok(value) => { - conn.execute_batch("COMMIT") - .context("commit read-only admin SQL transaction")?; - Ok(value) - } - Err(err) => { - if let Err(rb_err) = conn.execute_batch("ROLLBACK") { - tracing::warn!( - error = ?rb_err, - "rolling back admin SQL read-only transaction failed; the \ - original query error (returned to the caller) is the \ - actionable one" - ); - } - Err(err) - } - } -} - -fn run_select_in_transaction(conn: &Connection, sql: &str) -> Result { - let mut stmt = conn.prepare(sql).context("prepare SQL")?; - let mut rows_iter = stmt.query([]).context("execute SQL")?; - // duckdb-rs panics on Statement::column_names() if the statement has not - // executed yet - schema is only populated after `query()` runs. Pull it - // off the live `Rows` iterator instead. - let columns: Vec = rows_iter - .as_ref() - .map(|s| s.column_names()) - .unwrap_or_default(); - let column_count = columns.len(); - let mut rows: Vec> = Vec::new(); - let mut truncated = false; - while let Some(row) = rows_iter.next().context("row iter")? { - if rows.len() == ADMIN_SQL_ROW_LIMIT { - truncated = true; - break; - } - let mut out = Vec::with_capacity(column_count); - for i in 0..column_count { - let v = row.get_ref(i).context("get col")?; - out.push(value_ref_to_json(v)); - } - rows.push(out); - } - Ok(QueryResult { - columns, - rows, - truncated, - }) -} - -/// Coerce a DuckDB [`ValueRef`] into a JSON [`Value`] for the admin SQL API. -/// -/// `String::from_utf8_lossy` is used for `Text`: non-UTF-8 bytes in a TEXT -/// column are a misuse but not a reason to fail the request; the lossy -/// replacement (U+FFFD) surfaces so the caller can see something is wrong. -/// -/// `Decimal` is rendered via its Display impl. `Timestamp` is rendered as -/// `:` (one of `s|ms|us|ns:`) so it -/// round-trips through JSON unambiguously without pulling chrono / time -/// in as a dependency; consumers that want a human-readable ISO-8601 -/// can post-process the string. Other compound types (`List`, `Struct`, -/// `Array`, `Map`, `Union`, `Enum`) are rare in this database's schema; -/// they fall back to a best-effort Debug rendering tagged with the type -/// name so the caller can see something printable and we can extend -/// this match when we hit one. -fn value_ref_to_json(v: ValueRef<'_>) -> Value { - use duckdb::types::TimeUnit; - match v { - ValueRef::Null => Value::Null, - ValueRef::Boolean(b) => Value::Bool(b), - ValueRef::TinyInt(i) => Value::from(i), - ValueRef::SmallInt(i) => Value::from(i), - ValueRef::Int(i) => Value::from(i), - ValueRef::BigInt(i) => Value::from(i), - ValueRef::HugeInt(i) => Value::String(i.to_string()), - ValueRef::UTinyInt(i) => Value::from(i), - ValueRef::USmallInt(i) => Value::from(i), - ValueRef::UInt(i) => Value::from(i), - ValueRef::UBigInt(i) => Value::String(i.to_string()), - ValueRef::Float(f) => f64::from(f).into(), - ValueRef::Double(f) => f.into(), - ValueRef::Decimal(d) => Value::String(d.to_string()), - ValueRef::Timestamp(unit, raw) => { - // DuckDB stores timestamps as an integer count since - // 1970-01-01 UTC at the named precision. Surface them as a - // stable structured string keyed by the unit ("s:1700000000", - // "ms:1700000000000", etc.) so a future consumer can parse - // unambiguously without us reaching for chrono / time as a - // dependency in this slice. - let unit_str = match unit { - TimeUnit::Second => "s", - TimeUnit::Millisecond => "ms", - TimeUnit::Microsecond => "us", - TimeUnit::Nanosecond => "ns", - }; - Value::String(format!("{unit_str}:{raw}")) - } - ValueRef::Text(bytes) => Value::String(String::from_utf8_lossy(bytes).into_owned()), - ValueRef::Blob(_) => Value::String("".into()), - other => Value::String(format!("{other:?}")), - } -} - -fn format_table(r: &QueryResult) -> String { - if r.columns.is_empty() { - return "(no columns)\n".into(); - } - let row_strings: Vec> = r - .rows - .iter() - .map(|row| row.iter().map(value_display).collect()) - .collect(); - let mut widths: Vec = r.columns.iter().map(|c| c.chars().count()).collect(); - for row in &row_strings { - for (i, cell) in row.iter().enumerate() { - let w = cell.chars().count(); - if w > widths[i] { - widths[i] = w; - } - } - } - let mut out = String::new(); - write_separator(&mut out, &widths, '┌', '┬', '┐'); - write_row(&mut out, &r.columns, &widths); - write_separator(&mut out, &widths, '├', '┼', '┤'); - for row in &row_strings { - write_row(&mut out, row, &widths); - } - write_separator(&mut out, &widths, '└', '┴', '┘'); - // writeln! into a String only errors if the underlying Write impl - // returns one - fmt::Write for String is infallible - so the - // Result is discarded by design. - let _ = writeln!( - out, - "({} row{}{})", - r.rows.len(), - if r.rows.len() == 1 { "" } else { "s" }, - if r.truncated { "; truncated" } else { "" }, - ); - out -} - -fn value_display(v: &Value) -> String { - match v { - Value::Null => "NULL".into(), - Value::String(s) => s.clone(), - Value::Number(n) => n.to_string(), - Value::Bool(b) => b.to_string(), - other => other.to_string(), - } -} - -fn write_row>(out: &mut String, cells: &[S], widths: &[usize]) { - out.push('│'); - for (i, cell) in cells.iter().enumerate() { - let s = cell.as_ref(); - let pad = widths[i].saturating_sub(s.chars().count()); - out.push(' '); - out.push_str(s); - for _ in 0..pad { - out.push(' '); - } - out.push(' '); - out.push('│'); - } - out.push('\n'); -} - -fn write_separator(out: &mut String, widths: &[usize], left: char, mid: char, right: char) { - out.push(left); - for (i, w) in widths.iter().enumerate() { - if i > 0 { - out.push(mid); - } - for _ in 0..(*w + 2) { - out.push('─'); - } - } - out.push(right); - out.push('\n'); -} diff --git a/benchmarks-website/server/src/api/charts.rs b/benchmarks-website/server/src/api/charts.rs deleted file mode 100644 index 47e99c52058..00000000000 --- a/benchmarks-website/server/src/api/charts.rs +++ /dev/null @@ -1,642 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Per-chart payload assembly + the shared `SeriesAccumulator` glue. -//! -//! `chart_payload` dispatches on [`ChartKey`] to one of five -//! `collect_*_chart` functions, each of which runs one SQL query against -//! its fact table, threads the rows through a `SeriesAccumulator`, and -//! returns a [`ChartResponse`]. - -use std::collections::BTreeMap; -use std::sync::Arc; - -use anyhow::Context as _; -use anyhow::Result; -use duckdb::Connection; -use duckdb::ToSql; -use duckdb::params_from_iter; - -use super::dto::ChartHistory; -use super::dto::ChartResponse; -use super::dto::CommitPoint; -use super::dto::GroupChartsResponse; -use super::dto::NamedChartResponse; -use super::dto::SeriesTag; -use super::dto::UnitKind; -use super::groups::collect_groups; -use super::window::CommitWindow; -use crate::slug::ChartKey; -use crate::slug::GroupKey; - -/// Build the JSON payload for one chart by key. This is the shared -/// implementation behind `GET /api/chart/{slug}`, the inline `"}"#; - let out = escape_json_for_script(input); - assert!(!out.contains("` tags in `render::favicon_links`. -// v2's `public/favicon-*.png` set is unsuitable - those have white -// backgrounds baked in, so they render as a glaring white square on -// dark-mode tabs. -const ICON_LIGHT_PNG: &[u8] = include_bytes!("../../static/icon-light.png"); -const ICON_DARK_PNG: &[u8] = include_bytes!("../../static/icon-dark.png"); - -/// Cache-busting suffix appended to every static asset URL. Bump on a UI -/// release so cached browsers see the new bytes. -pub(crate) const STATIC_ASSET_VERSION: &str = "bench-v3-ui-27"; - -/// Append the cache-bust query param to a static asset URL. -pub(crate) fn versioned_asset(path: &str) -> String { - format!("{path}?v={STATIC_ASSET_VERSION}") -} - -pub(crate) async fn serve_chart_js() -> impl IntoResponse { - static_response(CHART_JS, "application/javascript; charset=utf-8") -} - -pub(crate) async fn serve_chart_zoom_js() -> impl IntoResponse { - static_response(CHART_ZOOM_JS, "application/javascript; charset=utf-8") -} - -pub(crate) async fn serve_chart_init_js() -> impl IntoResponse { - static_response(CHART_INIT_JS, "application/javascript; charset=utf-8") -} - -pub(crate) async fn serve_style_css() -> impl IntoResponse { - static_response(STYLE_CSS, "text/css; charset=utf-8") -} - -pub(crate) async fn serve_vortex_black_png() -> impl IntoResponse { - static_response(VORTEX_BLACK_PNG, "image/png") -} - -pub(crate) async fn serve_vortex_white_png() -> impl IntoResponse { - static_response(VORTEX_WHITE_PNG, "image/png") -} - -pub(crate) async fn serve_icon_light_png() -> impl IntoResponse { - static_response(ICON_LIGHT_PNG, "image/png") -} - -pub(crate) async fn serve_icon_dark_png() -> impl IntoResponse { - static_response(ICON_DARK_PNG, "image/png") -} - -fn static_response(bytes: &'static [u8], content_type: &'static str) -> Response { - ( - [ - (header::CONTENT_TYPE, content_type), - ( - header::CACHE_CONTROL, - "no-cache, max-age=0, must-revalidate", - ), - ], - bytes, - ) - .into_response() -} diff --git a/benchmarks-website/server/src/html/summary.rs b/benchmarks-website/server/src/html/summary.rs deleted file mode 100644 index 331cda64064..00000000000 --- a/benchmarks-website/server/src/html/summary.rs +++ /dev/null @@ -1,150 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Group summary card rendering. -//! -//! Each [`Summary`] variant renders into a small `.benchmark-scores-summary` -//! card that lives above the chart grid. Every variant is rendered the same -//! shape — a list of `.score-item` rows — only the rank label, value, and -//! footer change. - -use maud::Markup; -use maud::html; - -use crate::api::Summary; - -/// Render the summary card for a group, or empty markup if `summary` is -/// `None` or every variant's content list is empty. -pub(super) fn summary_markup(summary: Option<&Summary>) -> Markup { - let Some(summary) = summary else { - return html! {}; - }; - match summary { - Summary::RandomAccess { - title, - rankings, - explanation, - } if !rankings.is_empty() => html! { - section.benchmark-scores-summary aria-label=(title) { - h3.scores-title { (title) } - div.scores-list { - @for (idx, item) in rankings.iter().enumerate() { - div.score-item { - span.score-rank { "#" (idx + 1) } - span.score-series title=(item.name) { (item.name) } - span.score-metrics { - span.score-value { (format_time_ns(item.time)) } - span.score-runtime { (format!("{:.2}x", item.ratio)) } - } - } - } - } - div.scores-explanation { (explanation) } - } - }, - Summary::Compression { - title, - compress_ratio, - decompress_ratio, - dataset_count: _, - explanation, - } if compress_ratio.is_some() || decompress_ratio.is_some() => html! { - section.benchmark-scores-summary aria-label=(title) { - h3.scores-title { (title) } - div.scores-list { - @if let Some(v) = compress_ratio { - div.score-item { - span.score-rank { "⚡" } - span.score-series { "Write Speed (Compression)" } - span.score-metrics { - span.score-value { (format!("{v:.2}x")) } - } - } - } - @if let Some(v) = decompress_ratio { - div.score-item { - span.score-rank { "📤" } - span.score-series { "Scan Speed (Decompression)" } - span.score-metrics { - span.score-value { (format!("{v:.2}x")) } - } - } - } - } - div.scores-explanation { (explanation) } - } - }, - Summary::CompressionSize { - title, - min_ratio, - mean_ratio, - max_ratio, - dataset_count: _, - explanation, - } => html! { - section.benchmark-scores-summary aria-label=(title) { - h3.scores-title { (title) } - div.scores-list { - div.score-item { - span.score-rank { "⬇️" } - span.score-series { "Min Size Ratio" } - span.score-metrics { - span.score-value { (format!("{min_ratio:.2}x")) } - } - } - div.score-item { - span.score-rank { "📊" } - span.score-series { "Mean Size Ratio" } - span.score-metrics { - span.score-value { (format!("{mean_ratio:.2}x")) } - } - } - div.score-item { - span.score-rank { "⬆️" } - span.score-series { "Max Size Ratio" } - span.score-metrics { - span.score-value { (format!("{max_ratio:.2}x")) } - } - } - } - div.scores-explanation { (explanation) } - } - }, - Summary::QueryBenchmark { - title, - rankings, - explanation, - } if !rankings.is_empty() => html! { - section.benchmark-scores-summary aria-label=(title) { - h3.scores-title { (title) } - div.scores-list { - @for (idx, item) in rankings.iter().enumerate() { - div.score-item { - span.score-rank { "#" (idx + 1) } - span.score-series title=(item.name) { (item.name) } - span.score-metrics { - span.score-value { (format!("{:.2}x", item.score)) } - span.score-runtime { (format_time_ns(item.total_runtime)) } - } - } - } - } - div.scores-explanation { (explanation) } - } - }, - _ => html! {}, - } -} - -fn format_time_ns(ns: f64) -> String { - let abs = ns.abs(); - if abs >= 1_000_000_000.0 { - format!("{:.2} s", ns / 1_000_000_000.0) - } else if abs >= 1_000_000.0 { - format!("{:.2} ms", ns / 1_000_000.0) - } else if abs >= 1_000.0 { - format!("{:.2} us", ns / 1_000.0) - } else { - format!("{ns:.0} ns") - } -} diff --git a/benchmarks-website/server/src/html/toolbar.rs b/benchmarks-website/server/src/html/toolbar.rs deleted file mode 100644 index 1075632c89b..00000000000 --- a/benchmarks-website/server/src/html/toolbar.rs +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Per-chart toolbar markup: the scope slider + Y-axis switch above each -//! chart, and the range scrollbar strip below it. - -use maud::Markup; -use maud::html; - -/// Render the per-chart toolbar. `idx` namespaces input ids so multiple -/// charts on the same page don't collide on ``. -/// -/// All buttons are `
.vortex` file per table). Default: -//! `/snapshots`. -//! - `VORTEX_BENCH_EXTENSION_DIR` - directory DuckDB installs extensions -//! into. Default: `/duckdb-extensions`. The -//! default lives under `STATE_DIR`, which the systemd unit makes -//! writable; this is what lets `INSTALL vortex` succeed under -//! `ProtectHome=read-only`. -//! - `VORTEX_BENCH_BIND` - `host:port` the **public** listener binds to. -//! Highest priority. Default `127.0.0.1:3000` (after `PORT` fallback). -//! Override to `0.0.0.0:3000` for container deploys. Only ingest, read, -//! HTML, and `/health` are served here - admin routes do not match. -//! - `PORT` - optional PaaS-conventional knob for the **public** listener -//! only. When set and `VORTEX_BENCH_BIND` is not, the public listener -//! binds `0.0.0.0:$PORT`. Does not affect the admin listener. -//! - `VORTEX_BENCH_ADMIN_BIND` - `host:port` the **admin** listener binds -//! to when `ADMIN_BEARER_TOKEN` is set. Default `127.0.0.1:3001`. The -//! address MUST resolve to a loopback IP (`127.0.0.0/8` or `::1`); the -//! server refuses to start otherwise. This is the load-bearing guarantee -//! that `/api/admin/*` never reaches the public network even when -//! `VORTEX_BENCH_BIND=0.0.0.0:3000`. Must also resolve to a different -//! address than the public bind. -//! - `VORTEX_BENCH_LOG` - `tracing-subscriber` env filter spec. Default -//! `info`. -//! -//! On Unix, SIGTERM and SIGINT both trigger a graceful drain - in-flight -//! requests are allowed to finish before the process exits. systemd's -//! `TimeoutStopSec` (default 90s) bounds the grace window, which matters -//! because `systemctl restart` is what the deploy timer fires on every -//! new binary roll. On non-Unix targets only Ctrl-C/SIGINT is wired. - -use std::env; -use std::net::SocketAddr; -use std::path::PathBuf; - -use anyhow::Context as _; -use anyhow::Result; -use anyhow::anyhow; -use futures::FutureExt; -use tokio::net::TcpListener; -use tracing_subscriber::EnvFilter; - -#[tokio::main] -async fn main() -> Result<()> { - tracing_subscriber::fmt() - .with_env_filter( - EnvFilter::try_from_env("VORTEX_BENCH_LOG").unwrap_or_else(|_| EnvFilter::new("info")), - ) - .init(); - - let db_path: PathBuf = env::var("VORTEX_BENCH_DB") - .unwrap_or_else(|_| "bench.duckdb".to_string()) - .into(); - let bearer_token = - env::var("INGEST_BEARER_TOKEN").context("INGEST_BEARER_TOKEN env var must be set")?; - let admin_bearer_token = env::var("ADMIN_BEARER_TOKEN") - .ok() - .filter(|token| !token.trim().is_empty()); - // `VORTEX_BENCH_BIND` wins (full `host:port`). If unset, fall back to the - // PaaS-conventional `PORT` env var (binds to `0.0.0.0:$PORT`). Otherwise - // localhost-only on the default port. `PORT` only affects the public - // listener - the admin listener has its own env var below. - let public_bind = env::var("VORTEX_BENCH_BIND") - .ok() - .or_else(|| env::var("PORT").ok().map(|p| format!("0.0.0.0:{p}"))) - .unwrap_or_else(|| "127.0.0.1:3000".to_string()); - let admin_bind = - env::var("VORTEX_BENCH_ADMIN_BIND").unwrap_or_else(|_| "127.0.0.1:3001".to_string()); - - let mut state = vortex_bench_server::app::AppState::open(&db_path, bearer_token) - .with_context(|| format!("opening DuckDB at {}", db_path.display()))?; - if let Some(token) = admin_bearer_token { - state = state.with_admin(token); - } else { - tracing::warn!( - "ADMIN_BEARER_TOKEN is unset or empty - /api/admin/* will return 404 \ - (snapshot + read-only SQL disabled)" - ); - } - if let Ok(dir) = env::var("VORTEX_BENCH_SNAPSHOT_DIR") { - state = state.with_snapshot_dir(PathBuf::from(dir)); - } - if let Ok(dir) = env::var("VORTEX_BENCH_EXTENSION_DIR") { - state = state - .with_extension_dir(PathBuf::from(dir)) - .context("applying VORTEX_BENCH_EXTENSION_DIR")?; - } - let snapshot_dir = state.snapshot_dir.clone(); - let extension_dir = state.extension_dir.clone(); - let admin_app = vortex_bench_server::app::admin_router(state.clone()); - let public_app = vortex_bench_server::app::public_router(state); - - // Resolve and validate BOTH addresses before opening any listening - // socket. The earlier order opened the admin socket and only THEN - // checked the resolved address was loopback, which briefly bound a - // potentially-public address to the kernel's SYN queue before the - // refusal. Resolve via `tokio::net::lookup_host` (DNS-aware, matches - // what `TcpListener::bind` would use) and run both checks against the - // resolved `SocketAddr` first. - let public_addr = resolve_first_addr(&public_bind) - .await - .with_context(|| format!("resolving public listener bind {public_bind:?}"))?; - let admin_addr_opt = match admin_app.as_ref() { - Some(_) => Some( - resolve_first_addr(&admin_bind) - .await - .with_context(|| format!("resolving admin listener bind {admin_bind:?}"))?, - ), - None => None, - }; - if let Some(admin_addr) = admin_addr_opt { - ensure_admin_is_loopback(&admin_bind, admin_addr)?; - ensure_distinct_binds(public_addr, admin_addr)?; - } - - let public_listener = TcpListener::bind(public_addr) - .await - .with_context(|| format!("binding public listener to {public_addr}"))?; - let public_addr = public_listener.local_addr()?; - - let admin_listener = match (admin_app.as_ref(), admin_addr_opt) { - (Some(_), Some(admin_addr)) => { - let listener = TcpListener::bind(admin_addr) - .await - .with_context(|| format!("binding admin listener to {admin_addr}"))?; - let admin_addr = listener.local_addr()?; - // Defense-in-depth re-check on the post-bind address (a hostname - // resolution that drifted between our resolve and bind would be - // surprising; surface it rather than silently expose the port). - ensure_admin_is_loopback(&admin_bind, admin_addr)?; - Some((listener, admin_addr)) - } - _ => None, - }; - - tracing::info!( - public_addr = %public_addr, - admin_addr = ?admin_listener.as_ref().map(|(_, a)| *a), - db = %db_path.display(), - snapshot_dir = %snapshot_dir.display(), - extension_dir = %extension_dir.display(), - "bench server listening" - ); - - // Both listeners share one shutdown trigger so a single SIGTERM drains - // ingest, admin, and HTML in lockstep. `Shared` lets us hand the same - // future to each `with_graceful_shutdown`. - let shutdown = shutdown_signal().shared(); - match (admin_app, admin_listener) { - (Some(admin_app), Some((admin_listener, _))) => { - let public_fut = - axum::serve(public_listener, public_app).with_graceful_shutdown(shutdown.clone()); - let admin_fut = axum::serve(admin_listener, admin_app).with_graceful_shutdown(shutdown); - tokio::try_join!(public_fut, admin_fut)?; - } - _ => { - axum::serve(public_listener, public_app) - .with_graceful_shutdown(shutdown) - .await?; - } - } - Ok(()) -} - -/// Resolve a `host:port` spec to its first reachable [`SocketAddr`]. -async fn resolve_first_addr(spec: &str) -> Result { - tokio::net::lookup_host(spec) - .await - .with_context(|| format!("looking up {spec:?}"))? - .next() - .ok_or_else(|| anyhow!("{spec:?} did not resolve to any address")) -} - -/// Refuse to start if the public and admin listeners would land on the -/// same port. The admin listener exists specifically to keep -/// `/api/admin/*` off the public network, so the two collapsing back into -/// one is a silent rollback of that guarantee. -/// -/// Equality-of-`SocketAddr` alone misses the case where the public bind -/// is `0.0.0.0:3000` and the admin bind is `127.0.0.1:3000` (or any other -/// loopback + same port). The OS would EADDRINUSE at bind time, but the -/// error message ("Address already in use") gives no hint that -/// `VORTEX_BENCH_ADMIN_BIND` is the thing to change. Catch it here with -/// an actionable diagnostic instead. -fn ensure_distinct_binds(public: SocketAddr, admin: SocketAddr) -> Result<()> { - let port_collision = public.port() == admin.port() - && (public.ip() == admin.ip() - || public.ip().is_unspecified() - || admin.ip().is_unspecified()); - if port_collision { - return Err(anyhow!( - "public bind {public} and admin bind {admin} would overlap on port \ - {}; keep VORTEX_BENCH_ADMIN_BIND on a port distinct from the public listener", - public.port() - )); - } - Ok(()) -} - -/// Refuse to start if the admin listener resolved to a non-loopback -/// address. Without this guard, `VORTEX_BENCH_ADMIN_BIND=0.0.0.0:3001` -/// (or any public IP / unspecified address / non-loopback hostname) -/// would silently expose `/api/admin/*` - the bearer-gated SQL and -/// snapshot endpoints - to the public network. The contract is that the -/// admin listener is loopback-only and the only way callers reach it is -/// from the same host. An operator who genuinely wants remote admin -/// access should put it behind an SSH tunnel rather than opening the -/// bind, so this check is intentionally strict. -fn ensure_admin_is_loopback(spec: &str, admin: SocketAddr) -> Result<()> { - if !admin.ip().is_loopback() { - return Err(anyhow!( - "admin listener resolved to {admin} (from VORTEX_BENCH_ADMIN_BIND={spec:?}); \ - /api/admin/* must remain loopback-only. Use 127.0.0.1, ::1, or \ - a hostname that resolves to a loopback address; reach admin from \ - elsewhere via an SSH tunnel" - )); - } - Ok(()) -} - -/// Resolves when the process receives SIGINT or SIGTERM. Used as the -/// graceful-shutdown future for `axum::serve` so a `systemctl restart` -/// (SIGTERM) lets in-flight requests finish before the process exits. -/// `systemd`'s `TimeoutStopSec` (default 90s) bounds the grace window - -/// nothing inside the process imposes its own timeout. -async fn shutdown_signal() { - let ctrl_c = async { - tokio::signal::ctrl_c() - .await - .expect("install ctrl_c handler"); - }; - #[cfg(unix)] - let terminate = async { - tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) - .expect("install SIGTERM handler") - .recv() - .await; - }; - #[cfg(not(unix))] - let terminate = std::future::pending::<()>(); - - tokio::select! { - _ = ctrl_c => tracing::info!("received SIGINT - shutting down"), - _ = terminate => tracing::info!("received SIGTERM - shutting down"), - } -} - -#[cfg(test)] -mod tests { - use std::net::SocketAddr; - - use super::ensure_admin_is_loopback; - use super::ensure_distinct_binds; - - #[test] - fn admin_loopback_v4_passes() { - let addr: SocketAddr = "127.0.0.1:3001".parse().unwrap(); - ensure_admin_is_loopback("127.0.0.1:3001", addr).expect("127.0.0.1 is loopback"); - } - - #[test] - fn admin_loopback_v6_passes() { - let addr: SocketAddr = "[::1]:3001".parse().unwrap(); - ensure_admin_is_loopback("[::1]:3001", addr).expect("::1 is loopback"); - } - - #[test] - fn admin_loopback_127_8_subnet_passes() { - // The entire 127.0.0.0/8 block is loopback. - let addr: SocketAddr = "127.1.2.3:3001".parse().unwrap(); - ensure_admin_is_loopback("127.1.2.3:3001", addr).expect("127.0.0.0/8 is loopback"); - } - - #[test] - fn admin_zero_v4_rejected() { - let addr: SocketAddr = "0.0.0.0:3001".parse().unwrap(); - let err = ensure_admin_is_loopback("0.0.0.0:3001", addr) - .expect_err("0.0.0.0 must be rejected as non-loopback"); - let msg = err.to_string(); - assert!(msg.contains("loopback-only"), "{msg}"); - } - - #[test] - fn admin_zero_v6_rejected() { - let addr: SocketAddr = "[::]:3001".parse().unwrap(); - ensure_admin_is_loopback("[::]:3001", addr) - .expect_err(":: must be rejected as non-loopback"); - } - - #[test] - fn admin_public_ip_rejected() { - let addr: SocketAddr = "10.0.0.5:3001".parse().unwrap(); - ensure_admin_is_loopback("10.0.0.5:3001", addr) - .expect_err("private/public IP must be rejected as non-loopback"); - } - - #[test] - fn distinct_binds_passes() { - let p: SocketAddr = "127.0.0.1:3000".parse().unwrap(); - let a: SocketAddr = "127.0.0.1:3001".parse().unwrap(); - ensure_distinct_binds(p, a).expect("different ports are distinct"); - } - - #[test] - fn same_bind_rejected() { - let p: SocketAddr = "127.0.0.1:3000".parse().unwrap(); - let a: SocketAddr = "127.0.0.1:3000".parse().unwrap(); - ensure_distinct_binds(p, a).expect_err("identical binds must be rejected"); - } - - #[test] - fn unspecified_v4_collides_with_loopback_same_port() { - // The motivating case for the cycle-1 ensure_distinct_binds fix: - // public on 0.0.0.0:3000 + admin on 127.0.0.1:3000 would silently - // bind to overlapping ports; the OS would EADDRINUSE at bind time - // with no actionable diagnostic. Catch it pre-bind. - let p: SocketAddr = "0.0.0.0:3000".parse().unwrap(); - let a: SocketAddr = "127.0.0.1:3000".parse().unwrap(); - ensure_distinct_binds(p, a) - .expect_err("0.0.0.0:3000 + 127.0.0.1:3000 must be rejected as a port-collision"); - } - - #[test] - fn unspecified_v4_different_port_from_loopback_passes() { - let p: SocketAddr = "0.0.0.0:3000".parse().unwrap(); - let a: SocketAddr = "127.0.0.1:3001".parse().unwrap(); - ensure_distinct_binds(p, a).expect("distinct ports must not be rejected"); - } - - #[test] - fn unspecified_v6_collides_with_v6_loopback_same_port() { - let p: SocketAddr = "[::]:3000".parse().unwrap(); - let a: SocketAddr = "[::1]:3000".parse().unwrap(); - ensure_distinct_binds(p, a) - .expect_err("[::]:3000 + [::1]:3000 must be rejected as a port-collision"); - } -} diff --git a/benchmarks-website/server/src/query_cache.rs b/benchmarks-website/server/src/query_cache.rs deleted file mode 100644 index 3647ab920ec..00000000000 --- a/benchmarks-website/server/src/query_cache.rs +++ /dev/null @@ -1,472 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! In-memory query result cache for the read API. -//! -//! Every chart payload, group discovery result, and filter universe in the v3 -//! site is a deterministic function of the DuckDB snapshot — and that -//! snapshot only changes when `/api/ingest` lands a new envelope (~30 times a -//! day). Without a cache, every concurrent request re-runs the same SQL -//! against the engine, which serialises on DuckDB's internal locks; many tabs -//! / clients open at once peg the read API behind that lock even though the -//! underlying answer hasn't changed in hours. -//! -//! The cache is a generation-keyed, single-flight store of [`Arc`]-wrapped -//! payloads, one [`DashMap`] per result type: -//! - reads check the slot, clone the [`Arc`] out, and return — no DuckDB -//! round-trip on the hot path; -//! - the first miss for a key runs `compute` while concurrent waiters share the -//! same async slot; -//! - [`QueryCache::invalidate`] is called from [`crate::ingest`] after a -//! successful commit; it advances the generation and clears the visible -//! maps so old in-flight computes cannot repopulate the current snapshot. -//! -//! The two unkeyed slots — `/api/groups` and the filter universe — use a -//! [`DashMap`] with `()` as the logical key, so every slot in the cache is the -//! same primitive. -//! -//! Cached values are wrapped in [`std::sync::Arc`] and never deep-cloned on -//! the cache-hit path. The JSON bytes are still serialized per fallback -//! response; the materialized latest-100 hot path lives in -//! [`crate::read_model`]. - -use std::future::Future; -use std::hash::Hash; -use std::sync::Arc; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; - -use anyhow::Result; -use dashmap::DashMap; -use tokio::sync::Mutex as AsyncMutex; - -use crate::api::ChartResponse; -use crate::api::CommitWindow; -use crate::api::FilterUniverse; -use crate::api::Group; -use crate::api::GroupChartsResponse; - -/// Composite cache for every read-side DuckDB query. -/// -/// Cheap to clone via [`Arc`]; one instance is owned by [`crate::app::AppState`] -/// for the lifetime of the server. All entries are cleared by -/// [`Self::invalidate`] when ingest changes the underlying snapshot. -#[derive(Default)] -pub struct QueryCache { - /// Monotonically advances whenever ingest invalidates the read snapshot. - generation: AtomicU64, - /// `/api/groups` discovery result. Keyed by `()` because there is only - /// ever one group list per snapshot. - groups: DashMap, CacheSlot>>>, - /// Global filter universe (engines + formats). Also unkeyed. - filter_universe: DashMap, CacheSlot>>, - /// Per-chart payloads, keyed by `(slug, window)`. - chart_payloads: DashMap, CacheSlot>>>, - /// Per-group payloads, keyed by `(slug, window)`. - group_charts: DashMap, CacheSlot>>>, -} - -type CacheSlot = Arc>>; - -#[derive(Debug, Clone, Eq, PartialEq, Hash)] -struct VersionedKey { - generation: u64, - key: K, -} - -#[derive(Debug, Clone, Eq, PartialEq, Hash)] -struct ChartCacheKey { - slug: String, - window: CommitWindow, -} - -#[derive(Debug, Clone, Eq, PartialEq, Hash)] -struct GroupCacheKey { - slug: String, - window: CommitWindow, -} - -impl QueryCache { - /// Build an empty cache. Equivalent to [`Self::default`]. - pub fn new() -> Self { - Self::default() - } - - async fn get_or_compute( - &self, - map: &DashMap, CacheSlot>, - key: K, - compute: F, - wrap: Wrap, - ) -> Result - where - K: Clone + Eq + Hash, - V: Clone, - F: FnOnce() -> Fut, - Fut: Future>, - Wrap: FnOnce(Raw) -> V, - { - let generation = self.generation.load(Ordering::Acquire); - let cache_key = VersionedKey { generation, key }; - let slot = map - .entry(cache_key.clone()) - .or_insert_with(|| Arc::new(AsyncMutex::new(None))) - .clone(); - - let mut guard = slot.lock().await; - if let Some(value) = guard.as_ref() { - return Ok(value.clone()); - } - - let fresh = wrap(compute().await?); - if self.generation.load(Ordering::Acquire) == generation { - *guard = Some(fresh.clone()); - } else { - map.remove(&cache_key); - } - Ok(fresh) - } - - /// Get the cached `Arc>` from `/api/groups`, or run `compute` - /// if the slot is empty and store the result. - pub async fn groups(&self, compute: F) -> Result>> - where - F: FnOnce() -> Fut, - Fut: Future>>, - { - self.get_or_compute(&self.groups, (), compute, Arc::new) - .await - } - - /// Get the cached `Arc` for the global filter bar, or - /// run `compute` if the slot is empty and store the result. - pub async fn filter_universe(&self, compute: F) -> Result> - where - F: FnOnce() -> Fut, - Fut: Future>, - { - self.get_or_compute(&self.filter_universe, (), compute, Arc::new) - .await - } - - /// Get the cached chart payload for `(slug, window)`, or run `compute` - /// if the entry is absent and store the result. The cached value is - /// `Option>` so a confirmed "no data for this slug" - /// answer is cached too — repeated 404s do not re-hit DuckDB. - pub async fn chart_payload( - &self, - slug: &str, - window: &CommitWindow, - compute: F, - ) -> Result>> - where - F: FnOnce() -> Fut, - Fut: Future>>, - { - let key = ChartCacheKey { - slug: slug.to_string(), - window: *window, - }; - self.get_or_compute(&self.chart_payloads, key, compute, |value| { - value.map(Arc::new) - }) - .await - } - - /// Get the cached per-group payload for `(slug, window)`, or run - /// `compute` if the entry is absent and store the result. - pub async fn group_charts( - &self, - slug: &str, - window: &CommitWindow, - compute: F, - ) -> Result>> - where - F: FnOnce() -> Fut, - Fut: Future>>, - { - let key = GroupCacheKey { - slug: slug.to_string(), - window: *window, - }; - self.get_or_compute(&self.group_charts, key, compute, |value| { - value.map(Arc::new) - }) - .await - } - - /// Drop every cached value. Called from the ingest handler after a - /// successful commit so the next read sees the fresh snapshot. - pub fn invalidate(&self) { - self.generation.fetch_add(1, Ordering::AcqRel); - self.groups.clear(); - self.filter_universe.clear(); - self.chart_payloads.clear(); - self.group_charts.clear(); - } -} - -#[cfg(test)] -mod tests { - use std::sync::atomic::AtomicUsize; - use std::sync::atomic::Ordering; - use std::time::Duration; - - use anyhow::anyhow; - use tokio::sync::oneshot; - - use super::*; - use crate::api::FilterUniverse; - - fn empty_universe() -> FilterUniverse { - FilterUniverse::default() - } - - fn universe_with_engine(engine: &str) -> FilterUniverse { - FilterUniverse { - engines: vec![engine.to_string()], - formats: Vec::new(), - } - } - - #[tokio::test] - async fn singleton_caches_and_returns_same_arc() -> Result<()> { - let cache = QueryCache::new(); - let calls = AtomicUsize::new(0); - - let a = cache - .filter_universe(|| { - calls.fetch_add(1, Ordering::SeqCst); - async { Ok(empty_universe()) } - }) - .await?; - let b = cache - .filter_universe(|| { - calls.fetch_add(1, Ordering::SeqCst); - async { Ok(empty_universe()) } - }) - .await?; - - assert_eq!(calls.load(Ordering::SeqCst), 1, "compute should run once"); - assert!(Arc::ptr_eq(&a, &b), "cache returns the same Arc"); - Ok(()) - } - - #[tokio::test] - async fn invalidate_clears_singleton() -> Result<()> { - let cache = QueryCache::new(); - let calls = AtomicUsize::new(0); - - let a = cache - .filter_universe(|| { - calls.fetch_add(1, Ordering::SeqCst); - async { Ok(empty_universe()) } - }) - .await?; - cache.invalidate(); - let b = cache - .filter_universe(|| { - calls.fetch_add(1, Ordering::SeqCst); - async { Ok(empty_universe()) } - }) - .await?; - - assert_eq!( - calls.load(Ordering::SeqCst), - 2, - "invalidate should force a recompute" - ); - assert!( - !Arc::ptr_eq(&a, &b), - "post-invalidate read should produce a fresh Arc" - ); - Ok(()) - } - - #[tokio::test] - async fn concurrent_singleton_misses_share_one_compute() -> Result<()> { - let cache = Arc::new(QueryCache::new()); - let calls = Arc::new(AtomicUsize::new(0)); - let mut tasks = Vec::new(); - - for _ in 0..16 { - let cache = Arc::clone(&cache); - let calls = Arc::clone(&calls); - tasks.push(tokio::spawn(async move { - cache - .filter_universe(|| { - calls.fetch_add(1, Ordering::SeqCst); - async { - tokio::time::sleep(Duration::from_millis(50)).await; - Ok(empty_universe()) - } - }) - .await - })); - } - - for task in tasks { - task.await??; - } - - assert_eq!( - calls.load(Ordering::SeqCst), - 1, - "concurrent misses for one key should collapse to one compute" - ); - Ok(()) - } - - #[tokio::test] - async fn stale_in_flight_compute_does_not_repopulate_after_invalidate() -> Result<()> { - let cache = Arc::new(QueryCache::new()); - let (started_tx, started_rx) = oneshot::channel(); - let (release_tx, release_rx) = oneshot::channel(); - - let stale_cache = Arc::clone(&cache); - let stale_task = tokio::spawn(async move { - stale_cache - .filter_universe(|| { - started_tx.send(()).expect("test receiver is alive"); - async { - release_rx - .await - .expect("test sender releases stale compute"); - Ok(universe_with_engine("stale")) - } - }) - .await - }); - - started_rx.await?; - cache.invalidate(); - - let fresh = cache - .filter_universe(|| async { Ok(universe_with_engine("fresh")) }) - .await?; - assert_eq!(fresh.engines, ["fresh"]); - - release_tx.send(()).expect("stale compute is waiting"); - let stale = stale_task.await??; - assert_eq!(stale.engines, ["stale"]); - - let cached = cache - .filter_universe(|| async { Ok(universe_with_engine("unexpected")) }) - .await?; - assert_eq!( - cached.engines, - ["fresh"], - "old in-flight computations must not populate the new generation" - ); - Ok(()) - } - - #[tokio::test] - async fn chart_payload_keyed_by_slug_and_window() -> Result<()> { - let cache = QueryCache::new(); - let calls = AtomicUsize::new(0); - - let make = |display_name: &str| { - let display_name = display_name.to_string(); - ChartResponse { - display_name, - unit_kind: crate::api::UnitKind::TimeNs, - history: crate::api::ChartHistory { - total_commits: 0, - start_index: 0, - loaded_commits: 0, - complete: true, - }, - commits: Vec::new(), - series: serde_json::Map::new(), - series_meta: std::collections::BTreeMap::new(), - } - }; - - let one = cache - .chart_payload("a", &CommitWindow::All, || { - calls.fetch_add(1, Ordering::SeqCst); - async { Ok(Some(make("first"))) } - }) - .await? - .expect("Some"); - let _two = cache - .chart_payload("a", &CommitWindow::All, || { - calls.fetch_add(1, Ordering::SeqCst); - async { Ok(Some(make("second"))) } - }) - .await? - .expect("Some"); - // Different window — should be a separate cache slot. - let three = cache - .chart_payload( - "a", - &CommitWindow::Last(std::num::NonZeroU32::new(10).unwrap()), - || { - calls.fetch_add(1, Ordering::SeqCst); - async { Ok(Some(make("third"))) } - }, - ) - .await? - .expect("Some"); - - assert_eq!(calls.load(Ordering::SeqCst), 2); - assert_eq!(one.display_name, "first"); - assert_eq!(three.display_name, "third"); - Ok(()) - } - - #[tokio::test] - async fn chart_payload_caches_negative_result() -> Result<()> { - let cache = QueryCache::new(); - let calls = AtomicUsize::new(0); - - let none1 = cache - .chart_payload("missing", &CommitWindow::All, || { - calls.fetch_add(1, Ordering::SeqCst); - async { Ok(None) } - }) - .await?; - let none2 = cache - .chart_payload("missing", &CommitWindow::All, || { - calls.fetch_add(1, Ordering::SeqCst); - async { Ok(None) } - }) - .await?; - - assert!(none1.is_none() && none2.is_none()); - assert_eq!( - calls.load(Ordering::SeqCst), - 1, - "the second read for a missing slug should hit the cache, not re-query" - ); - Ok(()) - } - - #[tokio::test] - async fn errors_do_not_populate_cache() -> Result<()> { - let cache = QueryCache::new(); - let calls = AtomicUsize::new(0); - - let res = cache - .filter_universe(|| { - calls.fetch_add(1, Ordering::SeqCst); - async { Err::(anyhow!("boom")) } - }) - .await; - assert!(res.is_err(), "error path bubbles up"); - - cache - .filter_universe(|| { - calls.fetch_add(1, Ordering::SeqCst); - async { Ok(empty_universe()) } - }) - .await?; - assert_eq!( - calls.load(Ordering::SeqCst), - 2, - "second call must rerun after an errored first call", - ); - Ok(()) - } -} diff --git a/benchmarks-website/server/src/read_model.rs b/benchmarks-website/server/src/read_model.rs deleted file mode 100644 index b2aa0e21868..00000000000 --- a/benchmarks-website/server/src/read_model.rs +++ /dev/null @@ -1,837 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Materialized read model for the hot benchmark website paths. -//! -//! DuckDB remains the source of truth, but the normal latest-100 website -//! payloads are deterministic between ingests. This module builds those -//! payloads once per database snapshot, stores identity/gzip/brotli bytes in -//! memory, and lets handlers serve bytes directly on the hot path. - -use std::collections::VecDeque; -use std::future::Future; -use std::hash::Hasher as _; -use std::io::Read as _; -use std::io::Write as _; -use std::num::NonZeroU32; -use std::pin::Pin; -use std::sync::Arc; - -use anyhow::Context as _; -use anyhow::Result; -use axum::body::Body; -use axum::http::HeaderMap; -use axum::http::HeaderValue; -use axum::http::StatusCode; -use axum::http::header; -use axum::response::Response; -use bytes::Bytes; -use duckdb::Connection; -use flate2::Compression; -use flate2::write::GzEncoder; -use parking_lot::RwLock; -use serde::Serialize; -use tokio::sync::Mutex as AsyncMutex; -use twox_hash::XxHash64; -use vortex_utils::aliases::hash_map::HashMap; - -use crate::api; -use crate::api::ChartResponse; -use crate::api::CommitWindow; -use crate::api::FilterUniverse; -use crate::api::Group; -use crate::api::GroupChartsResponse; -use crate::api::GroupsResponse; -use crate::api::NamedChartResponse; -use crate::api::Summary; -use crate::db; -use crate::db::DbHandle; -use crate::slug::ChartKey; - -/// Number of charts included in one materialized group shard response. -pub const GROUP_SHARD_CHARTS: usize = 8; - -/// Superseded read generations retained for pages opened before newer ingests -/// completed. Each generation owns precompressed latest-100 artifacts, so keep -/// this finite while covering ordinary CI bursts. -const RETAINED_PREVIOUS_GENERATIONS: usize = 8; - -/// Cache policy for a materialized artifact route. -#[derive(Debug, Clone, Copy)] -pub enum ArtifactCachePolicy { - /// Stable URLs such as `/api/groups`; browsers should revalidate. - Revalidate, - /// Versioned URLs under `/api/artifacts/{generation}/...`. - Immutable, -} - -impl ArtifactCachePolicy { - fn header_value(self) -> &'static str { - match self { - Self::Revalidate => "no-cache, max-age=0, must-revalidate", - Self::Immutable => "public, max-age=31536000, immutable", - } - } -} - -/// A JSON artifact encoded in every representation the server wants to serve. -#[derive(Debug, Clone)] -pub struct EncodedArtifact { - identity: Bytes, - gzip: Bytes, - br: Bytes, - etag: HeaderValue, -} - -impl EncodedArtifact { - fn new(generation_id: &str, identity: Vec) -> Result { - let gzip = gzip_bytes(&identity).context("gzip artifact")?; - let br = brotli_bytes(&identity).context("brotli artifact")?; - let etag = HeaderValue::from_str(&format!("\"{generation_id}\"")) - .context("building artifact ETag")?; - Ok(Self { - identity: Bytes::from(identity), - gzip: Bytes::from(gzip), - br: Bytes::from(br), - etag, - }) - } - - /// Uncompressed bytes, used when an HTML page embeds a single chart. - pub fn identity(&self) -> &Bytes { - &self.identity - } - - /// Build an Axum response using the client's `Accept-Encoding` and - /// `If-None-Match` headers. - pub fn response(&self, request_headers: &HeaderMap, policy: ArtifactCachePolicy) -> Response { - if if_none_match_matches(request_headers, &self.etag) { - return artifact_response_builder(StatusCode::NOT_MODIFIED, policy, &self.etag) - .body(Body::empty()) - .expect("artifact 304 response"); - } - - let (encoding, bytes) = match preferred_encoding(request_headers) { - ArtifactEncoding::Brotli => (Some("br"), self.br.clone()), - ArtifactEncoding::Gzip => (Some("gzip"), self.gzip.clone()), - ArtifactEncoding::Identity => (None, self.identity.clone()), - }; - - let mut builder = artifact_response_builder(StatusCode::OK, policy, &self.etag) - .header(header::CONTENT_TYPE, "application/json") - .header(header::CONTENT_LENGTH, bytes.len().to_string()); - if let Some(encoding) = encoding { - builder = builder.header(header::CONTENT_ENCODING, encoding); - } - builder.body(Body::from(bytes)).expect("artifact response") - } -} - -fn artifact_response_builder( - status: StatusCode, - policy: ArtifactCachePolicy, - etag: &HeaderValue, -) -> axum::http::response::Builder { - Response::builder() - .status(status) - .header(header::CACHE_CONTROL, policy.header_value()) - .header(header::VARY, "Accept-Encoding") - .header(header::ETAG, etag.clone()) -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum ArtifactEncoding { - Brotli, - Gzip, - Identity, -} - -fn preferred_encoding(headers: &HeaderMap) -> ArtifactEncoding { - let Some(raw) = headers - .get(header::ACCEPT_ENCODING) - .and_then(|v| v.to_str().ok()) - else { - return ArtifactEncoding::Identity; - }; - if accepts_encoding(raw, "br") { - ArtifactEncoding::Brotli - } else if accepts_encoding(raw, "gzip") { - ArtifactEncoding::Gzip - } else { - ArtifactEncoding::Identity - } -} - -fn accepts_encoding(raw: &str, expected: &str) -> bool { - raw.split(',').any(|part| { - let mut pieces = part.trim().split(';'); - let name = pieces.next().unwrap_or_default().trim(); - if !name.eq_ignore_ascii_case(expected) { - return false; - } - pieces.all(|piece| { - let piece = piece.trim(); - let Some((key, value)) = piece.split_once('=') else { - return true; - }; - if !key.trim().eq_ignore_ascii_case("q") { - return true; - } - // Per RFC 9110 the q value is a positive number in [0, 1]; we - // reject `q=0` (client refuses the encoding) AND reject - // malformed q-values (`q=foo`, `q=`, `q=inf`, `q=NaN`, `q=2`). - // The earlier `map_or(true, |q| q > 0.0)` treated every parse - // failure as accept, which is non-conformant in the - // opposite direction. - match value.trim().parse::() { - Ok(q) if q.is_finite() && (0.0..=1.0).contains(&q) => q > 0.0, - _ => false, - } - }) - }) -} - -fn if_none_match_matches(headers: &HeaderMap, etag: &HeaderValue) -> bool { - let Some(raw) = headers - .get(header::IF_NONE_MATCH) - .and_then(|v| v.to_str().ok()) - else { - return false; - }; - let etag = etag.to_str().unwrap_or_default(); - raw.split(',').any(|candidate| { - let candidate = candidate.trim(); - candidate == "*" || candidate == etag - }) -} - -fn gzip_bytes(bytes: &[u8]) -> Result> { - let mut encoder = GzEncoder::new(Vec::new(), Compression::default()); - encoder.write_all(bytes)?; - Ok(encoder.finish()?) -} - -fn brotli_bytes(bytes: &[u8]) -> Result> { - let mut reader = brotli::CompressorReader::new(bytes, 4096, 5, 22); - let mut out = Vec::new(); - reader.read_to_end(&mut out)?; - Ok(out) -} - -type RebuildFuture = Pin> + Send>>; -type RebuildTask = Arc RebuildFuture + Send + Sync>; - -/// Shared in-memory store for the active and recently superseded read -/// generations. -#[derive(Clone)] -pub struct ReadStore { - inner: Arc>, - rebuild: Arc>, -} - -struct ReadStoreInner { - active: Arc, - previous: VecDeque>, -} - -#[derive(Default)] -struct RebuildState { - running: bool, - pending: bool, -} - -impl ReadStore { - /// Build the first generation synchronously during startup. Failure here - /// fails server startup rather than pushing a cold build onto users. - pub fn build_initial(db: &DbHandle) -> Result { - let mut conn = db.connection()?; - let generation = Arc::new(build_generation(&mut conn)?); - Ok(Self { - inner: Arc::new(RwLock::new(ReadStoreInner { - active: generation, - previous: VecDeque::new(), - })), - rebuild: Arc::new(AsyncMutex::new(RebuildState::default())), - }) - } - - /// Current generation. - pub fn active(&self) -> Arc { - Arc::clone(&self.inner.read().active) - } - - /// Find the active or retained previous generation by id. - pub fn generation(&self, id: &str) -> Option> { - let inner = self.inner.read(); - if inner.active.id == id { - return Some(Arc::clone(&inner.active)); - } - inner - .previous - .iter() - .find(|generation| generation.id == id) - .map(Arc::clone) - } - - /// Schedule a background rebuild after ingest. The active generation is - /// retained until the rebuild succeeds, and repeated ingests coalesce into - /// at most one follow-up rebuild. - pub async fn schedule_rebuild(&self, db: DbHandle) { - let build: RebuildTask = Arc::new(move || { - let db = db.clone(); - Box::pin(async move { db::run_read_blocking(&db, build_generation).await }) - }); - self.schedule_rebuild_with(build).await; - } - - async fn schedule_rebuild_with(&self, build: RebuildTask) { - let mut state = self.rebuild.lock().await; - if state.running { - state.pending = true; - return; - } - state.running = true; - let store = self.clone(); - tokio::spawn(async move { - store.rebuild_loop(build).await; - }); - } - - async fn rebuild_loop(self, build: RebuildTask) { - loop { - match build().await { - Ok(generation) => self.install(generation), - Err(err) => { - tracing::error!(error = ?err, "read model rebuild failed"); - } - } - - let mut state = self.rebuild.lock().await; - if state.pending { - state.pending = false; - continue; - } - state.running = false; - break; - } - } - - fn install(&self, generation: ReadGeneration) { - let mut inner = self.inner.write(); - let previous = Arc::clone(&inner.active); - inner.active = Arc::new(generation); - inner.previous.push_front(previous); - while inner.previous.len() > RETAINED_PREVIOUS_GENERATIONS { - inner.previous.pop_back(); - } - } -} - -/// One immutable read snapshot. -pub struct ReadGeneration { - id: String, - groups: Arc>, - filter_universe: Arc, - groups_artifact: EncodedArtifact, - chart_artifacts: HashMap, - group_artifacts: HashMap, - group_shards: HashMap, - group_shard_counts: HashMap, - chart_payloads: HashMap>, -} - -impl ReadGeneration { - /// Content-derived generation id. - pub fn id(&self) -> &str { - &self.id - } - - /// Structured groups for HTML rendering. - pub fn groups(&self) -> Arc> { - Arc::clone(&self.groups) - } - - /// Structured filter universe for HTML rendering. - pub fn filter_universe(&self) -> Arc { - Arc::clone(&self.filter_universe) - } - - /// Materialized `/api/groups` body. - pub fn groups_artifact(&self) -> &EncodedArtifact { - &self.groups_artifact - } - - /// Materialized latest-100 `/api/chart/{slug}` body. - pub fn chart_artifact(&self, slug: &str) -> Option<&EncodedArtifact> { - self.chart_artifacts.get(slug) - } - - /// Materialized latest-100 `/api/group/{slug}` compatibility body. - pub fn group_artifact(&self, slug: &str) -> Option<&EncodedArtifact> { - self.group_artifacts.get(slug) - } - - /// Materialized latest-100 shard body for landing/group hydration. - pub fn group_shard_artifact(&self, slug: &str, index: usize) -> Option<&EncodedArtifact> { - self.group_shards.get(&GroupShardKey { - slug: slug.to_string(), - index, - }) - } - - /// Number of materialized shards for a group. - pub fn group_shard_count(&self, slug: &str) -> usize { - self.group_shard_counts.get(slug).copied().unwrap_or(0) - } - - /// Structured latest-100 chart payload for single-chart HTML rendering. - pub fn chart_payload(&self, slug: &str) -> Option> { - self.chart_payloads.get(slug).map(Arc::clone) - } -} - -#[derive(Debug, Clone, Eq, PartialEq, Hash)] -struct GroupShardKey { - slug: String, - index: usize, -} - -#[derive(Serialize)] -struct GroupShardResponse { - name: String, - #[serde(skip_serializing_if = "Option::is_none")] - summary: Option, - #[serde(skip_serializing_if = "Option::is_none")] - description: Option, - window: u32, - shard_index: usize, - shard_count: usize, - charts: Vec, -} - -struct RawArtifact { - key: String, - kind: RawArtifactKind, - bytes: Vec, -} - -enum RawArtifactKind { - Groups, - Chart { slug: String }, - Group { slug: String }, - GroupShard { slug: String, index: usize }, -} - -fn build_generation(conn: &mut Connection) -> Result { - api::read_transaction(conn, build_generation_from_snapshot) -} - -fn build_generation_from_snapshot(conn: &Connection) -> Result { - let groups = Arc::new(api::collect_groups(conn)?); - let filter_universe = Arc::new(api::collect_filter_universe(conn)?); - let window = CommitWindow::Last( - NonZeroU32::new(api::DEFAULT_COMMIT_WINDOW).expect("default window is non-zero"), - ); - - let mut raw = Vec::new(); - raw.push(RawArtifact { - key: "api:groups".to_string(), - kind: RawArtifactKind::Groups, - bytes: serde_json::to_vec(&GroupsResponse { - groups: Arc::clone(&groups), - }) - .context("serialize groups artifact")?, - }); - - let mut chart_payloads = HashMap::new(); - let mut group_shard_counts = HashMap::new(); - - for group in groups.iter() { - let mut charts = Vec::with_capacity(group.charts.len()); - for link in &group.charts { - let chart = if let Some(chart) = chart_payloads.get(&link.slug) { - Arc::clone(chart) - } else { - let key = ChartKey::from_slug(&link.slug) - .with_context(|| format!("invalid chart slug in group: {}", link.slug))?; - let Some(chart) = api::chart_payload(conn, &key, &window)? else { - continue; - }; - let chart = Arc::new(chart); - raw.push(RawArtifact { - key: format!("api:chart:{}:100", link.slug), - kind: RawArtifactKind::Chart { - slug: link.slug.clone(), - }, - bytes: serde_json::to_vec(chart.as_ref()) - .with_context(|| format!("serialize chart artifact {}", link.slug))?, - }); - chart_payloads.insert(link.slug.clone(), Arc::clone(&chart)); - chart - }; - charts.push(NamedChartResponse { - name: link.name.clone(), - slug: link.slug.clone(), - chart, - }); - } - - if charts.is_empty() { - group_shard_counts.insert(group.slug.clone(), 0); - continue; - } - - let group_response = GroupChartsResponse { - name: group.name.clone(), - summary: group.summary.clone(), - description: group.description.clone(), - charts: charts.clone(), - }; - raw.push(RawArtifact { - key: format!("api:group:{}:100", group.slug), - kind: RawArtifactKind::Group { - slug: group.slug.clone(), - }, - bytes: serde_json::to_vec(&group_response) - .with_context(|| format!("serialize group artifact {}", group.slug))?, - }); - - let shard_count = charts.len().div_ceil(GROUP_SHARD_CHARTS); - group_shard_counts.insert(group.slug.clone(), shard_count); - for (shard_index, chunk) in charts.chunks(GROUP_SHARD_CHARTS).enumerate() { - let shard = GroupShardResponse { - name: group.name.clone(), - summary: group.summary.clone(), - description: group.description.clone(), - window: api::DEFAULT_COMMIT_WINDOW, - shard_index, - shard_count, - charts: chunk.to_vec(), - }; - raw.push(RawArtifact { - key: format!("api:group-shard:{}:{shard_index}:100", group.slug), - kind: RawArtifactKind::GroupShard { - slug: group.slug.clone(), - index: shard_index, - }, - bytes: serde_json::to_vec(&shard).with_context(|| { - format!( - "serialize group shard artifact {}#{shard_index}", - group.slug - ) - })?, - }); - } - } - - let id = generation_id(&raw); - let mut groups_artifact = None; - let mut chart_artifacts = HashMap::new(); - let mut group_artifacts = HashMap::new(); - let mut group_shards = HashMap::new(); - - for artifact in raw { - let encoded = EncodedArtifact::new(&id, artifact.bytes) - .with_context(|| format!("encode artifact {}", artifact.key))?; - match artifact.kind { - RawArtifactKind::Groups => groups_artifact = Some(encoded), - RawArtifactKind::Chart { slug } => { - chart_artifacts.insert(slug, encoded); - } - RawArtifactKind::Group { slug } => { - group_artifacts.insert(slug, encoded); - } - RawArtifactKind::GroupShard { slug, index } => { - group_shards.insert(GroupShardKey { slug, index }, encoded); - } - } - } - - Ok(ReadGeneration { - id, - groups, - filter_universe, - groups_artifact: groups_artifact.context("groups artifact missing")?, - chart_artifacts, - group_artifacts, - group_shards, - group_shard_counts, - chart_payloads, - }) -} - -fn generation_id(raw: &[RawArtifact]) -> String { - let mut sorted: Vec<_> = raw.iter().collect(); - sorted.sort_by(|a, b| a.key.cmp(&b.key)); - let mut hash = XxHash64::with_seed(0); - for artifact in sorted { - hash.write_u64(artifact.key.len() as u64); - hash.write(artifact.key.as_bytes()); - hash.write_u64(artifact.bytes.len() as u64); - hash.write(&artifact.bytes); - } - format!("{:016x}", hash.finish()) -} - -#[cfg(test)] -mod tests { - use std::sync::atomic::AtomicUsize; - use std::sync::atomic::Ordering; - use std::time::Duration; - - use axum::http::header; - use tokio::sync::Notify; - use tokio::time::sleep; - - use super::*; - - fn raw_artifact(key: &str, bytes: &[u8]) -> RawArtifact { - RawArtifact { - key: key.to_string(), - kind: RawArtifactKind::Groups, - bytes: bytes.to_vec(), - } - } - - fn empty_generation(id: &str) -> Result { - Ok(ReadGeneration { - id: id.to_string(), - groups: Arc::new(Vec::new()), - filter_universe: Arc::new(FilterUniverse::default()), - groups_artifact: EncodedArtifact::new(id, br#"{"groups":[]}"#.to_vec())?, - chart_artifacts: HashMap::new(), - group_artifacts: HashMap::new(), - group_shards: HashMap::new(), - group_shard_counts: HashMap::new(), - chart_payloads: HashMap::new(), - }) - } - - fn test_store(id: &str) -> Result { - Ok(ReadStore { - inner: Arc::new(RwLock::new(ReadStoreInner { - active: Arc::new(empty_generation(id)?), - previous: VecDeque::new(), - })), - rebuild: Arc::new(AsyncMutex::new(RebuildState::default())), - }) - } - - async fn wait_for_rebuild_idle(store: &ReadStore) { - for _ in 0..100 { - if !store.rebuild.lock().await.running { - return; - } - sleep(Duration::from_millis(10)).await; - } - panic!("read model rebuild did not become idle"); - } - - async fn wait_for_calls(calls: &AtomicUsize, expected: usize) { - for _ in 0..100 { - if calls.load(Ordering::SeqCst) >= expected { - return; - } - sleep(Duration::from_millis(10)).await; - } - panic!("read model rebuild did not start"); - } - - #[test] - fn generation_ids_are_content_derived_and_order_stable() { - let a = vec![raw_artifact("b", b"two"), raw_artifact("a", b"one")]; - let b = vec![raw_artifact("a", b"one"), raw_artifact("b", b"two")]; - let c = vec![raw_artifact("a", b"one"), raw_artifact("b", b"changed")]; - - assert_eq!(generation_id(&a), generation_id(&b)); - assert_ne!(generation_id(&a), generation_id(&c)); - } - - #[test] - fn encoded_artifact_negotiates_precompressed_variants() -> Result<()> { - let artifact = EncodedArtifact::new("abc123", br#"{"ok":true}"#.to_vec())?; - let mut headers = HeaderMap::new(); - headers.insert(header::ACCEPT_ENCODING, HeaderValue::from_static("gzip")); - - let resp = artifact.response(&headers, ArtifactCachePolicy::Immutable); - assert_eq!(resp.status(), StatusCode::OK); - assert_eq!( - resp.headers() - .get(header::CONTENT_ENCODING) - .and_then(|v| v.to_str().ok()), - Some("gzip") - ); - assert_eq!( - resp.headers() - .get(header::VARY) - .and_then(|v| v.to_str().ok()), - Some("Accept-Encoding") - ); - assert!( - resp.headers() - .get(header::CACHE_CONTROL) - .and_then(|v| v.to_str().ok()) - .is_some_and(|v| v.contains("immutable")) - ); - Ok(()) - } - - #[test] - fn encoded_artifact_honors_nonzero_q_values() -> Result<()> { - let artifact = EncodedArtifact::new("abc123", br#"{"ok":true}"#.to_vec())?; - let mut headers = HeaderMap::new(); - headers.insert( - header::ACCEPT_ENCODING, - HeaderValue::from_static("br;q=0.8, gzip;q=0.5"), - ); - - let resp = artifact.response(&headers, ArtifactCachePolicy::Immutable); - assert_eq!(resp.status(), StatusCode::OK); - assert_eq!( - resp.headers() - .get(header::CONTENT_ENCODING) - .and_then(|v| v.to_str().ok()), - Some("br") - ); - Ok(()) - } - - #[test] - fn encoded_artifact_rejects_zero_q_values() -> Result<()> { - let artifact = EncodedArtifact::new("abc123", br#"{"ok":true}"#.to_vec())?; - let mut headers = HeaderMap::new(); - headers.insert( - header::ACCEPT_ENCODING, - HeaderValue::from_static("br;q=0, gzip;q=0"), - ); - - let resp = artifact.response(&headers, ArtifactCachePolicy::Immutable); - assert_eq!(resp.status(), StatusCode::OK); - assert!( - resp.headers().get(header::CONTENT_ENCODING).is_none(), - "q=0 encodings should fall back to identity" - ); - Ok(()) - } - - /// `accepts_encoding` rejects malformed q-values (parse failure, out - /// of range, non-finite). Previously the parser treated every - /// non-numeric q-value as `true` (accept), which was non-conformant - /// in the opposite direction from the original q=0 bug. - #[test] - fn accepts_encoding_rejects_malformed_q_values() { - // Malformed numeric inputs: not a number, empty, out of range, - // non-finite. Each should reject the encoding. - for raw in [ - "gzip;q=foo", - "gzip;q=", - "gzip;q=2", - "gzip;q=1.5", - "gzip;q=-0.1", - "gzip;q=NaN", - "gzip;q=inf", - ] { - assert!( - !accepts_encoding(raw, "gzip"), - "should reject malformed q-value: {raw:?}" - ); - } - // Valid edge q-values still accepted. - for raw in ["gzip", "gzip;q=1", "gzip;q=1.0", "gzip;q=0.001"] { - assert!( - accepts_encoding(raw, "gzip"), - "should accept valid q-value: {raw:?}" - ); - } - } - - #[test] - fn encoded_artifact_returns_304_for_matching_etag() -> Result<()> { - let artifact = EncodedArtifact::new("abc123", br#"{"ok":true}"#.to_vec())?; - let mut headers = HeaderMap::new(); - headers.insert( - header::IF_NONE_MATCH, - HeaderValue::from_static("\"abc123\""), - ); - - let resp = artifact.response(&headers, ArtifactCachePolicy::Revalidate); - assert_eq!(resp.status(), StatusCode::NOT_MODIFIED); - assert_eq!( - resp.headers() - .get(header::ETAG) - .and_then(|v| v.to_str().ok()), - Some("\"abc123\"") - ); - Ok(()) - } - - #[tokio::test] - async fn failed_rebuild_keeps_old_generation_active() -> Result<()> { - let store = test_store("old")?; - let build: RebuildTask = Arc::new(|| Box::pin(async { anyhow::bail!("boom") })); - - store.schedule_rebuild_with(build).await; - wait_for_rebuild_idle(&store).await; - - assert_eq!(store.active().id(), "old"); - assert!(store.generation("old").is_some()); - Ok(()) - } - - #[tokio::test] - async fn install_evicts_only_beyond_retained_generation_limit() -> Result<()> { - let store = test_store("gen0")?; - for idx in 1..=(RETAINED_PREVIOUS_GENERATIONS + 2) { - store.install(empty_generation(&format!("gen{idx}"))?); - } - - assert_eq!( - store.active().id(), - &format!("gen{}", RETAINED_PREVIOUS_GENERATIONS + 2) - ); - assert!( - store.generation("gen2").is_some(), - "oldest retained generation should stay addressable" - ); - assert!( - store.generation("gen1").is_none(), - "generations beyond the retention window should be evicted" - ); - Ok(()) - } - - #[tokio::test] - async fn concurrent_rebuild_requests_coalesce() -> Result<()> { - let store = test_store("old")?; - let calls = Arc::new(AtomicUsize::new(0)); - let release_first = Arc::new(Notify::new()); - let build: RebuildTask = Arc::new({ - let calls = Arc::clone(&calls); - let release_first = Arc::clone(&release_first); - move || { - let calls = Arc::clone(&calls); - let release_first = Arc::clone(&release_first); - Box::pin(async move { - let call = calls.fetch_add(1, Ordering::SeqCst) + 1; - if call == 1 { - release_first.notified().await; - } - empty_generation(&format!("gen{call}")) - }) - } - }); - - store.schedule_rebuild_with(Arc::clone(&build)).await; - wait_for_calls(&calls, 1).await; - store.schedule_rebuild_with(Arc::clone(&build)).await; - store.schedule_rebuild_with(build).await; - release_first.notify_one(); - wait_for_rebuild_idle(&store).await; - - assert_eq!(calls.load(Ordering::SeqCst), 2); - assert_eq!(store.active().id(), "gen2"); - assert!(store.generation("gen1").is_some()); - assert!(store.generation("old").is_some()); - Ok(()) - } -} diff --git a/benchmarks-website/server/src/records.rs b/benchmarks-website/server/src/records.rs deleted file mode 100644 index 446b701e296..00000000000 --- a/benchmarks-website/server/src/records.rs +++ /dev/null @@ -1,291 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Wire shapes for `POST /api/ingest`. -//! -//! Each [`Record`] variant deserializes one row destined for one of the five -//! fact tables in [`crate::schema`]. The producer side of the contract lives -//! in `vortex-bench/src/v3.rs` (the `--gh-json-v3` emitter); when changing a -//! shape here, change both sides in the same commit. -//! -//! ## Records are discriminated by `kind` -//! -//! Every record carries a `kind` field that picks one of the five fact -//! tables; serde drives this with `#[serde(tag = "kind", rename_all = -//! "snake_case")]`. -//! -//! | `kind` | Destination table | -//! |----------------------|-------------------------| -//! | `query_measurement` | `query_measurements` | -//! | `compression_time` | `compression_times` | -//! | `compression_size` | `compression_sizes` | -//! | `random_access_time` | `random_access_times` | -//! | `vector_search_run` | `vector_search_runs` | -//! -//! Every record struct carries `#[serde(deny_unknown_fields)]`, so unknown -//! fields surface as a `400` with the offending record's index — version -//! skew is supposed to fail loudly. Unknown `kind` values produce the same -//! `400` from the outer enum's tag check. -//! -//! ## Ingest envelope -//! -//! `POST /api/ingest` accepts one [`Envelope`] per request. The envelope -//! wraps a heterogeneous batch of records (any mix of `kind`s): -//! -//! - `run_meta` — [`RunMeta`] with `benchmark_id`, `schema_version` -//! (must equal [`crate::schema::SCHEMA_VERSION`]), and `started_at`. -//! - `commit` — [`CommitInfo`] with the columns of the `commits` dim table, -//! keyed by their column names with `commit_sha` renamed to `sha`. The -//! server upserts this row before applying any record. -//! - `records` — array of per-`kind` records. -//! -//! `vortex-bench --gh-json-v3 ` writes JSONL of bare records only — -//! the envelope (`run_meta` + `commit`) is added by the post-ingest script -//! before POSTing, which keeps the Rust emitter dependency-light and lets -//! CI fill the commit fields from `${{ github.sha }}` plus `git show`. - -use serde::Deserialize; - -/// One ingest payload. -/// -/// `run_meta` and `commit` are added by the post-ingest script around the -/// JSONL of bare records the Rust emitter writes. -#[derive(Debug, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct Envelope { - /// Per-run metadata, including the wire schema version. - pub run_meta: RunMeta, - /// Commit context — upserted into `commits` before any record is applied. - pub commit: CommitInfo, - /// Heterogeneous batch of fact-table records. - pub records: Vec, -} - -/// Run-level metadata. `schema_version` is checked against -/// [`crate::schema::SCHEMA_VERSION`] before any record is processed. -#[derive(Debug, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct RunMeta { - /// Free-form ID of the producing run (e.g. `bench.yml@`). - pub benchmark_id: String, - /// Wire schema version. Must equal [`crate::schema::SCHEMA_VERSION`]. - pub schema_version: i32, - /// RFC 3339 timestamp at which the run started. - pub started_at: String, -} - -/// Columns for the `commits` dim table. The wire field for `commit_sha` is -/// renamed to `sha` per the contract; every other field name matches the -/// column name in [`crate::schema`]. -#[derive(Debug, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct CommitInfo { - /// 40-hex lowercase commit SHA. - pub sha: String, - /// RFC 3339 / ISO 8601 timestamp of the commit. - pub timestamp: String, - /// Full commit message (the server renders only the first line). - pub message: String, - /// Author's display name. - pub author_name: String, - /// Author's email. - pub author_email: String, - /// Committer's display name. - pub committer_name: String, - /// Committer's email. - pub committer_email: String, - /// Git tree SHA the commit points at. - pub tree_sha: String, - /// GitHub URL for the commit (used as the click-through fallback when - /// no `(#NNNN)` tag is present in the message). - pub url: String, -} - -/// A single ingest record, discriminated by `kind`. -#[derive(Debug, Deserialize)] -#[serde(tag = "kind", rename_all = "snake_case")] -pub enum Record { - /// `query_measurement` → `query_measurements` table. - QueryMeasurement(QueryMeasurement), - /// `compression_time` → `compression_times` table. - CompressionTime(CompressionTime), - /// `compression_size` → `compression_sizes` table. - CompressionSize(CompressionSize), - /// `random_access_time` → `random_access_times` table. - RandomAccessTime(RandomAccessTime), - /// `vector_search_run` → `vector_search_runs` table. - VectorSearchRun(VectorSearchRun), -} - -/// SQL query suite measurement (TPC-H, ClickBench, ...). Lands in -/// `query_measurements`. Field names match the schema columns; per-suite dim -/// values are documented on -/// [`vortex_bench::v3::benchmark_dataset_dims`](../../../vortex-bench/src/v3.rs). -#[derive(Debug, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct QueryMeasurement { - /// 40-hex lowercase SHA of the producing commit. - pub commit_sha: String, - /// Top-level suite (e.g. `tpch`, `clickbench`, `public-bi`). - pub dataset: String, - /// Categorical sub-name (Public-BI dataset; ClickBench flavor). - #[serde(default)] - pub dataset_variant: Option, - /// TPC SF as a string. Populated for TPC-H/TPC-DS, NULL elsewhere. - #[serde(default)] - pub scale_factor: Option, - /// Query index within the suite. The convention (0-based or 1-based) is - /// fixed per suite by the producing bench loop; the migrate classifier - /// matches it by parsing literal digits out of `q07`-style v2 chart - /// names. - pub query_idx: i32, - /// Storage backend the run targeted: `nvme` or `s3`. Validated on insert. - pub storage: String, - /// Engine (`datafusion`, `duckdb`, `vortex`, `arrow`). - pub engine: String, - /// On-disk format (`parquet`, `vortex-file-compressed`, `lance`, ...). - pub format: String, - /// Median per-iteration wall time in nanoseconds. - pub value_ns: i64, - /// Per-iteration wall times in nanoseconds (median of these is `value_ns`). - pub all_runtimes_ns: Vec, - /// Peak resident-set bytes during the query, when memory tracking was on. - #[serde(default)] - pub peak_physical: Option, - /// Peak virtual-memory bytes during the query, when memory tracking was on. - #[serde(default)] - pub peak_virtual: Option, - /// Resident-set delta across the query, when memory tracking was on. - #[serde(default)] - pub physical_delta: Option, - /// Virtual-memory delta across the query, when memory tracking was on. - #[serde(default)] - pub virtual_delta: Option, - /// Host environment triple (e.g. `x86_64-linux-gnu`). - #[serde(default)] - pub env_triple: Option, -} - -/// Encode-or-decode timing from `compress-bench`. Lands in -/// `compression_times`. -#[derive(Debug, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct CompressionTime { - /// 40-hex lowercase SHA of the producing commit. - pub commit_sha: String, - /// Compression dataset name. - pub dataset: String, - /// Optional dataset variant (reserved; unused at alpha). - #[serde(default)] - pub dataset_variant: Option, - /// On-disk format the timing applies to. - pub format: String, - /// `encode` or `decode`. The server treats it as opaque on the wire. - pub op: String, - /// Median per-iteration wall time in nanoseconds. - pub value_ns: i64, - /// Per-iteration wall times in nanoseconds. - pub all_runtimes_ns: Vec, - /// Host environment triple. - #[serde(default)] - pub env_triple: Option, -} - -/// On-disk size from `compress-bench`. One-shot, no per-iteration data. -/// Lands in `compression_sizes`. Compression ratios (e.g. `vortex/parquet`) -/// are NOT a separate record kind — they are computed at read time from -/// pairs of these rows. -#[derive(Debug, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct CompressionSize { - /// 40-hex lowercase SHA of the producing commit. - pub commit_sha: String, - /// Compression dataset name. - pub dataset: String, - /// Optional dataset variant (reserved; unused at alpha). - #[serde(default)] - pub dataset_variant: Option, - /// On-disk format the size applies to. - pub format: String, - /// Compressed-file size in bytes. - pub value_bytes: i64, -} - -/// Take-time timing from `random-access-bench`. Lands in -/// `random_access_times`. Datasets here (chimp, taxi, ...) are a different -/// namespace from the SQL query suites' dataset names. -#[derive(Debug, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct RandomAccessTime { - /// 40-hex lowercase SHA of the producing commit. - pub commit_sha: String, - /// Random-access dataset name. - pub dataset: String, - /// On-disk format the timing applies to. - pub format: String, - /// Median per-iteration wall time in nanoseconds. - pub value_ns: i64, - /// Per-iteration wall times in nanoseconds. - pub all_runtimes_ns: Vec, - /// Host environment triple. - #[serde(default)] - pub env_triple: Option, -} - -/// Cosine-similarity scan from `vector-search-bench`. Lands in -/// `vector_search_runs`. The only family that emits timing **plus** side -/// counters in the same row. -#[derive(Debug, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct VectorSearchRun { - /// 40-hex lowercase SHA of the producing commit. - pub commit_sha: String, - /// Vector dataset name (e.g. `cohere-large-10m`). - pub dataset: String, - /// Train-split layout label. - pub layout: String, - /// Compression flavor label. - pub flavor: String, - /// Cosine threshold passed to the scan filter. - pub threshold: f64, - /// Median per-scan wall time in nanoseconds. - pub value_ns: i64, - /// Per-iteration wall times in nanoseconds. - pub all_runtimes_ns: Vec, - /// Number of rows that survived the cosine filter. - pub matches: i64, - /// Total rows scanned. - pub rows_scanned: i64, - /// Total on-disk bytes scanned. - pub bytes_scanned: i64, - /// Number of timed iterations. Not part of the dim hash. - pub iterations: i32, - /// Host environment triple. - #[serde(default)] - pub env_triple: Option, -} - -impl Record { - /// The `commit_sha` referenced by this record. Every record carries one; - /// the server checks the envelope's `commit.sha` matches. - pub fn commit_sha(&self) -> &str { - match self { - Self::QueryMeasurement(r) => &r.commit_sha, - Self::CompressionTime(r) => &r.commit_sha, - Self::CompressionSize(r) => &r.commit_sha, - Self::RandomAccessTime(r) => &r.commit_sha, - Self::VectorSearchRun(r) => &r.commit_sha, - } - } - - /// The wire `kind` string. Useful for logging and error messages. - pub fn kind(&self) -> &'static str { - match self { - Self::QueryMeasurement(_) => "query_measurement", - Self::CompressionTime(_) => "compression_time", - Self::CompressionSize(_) => "compression_size", - Self::RandomAccessTime(_) => "random_access_time", - Self::VectorSearchRun(_) => "vector_search_run", - } - } -} diff --git a/benchmarks-website/server/src/schema.rs b/benchmarks-website/server/src/schema.rs deleted file mode 100644 index c181021142d..00000000000 --- a/benchmarks-website/server/src/schema.rs +++ /dev/null @@ -1,237 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! DuckDB schema applied on server boot - one `commits` dim plus five fact -//! tables, one per measurement family. -//! -//! ## Design principles -//! -//! 1. **One fact table per (dim shape, value shape).** A row in any fact -//! table has every value column populated; NULLs only appear in genuinely -//! optional dimensions. The five families have different dim shapes, so -//! forcing them into one wide table either bloats every row with NULL -//! columns or splits a single scan's results across multiple rows that -//! have to be re-joined to render one chart. -//! 2. **No discriminator columns spanning families.** No `metric_kind` enum -//! forcing the five shapes into one row. -//! 3. **No JSON escape hatch.** New benchmark parameters become real columns. -//! Adding a nullable column is cheap; the readability win is worth it. -//! 4. **Hashed primary key per fact table.** Every fact table's -//! `measurement_id` is a deterministic 64-bit hash of `commit_sha` plus -//! that table's dimensional tuple, computed in -//! [`crate::db::measurement_id_query`] et al. Including `commit_sha` -//! makes every (commit, dim) pair a distinct row - that is exactly what -//! the chart pages render as a time series. Re-emission of the same -//! (commit, dim) pair is the upsert case. The hash is **server-internal** -//! and never crosses a process boundary; the wire never carries it. -//! 5. **`commits` is the only dim table.** Engine, format, dataset, etc. -//! stay as inline strings; DuckDB's dictionary encoding makes a lookup -//! table pointless. -//! 6. **Ratios are not stored.** Computed at query time from -//! `compression_sizes`. -//! -//! ## Tables -//! -//! - **`commits`** - dim table. `commit_sha` is the PK. `timestamp`, -//! `tree_sha`, and `url` are required (the server cannot render a chart -//! without them); `message` and the author/committer name + email pair are -//! nullable so v2-imported rows that lacked them survive. Populated on -//! every `/api/ingest` from the envelope's `commit` block, and on every -//! migrator run from `commits.json`. -//! - **`query_measurements`** - SQL query suite measurements (TPC-H, TPC-DS, -//! ClickBench, StatPopGen, PolarSignals, Fineweb, GhArchive, Public-BI). -//! Natural key: `(commit_sha, dataset, dataset_variant, scale_factor, -//! query_idx, storage, engine, format)`. Memory columns -//! (`peak_physical`, `peak_virtual`, `physical_delta`, `virtual_delta`) -//! are populated together when the run was instrumented for memory and -//! are NULL otherwise; the ingest path enforces "all four or none". -//! `dataset_variant` carries a categorical sub-name (Public-BI dataset, -//! ClickBench flavor); `scale_factor` is the TPC SF as a string. -//! - **`compression_times`** - encode/decode timings from `compress-bench`. -//! Natural key: `(commit_sha, dataset, dataset_variant, format, op)`, -//! where `op ∈ {encode, decode}`. Encode and decode share a table because -//! they share dim and value shape; keeping them together makes the -//! per-format chart a single SQL query. -//! - **`compression_sizes`** - on-disk sizes from `compress-bench`. One-shot -//! (no per-iteration data, no `all_runtimes_ns`). Natural key: -//! `(commit_sha, dataset, dataset_variant, format)`. Compression ratios -//! (e.g. `vortex:parquet-zstd`) are NOT stored - they are a SELECT over -//! this table joined to itself, computed in `api/summary.rs`. -//! - **`random_access_times`** - take-time timings from -//! `random-access-bench`. Different dataset namespace from -//! `compression_times` (chimp, taxi, etc.) - kept in its own table so -//! dataset filters never have to disambiguate which suite a row belongs -//! to. Natural key: `(commit_sha, dataset, format)`. -//! - **`vector_search_runs`** - cosine-similarity scans from -//! `vector-search-bench`. The only family that emits a timing **plus** -//! side counters (`matches`, `rows_scanned`, `bytes_scanned`) for the -//! same scan; keeping them in one row avoids a 1:N split that has to be -//! re-joined on read. Natural key: `(commit_sha, dataset, layout, -//! flavor, threshold)`. `iterations` is not part of the dim hash - it is -//! a side count, like `matches`. -//! -//! ## Column conventions -//! -//! - `commit_sha` is `TEXT NOT NULL` on every fact table and references the -//! `commits.commit_sha` PK. There is no FK constraint declared at alpha; -//! the ingest path upserts the commit before the records. -//! - `value_ns` is the median per-iteration nanosecond timing for timing -//! tables. `value_bytes` is the on-disk byte count for `compression_sizes`. -//! - `all_runtimes_ns BIGINT[]` carries the per-iteration timings inline. -//! DuckDB's list type avoids a child table; chart code only ever reads -//! `value_ns`, so the list is effectively cold storage today, kept for -//! future variance or distribution charts. -//! - `storage` (only on `query_measurements`) is `nvme` or `s3`. Legacy `gcs` -//! was dropped during the v3 design pass. -//! - `env_triple` is the `arch-os-env` host triple captured at run time -//! (e.g. `x86_64-linux-gnu`). Optional everywhere; useful for slicing -//! results by host class once the data set has more than one host class. -//! -//! ## Schema changes -//! -//! There is no migration framework. If you change the schema: -//! -//! 1. Update the per-family DDL constant ([`COMMITS_DDL`] for the dim, -//! [`QUERY_MEASUREMENTS_DDL`] / [`COMPRESSION_TIMES_DDL`] / -//! [`COMPRESSION_SIZES_DDL`] / [`RANDOM_ACCESS_TIMES_DDL`] / -//! [`VECTOR_SEARCH_RUNS_DDL`] for the facts), the matching -//! [`crate::records`] struct, and the [`crate::family::Family`] entry -//! that ties them together. -//! 2. Update or delete any local `bench.duckdb` (the migrator's -//! `open_target_db` already deletes-and-recreates). -//! 3. Bump [`SCHEMA_VERSION`] if the wire envelope's -//! `run_meta.schema_version` semantics change. -//! -//! A real forward-only migration framework is post-cutover work. - -/// DDL for the `commits` dim table. The five fact-table DDLs live with -/// their respective [`crate::family::Family`] declarations; [`crate::db::open`] -/// applies this constant first, then iterates [`crate::family::FAMILIES`]. -pub const COMMITS_DDL: &str = r#" -CREATE TABLE IF NOT EXISTS commits ( - commit_sha TEXT PRIMARY KEY NOT NULL, - timestamp TIMESTAMPTZ NOT NULL, - message TEXT, - author_name TEXT, - author_email TEXT, - committer_name TEXT, - committer_email TEXT, - tree_sha TEXT NOT NULL, - url TEXT NOT NULL -); -"#; - -/// DDL for the `query_measurements` fact table. Wired into the -/// `schema_ddl` field of [`crate::family::QUERY_MEASUREMENTS`]. -pub const QUERY_MEASUREMENTS_DDL: &str = r#" -CREATE TABLE IF NOT EXISTS query_measurements ( - measurement_id BIGINT PRIMARY KEY NOT NULL, - commit_sha TEXT NOT NULL, - dataset TEXT NOT NULL, - dataset_variant TEXT, - scale_factor TEXT, - query_idx INTEGER NOT NULL, - storage TEXT NOT NULL, - engine TEXT NOT NULL, - format TEXT NOT NULL, - value_ns BIGINT NOT NULL, - all_runtimes_ns BIGINT[] NOT NULL, - peak_physical BIGINT, - peak_virtual BIGINT, - physical_delta BIGINT, - virtual_delta BIGINT, - env_triple TEXT -); -"#; - -/// DDL for the `compression_times` fact table. Wired into the -/// `schema_ddl` field of [`crate::family::COMPRESSION_TIMES`]. -pub const COMPRESSION_TIMES_DDL: &str = r#" -CREATE TABLE IF NOT EXISTS compression_times ( - measurement_id BIGINT PRIMARY KEY NOT NULL, - commit_sha TEXT NOT NULL, - dataset TEXT NOT NULL, - dataset_variant TEXT, - format TEXT NOT NULL, - op TEXT NOT NULL, - value_ns BIGINT NOT NULL, - all_runtimes_ns BIGINT[] NOT NULL, - env_triple TEXT -); -"#; - -/// DDL for the `compression_sizes` fact table. Wired into the -/// `schema_ddl` field of [`crate::family::COMPRESSION_SIZES`]. -pub const COMPRESSION_SIZES_DDL: &str = r#" -CREATE TABLE IF NOT EXISTS compression_sizes ( - measurement_id BIGINT PRIMARY KEY NOT NULL, - commit_sha TEXT NOT NULL, - dataset TEXT NOT NULL, - dataset_variant TEXT, - format TEXT NOT NULL, - value_bytes BIGINT NOT NULL -); -"#; - -/// DDL for the `random_access_times` fact table. Wired into the -/// `schema_ddl` field of [`crate::family::RANDOM_ACCESS_TIMES`]. -pub const RANDOM_ACCESS_TIMES_DDL: &str = r#" -CREATE TABLE IF NOT EXISTS random_access_times ( - measurement_id BIGINT PRIMARY KEY NOT NULL, - commit_sha TEXT NOT NULL, - dataset TEXT NOT NULL, - format TEXT NOT NULL, - value_ns BIGINT NOT NULL, - all_runtimes_ns BIGINT[] NOT NULL, - env_triple TEXT -); -"#; - -/// DDL for the `vector_search_runs` fact table. Wired into the -/// `schema_ddl` field of [`crate::family::VECTOR_SEARCH_RUNS`]. -pub const VECTOR_SEARCH_RUNS_DDL: &str = r#" -CREATE TABLE IF NOT EXISTS vector_search_runs ( - measurement_id BIGINT PRIMARY KEY NOT NULL, - commit_sha TEXT NOT NULL, - dataset TEXT NOT NULL, - layout TEXT NOT NULL, - flavor TEXT NOT NULL, - threshold DOUBLE NOT NULL, - value_ns BIGINT NOT NULL, - all_runtimes_ns BIGINT[] NOT NULL, - matches BIGINT NOT NULL, - rows_scanned BIGINT NOT NULL, - bytes_scanned BIGINT NOT NULL, - iterations INTEGER NOT NULL, - env_triple TEXT -); -"#; - -/// Schema version expected by the server. The ingest envelope's -/// `run_meta.schema_version` must match this exactly at alpha. -/// -/// Coupled sites that MUST agree on this value (see -/// `benchmarks-website/AGENTS.md` → "Wire shapes are a coordinated change"): -/// -/// - This constant. -/// - The producer-side wire-shape source of truth in -/// [`vortex_bench::v3`](../../../vortex-bench/src/v3.rs). -/// - The CI ingest wrapper at `scripts/post-ingest.py`, which fills the -/// envelope's `run_meta.schema_version` from a hardcoded Python constant. -/// Bumping `SCHEMA_VERSION` without bumping `post-ingest.py` makes every -/// CI run 400 at ingest until the script is updated. -pub const SCHEMA_VERSION: i32 = 1; - -/// Every table in the schema, in the order a fresh boot creates them. -/// Used by the snapshot endpoint to drive a per-table `COPY ... TO` -/// across the whole DB and by the restore docs to document the same -/// list. `commits` is the dim table; the rest are facts, derived from -/// the registry so adding a new fact table is one entry in -/// [`crate::family::FAMILIES`] and the snapshot endpoint + restore docs -/// pick it up automatically. -pub static TABLES: std::sync::LazyLock> = std::sync::LazyLock::new(|| { - let mut v: Vec<&'static str> = Vec::with_capacity(1 + crate::family::FAMILIES.len()); - v.push("commits"); - v.extend(crate::family::FAMILIES.iter().map(|f| f.table_name)); - v -}); diff --git a/benchmarks-website/server/src/slug.rs b/benchmarks-website/server/src/slug.rs deleted file mode 100644 index 38bd401752f..00000000000 --- a/benchmarks-website/server/src/slug.rs +++ /dev/null @@ -1,222 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Opaque slugs for `/api/chart/:slug` and `/api/group/:slug`. -//! -//! The web-ui treats slugs as opaque strings: it receives them from -//! `/api/groups` and feeds them back unchanged. The server is free to -//! choose any format — slugs here are -//! `.`, where `` names the source -//! fact table and the JSON encodes the chart or group key. Round-tripping -//! the slug back gives a strongly-typed [`ChartKey`] or [`GroupKey`]. - -use anyhow::Context as _; -use anyhow::Result; -use anyhow::anyhow; -use base64::Engine as _; -use base64::engine::general_purpose::URL_SAFE_NO_PAD; -use serde::Deserialize; -use serde::Serialize; - -use crate::family; - -// Slug prefixes live on each [`family::Family`] declaration (the -// `chart_slug_prefix` / `group_slug_prefix` fields). `ChartKey::prefix` -// and `GroupKey::prefix` consult the registry rather than maintaining a -// parallel const table here; the prior `PREFIX_QUERY = "qm"` etc. -// constants were a second source of truth that a test had to assert -// equal-to-the-Family-entries. - -/// The strongly-typed chart key parsed from a slug. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -#[serde(tag = "k")] -pub enum ChartKey { - /// `query_measurements` chart: `(dataset, query_idx)` is the chart key - /// per [`crate::schema`]. Group context (`dataset_variant`, - /// `scale_factor`, `storage`) is carried alongside so the slug fully - /// specifies the chart. - QueryMeasurement { - dataset: String, - dataset_variant: Option, - scale_factor: Option, - storage: String, - query_idx: i32, - }, - /// `compression_times` chart: `(dataset, dataset_variant)`. - CompressionTime { - dataset: String, - dataset_variant: Option, - }, - /// `compression_sizes` chart: `(dataset, dataset_variant)`. - CompressionSize { - dataset: String, - dataset_variant: Option, - }, - /// `random_access_times` chart: `dataset`. - RandomAccess { dataset: String }, - /// `vector_search_runs` chart: `(dataset, layout, threshold)`. - VectorSearch { - dataset: String, - layout: String, - threshold: f64, - }, -} - -impl ChartKey { - fn prefix(&self) -> &'static str { - family::family_for_chart_key(self).chart_slug_prefix - } - - /// Render the slug for this chart key. - pub fn to_slug(&self) -> String { - let json = serde_json::to_vec(self).expect("ChartKey is always JSON-serializable"); - format!("{}.{}", self.prefix(), URL_SAFE_NO_PAD.encode(json)) - } - - /// Parse a slug previously produced by [`Self::to_slug`]. - pub fn from_slug(slug: &str) -> Result { - let (_, encoded) = slug - .split_once('.') - .ok_or_else(|| anyhow!("slug missing '.' separator"))?; - let json = URL_SAFE_NO_PAD - .decode(encoded.as_bytes()) - .context("slug payload was not valid base64url")?; - let key: Self = serde_json::from_slice(&json).context("slug payload was not valid JSON")?; - Ok(key) - } -} - -/// Slug for a *group* of charts. Mirrors [`ChartKey`] but at the group -/// granularity: a group is a set of charts that share every dimension except -/// one (e.g. all 22 TPC-H queries at sf=1 on nvme). -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -#[serde(tag = "k")] -pub enum GroupKey { - /// All `query_measurements` charts at one `(dataset, dataset_variant, - /// scale_factor, storage)`. Charts inside vary by `query_idx`. - QueryGroup { - dataset: String, - dataset_variant: Option, - scale_factor: Option, - storage: String, - }, - /// Every compression-time chart (one per `(dataset, dataset_variant)`). - CompressionTimeGroup, - /// Every compression-size chart. - CompressionSizeGroup, - /// Every random-access chart. - RandomAccessGroup, - /// All vector-search charts at one `(dataset, layout)`. Charts inside - /// vary by `threshold`. - VectorSearchGroup { dataset: String, layout: String }, -} - -impl GroupKey { - fn prefix(&self) -> &'static str { - family::family_for_group_key(self).group_slug_prefix - } - - /// Render the slug for this group key. - pub fn to_slug(&self) -> String { - let json = serde_json::to_vec(self).expect("GroupKey is always JSON-serializable"); - format!("{}.{}", self.prefix(), URL_SAFE_NO_PAD.encode(json)) - } - - /// Parse a slug previously produced by [`Self::to_slug`]. - pub fn from_slug(slug: &str) -> Result { - let (_, encoded) = slug - .split_once('.') - .ok_or_else(|| anyhow!("slug missing '.' separator"))?; - let json = URL_SAFE_NO_PAD - .decode(encoded.as_bytes()) - .context("slug payload was not valid base64url")?; - let key: Self = serde_json::from_slice(&json).context("slug payload was not valid JSON")?; - Ok(key) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - fn roundtrip(key: ChartKey) { - let slug = key.to_slug(); - let parsed = ChartKey::from_slug(&slug).expect("parses back"); - assert_eq!(parsed, key); - } - - #[test] - fn query_measurement_roundtrips() { - roundtrip(ChartKey::QueryMeasurement { - dataset: "tpch".into(), - dataset_variant: None, - scale_factor: Some("1".into()), - storage: "nvme".into(), - query_idx: 7, - }); - } - - #[test] - fn vector_search_roundtrips() { - roundtrip(ChartKey::VectorSearch { - dataset: "cohere-large-10m".into(), - layout: "partitioned".into(), - threshold: 0.75, - }); - } - - #[test] - fn random_access_roundtrips() { - roundtrip(ChartKey::RandomAccess { - dataset: "taxi".into(), - }); - } - - #[test] - fn malformed_slug_rejected() { - assert!(ChartKey::from_slug("not-a-slug").is_err()); - assert!(ChartKey::from_slug("qm.****").is_err()); - } - - fn roundtrip_group(key: GroupKey) { - let slug = key.to_slug(); - let parsed = GroupKey::from_slug(&slug).expect("parses back"); - assert_eq!(parsed, key); - } - - #[test] - fn group_keys_roundtrip() { - roundtrip_group(GroupKey::QueryGroup { - dataset: "tpch".into(), - dataset_variant: None, - scale_factor: Some("1".into()), - storage: "nvme".into(), - }); - roundtrip_group(GroupKey::CompressionTimeGroup); - roundtrip_group(GroupKey::CompressionSizeGroup); - roundtrip_group(GroupKey::RandomAccessGroup); - roundtrip_group(GroupKey::VectorSearchGroup { - dataset: "cohere".into(), - layout: "partitioned".into(), - }); - } - - #[test] - fn group_slug_prefix_distinct_from_chart() { - let chart = ChartKey::CompressionTime { - dataset: "tpch".into(), - dataset_variant: None, - } - .to_slug(); - let group = GroupKey::CompressionTimeGroup.to_slug(); - let chart_prefix = chart.split_once('.').unwrap().0; - let group_prefix = group.split_once('.').unwrap().0; - assert_ne!(chart_prefix, group_prefix); - } - - #[test] - fn malformed_group_slug_rejected() { - assert!(GroupKey::from_slug("not-a-slug").is_err()); - assert!(GroupKey::from_slug("qmg.****").is_err()); - } -} diff --git a/benchmarks-website/server/static/CHARTJS_PLUGIN_ZOOM_LICENSE.md b/benchmarks-website/server/static/CHARTJS_PLUGIN_ZOOM_LICENSE.md deleted file mode 100644 index 5d0f4288b27..00000000000 --- a/benchmarks-website/server/static/CHARTJS_PLUGIN_ZOOM_LICENSE.md +++ /dev/null @@ -1,9 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2013-2021 chartjs-plugin-zoom contributors - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/benchmarks-website/server/static/CHART_JS_LICENSE.md b/benchmarks-website/server/static/CHART_JS_LICENSE.md deleted file mode 100644 index f216610fd7e..00000000000 --- a/benchmarks-website/server/static/CHART_JS_LICENSE.md +++ /dev/null @@ -1,9 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2014-2024 Chart.js Contributors - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/benchmarks-website/server/static/chart-init.js b/benchmarks-website/server/static/chart-init.js deleted file mode 100644 index eae6cc57662..00000000000 --- a/benchmarks-website/server/static/chart-init.js +++ /dev/null @@ -1,2595 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -// Hydrate Chart.js charts on /, /chart/:slug, and /group/:slug, plus the -// lazy-fetch-on-toggle behaviour for closed `
` groups. -// -// File map (in source order): -// 1. Constants — throttle delays, fetch knobs, caps. -// 2. Canvas state contract — every `canvas.__bench_*` field. -// 3. Per-card DOM contract — every `data-role` selector. -// 4. Global filter state — engines/formats from the navbar. -// 5. Palette + helpers — colours, formatting, throttle. -// 6. Display unit picker — bytes/time/count formatter switch. -// 7. LTTB — pure largest-triangle downsampler. -// 8. Crosshair plugin — inline Chart.js plugin. -// 9. External tooltip handler — factory that returns a Chart.js -// external tooltip handler. -// 10. Payload + datasets — readInlinePayload, buildDatasets, -// rebuildVisibleAndUpdate. -// 11. Full-history warmup — ensureFullHistory, -// replaceChartPayload, plus the -// slider + downsample-badge sync -// helpers. -// 12. Per-card construction — constructChart. -// 13. Range scrollbar strip — bindRangeStrip + pointer math. -// 14. Per-chart toolbar wiring — bindToolbar, attachWheelPan, -// applyScope, applyY. -// 15. Group hydration — bounded shard fetch queue + UI -// helpers. -// 16. Global filter wiring — chip toggle, URL sync, bindings. -// 17. Per-group toolbar wiring — group-level filter + Y override. -// 18. Header controls — theme toggle, expand/collapse all. -// 19. Page wiring — IntersectionObserver, init. -// -// Per-chart UX (for orientation): -// - Each `.chart-card` carries `data-chart-slug`. The card *owns* its own -// toolbar (`.toolbar--card`) — there is no page-level toolbar. -// - Landing groups fetch materialized latest-100 group shards, with bounded -// concurrency. Opening a group then queues `?n=all` for that group's charts -// so the first paint is fast but full history is already warming. -// - `rebuildVisibleAndUpdate` is the single source of truth for the -// rendered point count. The cap is one constant: at most -// `MAX_VISIBLE_POINTS` *unique commit indices* (x-positions) are -// rendered, **shared across every series**. Below the cap we render -// every commit that has data; above it we LTTB the per-commit -// "max-y across series" to pick that many representatives, then -// every series renders at those shared indices. This is what the -// cap is *supposed* to mean: visually, the chart never has more -// than that many x-axis columns regardless of how many lines are -// on it. (Earlier per-series LTTB picked different peaks for each -// series and the union of x-positions blew past the cap.) -// - The slider is throttled to ~16ms (one frame at 60fps) per v2's -// `CONFIG.ZOOM_THROTTLE_DELAY` so dragging the slider feels continuous. -// - Mouse wheel pans horizontally (chartjs-plugin-zoom does not expose -// pan-on-wheel, so a manual `wheel` listener calls `chart.pan(...)`). -// - Drag-pan + drag-rectangle-zoom are wired through the plugin and -// trigger the same `rebuildVisibleAndUpdate` via `onPan`/`onZoom`. -// - A custom inline plugin draws a vertical crosshair at the hovered -// commit; the external tooltip is offset and `pointer-events: none` -// to fix the flicker described in the per-chart UX rebuild brief. -// -// Canvas state contract — every per-chart property we plant on the canvas: -// canvas.__bench_chart Chart.js instance, set in constructChart. -// canvas.__bench_payload Last-fetched ChartResponse (raw, -// unmodified by LTTB). Source of truth -// the tooltip + LTTB rebuild read. -// canvas.__bench_state { y: "linear"|"log", scope: number|"all" } -// — the per-chart toolbar state. -// canvas.__bench_overrides Map of series the -// user has manually toggled on this card. -// Once set, the global filter no longer -// drives that label's visibility here. -// canvas.__bench_strip_render Function bound by bindRangeStrip; called -// from any path that mutates scales.x. -// canvas.__bench_rebuild Throttled `rebuildVisibleAndUpdate` -// wrapper; called from pan/zoom/wheel. -// canvas.__bench_wheel_attached true once attachWheelPan has wired -// a wheel listener (idempotency). -// canvas.__bench_inline_trimmed true if the current payload is a -// virtual latest-100 view over a longer -// full-history x-axis. -// canvas.__bench_full_loaded true once a `?n=all` refetch has -// replaced the payload. -// canvas.__bench_full_fetch_pending true while a `?n=all` refetch is in -// flight; dedupes queue/pan promotion. -// canvas.__bench_full_fetch_entry Full-history queue entry, if the fetch -// is queued but not yet complete. -// canvas.__bench_payload_window The server-side commit window used for -// the current payload (`"100"` for -// shard hydration, `"all"` for warmed -// full history). -// canvas.__bench_display_unit The picked display unit (`format`, -// `axisLabel`, `multiplier`) used by the -// tooltip and y-axis label. Recomputed -// after every payload swap and after each -// LTTB rebuild changes the visible window. -// canvas.__bench_y_user_set true once the user has explicitly -// clicked the per-chart Y-axis toolbar. -// The per-group Y override skips charts -// where this flag is set so the local -// click stays sticky. -// -// Per-card DOM contract — every selector the chart cards are queried by: -// .chart-card[data-chart-index][data-chart-slug] The card itself. -// canvas[data-chart-index] The chart canvas. -// .chart-tooltip-host External tooltip host. -// .chart-wrap Canvas wrapper. -// [data-role="downsample-badge"] LTTB badge slot. -// [data-role="scope-slider"] Toolbar scope slider. -// .toolbar--card Toolbar root. -// .toolbar-btn[data-y] Y-axis switch buttons. -// [data-role="range-strip"] Range scrollbar root. -// [data-role="range-window"] Range strip's window. -// [data-role="range-handle-left"] Left resize handle. -// [data-role="range-handle-right"] Right resize handle. -// .group-disclosure The
wrapper. -// .group-details The wrapping
. -// [data-role="global-filter-bar"] Filter dropdown root. -// [data-role="filter-trigger"] Filter dropdown button. -// [data-role="filter-panel"] Filter dropdown body. -// .filter-chip[data-filter][data-value] A single filter chip. -// [data-role="filter-badge"] Badge on the trigger. -// [data-action="expand-all"] Header button. -// [data-action="collapse-all"] Header button. -// [data-role="theme-toggle"] Header button. -// #bench-filter-state Server-emitted filter -// state JSON (script id). -(function () { - "use strict"; - - // ----------------------------------------------------------------------- - // Constants - // ----------------------------------------------------------------------- - var ZOOM_THROTTLE_MS = 16; // one frame at ~60fps for slider drag - var PAN_THROTTLE_MS = 50; // pan/zoom throttle — looser than slider - var FETCH_N = "all"; // explicit full-history upgrade - var DEFAULT_VISIBLE = 100; // initial visible window (last 100 of fetched) - var CHART_FETCH_N = String(DEFAULT_VISIBLE); // materialized shard window - var HYDRATION_CONCURRENCY = 4; // per-tab cap for latest-100 shard requests - var FULL_HISTORY_CONCURRENCY = 2; // per-tab cap for background `?n=all` - var GROUP_OPEN_PRIORITY_STEP = 100; - var INTERACTION_FULL_PRIORITY = 1000000; - - // Resolve the default scope for a chart card. Group-open hydration always - // starts with the latest-100 visible window, even when the x-axis is virtual - // and spans the full chart history. - function defaultScopeForCard(_card) { - return DEFAULT_VISIBLE; - } - // Hard cap on how many points a single series can render at once. When - // the visible commit range has more raw non-null points than this, we - // LTTB-downsample to exactly this number; below it we render raw. So - // the user always sees at most this many points per series, regardless - // of how far they zoom out, and the rule is one sentence: - // - // visible <= MAX_VISIBLE_POINTS → raw - // visible > MAX_VISIBLE_POINTS → LTTB to MAX_VISIBLE_POINTS - // - // Chart cards are ~600–900px on desktop and Chart.js draws ~2px point - // markers, so 500 points gives roughly 1.5px of horizontal space per - // point — about as dense as the eye can resolve. Bumping higher costs - // render time without visible improvement; lowering loses detail on - // wide cards. - var MAX_VISIBLE_POINTS = 500; - - // ----------------------------------------------------------------------- - // Global filter state (engine/format chips inside the navbar dropdown). - // - // Model: - // `globalFilter.engines` / `.formats` track the *active* (visible) set - // for that dimension. The chip's displayed active state mirrors - // visibility — every chip active means no filter is applied, exactly - // one chip inactive hides only that engine/format, and so on. The - // URL `?engine=`/`?format=` stay as allowlists for stability across - // refreshes; we omit the param when every chip is active (i.e. the - // active set equals the universe), so the no-filter URL is clean. - // - // Per-card overrides: - // Clicking a chart's legend toggles `dataset.hidden` and adds the label - // to that card's `canvas.__bench_overrides` set. The global apply pass - // skips overridden labels, so the user's manual call sticks even after - // subsequent global filter changes. - // ----------------------------------------------------------------------- - var globalFilter = readFilterState(); - var filterUniverse = readFilterUniverseFromDom(); - // `seedFromUrl` translates the URL state (allowlist) into the active set. - // Empty allowlist in the URL is treated as "no filter" → every chip - // active. Non-empty is taken verbatim, even if a chip has since been - // added or removed from the universe — keeps stale URLs deterministic. - seedActiveFromUrlState(); - - function readFilterState() { - var fallback = { engines: [], formats: [] }; - var node = document.getElementById("bench-filter-state"); - if (!node) return fallback; - try { - var parsed = JSON.parse(node.textContent); - return { - engines: Array.isArray(parsed.engines) ? parsed.engines.slice() : [], - formats: Array.isArray(parsed.formats) ? parsed.formats.slice() : [], - }; - } catch (e) { - return fallback; - } - } - - // Pull the chip universe straight from the rendered panel, so the JS - // doesn't have to mirror the server's enum. If the dropdown isn't on the - // page (shouldn't happen — the header always renders it when there's - // data) we fall back to whatever is in the URL state. - function readFilterUniverseFromDom() { - var u = { engines: [], formats: [] }; - document.querySelectorAll( - '[data-role="filter-panel"] .filter-chip[data-value]:not([data-value="*"])', - ).forEach(function (chip) { - var dim = chip.getAttribute("data-filter"); - var value = chip.getAttribute("data-value"); - if (!dim || !value) return; - var bucket = dim === "engine" ? u.engines : u.formats; - if (bucket.indexOf(value) === -1) bucket.push(value); - }); - return u; - } - - function seedActiveFromUrlState() { - if (!globalFilter.engines.length) { - globalFilter.engines = filterUniverse.engines.slice(); - } - if (!globalFilter.formats.length) { - globalFilter.formats = filterUniverse.formats.slice(); - } - } - - // Any-of-universe-missing-from-active means the dimension is filtered. - function dimensionIsFiltered(key) { - return globalFilter[key].length < filterUniverse[key].length; - } - - // A series is hidden when its engine/format dimension is filtered AND its - // tag isn't in the active set. Series without an engine tag (e.g. - // compression-time `format:op` series) are unaffected by the engine - // filter — symmetric for format. This keeps the chip semantics intuitive: - // hiding an engine doesn't nuke charts that have no engine dimension. - function seriesPassesFilter(meta) { - if (!meta) meta = {}; - if (meta.engine && dimensionIsFiltered("engines") - && globalFilter.engines.indexOf(meta.engine) === -1) { - return false; - } - if (meta.format && dimensionIsFiltered("formats") - && globalFilter.formats.indexOf(meta.format) === -1) { - return false; - } - return true; - } - - // Per-group filter layer. State is a single `hiddenSeries` array of dataset - // labels the user has toggled off via the group's filter dropdown. Engine - // and format chips in the dropdown are macros: clicking them bulk-toggles - // every known series whose `engine`/`format` matches (see - // `applyMacroToHiddenSeries`). The series list itself populates as charts - // in the group hydrate and surface their `payload.series_meta`. - function seriesPassesGroupFilter(filter, label) { - if (!filter || !filter.hiddenSeries) return true; - return filter.hiddenSeries.indexOf(label) === -1; - } - - // ----------------------------------------------------------------------- - // Palette + helpers - // ----------------------------------------------------------------------- - var palette = [ - "#2563eb", "#dc2626", "#16a34a", "#ea580c", "#7c3aed", - "#0891b2", "#ca8a04", "#db2777", "#65a30d", "#475569", - ]; - - function colorFor(i) { return palette[i % palette.length]; } - - function shortSha(sha) { - return typeof sha === "string" ? sha.slice(0, 7) : String(sha); - } - - function shortDate(ts) { - if (typeof ts !== "string") return ""; - return ts.slice(0, 10); - } - - function truncate(s, max) { - if (typeof s !== "string") return ""; - return s.length > max ? s.slice(0, max - 1) + "…" : s; - } - - function firstLine(s) { - if (typeof s !== "string") return ""; - var nl = s.indexOf("\n"); - return nl >= 0 ? s.slice(0, nl) : s; - } - - // Vortex commits to `develop` are squash-merged from PRs; the squash subject - // ends with `(#NNNN)`. Returning just the number lets callers build either a - // PR or commit URL. - function parsePrNumber(message) { - if (typeof message !== "string") return null; - var m = message.match(/\(#(\d+)\)/); - return m ? m[1] : null; - } - - function escapeHtml(s) { - return String(s) - .replace(/&/g, "&") - .replace(//g, ">") - .replace(/"/g, """) - .replace(/'/g, "'"); - } - - // ----------------------------------------------------------------------- - // Display unit picker. The wire payload's `unit_kind` says *what* the - // values are (`time_ns`, `bytes`, …); this helper turns that plus the - // magnitude of the loaded values into a `(multiplier, suffix, axisLabel, - // decimals)` tuple. The chart locks that tuple on construction (and again - // after the lazy `?n=all` refetch swaps the payload) so the y-axis stays - // stable while the user pans/zooms — recomputing per-frame would shift - // the unit out from under them. - // - // Worked example: a `time_ns` series whose median is 12,000,000,000 ns - // picks `{ multiplier: 1e-9, suffix: "s", axisLabel: "Time (s)" }`, so - // `12,000,000,000` renders as `12 s` on the axis and in the tooltip. - // ----------------------------------------------------------------------- - var IDENTITY_UNIT = { - multiplier: 1, - suffix: "", - axisLabel: "", - decimals: 2, - }; - - // Median of finite, nonzero |v|. Zeros and NaNs aren't informative for the - // magnitude pick (a chart with all zeros isn't readable anyway), so we - // skip them; if every value is filtered out, return `null` and callers - // fall back to the kind's smallest display unit. - function magnitudeReference(values) { - if (!Array.isArray(values) || values.length === 0) return null; - var sample = []; - for (var i = 0; i < values.length; i++) { - var v = values[i]; - if (v === null || v === undefined) continue; - if (typeof v !== "number" || !Number.isFinite(v)) continue; - var a = Math.abs(v); - if (a === 0) continue; - sample.push(a); - } - if (sample.length === 0) return null; - sample.sort(function (a, b) { return a - b; }); - var mid = Math.floor(sample.length / 2); - return (sample.length % 2) - ? sample[mid] - : (sample[mid - 1] + sample[mid]) / 2; - } - - // Walk every series in the loaded payload and concatenate the non-null - // values. The picker works off the merged distribution so a chart with one - // very fast and one very slow series still picks the unit that keeps the - // larger magnitudes readable. Toggling a series visibility via the global - // filter does NOT call this — the unit is locked at payload-load time. - function collectAllValues(payload) { - var out = []; - var series = (payload && payload.series) || {}; - var keys = Object.keys(series); - for (var i = 0; i < keys.length; i++) { - var arr = series[keys[i]]; - if (!Array.isArray(arr)) continue; - for (var j = 0; j < arr.length; j++) { - var v = arr[j]; - if (v !== null && v !== undefined && Number.isFinite(v)) out.push(v); - } - } - return out; - } - - function pickTimeUnit(ref) { - // Steps: ns → µs (1e3) → ms (1e6) → s (1e9). Pick by the median's - // magnitude so the y-axis tick numbers fit in 1–4 digits. - if (ref === null || ref < 1e3) { - return { multiplier: 1, suffix: "ns", decimals: 0 }; - } - if (ref < 1e6) return { multiplier: 1e-3, suffix: "µs", decimals: 2 }; - if (ref < 1e9) return { multiplier: 1e-6, suffix: "ms", decimals: 2 }; - return { multiplier: 1e-9, suffix: "s", decimals: 2 }; - } - - function pickBytesUnit(ref) { - // Binary multiples to match how DuckDB and on-disk file sizes are - // typically reported. Steps: B → KiB (1024) → MiB → GiB → TiB. - var k = 1024; - if (ref === null || ref < k) { - return { multiplier: 1, suffix: "B", decimals: 0 }; - } - if (ref < k * k) return { multiplier: 1 / k, suffix: "KiB", decimals: 2 }; - if (ref < k * k * k) return { multiplier: 1 / (k * k), suffix: "MiB", decimals: 2 }; - if (ref < k * k * k * k) return { multiplier: 1 / (k * k * k), suffix: "GiB", decimals: 2 }; - return { multiplier: 1 / (k * k * k * k), suffix: "TiB", decimals: 2 }; - } - - function pickDisplayUnit(unitKind, values) { - var ref = magnitudeReference(values); - if (unitKind === "time_ns") { - var t = pickTimeUnit(ref); - return { - multiplier: t.multiplier, - suffix: t.suffix, - axisLabel: "Time (" + t.suffix + ")", - decimals: t.decimals, - }; - } - if (unitKind === "bytes") { - var b = pickBytesUnit(ref); - return { - multiplier: b.multiplier, - suffix: b.suffix, - axisLabel: "Size (" + b.suffix + ")", - decimals: b.decimals, - }; - } - if (unitKind === "throughput_mb_s") { - return { - multiplier: 1, - suffix: "MB/s", - axisLabel: "Throughput (MB/s)", - decimals: 2, - }; - } - if (unitKind === "ratio" || unitKind === "count") { - // Dimensionless: no scaling, no suffix, no axis title — leaving the - // axis unlabeled keeps a "1.2× speedup" axis from being read as - // "1200 m" by an axis-title-driven label. - return { - multiplier: 1, - suffix: "", - axisLabel: "", - decimals: unitKind === "count" ? 0 : 2, - }; - } - // Unknown kind (forward-compat with a future server enum). Identity is - // the safest fallback — values render verbatim, no unit. - return IDENTITY_UNIT; - } - - // Tooltip formatter: applies the chart's locked display unit so the tooltip - // value matches the y-axis tick numbers exactly. Raw `null`/`NaN` collapse - // to an em-dash so a missing data point reads as a clear gap rather than - // a literal `0`. - function formatDisplayValue(rawValue, displayUnit) { - if (rawValue === null || rawValue === undefined || Number.isNaN(rawValue)) { - return "—"; - } - var u = displayUnit || IDENTITY_UNIT; - var scaled = rawValue * u.multiplier; - var text = Number.isFinite(scaled) ? scaled.toFixed(u.decimals) : "—"; - return u.suffix ? text + " " + u.suffix : text; - } - - // Throttle to a max call rate; trailing call is preserved so the final - // slider position is honoured. (`requestAnimationFrame` is conceptually - // similar but we want a hard ceiling regardless of when the browser - // schedules a frame.) - function throttle(fn, ms) { - var lastRan = 0; - var pending = null; - var pendingArgs = null; - return function () { - var now = Date.now(); - pendingArgs = arguments; - if (now - lastRan >= ms) { - lastRan = now; - fn.apply(null, pendingArgs); - } else if (!pending) { - var wait = ms - (now - lastRan); - pending = setTimeout(function () { - lastRan = Date.now(); - pending = null; - fn.apply(null, pendingArgs); - }, wait); - } - }; - } - - // ----------------------------------------------------------------------- - // LTTB (Largest-Triangle-Three-Buckets) downsampler. - // - // Returns the indices into `xs` / `ys` to keep, including index 0 and - // `n - 1`. `xs` must be strictly increasing. When `threshold >= n` or - // `threshold < 3`, returns `[0, 1, ..., n-1]` unchanged. - // - // Algorithm: . Per-bucket pick the - // point that forms the largest triangle with the previously kept point - // and the average of the next bucket. - // ----------------------------------------------------------------------- - function lttbIndices(xs, ys, threshold) { - var n = xs.length; - if (threshold >= n || threshold < 3) { - var all = new Array(n); - for (var i = 0; i < n; i++) all[i] = i; - return all; - } - var out = new Array(threshold); - out[0] = 0; - var bucket = (n - 2) / (threshold - 2); - var a = 0; - for (var bi = 0; bi < threshold - 2; bi++) { - // Average of the *next* bucket — the "C" point in the triangle. - var nextStart = Math.floor((bi + 1) * bucket) + 1; - var nextEnd = Math.min(n, Math.floor((bi + 2) * bucket) + 1); - var count = Math.max(1, nextEnd - nextStart); - var ax = 0, ay = 0; - for (var j = nextStart; j < nextEnd; j++) { ax += xs[j]; ay += ys[j]; } - ax /= count; ay /= count; - - // Search this bucket for the point with the largest triangle area - // against (a, avg_next). - var rangeStart = Math.floor(bi * bucket) + 1; - var rangeEnd = Math.floor((bi + 1) * bucket) + 1; - var pax = xs[a], pay = ys[a]; - var maxArea = -1; - var maxIdx = rangeStart; - for (var k = rangeStart; k < rangeEnd; k++) { - var area = Math.abs((pax - ax) * (ys[k] - pay) - (pax - xs[k]) * (ay - pay)) * 0.5; - if (area > maxArea) { maxArea = area; maxIdx = k; } - } - out[bi + 1] = maxIdx; - a = maxIdx; - } - out[threshold - 1] = n - 1; - return out; - } - - // ----------------------------------------------------------------------- - // Crosshair plugin: draws a vertical line at the chart's active hover - // index. Using an inline plugin is cheaper than pulling in - // chartjs-plugin-crosshair, which is overkill for this one feature. - // ----------------------------------------------------------------------- - var crosshairPlugin = { - id: "benchCrosshair", - afterDatasetsDraw: function (chart) { - var active = chart.tooltip && chart.tooltip.getActiveElements - ? chart.tooltip.getActiveElements() - : []; - if (!active || !active.length) return; - var x = active[0].element.x; - var ya = chart.scales && chart.scales.y; - if (!ya || !Number.isFinite(x)) return; - var ctx = chart.ctx; - ctx.save(); - // `--muted` from the page theme — read it lazily so dark mode picks - // up the right colour. - var muted = getComputedStyle(document.documentElement) - .getPropertyValue("--muted").trim() || "#9ca3af"; - ctx.strokeStyle = muted; - ctx.lineWidth = 1; - ctx.setLineDash([4, 4]); - ctx.beginPath(); - ctx.moveTo(x, ya.top); - ctx.lineTo(x, ya.bottom); - ctx.stroke(); - ctx.restore(); - }, - }; - - // ----------------------------------------------------------------------- - // External tooltip with offset + flip-on-overflow. - // - // Factory contract: returns a Chart.js `external` tooltip handler closed - // over `canvas` (the rendered canvas element, used to read the cached - // payload via `canvas.__bench_payload`) and `host` (the - // `
` element to render markup into; - // `host.parentNode` is the chart-card and is used as the positioning - // origin). The returned handler is invoked by Chart.js with one argument - // `context = { tooltip, chart }`; it mutates `host` in place and is a - // no-op when `tooltip.opacity === 0`. - // - // Flicker fix: the tooltip host is **always** `pointer-events: none`. The - // previous implementation flipped it to `auto` when visible; the cursor - // would land on the tooltip, fire mouseout on the canvas, the tooltip - // would hide, the cursor would re-enter the canvas, and the cycle would - // repeat at event-loop frequency. Clicks on a data point are handled by - // the chart's `onClick` (opens the PR or commit URL in a new tab), so the - // tooltip itself never needs to be interactive. - // ----------------------------------------------------------------------- - function externalTooltipHandler(canvas, host) { - return function (context) { - var tt = context.tooltip; - if (!host) return; - if (tt.opacity === 0) { - host.style.opacity = "0"; - return; - } - - var chart = context.chart; - var payload = canvas.__bench_payload || { commits: [] }; - var firstDp = tt.dataPoints && tt.dataPoints[0]; - if (!firstDp) { - host.style.opacity = "0"; - return; - } - // Snap to a single commit. We use `mode: "nearest"` on the chart - // options, so `firstDp.dataIndex` is the single closest data point - // to the cursor (skipping nulls in `dataset.data`). If the cursor - // falls between two LTTB-kept points, exactly one wins — no more - // rendering both columns at once. - var idx = firstDp.dataIndex; - var commit = (payload.commits || [])[idx] || {}; - // Tooltip values must match the locked y-axis unit. Raw values still - // live on `dataset.rawData`; the display unit is what scales them - // into ms / MiB / etc. for the visible text. - var displayUnit = canvas.__bench_display_unit || IDENTITY_UNIT; - var rawLen = (chart.data.labels || []).length; - - // Build one row per dataset, reading values from each series' - // `rawData` (the unmodified payload) so the tooltip shows raw - // measurements even when LTTB has nulled out `dataset.data[idx]`. - // Iterating `chart.data.datasets` directly — instead of mapping - // `tt.dataPoints` — guarantees one row per series at this single - // commit; `tt.dataPoints` could otherwise contain points from - // multiple `dataIndex` values when the cursor sits between two - // closely-packed LTTB columns. - var rowItems = chart.data.datasets.map(function (ds, dsIndex) { - // Skip datasets the user (or filter bar) has hidden. - var meta = chart.getDatasetMeta && chart.getDatasetMeta(dsIndex); - if (meta && meta.hidden) return null; - if (ds.hidden) return null; - var raw = (ds.rawData || [])[idx]; - if (raw === null || raw === undefined || Number.isNaN(raw)) { - return null; - } - // Per-row delta is `(current - previous) / previous`, where - // "previous" is the chronologically preceding commit. The - // `commits[]` array is sorted oldest-first by SQL — index 0 is - // the oldest commit, the last index is the newest — so the - // predecessor lives at `idx - 1`. Walk further back across - // null-valued slots so series that didn't run on every commit - // still get a meaningful baseline. - var prevIdx = idx - 1; - var prevRaw = null; - while (prevIdx >= 0) { - var pv = (ds.rawData || [])[prevIdx]; - if (pv !== null && pv !== undefined && !Number.isNaN(pv)) { - prevRaw = pv; - break; - } - prevIdx--; - } - var deltaHtml = ""; - if (prevRaw !== null && prevRaw !== 0) { - var pct = ((raw - prevRaw) / prevRaw) * 100; - var cls = pct > 0 ? "tt-delta tt-delta--worse" - : pct < 0 ? "tt-delta tt-delta--better" : "tt-delta"; - var sign = pct > 0 ? "+" : ""; - deltaHtml = '' + sign + pct.toFixed(1) + "%"; - } - return { - label: ds.label, - color: ds.borderColor, - raw: raw, - deltaHtml: deltaHtml, - }; - }).filter(Boolean); - - // Top-to-bottom order matches the visual stack of lines at this x. - rowItems.sort(function (a, b) { return b.raw - a.raw; }); - - var rows = rowItems - .map(function (r) { - return '
' - + '' - + '' + escapeHtml(r.label) + '' - + '' - + escapeHtml(formatDisplayValue(r.raw, displayUnit)) + '' - + r.deltaHtml - + "
"; - }) - .join(""); - - // If every series was hidden / had no value at this commit, treat - // this as a no-op hover instead of flashing an empty popup. - if (!rows) { - host.style.opacity = "0"; - return; - } - - var titleHtml = '
' - + escapeHtml(shortSha(commit.sha)) + ' · ' - + escapeHtml(shortDate(commit.timestamp)) - + "
"; - - // Show short SHA + first-line commit message, truncated. The full URL - // (or PR URL) is wired up via the chart's onClick handler, so we don't - // render it as text here. - var msg = truncate(firstLine(commit.message || ""), 80); - var footerLine = commit.sha - ? (msg ? escapeHtml(shortSha(commit.sha)) + " · " + escapeHtml(msg) - : escapeHtml(shortSha(commit.sha))) - : escapeHtml(msg); - var footerHtml = footerLine - ? '" - : ""; - - host.innerHTML = titleHtml + '
' + rows + "
" + footerHtml; - - // Position the tooltip relative to its container, offset 12px from - // the cursor. Flip horizontally if it would overflow. - var canvasRect = context.chart.canvas.getBoundingClientRect(); - var hostRect = host.parentNode.getBoundingClientRect(); - var x = canvasRect.left - hostRect.left + tt.caretX; - var y = canvasRect.top - hostRect.top + tt.caretY; - host.style.opacity = "1"; - host.style.left = x + "px"; - host.style.top = y + "px"; - // Measure after content swap so flipping is correct. - var ttWidth = host.offsetWidth || 0; - var containerWidth = host.parentNode.clientWidth || 0; - var flip = (x + ttWidth + 24) > containerWidth; - host.style.transform = flip - ? "translate(calc(-100% - 12px), 12px)" - : "translate(12px, 12px)"; - }; - } - - // ----------------------------------------------------------------------- - // Payload + datasets - // ----------------------------------------------------------------------- - function readInlinePayload(idx) { - var s = document.getElementById("chart-data-" + idx); - if (!s) return null; - try { return JSON.parse(s.textContent); } catch (e) { return null; } - } - - function labelForCommit(commit) { - return commit && commit.sha ? shortSha(commit.sha) : ""; - } - - function canonicalHistory(payload) { - var commits = Array.isArray(payload && payload.commits) ? payload.commits : []; - var history = payload && payload.history ? payload.history : {}; - var loaded = Number.isFinite(history.loaded_commits) - ? history.loaded_commits - : commits.length; - var total = Number.isFinite(history.total_commits) - ? history.total_commits - : commits.length; - var start = Number.isFinite(history.start_index) ? history.start_index : 0; - loaded = Math.max(0, Math.floor(loaded)); - total = Math.max(loaded, Math.floor(total)); - start = Math.max(0, Math.min(Math.floor(start), Math.max(0, total - loaded))); - return { - total_commits: total, - start_index: start, - loaded_commits: loaded, - complete: history.complete === true || (start === 0 && loaded === total), - }; - } - - function normalizeChartPayload(payload) { - if (!payload) return payload; - if (payload.__bench_normalized) return payload; - var commits = Array.isArray(payload.commits) ? payload.commits : []; - var history = canonicalHistory(payload); - if (history.complete && history.start_index === 0 && history.total_commits === commits.length) { - payload.history = history; - payload.__bench_normalized = true; - return payload; - } - - var total = history.total_commits; - var start = history.start_index; - var normalizedCommits = new Array(total); - for (var i = 0; i < total; i++) normalizedCommits[i] = null; - for (var ci = 0; ci < commits.length && start + ci < total; ci++) { - normalizedCommits[start + ci] = commits[ci]; - } - - var rawSeries = payload.series || {}; - var normalizedSeries = {}; - Object.keys(rawSeries).forEach(function (name) { - var values = Array.isArray(rawSeries[name]) ? rawSeries[name] : []; - var out = new Array(total); - for (var zi = 0; zi < total; zi++) out[zi] = null; - for (var vi = 0; vi < values.length && start + vi < total; vi++) { - out[start + vi] = values[vi]; - } - normalizedSeries[name] = out; - }); - - var clone = {}; - Object.keys(payload).forEach(function (key) { - if (key !== "commits" && key !== "series") clone[key] = payload[key]; - }); - clone.history = history; - clone.commits = normalizedCommits; - clone.series = normalizedSeries; - clone.__bench_normalized = true; - return clone; - } - - function rangeTouchesUnloadedHistory(payload, min, max) { - var history = payload && payload.history; - if (!history || history.complete) return false; - var start = history.start_index || 0; - var end = start + (history.loaded_commits || 0) - 1; - return Math.floor(min) < start || Math.ceil(max) > end; - } - - // Build the per-series dataset shells. `data` starts as a full-length - // null-padded array; `rebuildVisibleAndUpdate` fills it in based on the - // current visible range. `rawData` holds a reference to the original - // payload so the tooltip can show raw values regardless of LTTB. - function buildDatasets(payload) { - var raw = payload.series || {}; - var meta = payload.series_meta || {}; - var n = (payload.commits || []).length; - var names = Object.keys(raw).sort(); - return names.map(function (name, i) { - var seriesMeta = meta[name] || {}; - var rawValues = Array.isArray(raw[name]) ? raw[name] : []; - // `data` starts null-padded; `rebuildVisibleAndUpdate` fills the - // current visible window with raw or LTTB-kept values. Chart.js's - // `spanGaps: true` connects the line across nulls so a series with - // partial coverage (a benchmark crashed at one commit, a series - // only runs nightly, etc.) still draws as a continuous trend - // through the surrounding measurements. The point markers - // themselves are only drawn at non-null indices, so the missing - // commits are visible as a "no marker" beat in the line — the line - // itself bridges to the next available data point. - var data = new Array(n); - for (var j = 0; j < n; j++) data[j] = null; - return { - label: name, - data: data, - rawData: rawValues, - borderColor: colorFor(i), - backgroundColor: colorFor(i) + "20", - borderWidth: 1.5, - spanGaps: true, - tension: 0, - pointRadius: 2, - pointHoverRadius: 5, - pointHitRadius: 8, - pointStyle: "cross", - // Custom field (Chart.js ignores unknown keys). Used by the global - // filter to decide which datasets to hide/show in bulk. - benchMeta: { engine: seriesMeta.engine, format: seriesMeta.format }, - hidden: !seriesPassesFilter(seriesMeta), - }; - }); - } - - // ----------------------------------------------------------------------- - // The single source of truth for the rendered point count. - // - // Walks the visible `[rangeMin, rangeMax]` window of the raw payload and, - // for each series, renders raw when the visible count is at or below - // `MAX_VISIBLE_POINTS` and LTTB-downsamples to exactly that number when - // above. The result is written into `dataset.data` with nulls outside - // the kept set so Chart.js renders just the kept points; with - // `spanGaps: true`, the line connects across the nulls to the next - // non-null point so a sparse series still reads as a continuous trend. - // - // Mutates `dataset.data` in place to avoid GC churn on every pan frame. - // Updates the per-card downsample badge as a side effect. - // ----------------------------------------------------------------------- - function rebuildVisibleAndUpdate(card, chart, rangeMin, rangeMax, allowFullFetch) { - var canvas = chart.canvas; - var payload = canvas.__bench_payload; - if (!payload) return; - var datasets = chart.data.datasets; - var n = (payload.commits || []).length; - if (n === 0) return; - - var min = Math.max(0, Math.floor(rangeMin)); - var max = Math.min(n - 1, Math.ceil(rangeMax)); - if (max < min) max = min; - - // Build one "virtual series" for LTTB: walk every commit index in the - // visible range and, for each index, take the max non-null value - // across all datasets. This is the union of x-positions, with a - // representative y per position. Series in a Vortex chart share both - // unit and overall scale (they're the same benchmark with different - // engines/formats), so max-across-series picks visually salient peaks - // without per-series scale skew. - // - // This becomes our LTTB input: we then pick AT MOST MAX_VISIBLE_POINTS - // commit indices and every dataset renders only at those shared - // indices. Without this, per-series LTTB picked different peaks for - // each series and the union of x-positions grew with the series - // count — visually you saw way more than MAX_VISIBLE_POINTS dots - // even though each line only had MAX_VISIBLE_POINTS. - var unionIdxs = []; - var unionVals = []; - for (var i = min; i <= max; i++) { - var bestY = null; - for (var di = 0; di < datasets.length; di++) { - var rawValues = datasets[di].rawData; - if (!Array.isArray(rawValues)) continue; - var v = rawValues[i]; - if (v !== null && v !== undefined && !Number.isNaN(v) - && (bestY === null || v > bestY)) { - bestY = v; - } - } - if (bestY !== null) { - unionIdxs.push(i); - unionVals.push(bestY); - } - } - - // Decide which commit indices to render — shared across all series. - var keptSet = {}; - var anyDownsampled = false; - if (unionIdxs.length <= MAX_VISIBLE_POINTS) { - // Below the cap: render every commit that has data anywhere. - for (var u = 0; u < unionIdxs.length; u++) keptSet[unionIdxs[u]] = true; - } else { - // Above the cap: LTTB the union down to MAX_VISIBLE_POINTS exactly. - // The selected indices are then *shared* across every dataset; that - // is the cap's only correct interpretation of "max points on the - // chart at a time". - var localIndices = lttbIndices(unionIdxs, unionVals, MAX_VISIBLE_POINTS); - for (var li = 0; li < localIndices.length; li++) { - keptSet[unionIdxs[localIndices[li]]] = true; - } - anyDownsampled = true; - } - - // Plant the shared kept set into every dataset.data. Series that have - // no value at a kept index simply remain null there; with - // `spanGaps: true`, the line connects to the next non-null point so - // a series with partial coverage (a benchmark crashed, a series only - // runs nightly) still draws as a continuous trend through the - // surrounding measurements. Markers are only drawn at non-null - // indices, so the gap is still visible as a missing point — just not - // as a broken line. - // - // We deliberately do NOT plant nearest-neighbour values for indices - // outside `[min, max]`: extending the line past the visible edges - // sounds nice (the line goes off-screen toward the next real - // measurement instead of stopping at the rightmost in-range point), - // but Chart.js's y-axis auto-scale uses every non-null value in the - // dataset regardless of `scales.x.min/max`. An off-screen neighbour - // with a very different y value (an old benchmark configuration, a - // first-run cold cache, anything) blows up the y-axis range and - // squashes the in-window values into a flat line near the floor. - // Fixing that would mean overriding `scales.y.min/max` per rebuild - // from only the in-window values, which changes the "y-axis stays - // stable across x-zoom" UX. Out of scope here; if a user wants to - // see how the line connects across the edge they can zoom out. - // Pull the chart's locked display-unit multiplier. Applied here, not on - // ingest or in the SQL, so the wire payload stays in base units (ns, - // bytes, …) — the unit transform is purely cosmetic. - var displayUnit = canvas.__bench_display_unit || IDENTITY_UNIT; - var multiplier = displayUnit.multiplier; - for (var dj = 0; dj < datasets.length; dj++) { - var ds = datasets[dj]; - var dsRaw = ds.rawData; - if (!Array.isArray(dsRaw)) continue; - var data = ds.data; - if (!Array.isArray(data) || data.length !== n) { - data = new Array(n); - ds.data = data; - } - for (var z = 0; z < n; z++) data[z] = null; - for (var idxStr in keptSet) { - var idx = +idxStr; - var val = dsRaw[idx]; - if (val !== null && val !== undefined && !Number.isNaN(val)) { - data[idx] = val * multiplier; - } - } - } - - var visibleCommits = max - min + 1; - var keptCommits = 0; - for (var _u in keptSet) keptCommits++; - chart.update("none"); - syncSliderFromRange(card, visibleCommits); - syncDownsampleBadge(card, keptCommits, visibleCommits, anyDownsampled); - // If the user moves into the virtual, not-yet-loaded part of the x-axis, - // promote this chart's queued full-history fetch. Group-open warmup should - // usually have it in flight already; this just puts direct interaction - // ahead of background work that has not started. - if (allowFullFetch && rangeTouchesUnloadedHistory(payload, min, max)) { - ensureFullHistory(card, INTERACTION_FULL_PRIORITY); - } - } - - // ----------------------------------------------------------------------- - // Full-history warmup. Opening a group queues `?n=all` for every chart in - // that group. Direct interaction with an unloaded virtual range promotes the - // same queued entry instead of issuing a duplicate request. - // ----------------------------------------------------------------------- - function ensureFullHistory(card, priority) { - var canvas = card.querySelector("canvas"); - if (!canvas) return Promise.resolve(); - if (canvas.__bench_full_loaded) return Promise.resolve(); - if (canvas.__bench_full_fetch_entry) { - if (priority) { - canvas.__bench_full_fetch_entry.priority = Math.max( - canvas.__bench_full_fetch_entry.priority, - priority - ); - drainFullHistoryQueue(); - } - return canvas.__bench_full_fetch_pending || Promise.resolve(); - } - var slug = card.getAttribute("data-chart-slug"); - if (!slug) return Promise.resolve(); - var entry = scheduleFullHistory(function () { - var url = "/api/chart/" + encodeURIComponent(slug) - + "?n=" + encodeURIComponent(FETCH_N); - return fetch(url, { headers: { "accept": "application/json" } }) - .then(function (r) { - if (r.status === 404) return null; - if (!r.ok) throw new Error("HTTP " + r.status); - return r.json(); - }) - .then(function (full) { - if (!full) return; - replaceChartPayload(card, full); - canvas.__bench_full_loaded = true; - canvas.__bench_inline_trimmed = false; - showCardLoading(card, false); - var group = card.closest(".group-details"); - if (!canvas.__bench_chart && (!group || groupIsOpen(group))) { - fetchAndConstruct(card); - } - }); - }, priority || 0); - canvas.__bench_full_fetch_entry = entry; - canvas.__bench_full_fetch_pending = entry.promise.catch(function (err) { - // Quiet — the latest-100 virtual payload is still usable. Surface to the - // console for debugging. - if (window && window.console) { - window.console.warn("bench: full history fetch failed", err); - } - }).then(function () { - canvas.__bench_full_fetch_entry = null; - canvas.__bench_full_fetch_pending = null; - }); - return canvas.__bench_full_fetch_pending; - } - - // Swap the chart's labels + datasets to a freshly fetched, unbounded - // payload while preserving the current x-range. The virtual latest-100 - // payload and the full payload share a full-history x-axis, so the chart - // should not jump when the real older values arrive. - function replaceChartPayload(card, payload) { - var canvas = card.querySelector("canvas"); - var chart = canvas && canvas.__bench_chart; - if (!canvas || !payload) return; - payload = normalizeChartPayload(payload); - canvas.__bench_payload = payload; - canvas.__bench_payload_window = FETCH_N; - if (!chart) return; - // Re-pick the display unit against the now-wider window. The first - // payload was the latest-100 slice; the refetch may surface older - // commits with a different magnitude, and we'd rather move the - // y-axis once at the refetch boundary than leave the chart on a - // stale unit. The axis title is updated to match. - canvas.__bench_display_unit = pickDisplayUnit( - payload.unit_kind, collectAllValues(payload), - ); - var yAxis = chart.options.scales && chart.options.scales.y; - if (yAxis && yAxis.title) { - yAxis.title.display = !!canvas.__bench_display_unit.axisLabel; - yAxis.title.text = canvas.__bench_display_unit.axisLabel; - } - var newLabels = (payload.commits || []).map(labelForCommit); - var newDatasets = buildDatasets(payload); - // Re-apply per-card legend overrides + global filter to the new datasets, - // matching the visibility state the user had before the refetch. - var overrides = canvas.__bench_overrides || {}; - for (var i = 0; i < newDatasets.length; i++) { - var ds = newDatasets[i]; - if (overrides[ds.label]) { - // Honour any explicit legend toggle the user had made already. - var prev = chart.data.datasets.find(function (p) { - return p.label === ds.label; - }); - if (prev) ds.hidden = !!prev.hidden; - } - } - chart.data.labels = newLabels; - chart.data.datasets = newDatasets; - // Re-evaluate per-group + global filter on the swapped dataset so the - // visibility state matches what was on screen before the refetch. Also - // refresh the group's series chip row in case the wider window surfaces - // a series that was absent from the inline payload. - applyFiltersToChart(card); - noteSeriesFromCard(card); - var newMaxIdx = Math.max(0, newLabels.length - 1); - var zoomLimits = chart.options.plugins - && chart.options.plugins.zoom - && chart.options.plugins.zoom.limits - && chart.options.plugins.zoom.limits.x; - if (zoomLimits) { - zoomLimits.max = newMaxIdx; - } - syncSliderBounds(card, newLabels.length); - var sx = chart.options.scales.x; - var prevMin = Number.isFinite(sx.min) ? sx.min : 0; - var prevMax = Number.isFinite(sx.max) ? sx.max : 0; - if (Number.isFinite(prevMin) && Number.isFinite(prevMax)) { - sx.min = Math.max(0, Math.min(newMaxIdx, prevMin)); - sx.max = Math.max(sx.min, Math.min(newMaxIdx, prevMax)); - } else { - var prevVisible = Math.max(1, prevMax - prevMin + 1); - sx.max = newMaxIdx; - sx.min = Math.max(0, newMaxIdx - (prevVisible - 1)); - } - rebuildVisibleAndUpdate(card, chart, sx.min, sx.max); - if (canvas.__bench_strip_render) canvas.__bench_strip_render(); - } - - // Mirror the chart's current visible commit count onto the toolbar - // slider. Called from `rebuildVisibleAndUpdate` so every path that - // changes the visible range — toolbar slider drag, drag-pan, - // drag-rectangle-zoom, wheel-pan, range-strip drag — keeps the - // slider in sync. Programmatic value writes do not fire the slider's - // `input` event, so this never re-enters `applyScope`. - function syncSliderFromRange(card, visibleCommits) { - var slider = card.querySelector('[data-role="scope-slider"]'); - if (!slider) return; - var lo = parseInt(slider.min, 10) || 1; - var hi = parseInt(slider.max, 10) || visibleCommits; - slider.value = String(Math.max(lo, Math.min(hi, visibleCommits))); - } - - // Show the badge when at least one series in the visible range was - // downsampled. The numbers are commit counts: how many distinct - // commits the chart is rendering, and how many are in the visible - // range. Both come from the slider's mental model so "300 / 3000" in - // the badge matches "showing the last 3000" on the slider. - function syncDownsampleBadge(card, keptCommits, visibleCommits, anyDownsampled) { - var badge = card.querySelector('[data-role="downsample-badge"]'); - if (!badge) return; - if (!anyDownsampled || keptCommits >= visibleCommits) { - badge.setAttribute("hidden", ""); - badge.textContent = ""; - return; - } - badge.removeAttribute("hidden"); - badge.textContent = "downsampled · " + keptCommits + " / " + visibleCommits; - badge.setAttribute( - "title", - "Showing " + keptCommits + " of " + visibleCommits - + " commits in view. Each series renders at most " - + MAX_VISIBLE_POINTS + " points at a time; when more are in " - + "view, we apply LTTB (Largest Triangle, Three Buckets), an " - + "algorithm that picks representative points by maximising " - + "the area of triangles formed with neighbouring buckets. " - + "Visual peaks and valleys are preserved while the chart " - + "stays responsive. Zoom in past " + MAX_VISIBLE_POINTS - + " visible commits to see every raw measurement." - ); - } - - // ----------------------------------------------------------------------- - // Per-card construction. The set of `canvas.__bench_*` fields planted - // by this function (and read elsewhere) is documented at the top of - // this file under "Canvas state contract". - // ----------------------------------------------------------------------- - function constructChart(card) { - var idx = card.getAttribute("data-chart-index"); - var canvas = card.querySelector('canvas[data-chart-index="' + idx + '"]'); - if (!canvas || typeof Chart === "undefined") return null; - if (canvas.__bench_chart) return canvas.__bench_chart; - - var payload = normalizeChartPayload(canvas.__bench_payload || readInlinePayload(idx)); - if (!payload) return null; - canvas.__bench_payload = payload; - // Latest-100 payloads are normalized onto the full x-axis. `history` - // tells us whether old indices are virtual placeholders or real data. - if (canvas.__bench_full_loaded === undefined) { - var history = canonicalHistory(payload); - canvas.__bench_full_loaded = !!history.complete; - canvas.__bench_inline_trimmed = !canvas.__bench_full_loaded; - } - - var state = canvas.__bench_state - || { y: "linear", scope: defaultScopeForCard(card) }; - canvas.__bench_state = state; - // Series labels the user has explicitly toggled on this card. Once a - // label lands here, the global filter no longer drives that series's - // hidden-state on this card — only direct legend clicks do. - if (!canvas.__bench_overrides) canvas.__bench_overrides = {}; - // Lock the display unit for the lifetime of this loaded payload. We - // recompute only when `replaceChartPayload` swaps in a wider window - // after a `?n=all` refetch — toggling a series via the global filter - // never touches it. See `pickDisplayUnit` for the full rationale. - canvas.__bench_display_unit = pickDisplayUnit( - payload.unit_kind, collectAllValues(payload), - ); - - var labels = (payload.commits || []).map(labelForCommit); - var datasets = buildDatasets(payload); - var host = card.querySelector(".chart-tooltip-host"); - var range = visibleRange(labels.length, state.scope); - var legendPosition = (window.matchMedia - && window.matchMedia("(max-width: 768px)").matches) ? "top" : "bottom"; - - // Throttled rebuild for pan/zoom. Both axes mutate scales.x.min/max - // continuously during interaction, so we re-derive the rendered - // points on every frame (capped to PAN_THROTTLE_MS) and refresh the - // range strip to match. Single throttle so LTTB and the strip never - // diverge. - var throttledRebuild = throttle(function (chart) { - var sx = chart.scales && chart.scales.x; - if (!sx) return; - rebuildVisibleAndUpdate(card, chart, sx.min, sx.max, true); - if (canvas.__bench_strip_render) canvas.__bench_strip_render(); - }, PAN_THROTTLE_MS); - - var chart = new Chart(canvas, { - type: "line", - data: { labels: labels, datasets: datasets }, - plugins: [crosshairPlugin], - options: { - responsive: true, - maintainAspectRatio: false, - animation: false, - // Snap to the single nearest commit *that has rendered data*. - // After LTTB downsampling most commit indices are null in - // `dataset.data`; `mode: "index"` would happily pick one of - // those null indices and produce an empty tooltip, while - // `mode: "x"` would pick multiple closely-packed LTTB columns - // at once and the tooltip would render duplicate rows for the - // same series at different commits. `mode: "nearest"` returns - // exactly one closest data point — its `dataIndex` is then - // used by the external handler as the single hovered commit, - // and the handler iterates `chart.data.datasets` itself to - // build one row per series. `intersect: false` keeps it - // active anywhere on the chart and, combined with - // `pointer-events: none` on the tooltip host, is also the - // flicker fix. - interaction: { mode: "nearest", intersect: false, axis: "x" }, - onClick: function (event, _activeElements, chart) { - var points = chart.getElementsAtEventForMode( - event, "nearest", { intersect: false, axis: "x" }, true, - ); - if (!points.length) return; - var pIdx = points[0].index; - var commits = (canvas.__bench_payload || {}).commits || []; - var commit = commits[pIdx]; - if (!commit) return; - var pr = parsePrNumber(commit.message); - var url = pr - ? "https://github.com/vortex-data/vortex/pull/" + pr - : commit.url; - if (url) window.open(url, "_blank", "noopener"); - }, - scales: { - y: { - type: state.y === "log" ? "logarithmic" : "linear", - beginAtZero: state.y !== "log", - // Axis title reflects the locked display unit. Empty string when - // the kind is dimensionless (`ratio`, `count`) so a "1.2× speedup" - // chart doesn't get an arbitrary "value" label and a "12 m" chart - // doesn't get read as anything other than `12 ms` / `12 s` / etc. - title: { - display: !!canvas.__bench_display_unit.axisLabel, - text: canvas.__bench_display_unit.axisLabel, - }, - }, - x: { - min: range.min, - max: range.max, - title: { display: false }, - // With a 5000-commit history rendering one tick per commit - // is unreadable anyway. Cap it; Chart.js will pick a sensible - // subset of label indices to draw. - ticks: { maxTicksLimit: 12, autoSkip: true }, - }, - }, - plugins: { - legend: { - position: legendPosition, - // Wrap the default toggle so we record the per-card override - // and keep `dataset.hidden` in sync with the legend's - // `_hiddenInLegend` flag — the global filter pass writes to - // `dataset.hidden`, so they need to track each other or - // subsequent global changes look stale. - onClick: function (e, item, legend) { - var ci = legend.chart; - var ds = ci.data.datasets[item.datasetIndex]; - var label = ds && ds.label; - if (label && ci.canvas && ci.canvas.__bench_overrides) { - ci.canvas.__bench_overrides[label] = true; - } - var visible = ci.isDatasetVisible(item.datasetIndex); - ci.setDatasetVisibility(item.datasetIndex, !visible); - if (ds) ds.hidden = visible; // flipped: was visible → now hidden, etc. - ci.update(); - }, - }, - tooltip: { - enabled: false, - external: externalTooltipHandler(canvas, host), - // Row ordering is handled inside the external handler now — - // we iterate `chart.data.datasets` ourselves rather than the - // tooltip's `dataPoints`, so `itemSort` here would be dead - // code. - }, - // chartjs-plugin-zoom config — wheel-zoom is disabled because we - // want wheel-pan instead (handled by the canvas wheel listener - // below). Drag-pan and drag-rectangle-zoom are free. - zoom: { - zoom: { - wheel: { enabled: false }, - drag: { - enabled: true, - backgroundColor: "rgba(37, 99, 235, 0.10)", - }, - mode: "x", - onZoom: function (ctx) { throttledRebuild(ctx.chart); }, - }, - pan: { - enabled: true, - mode: "x", - modifierKey: null, - onPan: function (ctx) { throttledRebuild(ctx.chart); }, - }, - limits: { - x: { min: 0, max: Math.max(0, labels.length - 1), minRange: 4 }, - }, - }, - }, - }, - }); - - canvas.__bench_chart = chart; - canvas.__bench_rebuild = throttledRebuild; - attachWheelPan(canvas, chart, throttledRebuild); - syncSliderBounds(card, labels.length); - // Initial render: the chart is constructed with empty (null) data; - // populate it for the initial visible window. Strip is bound after the - // rebuild so its first paint reflects the same range Chart.js shows. - rebuildVisibleAndUpdate(card, chart, range.min, range.max); - bindRangeStrip(card, chart); - if (canvas.__bench_strip_render) canvas.__bench_strip_render(); - // `buildDatasets` seeded `hidden` from the global filter; reapply through - // the layered helper so a per-group filter set before this card hydrated - // also takes effect. Then surface this card's series labels to the - // group's filter dropdown so the chip row picks them up. - applyFiltersToChart(card); - noteSeriesFromCard(card); - return chart; - } - - // ----------------------------------------------------------------------- - // Range scrollbar strip — the thin track below each canvas. Spans the full - // commit history; the highlighted "window" matches the chart's currently - // visible x-range and can be dragged or its edges resized to pan/zoom. - // ----------------------------------------------------------------------- - function bindRangeStrip(card, chart) { - var strip = card.querySelector('[data-role="range-strip"]'); - if (!strip || strip.__bench_bound) return; - strip.__bench_bound = true; - var win = strip.querySelector('[data-role="range-window"]'); - var leftHandle = strip.querySelector('[data-role="range-handle-left"]'); - var rightHandle = strip.querySelector('[data-role="range-handle-right"]'); - if (!win || !leftHandle || !rightHandle) return; - - var canvas = card.querySelector("canvas"); - - function commitCount() { - return (chart.data.labels || []).length; - } - - function visibleBounds() { - var n = commitCount(); - if (n <= 0) return { min: 0, max: 0 }; - var maxIdx = n - 1; - var sx = chart.options.scales.x || {}; - var min = Number.isFinite(sx.min) ? sx.min : 0; - var max = Number.isFinite(sx.max) ? sx.max : maxIdx; - min = Math.max(0, Math.min(maxIdx, min)); - max = Math.max(min, Math.min(maxIdx, max)); - return { min: min, max: max }; - } - - function render() { - var n = commitCount(); - if (n <= 0) { - win.style.left = "0%"; - win.style.width = "100%"; - return; - } - var b = visibleBounds(); - var span = Math.max(1, n - 1); - var leftPct = (b.min / span) * 100; - var widthPct = ((b.max - b.min) / span) * 100; - // A minimum visible width keeps the handles grabbable when zoomed in - // tight on a single commit. - if (widthPct < 1.5) widthPct = 1.5; - if (leftPct + widthPct > 100) leftPct = 100 - widthPct; - win.style.left = leftPct + "%"; - win.style.width = widthPct + "%"; - } - - function setRange(newMin, newMax) { - var n = commitCount(); - if (n <= 0) return; - var maxIdx = n - 1; - var minRange = 1; // matches plugin `limits.x.minRange = 4` loosely; allow tighter via strip - newMin = Math.max(0, Math.min(maxIdx - minRange, newMin)); - newMax = Math.max(newMin + minRange, Math.min(maxIdx, newMax)); - chart.options.scales.x.min = newMin; - chart.options.scales.x.max = newMax; - // Track scope on the canvas so the toolbar slider stays consistent - // when the user later drags it. - if (canvas && canvas.__bench_state) { - canvas.__bench_state.scope = Math.round(newMax - newMin + 1); - } - // Re-derive what Chart.js renders against the new visible window. - // `rebuildVisibleAndUpdate` calls `chart.update("none")`, applies - // LTTB, and mirrors the new scope onto the toolbar slider, so the - // strip-driven pan/resize stays in lockstep with both the data - // density and the slider readout. - rebuildVisibleAndUpdate(card, chart, newMin, newMax, true); - render(); - } - - function pxToIndex(px, trackWidth) { - var n = commitCount(); - if (n <= 1 || trackWidth <= 0) return 0; - var pct = Math.max(0, Math.min(1, px / trackWidth)); - return pct * (n - 1); - } - - var dragState = null; - - function onPointerDown(e) { - if (e.button !== undefined && e.button !== 0) return; - var role = e.target.getAttribute && e.target.getAttribute("data-role"); - var rect = strip.getBoundingClientRect(); - var trackWidth = rect.width; - var b = visibleBounds(); - var idxAtCursor = pxToIndex(e.clientX - rect.left, trackWidth); - - var mode; - if (role === "range-handle-left") mode = "resize-left"; - else if (role === "range-handle-right") mode = "resize-right"; - else if (role === "range-window") mode = "pan"; - else { - // Click on bare track: jump the window so its centre lands at the - // cursor, then begin a pan drag. - var width = b.max - b.min; - var newMin = idxAtCursor - width / 2; - setRange(newMin, newMin + width); - b = visibleBounds(); - mode = "pan"; - } - dragState = { - mode: mode, - rect: rect, - startX: e.clientX, - startMin: b.min, - startMax: b.max, - pointerId: e.pointerId, - }; - try { strip.setPointerCapture(e.pointerId); } catch (err) {} - e.preventDefault(); - strip.classList.add("chart-range-strip--dragging"); - } - - function onPointerMove(e) { - if (!dragState) return; - var n = commitCount(); - if (n <= 1) return; - var trackWidth = dragState.rect.width; - var dxPx = e.clientX - dragState.startX; - var dxIdx = (dxPx / Math.max(1, trackWidth)) * (n - 1); - if (dragState.mode === "pan") { - setRange(dragState.startMin + dxIdx, dragState.startMax + dxIdx); - } else if (dragState.mode === "resize-left") { - setRange(dragState.startMin + dxIdx, dragState.startMax); - } else if (dragState.mode === "resize-right") { - setRange(dragState.startMin, dragState.startMax + dxIdx); - } - } - - function onPointerUp(e) { - if (!dragState) return; - try { strip.releasePointerCapture(dragState.pointerId); } catch (err) {} - dragState = null; - strip.classList.remove("chart-range-strip--dragging"); - } - - strip.addEventListener("pointerdown", onPointerDown); - strip.addEventListener("pointermove", onPointerMove); - strip.addEventListener("pointerup", onPointerUp); - strip.addEventListener("pointercancel", onPointerUp); - - // Expose the strip's render function so other code paths (toolbar - // slider, wheel-pan, the throttled LTTB rebuild) can keep the strip - // in lockstep without each having to know strip internals. The chart - // options' `onPan` / `onZoom` callbacks call this via the throttled - // rebuild rather than overriding them here, so LTTB and the strip - // refresh as one unit. - canvas.__bench_strip_render = render; - render(); - } - - // Cap the toolbar slider's `max` to the chart's full x-axis length. For a - // latest-100 virtual payload this is intentionally larger than the loaded - // point count, so "Show all" can expose the unloaded older range while the - // full-history fetch is warming. - function syncSliderBounds(card, commitCount) { - var slider = card.querySelector('[data-role="scope-slider"]'); - if (!slider) return; - var max = Math.max(5, commitCount); - slider.max = String(max); - // Pick a step that gives ~200 stops across the slider so dragging - // feels continuous regardless of history size. - var step = Math.max(1, Math.round(max / 200)); - slider.step = String(step); - var current = parseInt(slider.value, 10); - if (!Number.isFinite(current) || current > max) { - var def = defaultScopeForCard(card); - var seed = def === "all" ? max : Math.min(def, max); - slider.value = String(seed); - } - } - - // Wheel = horizontal pan. Chart.js zoom plugin doesn't support wheel-pan - // out of the box (wheel is always zoom in its config), so we attach a - // `wheel` listener that translates `deltaY`/`deltaX` into `chart.pan` and - // re-runs the rebuild after panning. - function attachWheelPan(canvas, chart, rebuild) { - if (canvas.__bench_wheel_attached) return; - canvas.__bench_wheel_attached = true; - canvas.addEventListener("wheel", function (e) { - // Treat horizontal-wheel-or-shift+wheel as horizontal pan; otherwise - // also pan on plain vertical wheel so trackpad scroll-up/down moves - // through commit history without needing modifier keys. - var dx = (Math.abs(e.deltaX) > Math.abs(e.deltaY)) ? e.deltaX : e.deltaY; - if (!dx) return; - e.preventDefault(); - // Browser wheel-down reports a positive delta. In Chart.js pan space, - // positive x moves the visible window toward older commits, while - // negative x moves back toward newer commits. - chart.pan({ x: dx * 0.5 }, undefined, "none"); - // `rebuild` recomputes LTTB on the new visible range AND, via the - // throttled wrapper, also calls `canvas.__bench_strip_render`. - rebuild(chart); - }, { passive: false }); - } - - // ----------------------------------------------------------------------- - // Recompute helpers driven by the per-chart toolbar. - // ----------------------------------------------------------------------- - // Invariant: when `currentRange` is supplied AND the chart is already - // panned away from the right edge, a scope change preserves the visible - // CENTER instead of snapping to the most recent N commits. With no - // `currentRange` (initial render) or a view that already covers - // everything / sits flush with the newest commit, anchor to the right — - // the right default at first load and after "show all". - function visibleRange(commitCount, scope, currentRange) { - if (commitCount <= 0) return { min: undefined, max: undefined }; - var maxIdx = commitCount - 1; - if (scope === "all" || !Number.isFinite(scope) || scope <= 0 || scope >= commitCount) { - return { min: 0, max: maxIdx }; - } - var width = scope; - var rightAnchored = { min: Math.max(0, maxIdx - (width - 1)), max: maxIdx }; - if (!currentRange) return rightAnchored; - var curMin = Number.isFinite(currentRange.min) ? currentRange.min : 0; - var curMax = Number.isFinite(currentRange.max) ? currentRange.max : maxIdx; - var coversAll = curMin <= 0 && curMax >= maxIdx; - // Half-commit tolerance: pan/zoom can leave fractional drift even when - // the user is effectively still flush with the newest commit. - var atRightEdge = curMax >= maxIdx - 0.5; - if (coversAll || atRightEdge) return rightAnchored; - var center = (curMin + curMax) / 2; - var halfWidth = (width - 1) / 2; - var newMin = Math.round(center - halfWidth); - var newMax = newMin + (width - 1); - if (newMin < 0) { - newMin = 0; - newMax = width - 1; - } else if (newMax > maxIdx) { - newMax = maxIdx; - newMin = maxIdx - (width - 1); - } - return { min: newMin, max: newMax }; - } - - function applyScope(card, scopeValue) { - var canvas = card.querySelector("canvas"); - var chart = canvas && canvas.__bench_chart; - if (!chart) return; - var commits = chart.data.labels.length; - var scope = scopeValue === "all" ? "all" : parseInt(scopeValue, 10); - canvas.__bench_state.scope = scope; - // Capture the chart's existing visible window BEFORE we overwrite it, - // so `visibleRange` can preserve the center when the user has panned - // away from the right edge. - var sx = chart.options.scales.x; - var currentRange = sx ? { min: sx.min, max: sx.max } : null; - var range = visibleRange(commits, scope, currentRange); - chart.options.scales.x.min = range.min; - chart.options.scales.x.max = range.max; - rebuildVisibleAndUpdate(card, chart, range.min, range.max, true); - syncToolbarUi(card, "scope", String(scopeValue)); - if (canvas.__bench_strip_render) canvas.__bench_strip_render(); - } - - // `userInitiated` defaults to true. Once set, the chart is "sticky" — the - // per-group Y apply pass skips it on subsequent group-level clicks, - // honouring the user's explicit per-card choice. The per-group toolbar - // passes `false` so it doesn't pollute the flag while broadcasting. - function applyY(card, yValue, userInitiated) { - var canvas = card.querySelector("canvas"); - var chart = canvas && canvas.__bench_chart; - if (!chart) return; - if (userInitiated !== false) { - canvas.__bench_y_user_set = true; - } - canvas.__bench_state.y = yValue; - chart.options.scales.y.type = yValue === "log" ? "logarithmic" : "linear"; - chart.options.scales.y.beginAtZero = yValue !== "log"; - chart.update("none"); - syncToolbarUi(card, "y", yValue); - } - - function syncToolbarUi(card, group, value) { - var attr = "data-" + group; - card.querySelectorAll(".toolbar-btn[" + attr + "]").forEach(function (b) { - b.classList.toggle("toolbar-btn--active", b.getAttribute(attr) === value); - }); - if (group === "scope") { - var slider = card.querySelector('[data-role="scope-slider"]'); - if (slider && /^\d+$/.test(value)) slider.value = value; - } - } - - function bindToolbar(card) { - var toolbar = card.querySelector(".toolbar--card"); - if (!toolbar || toolbar.__bench_bound) return; - toolbar.__bench_bound = true; - - toolbar.addEventListener("click", function (e) { - var btn = e.target.closest(".toolbar-btn"); - if (!btn || !toolbar.contains(btn)) return; - if (btn.hasAttribute("data-y")) applyY(card, btn.getAttribute("data-y")); - }); - - var slider = toolbar.querySelector('[data-role="scope-slider"]'); - if (slider) { - // `input` (continuous), throttled so dragging stays at ~60fps even on - // pages with dozens of charts. Last value still lands because - // `throttle` preserves the trailing call. - var throttled = throttle(function () { - applyScope(card, slider.value); - }, ZOOM_THROTTLE_MS); - slider.addEventListener("input", throttled); - } - } - - // ----------------------------------------------------------------------- - // Group hydration. Every landing group renders chart shells with versioned - // shard metadata. On intent or first open we fetch materialized latest-100 - // group shards through a small per-tab queue and construct each chart as - // soon as its shard arrives. - // ----------------------------------------------------------------------- - var hydrationActive = 0; - var hydrationQueue = []; - var fullHistoryActive = 0; - var fullHistoryQueue = []; - var groupOpenPriority = 0; - - function scheduleHydration(task, priority) { - var entry = { - task: task, - priority: priority || 0, - promise: null, - resolve: null, - reject: null, - }; - entry.promise = new Promise(function (resolve, reject) { - entry.resolve = resolve; - entry.reject = reject; - }); - hydrationQueue.push(entry); - drainHydrationQueue(); - return entry; - } - - function drainHydrationQueue() { - while (hydrationActive < HYDRATION_CONCURRENCY && hydrationQueue.length) { - hydrationQueue.sort(function (a, b) { return b.priority - a.priority; }); - var item = hydrationQueue.shift(); - hydrationActive++; - Promise.resolve() - .then(item.task) - .then( - function (value) { - hydrationActive--; - item.resolve(value); - drainHydrationQueue(); - }, - function (err) { - hydrationActive--; - item.reject(err); - drainHydrationQueue(); - } - ); - } - } - - function scheduleFullHistory(task, priority) { - var entry = { - task: task, - priority: priority || 0, - promise: null, - resolve: null, - reject: null, - }; - entry.promise = new Promise(function (resolve, reject) { - entry.resolve = resolve; - entry.reject = reject; - }); - fullHistoryQueue.push(entry); - drainFullHistoryQueue(); - return entry; - } - - function drainFullHistoryQueue() { - while (fullHistoryActive < FULL_HISTORY_CONCURRENCY && fullHistoryQueue.length) { - fullHistoryQueue.sort(function (a, b) { return b.priority - a.priority; }); - var item = fullHistoryQueue.shift(); - fullHistoryActive++; - Promise.resolve() - .then(item.task) - .then( - function (value) { - fullHistoryActive--; - item.resolve(value); - drainFullHistoryQueue(); - }, - function (err) { - fullHistoryActive--; - item.reject(err); - drainFullHistoryQueue(); - } - ); - } - } - - function priorityForGroupOpen(group) { - groupOpenPriority += GROUP_OPEN_PRIORITY_STEP; - group.__bench_group_priority = groupOpenPriority; - return groupOpenPriority; - } - - function fetchAndConstruct(card) { - var canvas = card.querySelector("canvas"); - if (!canvas) return Promise.resolve(); - if (canvas.__bench_chart) return Promise.resolve(); - if (constructChart(card)) { - bindToolbar(card); - } - return Promise.resolve(); - } - - function groupCards(group) { - return Array.prototype.slice.call( - group.querySelectorAll(".chart-card[data-chart-slug]") - ); - } - - function cardHasPayloadAvailable(card) { - var canvas = card.querySelector("canvas"); - if (!canvas) return true; - if (canvas.__bench_payload) return true; - var idx = card.getAttribute("data-chart-index"); - return idx != null && !!readInlinePayload(idx); - } - - function groupShardCount(group) { - var n = parseInt(group.getAttribute("data-group-shard-count") || "0", 10); - return Number.isFinite(n) && n > 0 ? n : 0; - } - - function groupShardPrefix(group) { - return group.getAttribute("data-group-shard-prefix") || ""; - } - - function groupShardUrl(group, index) { - var prefix = groupShardPrefix(group); - return prefix ? prefix + encodeURIComponent(String(index)) : ""; - } - - function groupHydrationState(group) { - if (!group.__bench_group_hydration) { - group.__bench_group_hydration = { - loaded: {}, - pending: {}, - entries: {}, - errors: {}, - }; - } - return group.__bench_group_hydration; - } - - function cardBySlug(group, slug) { - var cards = groupCards(group); - for (var i = 0; i < cards.length; i++) { - if (cards[i].getAttribute("data-chart-slug") === slug) return cards[i]; - } - return null; - } - - function showCardLoading(card, on) { - var existing = card.querySelector(".chart-loading"); - if (on) { - if (existing) return; - var el = document.createElement("div"); - el.className = "chart-loading"; - el.textContent = "loading…"; - card.appendChild(el); - } else if (existing) { - existing.remove(); - } - } - - function showCardError(card, msg) { - var existing = card.querySelector(".chart-error"); - if (existing) existing.remove(); - var el = document.createElement("div"); - el.className = "chart-error"; - el.textContent = msg; - card.appendChild(el); - setTimeout(function () { if (el.parentNode) el.remove(); }, 4000); - } - - function applyGroupShard(group, shard) { - if (!shard || !Array.isArray(shard.charts)) return; - shard.charts.forEach(function (payload) { - if (!payload || !payload.slug) return; - var card = cardBySlug(group, payload.slug); - var canvas = card && card.querySelector("canvas"); - if (!canvas) return; - if (!canvas.__bench_full_loaded) { - canvas.__bench_payload = normalizeChartPayload(payload); - canvas.__bench_payload_window = CHART_FETCH_N; - } - showCardLoading(card, false); - if (groupIsOpen(group)) fetchAndConstruct(card); - }); - } - - function fetchGroupShard(group, index, priority) { - var state = groupHydrationState(group); - if (state.loaded[index]) return Promise.resolve(); - if (state.pending[index]) { - if (state.entries[index] && priority) { - state.entries[index].priority = Math.max(state.entries[index].priority, priority); - drainHydrationQueue(); - } - return state.pending[index]; - } - var url = groupShardUrl(group, index); - if (!url) return Promise.resolve(); - var entry = scheduleHydration(function () { - return fetch(url, { headers: { "accept": "application/json" } }) - .then(function (r) { - if (r.status === 404) throw new Error("not found"); - if (!r.ok) throw new Error("HTTP " + r.status); - return r.json(); - }) - .then(function (payload) { - state.loaded[index] = true; - state.errors[index] = null; - applyGroupShard(group, payload); - }); - }, priority); - state.entries[index] = entry; - state.pending[index] = entry.promise.catch(function (err) { - state.errors[index] = err; - if (index === 0) { - groupCards(group).forEach(function (card) { - if (!cardHasPayloadAvailable(card)) { - showCardLoading(card, false); - showCardError(card, "failed to load: " + (err.message || "unknown error")); - } - }); - } - }).then(function () { - state.pending[index] = null; - state.entries[index] = null; - }); - return state.pending[index]; - } - - function groupIsOpen(group) { - var details = group.querySelector("details.group-disclosure"); - return !details || details.open; - } - - function queueRemainingGroupShards(group, priority) { - var count = groupShardCount(group); - for (var i = 1; i < count; i++) fetchGroupShard(group, i, priority || 0); - } - - function queueGroupFullHistory(group, priority) { - groupCards(group).forEach(function (card) { - ensureFullHistory(card, priority || 0); - }); - } - - function hydrateGroupShardZero(group, showLoading, priority) { - if (showLoading) { - groupCards(group).forEach(function (card) { - if (!cardHasPayloadAvailable(card)) showCardLoading(card, true); - }); - } - return fetchGroupShard(group, 0, priority || (showLoading ? 1 : 0)).then(function () { - if (showLoading) { - groupCards(group).forEach(function (card) { - if (cardHasPayloadAvailable(card)) showCardLoading(card, false); - }); - } - }); - } - - function hydrateOpenGroup(disclosure) { - if (!disclosure || !disclosure.open) return; - var group = disclosure.closest(".group-details"); - if (!group) return; - var priority = priorityForGroupOpen(group); - hydrateGroupShardZero(group, true, priority + 20).then(function () { - queueRemainingGroupShards(group, priority + 10); - queueGroupFullHistory(group, priority); - }); - } - - function prefetchGroupOnIntent(group) { - fetchGroupShard(group, 0, 0); - } - - // ----------------------------------------------------------------------- - // Global filter bar wiring. - // - // Chips live in `.global-filter-bar`. Click a non-"all" chip to toggle - // that engine/format in/out of the active set; click "all" to clear the - // filter for that dimension. After every change we: - // 1. Re-paint the chips. - // 2. Walk every chart on the page and re-apply the filter (skipping - // series the user has explicitly overridden on that card). - // 3. Sync the URL with `history.replaceState` so a refresh / share - // preserves the view. - // ----------------------------------------------------------------------- - // Apply the layered filter on a single card. Layer order matches the - // resolution rule documented at the top of the file: - // 1. Per-card legend overrides (`canvas.__bench_overrides`) win. - // 2. Per-group filter (`section.__bench_group_filter`) hides next. - // 3. Global filter hides last. - // 4. Otherwise show. - // Used by every code path that mutates filter state (global chip clicks, - // per-group chip clicks, post-construction seeding). - function applyFiltersToChart(card) { - var canvas = card.querySelector("canvas"); - var chart = canvas && canvas.__bench_chart; - if (!chart) return; - var overrides = canvas.__bench_overrides || {}; - var section = card.closest(".group-details"); - var groupFilter = section && section.__bench_group_filter; - var datasets = chart.data.datasets || []; - for (var i = 0; i < datasets.length; i++) { - var ds = datasets[i]; - if (overrides[ds.label]) continue; - var hidden = false; - if (!seriesPassesGroupFilter(groupFilter, ds.label)) { - hidden = true; - } else if (!seriesPassesFilter(ds.benchMeta)) { - hidden = true; - } - // Use the dataset.hidden field directly so the legend stays in sync; - // setDatasetVisibility writes into a separate visibility map. - ds.hidden = hidden; - } - chart.update("none"); - } - - function applyGlobalFilterEverywhere() { - document.querySelectorAll(".chart-card[data-chart-index]").forEach(function (card) { - applyFiltersToChart(card); - }); - } - - function syncFilterChipsUi() { - var bar = document.querySelector('[data-role="global-filter-bar"]'); - if (!bar) return; - bar.querySelectorAll(".filter-chip").forEach(function (chip) { - var dim = chip.getAttribute("data-filter"); - var value = chip.getAttribute("data-value"); - var key = dim === "engine" ? "engines" : "formats"; - var list = globalFilter[key]; - var active; - if (value === "*") { - // The "all" chip is a one-shot reset, never a "current state" - // indicator — leave it inactive. Pressing it forces every other - // chip in the row back to active. - active = false; - } else { - active = list.indexOf(value) !== -1; - } - chip.classList.toggle("filter-chip--active", active); - chip.setAttribute("aria-pressed", active ? "true" : "false"); - }); - syncFilterBadge(); - } - - // Show a badge on the trigger that counts how many chips are *off* - // (i.e. how many things the global filter is hiding). Hidden when the - // filter is fully open, so it's noise-free in the resting state. - function syncFilterBadge() { - var trigger = document.querySelector('[data-role="filter-trigger"]'); - if (!trigger) return; - var hidden = - Math.max(0, filterUniverse.engines.length - globalFilter.engines.length) + - Math.max(0, filterUniverse.formats.length - globalFilter.formats.length); - var badge = trigger.querySelector('[data-role="filter-badge"]'); - if (hidden === 0) { - if (badge) badge.remove(); - return; - } - if (!badge) { - badge = document.createElement("span"); - badge.className = "filter-badge"; - badge.setAttribute("data-role", "filter-badge"); - trigger.appendChild(badge); - } - badge.textContent = String(hidden); - } - - function syncFilterUrl() { - if (!window.history || !window.history.replaceState) return; - var url = new URL(window.location.href); - // URL stays as an allowlist (`?engine=duckdb` = "show only duckdb"). We - // emit the param only when the active set is a strict subset of the - // universe; an all-active row leaves the URL clean. - syncDimensionUrl(url, "engine", "engines"); - syncDimensionUrl(url, "format", "formats"); - window.history.replaceState(null, "", url.toString()); - } - - function syncDimensionUrl(url, paramName, key) { - if (dimensionIsFiltered(key)) { - url.searchParams.set(paramName, globalFilter[key].join(",")); - } else { - url.searchParams.delete(paramName); - } - } - - // Toggle one chip independently. The "all" chip resets the dimension to - // every-chip-active; specific chips just flip their own active state. - function toggleFilterValue(dim, value) { - var key = dim === "engine" ? "engines" : "formats"; - if (value === "*") { - globalFilter[key] = filterUniverse[key].slice(); - return; - } - var list = globalFilter[key]; - var idx = list.indexOf(value); - if (idx === -1) { - list.push(value); - } else { - list.splice(idx, 1); - } - } - - // Toggle the dropdown panel open/closed. Click outside or press Escape - // to close. The panel is anchored under the trigger button via CSS. - function bindFilterDropdown() { - var bar = document.querySelector('[data-role="global-filter-bar"]'); - if (!bar) return; - var trigger = bar.querySelector('[data-role="filter-trigger"]'); - var panel = bar.querySelector('[data-role="filter-panel"]'); - if (!trigger || !panel) return; - - function setOpen(open) { - if (open) { - panel.removeAttribute("hidden"); - } else { - panel.setAttribute("hidden", ""); - } - trigger.setAttribute("aria-expanded", open ? "true" : "false"); - bar.classList.toggle("filter-dropdown--open", open); - } - trigger.addEventListener("click", function (e) { - e.stopPropagation(); - var isOpen = !panel.hasAttribute("hidden"); - setOpen(!isOpen); - }); - document.addEventListener("click", function (e) { - if (!bar.contains(e.target)) setOpen(false); - }); - document.addEventListener("keydown", function (e) { - if (e.key === "Escape") setOpen(false); - }); - } - - function initGlobalFilterBar() { - var bar = document.querySelector('[data-role="global-filter-bar"]'); - if (!bar) return; - bar.addEventListener("click", function (e) { - var chip = e.target.closest(".filter-chip"); - if (!chip || !bar.contains(chip)) return; - var dim = chip.getAttribute("data-filter"); - var value = chip.getAttribute("data-value"); - if (!dim || !value) return; - toggleFilterValue(dim, value); - syncFilterChipsUi(); - applyGlobalFilterEverywhere(); - syncFilterUrl(); - }); - bindFilterDropdown(); - syncFilterChipsUi(); - } - - // ----------------------------------------------------------------------- - // Per-group toolbar wiring. - // - // Each `.group-details` section carries a `[data-role="group-toolbar"]` - // with Y-axis buttons and a centered filter dropdown. State lives on the - // section node: - // section.__bench_group_filter = { hiddenSeries: [, ...] } - // section.__bench_group_y = "linear" | "log" | null - // section.__bench_known_series = {
` - // URLs that come from ingested COMMITS rows (the fixtures use - // 1111..., 2222..., 3333... so a regression that misrenders those - // links is still pinned). The hex match is `{7,40}` so an extended - // disambiguating short-SHA (git auto-extends to 8-12 chars when the - // 7-char prefix is ambiguous) is also caught. - s.add_filter( - concat!( - r#"class="site-footer-sha"[^>]*href="https://github\.com/vortex-data/vortex/commit/"#, - r#"[0-9a-f]{7,40}"[^>]*>[0-9a-f]{7,40}"#, - ), - r#"class="site-footer-sha" href="https://github.com/vortex-data/vortex/commit/" rel="noopener noreferrer" target="_blank">"#, - ); - s -} - -/// Lift a single chart slug from `/api/groups`, picking from a group whose -/// name matches `predicate`. Used by tests that need a real slug to drive -/// `/chart/{slug}` and `/api/chart/{slug}` round-trips. -pub(crate) async fn pick_chart_slug( - server: &Server, - predicate: impl Fn(&str) -> bool, -) -> Result { - let client = reqwest::Client::new(); - let groups: Value = client - .get(server.url("/api/groups")) - .send() - .await? - .json() - .await?; - groups["groups"] - .as_array() - .context("groups is array")? - .iter() - .find(|g| g["name"].as_str().is_some_and(&predicate)) - .and_then(|g| g["charts"].as_array()) - .and_then(|c| c.first()) - .and_then(|c| c["slug"].as_str()) - .map(str::to_string) - .context("matching chart slug") -} - -/// Lift a single group slug from `/api/groups`, picking the first group -/// whose name matches `predicate`. -pub(crate) async fn pick_group_slug( - server: &Server, - predicate: impl Fn(&str) -> bool, -) -> Result { - let client = reqwest::Client::new(); - let groups: Value = client - .get(server.url("/api/groups")) - .send() - .await? - .json() - .await?; - groups["groups"] - .as_array() - .context("groups is array")? - .iter() - .find(|g| g["name"].as_str().is_some_and(&predicate)) - .and_then(|g| g["slug"].as_str()) - .map(str::to_string) - .context("matching group slug") -} - -/// Look up a group entry by its `name` field inside an `/api/groups` -/// response. -pub(crate) fn group_by_name<'a>(groups: &'a Value, name: &str) -> Result<&'a Value> { - groups["groups"] - .as_array() - .context("groups is array")? - .iter() - .find(|g| g["name"].as_str() == Some(name)) - .with_context(|| format!("group {name:?} exists")) -} - -/// Fuzzy `f64` equality for test assertions. The summary rollups round-trip -/// through SQL so exact equality isn't safe even on integer-valued inputs. -pub(crate) fn assert_close(actual: f64, expected: f64) { - let delta = (actual - expected).abs(); - assert!( - delta < 0.000_001, - "expected {actual} to be close to {expected}" - ); -} - -/// Pull just the `
` substring of the -/// filter dropdown - its trigger button and the chip panel. Keeps the -/// snapshot focused on the chip markup and stable against changes elsewhere -/// on the page. -pub(crate) fn filter_bar_section(body: &str) -> String { - let needle = r#"
".to_string(); - }; - let tail = &body[start..]; - // The dropdown is `
...
`. - // We need to find the matching `
` for the outer wrapper. The - // simplest robust approach is to scan and balance. - let bytes = tail.as_bytes(); - let mut depth = 0usize; - let mut i = 0usize; - while i < bytes.len() { - if bytes[i] == b'<' { - if tail[i..].starts_with("") { - depth -= 1; - if depth == 0 { - return tail[..i + "
".len()].to_string(); - } - i += "".len(); - continue; - } - } - i += 1; - } - tail.to_string() -} - -/// Pull the `
` containing chips for one -/// dimension (`"engine"` or `"format"`). -pub(crate) fn filter_section(body: &str, dim: &str) -> String { - let bar = filter_bar_section(body); - let needle = format!(r#"data-filter="{dim}""#); - let Some(_) = bar.find(&needle) else { - return String::new(); - }; - // Walk back to the enclosing `
`. - let row_open = r#"
"#; - let row_close = "
"; - bar.split(row_open) - .find(|chunk| chunk.contains(&needle)) - .and_then(|chunk| chunk.split(row_close).next()) - .map(str::to_string) - .unwrap_or_default() -} - -/// Pull a single chip's opening tag for assertions. -pub(crate) fn extract_chip(section: &str, value: &str) -> String { - let needle = format!(r#"data-value="{value}""#); - let Some(idx) = section.find(&needle) else { - return String::new(); - }; - let head = §ion[..idx]; - let chip_start = head.rfind("').map(|p| p + 1).unwrap_or(tail.len()); - tail[..chip_end].to_string() -} diff --git a/benchmarks-website/server/tests/group_api.rs b/benchmarks-website/server/tests/group_api.rs deleted file mode 100644 index dbc33473c07..00000000000 --- a/benchmarks-website/server/tests/group_api.rs +++ /dev/null @@ -1,290 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Integration tests for `/group/{slug}` and `/api/group/{slug}` plus the -//! v2-compatible group summary contract on `/api/groups`. - -mod common; - -use anyhow::Context as _; -use anyhow::Result; -use serde_json::Value; - -use self::common::Server; -use self::common::assert_close; -use self::common::attr_value; -use self::common::group_by_name; -use self::common::insta_settings; -use self::common::pick_group_slug; -use self::common::seed; -use self::common::seed_long_history; - -#[tokio::test] -async fn group_page_snapshot() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let slug = pick_group_slug(&server, |s| s.starts_with("TPC-H")).await?; - - let resp = client - .get(server.url(&format!("/group/{slug}"))) - .send() - .await?; - assert_eq!(resp.status(), 200); - let body = resp.text().await?; - assert!( - !body.contains(r#"id="chart-data-0""#), - "group page should hydrate through materialized shards, not inline chart payloads" - ); - assert!( - body.contains(r#"data-artifact-generation="#) - && body.contains(r#"data-group-shard-prefix="#) - && body.contains(r#"open"#), - "group page should render an open shard-hydrated group shell" - ); - assert!( - body.contains(r#"class="toolbar toolbar--card""#), - "per-chart toolbar must be rendered on group page" - ); - insta_settings().bind(|| { - insta::assert_snapshot!("group_page_query", body); - }); - Ok(()) -} - -#[tokio::test] -async fn group_api_returns_charts() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let slug = pick_group_slug(&server, |s| s.starts_with("TPC-H")).await?; - - let client = reqwest::Client::new(); - let resp = client - .get(server.url(&format!("/api/group/{slug}"))) - .send() - .await?; - assert_eq!(resp.status(), 200); - let body: Value = resp.json().await?; - let charts = body["charts"].as_array().context("charts is array")?; - assert!(!charts.is_empty(), "group must have at least one chart"); - let first = &charts[0]; - assert!(first["slug"].as_str().is_some(), "chart slug present"); - assert!(first["name"].as_str().is_some(), "chart name present"); - assert!( - first["commits"].as_array().is_some(), - "embedded chart commits" - ); - assert!( - first["series"].as_object().is_some(), - "embedded chart series" - ); - assert_eq!( - body["summary"]["type"].as_str(), - Some("queryBenchmark"), - "group API should include the server-computed summary" - ); - Ok(()) -} - -#[tokio::test] -async fn group_api_respects_commit_window() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let slug = pick_group_slug(&server, |s| s.starts_with("TPC-H")).await?; - - let client = reqwest::Client::new(); - let resp = client - .get(server.url(&format!("/api/group/{slug}?n=1"))) - .send() - .await?; - assert_eq!(resp.status(), 200); - let body: Value = resp.json().await?; - let charts = body["charts"].as_array().context("charts is array")?; - assert!(!charts.is_empty(), "group must have at least one chart"); - for chart in charts { - let commits = chart["commits"].as_array().context("commits is array")?; - assert!( - commits.len() <= 1, - "group hydration should honor the requested commit window" - ); - assert!(chart["series"].as_object().is_some(), "series is present"); - } - Ok(()) -} - -#[tokio::test] -async fn group_shard_artifact_returns_bounded_chart_payloads() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let landing = client.get(server.url("/")).send().await?.text().await?; - let generation = attr_value(&landing, "data-artifact-generation") - .context("landing exposes artifact generation")?; - let group_slug = - attr_value(&landing, "data-group-slug").context("landing exposes group slug")?; - - let resp = client - .get(server.url(&format!( - "/api/artifacts/{generation}/groups/{group_slug}/shards/0" - ))) - .send() - .await?; - assert_eq!(resp.status(), 200); - let etag = resp - .headers() - .get(reqwest::header::ETAG) - .and_then(|v| v.to_str().ok()) - .unwrap_or_default() - .to_string(); - assert!(!etag.is_empty(), "artifact response should carry an ETag"); - let vary = resp - .headers() - .get(reqwest::header::VARY) - .and_then(|v| v.to_str().ok()) - .unwrap_or_default() - .to_ascii_lowercase(); - assert!( - vary.contains("accept-encoding"), - "artifact response should vary on Accept-Encoding" - ); - - let body: Value = resp.json().await?; - assert_eq!(body["window"].as_u64(), Some(100)); - assert_eq!(body["shard_index"].as_u64(), Some(0)); - assert!( - body["shard_count"].as_u64().unwrap_or_default() >= 1, - "shard count should be present" - ); - let charts = body["charts"].as_array().context("charts is array")?; - assert!(!charts.is_empty(), "first shard should include charts"); - assert!(charts.len() <= 8, "shards should carry at most 8 charts"); - for chart in charts { - let commits = chart["commits"].as_array().context("commits is array")?; - assert!( - commits.len() <= 100, - "artifact hydration should use the latest-100 window" - ); - assert!(chart["series"].as_object().is_some(), "series is present"); - } - Ok(()) -} - -#[tokio::test] -async fn group_shard_artifact_carries_chart_history_metadata() -> Result<()> { - let server = Server::start().await?; - seed_long_history(&server, 125).await?; - - let client = reqwest::Client::new(); - let landing = client.get(server.url("/")).send().await?.text().await?; - let generation = attr_value(&landing, "data-artifact-generation") - .context("landing exposes artifact generation")?; - let group_slug = - attr_value(&landing, "data-group-slug").context("landing exposes group slug")?; - - let body: Value = client - .get(server.url(&format!( - "/api/artifacts/{generation}/groups/{group_slug}/shards/0" - ))) - .send() - .await? - .json() - .await?; - let first_chart = body["charts"][0] - .as_object() - .context("first chart in shard")?; - assert_eq!( - first_chart["commits"].as_array().map(Vec::len), - Some(100), - "group shard remains a latest-100 payload" - ); - assert_eq!(first_chart["history"]["total_commits"].as_u64(), Some(125)); - assert_eq!(first_chart["history"]["start_index"].as_u64(), Some(25)); - assert_eq!(first_chart["history"]["loaded_commits"].as_u64(), Some(100)); - assert_eq!(first_chart["history"]["complete"].as_bool(), Some(false)); - Ok(()) -} - -#[tokio::test] -async fn unknown_group_shard_artifact_returns_404() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let resp = client - .get(server.url("/api/artifacts/not-a-generation/groups/not-a-group/shards/0")) - .send() - .await?; - assert_eq!(resp.status(), 404); - Ok(()) -} - -#[tokio::test] -async fn group_summaries_match_v2_contract() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let groups: Value = client - .get(server.url("/api/groups")) - .send() - .await? - .json() - .await?; - - let random_access = &group_by_name(&groups, "Random Access")?["summary"]; - assert_eq!(random_access["type"].as_str(), Some("randomAccess")); - let rankings = random_access["rankings"] - .as_array() - .context("random access rankings")?; - assert_eq!(rankings[0]["name"].as_str(), Some("vortex-file-compressed")); - assert_eq!(rankings[1]["name"].as_str(), Some("parquet")); - assert_close(rankings[1]["ratio"].as_f64().context("random ratio")?, 2.0); - - let compression = &group_by_name(&groups, "Compression")?["summary"]; - assert_eq!(compression["type"].as_str(), Some("compression")); - assert_close( - compression["compressRatio"] - .as_f64() - .context("compressRatio")?, - 2.0, - ); - assert_close( - compression["decompressRatio"] - .as_f64() - .context("decompressRatio")?, - 2.0, - ); - assert_eq!(compression["datasetCount"].as_u64(), Some(1)); - - let compression_size = &group_by_name(&groups, "Compression Size")?["summary"]; - assert_eq!(compression_size["type"].as_str(), Some("compressionSize")); - assert_close( - compression_size["meanRatio"] - .as_f64() - .context("meanRatio")?, - 0.5, - ); - assert_eq!(compression_size["datasetCount"].as_u64(), Some(1)); - - let query = &group_by_name(&groups, "TPC-H (NVMe) (SF=1)")?["summary"]; - assert_eq!(query["type"].as_str(), Some("queryBenchmark")); - let rankings = query["rankings"].as_array().context("query rankings")?; - assert_eq!( - rankings[0]["name"].as_str(), - Some("datafusion:vortex-file-compressed"), - "query summary should include v2's missing-series penalty" - ); - assert_eq!(rankings[1]["name"].as_str(), Some("duckdb:parquet")); - let first_score = rankings[0]["score"].as_f64().context("first score")?; - let second_score = rankings[1]["score"].as_f64().context("second score")?; - assert!( - first_score < second_score, - "lower query score should rank first" - ); - - Ok(()) -} diff --git a/benchmarks-website/server/tests/ingest.rs b/benchmarks-website/server/tests/ingest.rs deleted file mode 100644 index ba4af6e29b4..00000000000 --- a/benchmarks-website/server/tests/ingest.rs +++ /dev/null @@ -1,385 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Integration tests for `POST /api/ingest` — round-trips the bearer -//! check, the all-or-nothing transaction, the schema-version gate, and -//! the upsert path. - -use std::net::SocketAddr; - -use anyhow::Context as _; -use anyhow::Result; -use serde_json::Value; -use serde_json::json; -use tempfile::TempDir; -use tokio::net::TcpListener; -use tokio::task::JoinHandle; -use tokio::time::Duration; -use vortex_bench_server::app::AppState; -use vortex_bench_server::app::router; - -const TOKEN: &str = "test-bearer-token"; - -struct Server { - addr: SocketAddr, - _tmp: TempDir, - handle: JoinHandle<()>, -} - -impl Server { - async fn start() -> Result { - let tmp = TempDir::new()?; - let db_path = tmp.path().join("bench.duckdb"); - let state = AppState::open(&db_path, TOKEN.to_string())?; - let app = router(state); - - let listener = TcpListener::bind("127.0.0.1:0").await?; - let addr = listener.local_addr()?; - let handle = tokio::spawn(async move { - axum::serve(listener, app).await.unwrap(); - }); - Ok(Self { - addr, - _tmp: tmp, - handle, - }) - } - - fn url(&self, path: &str) -> String { - format!("http://{}{}", self.addr, path) - } -} - -impl Drop for Server { - fn drop(&mut self) { - self.handle.abort(); - } -} - -async fn wait_for_groups(client: &reqwest::Client, server: &Server) -> Result { - let mut last_len = 0usize; - for _ in 0..100 { - let resp = client.get(server.url("/api/groups")).send().await?; - assert_eq!(resp.status(), 200); - let body: Value = resp.json().await?; - last_len = body["groups"].as_array().context("groups is array")?.len(); - if last_len > 0 { - return Ok(body); - } - tokio::time::sleep(Duration::from_millis(25)).await; - } - anyhow::bail!("read model did not rebuild groups; last len {last_len}") -} - -fn fixture_envelope() -> Value { - let raw = include_str!("../fixtures/envelope.json"); - serde_json::from_str(raw).expect("fixture envelope is valid JSON") -} - -#[tokio::test] -async fn happy_path_then_idempotent_reingest() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - let envelope = fixture_envelope(); - - let resp = client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(&envelope) - .send() - .await?; - assert_eq!(resp.status(), 200, "first ingest should be 200"); - let body: Value = resp.json().await?; - assert_eq!(body["inserted"].as_u64(), Some(5)); - assert_eq!(body["updated"].as_u64(), Some(0)); - - let resp = client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(&envelope) - .send() - .await?; - assert_eq!(resp.status(), 200, "second ingest should be 200"); - let body: Value = resp.json().await?; - assert_eq!(body["inserted"].as_u64(), Some(0), "no new rows on re-emit"); - assert!( - body["updated"].as_u64().context("updated is u64")? > 0, - "re-emit must report at least one updated row" - ); - Ok(()) -} - -#[tokio::test] -async fn accepts_payloads_above_axum_default_body_limit() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - let mut envelope = fixture_envelope(); - envelope["commit"]["message"] = json!("x".repeat(3 * 1024 * 1024)); - - let resp = client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(&envelope) - .send() - .await?; - assert_eq!(resp.status(), 200); - Ok(()) -} - -#[tokio::test] -async fn missing_bearer_is_unauthorized() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - let envelope = fixture_envelope(); - - let resp = client - .post(server.url("/api/ingest")) - .json(&envelope) - .send() - .await?; - assert_eq!(resp.status(), 401); - Ok(()) -} - -#[tokio::test] -async fn wrong_bearer_is_unauthorized() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - let envelope = fixture_envelope(); - - let resp = client - .post(server.url("/api/ingest")) - .bearer_auth("not-the-real-token") - .json(&envelope) - .send() - .await?; - assert_eq!(resp.status(), 401); - Ok(()) -} - -#[tokio::test] -async fn unknown_kind_is_400() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - let envelope = json!({ - "run_meta": { - "benchmark_id": "fixture", - "schema_version": 1, - "started_at": "2026-04-25T00:00:00Z" - }, - "commit": { - "sha": "0123456789abcdef0123456789abcdef01234567", - "timestamp": "2026-04-25T00:00:00Z", - "message": "x", "author_name": "x", "author_email": "x@x", - "committer_name": "x", "committer_email": "x@x", - "tree_sha": "fedcba9876543210fedcba9876543210fedcba98", - "url": "https://example.com" - }, - "records": [ - { "kind": "made_up_kind", "commit_sha": "0123456789abcdef0123456789abcdef01234567" } - ] - }); - let resp = client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(&envelope) - .send() - .await?; - assert_eq!(resp.status(), 400); - Ok(()) -} - -#[tokio::test] -async fn unknown_field_is_400() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - let mut envelope = fixture_envelope(); - envelope["records"][0]["surprise_field"] = json!("oops"); - let resp = client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(&envelope) - .send() - .await?; - assert_eq!(resp.status(), 400); - Ok(()) -} - -#[tokio::test] -async fn schema_version_too_new_is_409() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - let mut envelope = fixture_envelope(); - envelope["run_meta"]["schema_version"] = json!(99); - let resp = client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(&envelope) - .send() - .await?; - assert_eq!(resp.status(), 409); - Ok(()) -} - -#[tokio::test] -async fn invalid_storage_is_400_record_error() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - let mut envelope = fixture_envelope(); - envelope["records"][0]["storage"] = json!("gcs"); - let resp = client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(&envelope) - .send() - .await?; - assert_eq!(resp.status(), 400); - let body: Value = resp.json().await?; - assert_eq!(body["record_index"], json!(0)); - Ok(()) -} - -#[tokio::test] -async fn health_reports_after_ingest() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - // Pre-ingest: counts are zero. - let resp = client.get(server.url("/health")).send().await?; - assert_eq!(resp.status(), 200); - let body: Value = resp.json().await?; - assert_eq!(body["status"], "ok"); - assert_eq!(body["schema_version"], 1); - assert_eq!(body["row_counts"]["commits"], 0); - assert!(body["latest_commit_timestamp"].is_null()); - - // Ingest, then re-check. - client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(&fixture_envelope()) - .send() - .await?; - - let resp = client.get(server.url("/health")).send().await?; - let body: Value = resp.json().await?; - assert_eq!(body["row_counts"]["commits"], 1); - assert_eq!(body["row_counts"]["query_measurements"], 1); - assert_eq!(body["row_counts"]["compression_times"], 1); - assert_eq!(body["row_counts"]["compression_sizes"], 1); - assert_eq!(body["row_counts"]["random_access_times"], 1); - assert_eq!(body["row_counts"]["vector_search_runs"], 1); - assert!(!body["latest_commit_timestamp"].is_null()); - Ok(()) -} - -#[tokio::test] -async fn read_routes_serve_after_ingest() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(&fixture_envelope()) - .send() - .await?; - - let body = wait_for_groups(&client, &server).await?; - let groups = body["groups"].as_array().context("groups is array")?; - assert!( - !groups.is_empty(), - "groups should not be empty after ingest" - ); - - // Pick the first chart slug and round-trip it. - let first_chart = groups - .iter() - .find_map(|g| g["charts"].as_array().and_then(|c| c.first())) - .context("at least one chart")?; - let slug = first_chart["slug"] - .as_str() - .context("slug is a string")? - .to_string(); - - let resp = client - .get(server.url(&format!("/api/chart/{slug}"))) - .send() - .await?; - assert_eq!(resp.status(), 200, "chart {slug} should resolve"); - let body: Value = resp.json().await?; - assert!(body["display_name"].is_string()); - assert!(body["unit_kind"].is_string()); - assert!(body["commits"].is_array()); - assert_eq!( - body["commits"] - .as_array() - .context("commits is array")? - .len(), - 1 - ); - assert!(body["series"].is_object()); - Ok(()) -} - -#[tokio::test] -async fn unknown_slug_is_404() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - let resp = client - .get(server.url("/api/chart/qm.aGVsbG8")) - .send() - .await?; - // Either 400 (couldn't decode JSON) or 404 (decoded but no rows). Both are - // acceptable per the contract; we just need it to not be a 500. - assert!( - resp.status() == 400 || resp.status() == 404, - "got {}", - resp.status() - ); - Ok(()) -} - -/// Reads are served from the materialized read model. Ingest schedules a -/// background rebuild and keeps the old generation live until the new one is -/// ready. -#[tokio::test] -async fn read_model_rebuilds_after_ingest() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - // Warm the cache against an empty DB. - let resp = client.get(server.url("/api/groups")).send().await?; - assert_eq!(resp.status(), 200); - let body: Value = resp.json().await?; - assert_eq!( - body["groups"].as_array().context("groups is array")?.len(), - 0, - "groups should be empty before any ingest" - ); - - // Ingest some data. The handler invalidates fallback caches and schedules - // the materialized read-model rebuild after the commit. - let resp = client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(&fixture_envelope()) - .send() - .await?; - assert_eq!(resp.status(), 200, "ingest should succeed"); - - // The old empty generation may be served briefly, but the rebuilt - // generation must become visible shortly after ingest. - let body = wait_for_groups(&client, &server).await?; - let groups = body["groups"].as_array().context("groups is array")?; - assert!( - !groups.is_empty(), - "groups must repopulate after the read-model rebuild" - ); - Ok(()) -} diff --git a/benchmarks-website/server/tests/landing.rs b/benchmarks-website/server/tests/landing.rs deleted file mode 100644 index 213b92576c9..00000000000 --- a/benchmarks-website/server/tests/landing.rs +++ /dev/null @@ -1,518 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Integration tests for the landing page (`GET /`). - -mod common; - -use anyhow::Context as _; -use anyhow::Result; -use serde_json::Value; - -use self::common::Server; -use self::common::attr_value; -use self::common::extract_chip; -use self::common::filter_bar_section; -use self::common::filter_section; -use self::common::insta_settings; -use self::common::pick_chart_slug; -use self::common::pick_group_slug; -use self::common::seed; -use self::common::seed_long_history; - -#[tokio::test] -async fn landing_page_snapshot() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let resp = client.get(server.url("/")).send().await?; - assert_eq!(resp.status(), 200); - let content_type = resp - .headers() - .get(reqwest::header::CONTENT_TYPE) - .and_then(|v| v.to_str().ok()) - .unwrap_or(""); - assert!( - content_type.starts_with("text/html"), - "expected text/html, got {content_type:?}" - ); - let body = resp.text().await?; - - // Canvas shells render immediately, but chart data comes from - // versioned group shard artifacts instead of inline JSON. - assert!( - body.contains("" - ); - assert!( - !body.contains(r#"id="chart-data-0""#), - "landing page should not inline chart payloads" - ); - assert!( - body.contains(r#"data-chart-slug="#), - "every chart card carries data-chart-slug for the lazy-fetch path" - ); - assert!( - body.contains(r#"data-group-slug="#), - "every group carries data-group-slug as stable metadata" - ); - assert!( - body.contains(r#"data-artifact-generation="#) - && body.contains(r#"data-group-shard-count="#) - && body.contains(r#"data-group-shard-prefix="#), - "every group should carry versioned shard hydration metadata" - ); - assert!( - attr_value(&body, "data-artifact-generation").is_some_and(|v| !v.is_empty()), - "artifact generation should be non-empty" - ); - assert!( - !body.contains(r#"id="group-search""#), - "landing page should not render the old group search bar" - ); - assert!( - body.contains(r#"class="sticky-header""#), - "landing page should render the v2-style top navbar" - ); - assert!( - body.contains(r#"data-action="expand-all""#) - && body.contains(r#"data-action="collapse-all""#), - "navbar should expose expand/collapse controls" - ); - assert!( - body.contains(r#"data-role="theme-toggle""#), - "navbar should expose a theme toggle" - ); - assert!( - body.contains(r#"class="btn-icon""#) - || body.contains(r#"class="btn-icon theme-icon theme-icon-light""#), - "navbar controls should render icons" - ); - assert!( - body.contains(r#"Vortex_Black_NoBG.png"#) && body.contains(r#"Vortex_White_NoBG.png"#), - "navbar should render the Vortex logo assets" - ); - assert!( - body.contains("⚡") && body.contains("📤") && body.contains("⬇️") && body.contains("📊"), - "summaries should render the v2 summary icons" - ); - - insta_settings().bind(|| { - insta::assert_snapshot!("landing_page", body); - }); - Ok(()) -} - -/// All group disclosures render closed by default — the user picks which -/// to expand. Chart payloads are intentionally not inlined; the disclosure -/// carries shard metadata so JS can fetch the materialized latest-100 -/// artifact on intent/open. -#[tokio::test] -async fn details_all_groups_closed_by_default() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let body = client.get(server.url("/")).send().await?.text().await?; - - let opens: Vec<_> = body - .match_indices(r#"
').map(|p| i + p).unwrap_or(i); - body[i..=tag_end].contains(" open") - }) - .collect(); - assert!(!opens.is_empty(), "landing page must render
"); - for (i, is_open) in opens.iter().enumerate() { - assert!(!is_open, "group #{i} must be closed by default"); - } - assert!( - !body.contains(r#"id="chart-data-0""#), - "landing page should hydrate charts from materialized artifacts", - ); - assert!( - body.contains(r#"data-group-shard-count="#), - "closed groups should still carry shard metadata", - ); - Ok(()) -} - -#[tokio::test] -async fn collapsed_groups_still_show_summaries() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let body = client.get(server.url("/")).send().await?.text().await?; - - let mut found_visible_summary = false; - for (group_start, _) in body.match_indices(r#"
') - .map(|p| details_start + p) - .context("details tag closes")?; - let is_open = body[details_start..=details_tag_end].contains(" open"); - if is_open { - continue; - } - - let summary_end = body[details_start..] - .find("
") - .map(|p| details_start + p) - .context("disclosure closes")?; - let chart_grid_start = body[summary_end..] - .find(r#"
"#) - .map(|p| summary_end + p) - .context("details contains chart grid")?; - let visible_region = &body[summary_end..chart_grid_start]; - if visible_region.contains(r#"class="benchmark-scores-summary""#) { - found_visible_summary = true; - break; - } - } - - assert!( - found_visible_summary, - "at least one closed group should render its score summary before the hidden chart grid" - ); - Ok(()) -} - -/// Every `.chart-card` carries a compact `.toolbar.toolbar--card` so the user -/// has per-chart controls. There is no page-level toolbar, no preset scope -/// button row, and no abs/rel mode toggle. -#[tokio::test] -async fn chart_card_carries_per_chart_toolbar() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let body = client.get(server.url("/")).send().await?.text().await?; - - let card_count = body.matches(r#"
0, "landing page must render chart cards"); - assert_eq!( - toolbar_count, card_count, - "every chart-card must contain a toolbar--card ({card_count} cards / {toolbar_count} toolbars)" - ); - assert_eq!( - strip_count, card_count, - "every chart-card must carry a range-strip below the canvas \ - ({card_count} cards / {strip_count} strips)" - ); - assert!( - body.contains(r#"data-role="range-window""#) - && body.contains(r#"data-role="range-handle-left""#) - && body.contains(r#"data-role="range-handle-right""#), - "range-strip must include a draggable window and two resize handles" - ); - assert!( - !body.contains(r#"data-mode="#), - "abs/rel mode buttons should not render" - ); - assert!( - !body.contains(r#"data-scope="#), - "preset scope buttons should not render; use the slider instead" - ); - assert!( - body.contains(r#"data-role="scope-slider""#), - "scope slider should remain available" - ); - assert!( - !body.contains(r#"scope-slider-label"#), - "scope value labels should not add repeated numbers to every card" - ); - - // Same invariant on /chart/{slug}. - let slug = pick_chart_slug(&server, |s| s.starts_with("TPC-H")).await?; - let body = client - .get(server.url(&format!("/chart/{slug}"))) - .send() - .await? - .text() - .await?; - assert!( - body.contains(r#"class="toolbar toolbar--card""#), - "chart page must carry a per-chart toolbar" - ); - assert!(!body.contains(r#"data-mode="#)); - assert!(!body.contains(r#"data-scope="#)); - assert!(body.contains(r#"data-role="scope-slider""#)); - assert!(!body.contains(r#"scope-slider-label"#)); - - // Same invariant on /group/{slug}. - let group_slug = pick_group_slug(&server, |s| s.starts_with("TPC-H")).await?; - let body = client - .get(server.url(&format!("/group/{group_slug}"))) - .send() - .await? - .text() - .await?; - assert!( - body.contains(r#"class="toolbar toolbar--card""#), - "group page must carry per-chart toolbars" - ); - assert!(!body.contains(r#"data-mode="#)); - assert!(!body.contains(r#"data-scope="#)); - assert!(body.contains(r#"data-role="scope-slider""#)); - assert!(!body.contains(r#"scope-slider-label"#)); - Ok(()) -} - -/// Landing-page `
` summaries appear in the canonical v2 order: the -/// fixture seeds Random Access, Compression, Compression Size, TPC-H, and a -/// vector-search group. The first three are in `api::GROUP_ORDER` in the -/// expected positions; TPC-H follows; the unknown vector-search group sorts -/// last (alphabetical fallback after the listed names). -#[tokio::test] -async fn landing_groups_render_in_v2_order() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let body = client.get(server.url("/")).send().await?.text().await?; - - // Extract group names in render order from the `data-group-name=` attrs. - let mut names = Vec::new(); - for window in body.split("data-group-name=\"").skip(1) { - if let Some(end) = window.find('"') { - names.push(window[..end].to_string()); - } - } - let expected = [ - "Random Access", - "Compression", - "Compression Size", - "TPC-H (NVMe) (SF=1)", - "cohere-large-10m / partitioned", - ]; - assert_eq!(names, expected, "v2 ordering"); - Ok(()) -} - -#[tokio::test] -async fn empty_landing_page_renders() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - let resp = client.get(server.url("/")).send().await?; - assert_eq!(resp.status(), 200); - let body = resp.text().await?; - assert!(body.contains("No data ingested yet")); - Ok(()) -} - -/// Landing page renders the global filter dropdown inside the sticky -/// header, with chip rows for engine and format sourced from the seeded -/// data — no hard-coding. -#[tokio::test] -async fn landing_page_renders_global_filter_bar() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let body = client.get(server.url("/")).send().await?.text().await?; - - // The dropdown lives inside the sticky header so it stays on-screen - // while the user scrolls. - let header_chunk = body - .split(r#"class="sticky-header""#) - .nth(1) - .and_then(|s| s.split("").next()) - .context("sticky header chunk")?; - assert!( - header_chunk.contains(r#"data-role="global-filter-bar""#), - "filter dropdown must live inside the sticky header" - ); - assert!(header_chunk.contains(r#"data-role="filter-trigger""#)); - assert!(header_chunk.contains(r#"data-role="filter-panel""#)); - assert!(header_chunk.contains(r#"data-filter="engine""#)); - assert!(header_chunk.contains(r#"data-filter="format""#)); - // Engines + formats from the seed fixture must appear as chips. - assert!(body.contains(r#"data-value="datafusion""#)); - assert!(body.contains(r#"data-value="duckdb""#)); - assert!(body.contains(r#"data-value="vortex-file-compressed""#)); - assert!(body.contains(r#"data-value="parquet""#)); - // Both rows have an "all" reset chip. - assert!(body.matches(r#"data-value="*""#).count() >= 2); - // The "all" chip is now a one-shot reset and is never rendered active — - // active chips reflect the visible engine/format set. - assert!( - !body.contains(r#"class="filter-chip filter-chip--all filter-chip--active""#), - "the 'all' chip should never start active" - ); - // No filter applied by default → every specific chip is active. - let engine_section = filter_section(&body, "engine"); - for engine in ["datafusion", "duckdb"] { - assert!( - extract_chip(&engine_section, engine).contains("filter-chip--active"), - "engine chip {engine} should be active when no filter is applied" - ); - } - // No badge on the trigger when nothing is hidden. - assert!( - !body.contains(r#"data-role="filter-badge""#), - "filter badge should be absent when no chips are off" - ); - // Embedded filter state JSON for the client to pick up. - assert!(body.contains(r#"id="bench-filter-state""#)); - - insta_settings().bind(|| { - insta::assert_snapshot!("landing_page_filter_bar", filter_bar_section(&body)); - }); - Ok(()) -} - -/// Landing page honours `?engine=`/`?format=` and reflects them as the -/// active chip set + initial filter-state JSON, so a refresh preserves view. -#[tokio::test] -async fn landing_page_honours_filter_query_params() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let body = client - .get(server.url("/?engine=duckdb&format=vortex-file-compressed")) - .send() - .await? - .text() - .await?; - - assert!( - body.contains(r#"{"engines":["duckdb"],"formats":["vortex-file-compressed"]}"#), - "filter state JSON should reflect query params" - ); - let engine_section = filter_section(&body, "engine"); - assert!( - engine_section.contains(r#"data-value="duckdb""#) - && extract_chip(&engine_section, "duckdb").contains("filter-chip--active"), - "duckdb chip should be active" - ); - assert!( - !extract_chip(&engine_section, "datafusion").contains("filter-chip--active"), - "datafusion chip should NOT be active when engine=duckdb" - ); - assert!( - !extract_chip(&engine_section, "*").contains("filter-chip--active"), - "the 'all' chip is a reset, never active" - ); - // Trigger should show a badge counting the off chips (1 engine + 1 format). - assert!( - body.contains(r#"data-role="filter-badge""#), - "trigger should render a badge when chips are filtered off" - ); - Ok(()) -} - -/// The landing page does not inline chart JSON. Its first materialized shard -/// caps chart payloads at 100 commits regardless of `?n=`; power users get -/// full history via the explicit `/api/chart/{slug}?n=all` refetch. -#[tokio::test] -async fn landing_first_group_shard_caps_commits() -> Result<()> { - // 101 commits is the smallest fixture above the 100-commit artifact cap, so the - // cap actually kicks in. `seed_long_history` only seeds the Random-Access - // group; with the canonical group ordering Random Access sorts first. - let server = Server::start().await?; - seed_long_history(&server, 101).await?; - - let client = reqwest::Client::new(); - let body = client.get(server.url("/")).send().await?.text().await?; - assert!( - !body.contains(r#"id="chart-data-0""#), - "landing page should not inline chart JSON" - ); - - let generation = attr_value(&body, "data-artifact-generation") - .context("landing exposes artifact generation")?; - let group_slug = attr_value(&body, "data-group-slug").context("landing exposes group slug")?; - let shard: Value = client - .get(server.url(&format!( - "/api/artifacts/{generation}/groups/{group_slug}/shards/0" - ))) - .send() - .await? - .json() - .await?; - let commits = shard["charts"][0]["commits"] - .as_array() - .context("shard chart commits array")?; - assert!( - commits.len() <= 100, - "landing shard must cap commits at 100, \ - got {}", - commits.len(), - ); - // Sanity check: the cap actually fired on this fixture (≥ 100 commits - // seeded). Without this we'd silently regress to "always small fixture". - assert_eq!( - commits.len(), - 100, - "with 101 seeded commits the shard payload should be exactly the \ - 100-commit cap; got {}", - commits.len(), - ); - - // ?n=all on the URL still parses without panicking and still leaves the - // landing page as shell-only metadata. - let body_all = client - .get(server.url("/?n=all")) - .send() - .await? - .text() - .await?; - assert!( - !body_all.contains(r#"id="chart-data-0""#), - "?n=all on the landing page must not inline full-history chart data" - ); - Ok(()) -} - -/// Sanity smoke test: round-trip every chart slug `/api/groups` returns -/// through `/chart/{slug}` to make sure each slug shape's HTML route is -/// wired up. -#[tokio::test] -async fn chart_page_round_trips_every_slug() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let groups: Value = client - .get(server.url("/api/groups")) - .send() - .await? - .json() - .await?; - let slugs: Vec = groups["groups"] - .as_array() - .context("groups is array")? - .iter() - .flat_map(|g| g["charts"].as_array().cloned().unwrap_or_default()) - .filter_map(|c| c["slug"].as_str().map(str::to_string)) - .collect(); - anyhow::ensure!(!slugs.is_empty(), "expected at least one chart slug"); - - for slug in &slugs { - let resp = client - .get(server.url(&format!("/chart/{slug}"))) - .send() - .await?; - assert_eq!( - resp.status(), - 200, - "chart page for slug {slug} should be 200" - ); - let body = resp.text().await?; - assert!( - body.contains(r#"id="chart-data-0""#), - "missing inline chart data for slug {slug}" - ); - } - Ok(()) -} diff --git a/benchmarks-website/server/tests/permalinks.rs b/benchmarks-website/server/tests/permalinks.rs deleted file mode 100644 index 3b8b3919119..00000000000 --- a/benchmarks-website/server/tests/permalinks.rs +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Integration tests for `/chart/{slug}` and `/group/{slug}` permalink -//! behaviour: full-history payloads, embedded filter state, 404s on -//! unknown slugs. - -mod common; - -use anyhow::Context as _; -use anyhow::Result; - -use self::common::Server; -use self::common::attr_value; -use self::common::extract_chart_data; -use self::common::pick_chart_slug; -use self::common::pick_group_slug; -use self::common::seed; -use self::common::seed_long_history; - -/// `/chart/{slug}` defaults to the materialized latest-100 window and -/// upgrades to full raw history only with `?n=all`. `/group/{slug}` renders -/// shell markup and hydrates the same latest-100 materialized shards as the -/// landing page. -#[tokio::test] -async fn permalink_pages_default_to_latest_100_and_opt_into_full_history() -> Result<()> { - let server = Server::start().await?; - seed_long_history(&server, 101).await?; - - let chart_slug = pick_chart_slug(&server, |s| s == "Random Access").await?; - let group_slug = pick_group_slug(&server, |s| s == "Random Access").await?; - let client = reqwest::Client::new(); - - let chart_body = client - .get(server.url(&format!("/chart/{chart_slug}"))) - .send() - .await? - .text() - .await?; - let chart_payload = - extract_chart_data(&chart_body, 0).context("chart inline payload present")?; - assert_eq!( - chart_payload["commits"] - .as_array() - .context("commits is array")? - .len(), - 100, - "/chart permalink should default to the latest-100 materialized window", - ); - - let chart_all_body = client - .get(server.url(&format!("/chart/{chart_slug}?n=all"))) - .send() - .await? - .text() - .await?; - let chart_all_payload = - extract_chart_data(&chart_all_body, 0).context("chart all inline payload present")?; - assert_eq!( - chart_all_payload["commits"] - .as_array() - .context("all commits is array")? - .len(), - 101, - "/chart?n=all should inline the full raw history", - ); - - let group_body = client - .get(server.url(&format!("/group/{group_slug}"))) - .send() - .await? - .text() - .await?; - assert!( - !group_body.contains(r#"id="chart-data-0""#), - "/group permalink should not inline chart payloads" - ); - let generation = attr_value(&group_body, "data-artifact-generation") - .context("group page exposes artifact generation")?; - let shard_prefix = attr_value(&group_body, "data-group-shard-prefix") - .context("group page exposes shard prefix")?; - let shard_path = shard_prefix - .strip_prefix('/') - .map(|s| format!("/{s}0")) - .context("absolute shard prefix")?; - let shard: serde_json::Value = client - .get(server.url(&shard_path)) - .send() - .await? - .json() - .await?; - assert_eq!( - shard["charts"][0]["commits"] - .as_array() - .context("shard commits is array")? - .len(), - 100, - "/group permalink should hydrate the latest-100 shard", - ); - assert!( - shard_path.contains(&generation), - "group shard URL should be versioned by generation" - ); - - Ok(()) -} - -/// Permalink pages render the same filter dropdown in the navbar (so the -/// user can adjust visibility from any page) and embed the filter-state -/// JSON so chart-init.js applies the filter on hydration. -#[tokio::test] -async fn permalink_pages_embed_filter_state() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let chart_slug = pick_chart_slug(&server, |s| s.starts_with("TPC-H")).await?; - let group_slug = pick_group_slug(&server, |s| s.starts_with("TPC-H")).await?; - - let chart_body = client - .get(server.url(&format!("/chart/{chart_slug}?engine=duckdb&format=parquet"))) - .send() - .await? - .text() - .await?; - assert!( - chart_body.contains(r#"id="bench-filter-state""#), - "chart permalink must embed filter state" - ); - assert!( - chart_body.contains(r#"{"engines":["duckdb"],"formats":["parquet"]}"#), - "chart permalink must echo the query-param filter state" - ); - - let group_body = client - .get(server.url(&format!("/group/{group_slug}?engine=duckdb"))) - .send() - .await? - .text() - .await?; - assert!( - group_body.contains(r#"{"engines":["duckdb"],"formats":[]}"#), - "group permalink must echo the query-param filter state" - ); - Ok(()) -} - -#[tokio::test] -async fn unknown_slug_renders_404() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - let resp = client.get(server.url("/chart/qm.aGVsbG8")).send().await?; - assert_eq!(resp.status(), 404); - let body = resp.text().await?; - assert!(body.contains("chart not found")); - Ok(()) -} diff --git a/benchmarks-website/server/tests/snapshots/chart_page_query.snap b/benchmarks-website/server/tests/snapshots/chart_page_query.snap deleted file mode 100644 index 7c576b55234..00000000000 --- a/benchmarks-website/server/tests/snapshots/chart_page_query.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: benchmarks-website/server/tests/chart_api.rs -assertion_line: 52 -expression: body ---- -tpch sf=1 Q1 [nvme] - Vortex Benchmarks

unit: ns · 2 series · 3 commits

Show
Y
build
diff --git a/benchmarks-website/server/tests/snapshots/group_page_query.snap b/benchmarks-website/server/tests/snapshots/group_page_query.snap deleted file mode 100644 index a1716e15501..00000000000 --- a/benchmarks-website/server/tests/snapshots/group_page_query.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: benchmarks-website/server/tests/group_api.rs -assertion_line: 51 -expression: body ---- -TPC-H (NVMe) (SF=1) - Vortex Benchmarks
TPC-H (NVMe) (SF=1)2 charts

Performance Summary

#1datafusion:vortex-file-compressed1.11x1.80 ms
#2duckdb:parquet1.60x900.00 us
Geomean of query time ratio to fastest (lower is better)
Y

Q1

Show
Y

Q2

Show
Y
build
diff --git a/benchmarks-website/server/tests/snapshots/landing_page.snap b/benchmarks-website/server/tests/snapshots/landing_page.snap deleted file mode 100644 index 8337ed6dc05..00000000000 --- a/benchmarks-website/server/tests/snapshots/landing_page.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: benchmarks-website/server/tests/landing.rs -assertion_line: 102 -expression: body ---- -Vortex Benchmarks
Random Access1 chart

Random Access Performance

#1vortex-file-compressed100.50 us1.00x
#2parquet201.00 us2.00x
Random access time | Ratio to fastest (lower is better)
Y

taxi

Show
Y
Compression1 chart

Compression Throughput vs Parquet

Write Speed (Compression)2.00x
📤Scan Speed (Decompression)2.00x
Inverse geomean of Vortex/Parquet ratios (higher is better)
Y

tpch-lineitem

Show
Y
Compression Size1 chart

Compression Size Summary

⬇️Min Size Ratio0.50x
📊Mean Size Ratio0.50x
⬆️Max Size Ratio0.50x
Geomean of Vortex/Parquet size ratios (lower is better)
Y

tpch-lineitem

Show
Y
TPC-H (NVMe) (SF=1)2 charts

Performance Summary

#1datafusion:vortex-file-compressed1.11x1.80 ms
#2duckdb:parquet1.60x900.00 us
Geomean of query time ratio to fastest (lower is better)
Y

Q1

Show
Y

Q2

Show
Y
cohere-large-10m / partitioned1 chart
Y

threshold=0.75

Show
Y
build
diff --git a/benchmarks-website/server/tests/snapshots/landing_page_filter_bar.snap b/benchmarks-website/server/tests/snapshots/landing_page_filter_bar.snap deleted file mode 100644 index 1a995f8a01f..00000000000 --- a/benchmarks-website/server/tests/snapshots/landing_page_filter_bar.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: benchmarks-website/server/tests/web_ui.rs -expression: filter_bar_section(&body) ---- -
diff --git a/benchmarks-website/server/tests/static_assets.rs b/benchmarks-website/server/tests/static_assets.rs deleted file mode 100644 index 8533419a416..00000000000 --- a/benchmarks-website/server/tests/static_assets.rs +++ /dev/null @@ -1,211 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Integration tests for the bundled `/static/...` asset routes plus the -//! response compression layer. - -mod common; - -use std::io::Read as _; - -use anyhow::Result; -use flate2::read::GzDecoder; - -use self::common::Server; -use self::common::seed; - -#[tokio::test] -async fn static_assets_are_served() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - for (path, ct_prefix) in [ - ("/static/chart.umd.js", "application/javascript"), - ( - "/static/chartjs-plugin-zoom.umd.min.js", - "application/javascript", - ), - ("/static/chart-init.js", "application/javascript"), - ("/static/style.css", "text/css"), - ("/static/icon-light.png", "image/png"), - ("/static/icon-dark.png", "image/png"), - ("/Vortex_Black_NoBG.png", "image/png"), - ("/Vortex_White_NoBG.png", "image/png"), - ] { - let resp = client.get(server.url(path)).send().await?; - assert_eq!(resp.status(), 200, "GET {path} should be 200"); - let ct = resp - .headers() - .get(reqwest::header::CONTENT_TYPE) - .and_then(|v| v.to_str().ok()) - .unwrap_or_default() - .to_string(); - assert!( - ct.starts_with(ct_prefix), - "GET {path}: content-type {ct:?} should start with {ct_prefix:?}" - ); - let cache_control = resp - .headers() - .get(reqwest::header::CACHE_CONTROL) - .and_then(|v| v.to_str().ok()) - .unwrap_or_default() - .to_string(); - assert!( - cache_control.contains("no-cache"), - "GET {path}: static assets should revalidate so UI CSS/JS changes are not stale" - ); - let bytes = resp.bytes().await?; - assert!(!bytes.is_empty(), "GET {path}: body must not be empty"); - } - Ok(()) -} - -#[tokio::test] -async fn chart_init_uses_bounded_group_hydration() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - let js = client - .get(server.url("/static/chart-init.js")) - .send() - .await? - .text() - .await?; - - assert!( - js.contains("data-group-shard-prefix") && js.contains("fetchGroupShard"), - "landing hydration should fetch versioned group shard artifacts" - ); - assert!( - !js.contains(r#""/api/group/""#), - "landing hydration should not put whole-group payloads on the hot path" - ); - assert!( - js.contains(r#""/api/chart/""#) && js.contains("ensureFullHistory"), - "chart-init should use /api/chart for queued full-history warmup" - ); - assert!( - js.contains("HYDRATION_CONCURRENCY") && js.contains("FULL_HISTORY_CONCURRENCY"), - "landing hydration and full-history warmup should bound per-tab fanout" - ); - assert!( - !js.contains("startBackgroundPrefetch();"), - "chart-init must not fan out full-history chart fetches on page load" - ); - assert!( - js.contains("normalizeChartPayload"), - "chart-init should normalize latest-100 payloads onto a virtual full-history x-axis" - ); - assert!( - js.contains("queueGroupFullHistory"), - "opening a group should queue full-history fetches for that group" - ); - assert!( - !js.contains("function maybeRefetchFullPayload"), - "full-history fetches should not depend on the old covers-all interaction gate" - ); - assert!( - !js.contains("WIDE_DEFAULT_GROUPS"), - "group-open hydration should default every group to the latest-100 window" - ); - Ok(()) -} - -/// Every response — landing HTML, chart JSON, bundled JS — flows through -/// `tower-http`'s `CompressionLayer` so a client advertising -/// `Accept-Encoding: gzip` gets a gzipped (or brotli) body. The -/// reqwest dev-dependency is built without `gzip`/`brotli` features, so the -/// transport hands us the compressed bytes verbatim and we can both inspect -/// the `content-encoding` response header and decompress the body manually -/// to confirm it matches the uncompressed snapshot. -#[tokio::test] -async fn responses_are_compressed_when_client_accepts_gzip() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - - // 1. Landing HTML. - let plain_body = client.get(server.url("/")).send().await?.text().await?; - let resp = client - .get(server.url("/")) - .header(reqwest::header::ACCEPT_ENCODING, "gzip") - .send() - .await?; - assert_eq!(resp.status(), 200); - let encoding = resp - .headers() - .get(reqwest::header::CONTENT_ENCODING) - .and_then(|v| v.to_str().ok()) - .unwrap_or_default() - .to_string(); - assert_eq!( - encoding, "gzip", - "GET / with Accept-Encoding: gzip should respond with gzip" - ); - let compressed = resp.bytes().await?; - assert!( - compressed.len() < plain_body.len(), - "compressed body ({} B) should be smaller than plain body ({} B)", - compressed.len(), - plain_body.len(), - ); - let mut decoded = String::new(); - GzDecoder::new(&compressed[..]).read_to_string(&mut decoded)?; - assert_eq!( - decoded, plain_body, - "gzipped landing body should decompress to the uncompressed body" - ); - - // 2. Bundled JS — the heaviest static asset; gzip is the whole point. - let plain_js = client - .get(server.url("/static/chart.umd.js")) - .send() - .await? - .bytes() - .await?; - let js_resp = client - .get(server.url("/static/chart.umd.js")) - .header(reqwest::header::ACCEPT_ENCODING, "gzip") - .send() - .await?; - assert_eq!(js_resp.status(), 200); - let js_encoding = js_resp - .headers() - .get(reqwest::header::CONTENT_ENCODING) - .and_then(|v| v.to_str().ok()) - .unwrap_or_default() - .to_string(); - assert_eq!( - js_encoding, "gzip", - "/static/chart.umd.js must compress so the cold load isn't dominated by ~200KB of JS" - ); - let compressed_js = js_resp.bytes().await?; - let mut decoded_js = Vec::new(); - GzDecoder::new(&compressed_js[..]).read_to_end(&mut decoded_js)?; - assert_eq!( - decoded_js, - plain_js.as_ref(), - "decompressed chart.umd.js should match the unencoded body byte-for-byte" - ); - - // 3. Brotli is also offered when the client prefers it. - let br_resp = client - .get(server.url("/")) - .header(reqwest::header::ACCEPT_ENCODING, "br") - .send() - .await?; - assert_eq!(br_resp.status(), 200); - let br_encoding = br_resp - .headers() - .get(reqwest::header::CONTENT_ENCODING) - .and_then(|v| v.to_str().ok()) - .unwrap_or_default() - .to_string(); - assert_eq!( - br_encoding, "br", - "GET / with Accept-Encoding: br should respond with brotli" - ); - - Ok(()) -} diff --git a/benchmarks-website/server/tests/web_ui.rs b/benchmarks-website/server/tests/web_ui.rs deleted file mode 100644 index 83edf2c86e6..00000000000 --- a/benchmarks-website/server/tests/web_ui.rs +++ /dev/null @@ -1,506 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Integration tests for the v3 web UI features that span the API and -//! HTML routes: -//! -//! - **Per-group hover descriptions** (Task A): editorial blurbs port from -//! v2's `BENCHMARK_DESCRIPTIONS` + `getBenchmarkDescription`. Asserted on -//! the landing page and on the `/group/{slug}` permalink. -//! - **Partial-coverage commits** (Task B): a chart's x-axis includes -//! commits that have NO row in the fact table for this chart, so -//! missing measurements render as visible gaps rather than silently -//! bridged lines. - -mod common; - -use anyhow::Context as _; -use anyhow::Result; -use serde_json::Value; -use serde_json::json; - -use self::common::Server; -use self::common::TOKEN; -use self::common::pick_chart_slug; -use self::common::pick_group_slug; -use self::common::seed; -use self::common::wait_for_materialized_first_chart_commits; - -// ============================================================================= -// Task A — per-group hover descriptions -// ============================================================================= - -/// The landing page renders a small ⓘ icon next to every group title that -/// has a canonical description, with the description surfaced via the -/// `data-tooltip` attribute (CSS-only hover/focus tooltip). The description -/// also appears on `/api/groups`. -#[tokio::test] -async fn landing_page_renders_group_descriptions() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let body = client.get(server.url("/")).send().await?.text().await?; - - // Random Access — verbatim v2 description. - assert!( - body.contains(r#"data-tooltip="Tests performance of selecting arbitrary row indices from a file on NVMe storage""#), - "Random Access description must appear as a hover tooltip on the landing page" - ); - // Compression — verbatim v2 description (the longer wording, not the - // shorter `getBenchmarkDescription` fallback). - assert!( - body.contains(r#"data-tooltip="Measures encoding and decoding throughput (MB/s) for Vortex files and Parquet files (with zstd page compression)""#), - "Compression description must appear as a hover tooltip on the landing page" - ); - // Compression Size — verbatim v2 description. - assert!( - body.contains(r#"data-tooltip="Compares compressed file sizes and compression ratios across different encoding strategies""#), - "Compression Size description must appear as a hover tooltip on the landing page" - ); - // TPC-H NVMe SF=1 — derived description with scale-bytes annotation. - assert!( - body.contains( - r#"data-tooltip="TPC-H benchmark queries on local NVMe storage at SF=1 (~1GB of data)""# - ), - "TPC-H description with scale-bytes annotation must appear on the landing page" - ); - - // The icon itself is keyboard-focusable + role-annotated for a11y. - assert!( - body.contains(r#" Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let slug = pick_group_slug(&server, |s| s == "Random Access").await?; - let body = client - .get(server.url(&format!("/group/{slug}"))) - .send() - .await? - .text() - .await?; - - assert!( - body.contains(r#"data-tooltip="Tests performance of selecting arbitrary row indices from a file on NVMe storage""#), - "group permalink page must render the same description tooltip as the landing page" - ); - assert!( - body.contains(r#" Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let body = client.get(server.url("/")).send().await?.text().await?; - - // Locate the vector-search section and assert no info-icon inside its - // disclosure summary. - let needle = r#"data-group-name="cohere-large-10m / partitioned""#; - let start = body.find(needle).context("vector-search section present")?; - // The `` tag is the disclosure header; we want the slice - // between this section's start and the end of its ``. - let summary_end = body[start..] - .find("") - .map(|p| start + p) - .context("section contains ")?; - let summary = &body[start..summary_end]; - assert!( - !summary.contains("group-info-icon"), - "vector-search group should not render an info-icon (no canonical description), got: {summary}" - ); - Ok(()) -} - -/// `/api/groups` carries the description on every group entry as a `description` -/// field, so external API consumers can render their own UI without having to -/// hard-code v2's description list. -#[tokio::test] -async fn groups_api_carries_description_field() -> Result<()> { - let server = Server::start().await?; - seed(&server).await?; - - let client = reqwest::Client::new(); - let groups: Value = client - .get(server.url("/api/groups")) - .send() - .await? - .json() - .await?; - let arr = groups["groups"].as_array().context("groups[] array")?; - let by_name = |n: &str| { - arr.iter() - .find(|g| g["name"].as_str() == Some(n)) - .with_context(|| format!("group {n:?} present")) - }; - assert_eq!( - by_name("Random Access")?["description"].as_str(), - Some("Tests performance of selecting arbitrary row indices from a file on NVMe storage"), - ); - assert_eq!( - by_name("TPC-H (NVMe) (SF=1)")?["description"].as_str(), - Some("TPC-H benchmark queries on local NVMe storage at SF=1 (~1GB of data)"), - ); - // Vector-search group has no canonical description; the `description` - // key should be absent (skip_serializing_if). - let vsg = by_name("cohere-large-10m / partitioned")?; - assert!( - vsg.get("description").is_none(), - "vector-search group should not carry a description field, got: {vsg}" - ); - Ok(()) -} - -// ============================================================================= -// Task B — partial-coverage commits -// ============================================================================= - -/// Build an envelope that records a `random_access_time` measurement only -/// for the listed `(format, value_ns)` pairs. The fixture commits' SHAs are -/// deterministic so tests can assert on them. -fn ra_envelope(sha: &str, ts: &str, msg: &str, rows: &[(&str, i64)]) -> Value { - json!({ - "run_meta": { - "benchmark_id": "partial-coverage-fixture", - "schema_version": 1, - "started_at": ts - }, - "commit": { - "sha": sha, - "timestamp": ts, - "message": msg, - "author_name": "Test Author", - "author_email": "author@example.com", - "committer_name": "Test Committer", - "committer_email": "committer@example.com", - "tree_sha": "fedcba9876543210fedcba9876543210fedcba98", - "url": format!("https://github.com/vortex-data/vortex/commit/{sha}") - }, - "records": rows.iter().map(|(format, value_ns)| json!({ - "kind": "random_access_time", - "commit_sha": sha, - "dataset": "taxi", - "format": format, - "value_ns": value_ns, - "all_runtimes_ns": [value_ns] - })).collect::>() - }) -} - -/// Regression test for "charts have invisible gaps where commits should be." -/// -/// Seed three commits A, B, C in chronological order: -/// * A — series X and Y both have data -/// * B — only series Y has data (X crashed; this is the partial-coverage case) -/// * C — series X and Y both have data -/// -/// The chart's `commits[]` must include all three commits (B included), -/// and series X's value at B must be `null`. Before the fix the chart -/// silently dropped B because `SeriesAccumulator::ensure_commit` only -/// registered commits that had at least one row in the fact table. -#[tokio::test] -async fn chart_includes_commits_with_partial_series_coverage() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - let envelopes = [ - ra_envelope( - "aaaa111111111111111111111111111111111111", - "2026-04-23T12:00:00Z", - "A: both series", - &[("vortex-file-compressed", 500), ("parquet", 1_000)], - ), - ra_envelope( - "bbbb222222222222222222222222222222222222", - "2026-04-24T12:00:00Z", - "B: only parquet (vortex crashed)", - &[("parquet", 1_100)], - ), - ra_envelope( - "cccc333333333333333333333333333333333333", - "2026-04-25T12:00:00Z", - "C: both series", - &[("vortex-file-compressed", 600), ("parquet", 1_200)], - ), - ]; - for env in &envelopes { - let resp = client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(env) - .send() - .await?; - anyhow::ensure!( - resp.status().is_success(), - "seed ingest failed: {}", - resp.status() - ); - } - wait_for_materialized_first_chart_commits(&server, 3).await?; - - let slug = pick_chart_slug(&server, |s| s == "Random Access").await?; - let chart: Value = client - .get(server.url(&format!("/api/chart/{slug}"))) - .send() - .await? - .json() - .await?; - - let commits = chart["commits"].as_array().context("commits[] array")?; - let shas: Vec<&str> = commits.iter().filter_map(|c| c["sha"].as_str()).collect(); - assert_eq!( - shas, - vec![ - "aaaa111111111111111111111111111111111111", - "bbbb222222222222222222222222222222222222", - "cccc333333333333333333333333333333333333", - ], - "all three commits must appear in commits[], including the partial-coverage commit B" - ); - - // Series X (vortex-file-compressed) has data at A and C, NULL at B. - let vortex = chart["series"]["vortex-file-compressed"] - .as_array() - .context("vortex-file-compressed series array")?; - assert_eq!(vortex.len(), 3, "series array aligns with commits[]"); - assert_eq!(vortex[0].as_f64(), Some(500.0)); - assert!( - vortex[1].is_null(), - "vortex-file-compressed must be null at the partial-coverage commit, got {:?}", - vortex[1], - ); - assert_eq!(vortex[2].as_f64(), Some(600.0)); - - // Series Y (parquet) has data at all three commits. - let parquet = chart["series"]["parquet"] - .as_array() - .context("parquet series array")?; - assert_eq!(parquet[0].as_f64(), Some(1_000.0)); - assert_eq!(parquet[1].as_f64(), Some(1_100.0)); - assert_eq!(parquet[2].as_f64(), Some(1_200.0)); - - Ok(()) -} - -/// A commit with NO row in the chart's fact table (every benchmark crashed -/// for that commit) still appears on the chart's x-axis as long as it falls -/// within the chart's window — i.e. ≥ the earliest commit that has data. -/// -/// Seed two commits with random-access data and one commit that only has a -/// `compression_size` row. The compression-size-only commit is in the -/// `commits` dim but has nothing in `random_access_times`, so the random- -/// access chart should still place it on the x-axis with NULL for every -/// series. -#[tokio::test] -async fn chart_includes_commits_with_zero_rows_in_fact_table() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - // Commit A: random-access only. - let env_a = ra_envelope( - "aaaa111111111111111111111111111111111111", - "2026-04-23T12:00:00Z", - "A", - &[("parquet", 1_000)], - ); - // Commit B (chronologically between A and C): a compression_size row, - // nothing in random_access_times. - let env_b = json!({ - "run_meta": { - "benchmark_id": "partial-coverage-fixture", - "schema_version": 1, - "started_at": "2026-04-24T12:00:00Z" - }, - "commit": { - "sha": "bbbb222222222222222222222222222222222222", - "timestamp": "2026-04-24T12:00:00Z", - "message": "B: random-access did not run (only compression_size emitted)", - "author_name": "Test Author", - "author_email": "author@example.com", - "committer_name": "Test Committer", - "committer_email": "committer@example.com", - "tree_sha": "fedcba9876543210fedcba9876543210fedcba98", - "url": "https://github.com/vortex-data/vortex/commit/bbbb222222222222222222222222222222222222" - }, - "records": [ - { - "kind": "compression_size", - "commit_sha": "bbbb222222222222222222222222222222222222", - "dataset": "tpch-lineitem", - "format": "parquet", - "value_bytes": 4_000, - }, - ], - }); - // Commit C: random-access again. - let env_c = ra_envelope( - "cccc333333333333333333333333333333333333", - "2026-04-25T12:00:00Z", - "C", - &[("parquet", 1_200)], - ); - - for env in [&env_a, &env_b, &env_c] { - let resp = client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(env) - .send() - .await?; - anyhow::ensure!( - resp.status().is_success(), - "seed ingest failed: {}", - resp.status() - ); - } - wait_for_materialized_first_chart_commits(&server, 3).await?; - - let slug = pick_chart_slug(&server, |s| s == "Random Access").await?; - let chart: Value = client - .get(server.url(&format!("/api/chart/{slug}"))) - .send() - .await? - .json() - .await?; - - let shas: Vec<&str> = chart["commits"] - .as_array() - .context("commits[] array")? - .iter() - .filter_map(|c| c["sha"].as_str()) - .collect(); - assert_eq!( - shas, - vec![ - "aaaa111111111111111111111111111111111111", - "bbbb222222222222222222222222222222222222", - "cccc333333333333333333333333333333333333", - ], - "the commit with zero rows in the fact table must still appear in commits[]" - ); - - // The parquet series has data only at A and C. - let parquet = chart["series"]["parquet"] - .as_array() - .context("parquet series array")?; - assert_eq!(parquet.len(), 3); - assert_eq!(parquet[0].as_f64(), Some(1_000.0)); - assert!( - parquet[1].is_null(), - "parquet must be null at the zero-rows commit" - ); - assert_eq!(parquet[2].as_f64(), Some(1_200.0)); - - Ok(()) -} - -/// Commits older than the earliest fact-table row for this chart are NOT -/// included on the x-axis. Without this lower bound a chart's first commit -/// could be from before the benchmark even existed — the spec calls this -/// out explicitly as "Beware: don't accidentally include EVERY commit ever." -#[tokio::test] -async fn chart_excludes_commits_before_first_fact_row() -> Result<()> { - let server = Server::start().await?; - let client = reqwest::Client::new(); - - // Commit A: a `compression_time` row (random-access does not exist for A). - let env_a = json!({ - "run_meta": { - "benchmark_id": "partial-coverage-fixture", - "schema_version": 1, - "started_at": "2026-04-22T12:00:00Z" - }, - "commit": { - "sha": "aaaa111111111111111111111111111111111111", - "timestamp": "2026-04-22T12:00:00Z", - "message": "A: pre-history of the random-access bench", - "author_name": "Test Author", - "author_email": "author@example.com", - "committer_name": "Test Committer", - "committer_email": "committer@example.com", - "tree_sha": "fedcba9876543210fedcba9876543210fedcba98", - "url": "https://github.com/vortex-data/vortex/commit/aaaa111111111111111111111111111111111111" - }, - "records": [ - { - "kind": "compression_time", - "commit_sha": "aaaa111111111111111111111111111111111111", - "dataset": "tpch-lineitem", - "format": "parquet", - "op": "encode", - "value_ns": 9_000, - "all_runtimes_ns": [9_000], - }, - ], - }); - // Commit B: first random-access row appears. - let env_b = ra_envelope( - "bbbb222222222222222222222222222222222222", - "2026-04-23T12:00:00Z", - "B: random-access bench begins", - &[("parquet", 1_000)], - ); - - for env in [&env_a, &env_b] { - let resp = client - .post(server.url("/api/ingest")) - .bearer_auth(TOKEN) - .json(env) - .send() - .await?; - anyhow::ensure!( - resp.status().is_success(), - "seed ingest failed: {}", - resp.status() - ); - } - wait_for_materialized_first_chart_commits(&server, 1).await?; - - let slug = pick_chart_slug(&server, |s| s == "Random Access").await?; - let chart: Value = client - .get(server.url(&format!("/api/chart/{slug}"))) - .send() - .await? - .json() - .await?; - - let shas: Vec<&str> = chart["commits"] - .as_array() - .context("commits[] array")? - .iter() - .filter_map(|c| c["sha"].as_str()) - .collect(); - assert_eq!( - shas, - vec!["bbbb222222222222222222222222222222222222"], - "commit A predates the first random-access row, so it must not be on the x-axis" - ); - Ok(()) -} diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index b2e49412a35..8448bd518b6 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -45,7 +45,6 @@ humansize = { workspace = true } inventory = { workspace = true } itertools = { workspace = true } jiff = { workspace = true } -multiversion = { workspace = true } num-traits = { workspace = true } num_enum = { workspace = true } parking_lot = { workspace = true } diff --git a/vortex-bench/Cargo.toml b/vortex-bench/Cargo.toml index fc1766c599f..3b793c6124a 100644 --- a/vortex-bench/Cargo.toml +++ b/vortex-bench/Cargo.toml @@ -30,7 +30,6 @@ arrow-array = { workspace = true } arrow-schema = { workspace = true } arrow-select = { workspace = true } async-trait = { workspace = true } -bytes = { workspace = true } bzip2 = { workspace = true } clap = { workspace = true, features = ["derive"] } futures = { workspace = true } diff --git a/vortex-btrblocks/Cargo.toml b/vortex-btrblocks/Cargo.toml index 3991eccb8c7..34a98f85b73 100644 --- a/vortex-btrblocks/Cargo.toml +++ b/vortex-btrblocks/Cargo.toml @@ -18,8 +18,6 @@ itertools = { workspace = true } num-traits = { workspace = true } pco = { workspace = true, optional = true } rand = { workspace = true } -rustc-hash = { workspace = true } -tracing = { workspace = true } vortex-alp = { workspace = true } vortex-array = { workspace = true } vortex-buffer = { workspace = true } @@ -29,7 +27,6 @@ vortex-decimal-byte-parts = { workspace = true } vortex-error = { workspace = true } vortex-fastlanes = { workspace = true } vortex-fsst = { workspace = true } -vortex-mask = { workspace = true } vortex-onpair = { workspace = true, optional = true } vortex-pco = { workspace = true, optional = true } vortex-runend = { workspace = true } diff --git a/vortex-mask/Cargo.toml b/vortex-mask/Cargo.toml index a2af3d27cb0..7aa5bb2ef85 100644 --- a/vortex-mask/Cargo.toml +++ b/vortex-mask/Cargo.toml @@ -14,14 +14,12 @@ rust-version = { workspace = true } version = { workspace = true } [features] -arrow = ["dep:arrow-buffer"] serde = ["dep:serde", "vortex-buffer/serde"] [package.metadata.docs.rs] all-features = true [dependencies] -arrow-buffer = { workspace = true, optional = true } itertools = { workspace = true } serde = { workspace = true, optional = true, features = ["rc"] } vortex-buffer = { workspace = true, features = ["arrow"] } diff --git a/vortex-tensor/Cargo.toml b/vortex-tensor/Cargo.toml index b0521170e67..a293e604b5b 100644 --- a/vortex-tensor/Cargo.toml +++ b/vortex-tensor/Cargo.toml @@ -21,7 +21,6 @@ vortex-array = { workspace = true } vortex-buffer = { workspace = true } vortex-compressor = { workspace = true } vortex-error = { workspace = true } -vortex-fastlanes = { workspace = true } vortex-session = { workspace = true } vortex-utils = { workspace = true } diff --git a/vortex-utils/Cargo.toml b/vortex-utils/Cargo.toml index 66b2576f98d..ca6c639ec0d 100644 --- a/vortex-utils/Cargo.toml +++ b/vortex-utils/Cargo.toml @@ -16,7 +16,6 @@ version = { workspace = true } [dependencies] dashmap = { workspace = true, optional = true } hashbrown = { workspace = true } -parking_lot = { workspace = true, optional = true } vortex-error = { workspace = true } [lints]