diff --git a/.github/workflows/benchmark-linux.yaml b/.github/workflows/benchmark-linux.yaml new file mode 100644 index 000000000..7cd097d0c --- /dev/null +++ b/.github/workflows/benchmark-linux.yaml @@ -0,0 +1,40 @@ +name: benchmark + +on: + push: + branches: + - main + - dev + pull_request: + branches: + - main + - dev + +jobs: + ubuntu-build: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + with: + submodules: "recursive" + + - name: Build + id: build + run: sudo su -c "PG_VERSION=15 USE_SOURCE=1 ./ci/scripts/build-linux.sh" + env: + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} + + - name: Run benchmarking + id: test + run: sudo su postgres -c "./ci/scripts/run-benchmarks.sh" + env: + BASE_REF: ${{ github.base_ref }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload to artifacts + uses: actions/upload-artifact@v3 + with: + name: benchmark-results + path: | + /tmp/benchmarks-out.json \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 10a0fdfd0..7f2aeeada 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,9 +184,26 @@ add_custom_target( test COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test - ) +) + +# BENCHMARK +add_custom_target( + benchmark + COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_benchmarks.sh + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/build +) +add_custom_target( + benchmark-skip-setup + COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_benchmarks.sh --skip-setup + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/build +) +add_custom_target( + benchmark-print-only + COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_benchmarks.sh --print-only + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/build +) - # DEVELOPMENT +# DEVELOPMENT find_program(CLANG_FORMAT NAMES clang-format) if (CLANG_FORMAT) @@ -226,12 +243,6 @@ if (CLANG_FORMAT) ) endif() -add_custom_target( - bench - COMMAND ${CMAKE_SOURCE_DIR}/scripts/bench.sh - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - ) - # Package universal install script string(REGEX MATCH "^PostgreSQL (\[0-9]+).*" PostgreSQL_VERSION_NUMBER ${PostgreSQL_VERSION_STRING}) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 19a456912..b77b38eea 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,47 +1,55 @@ -Thanks for considering to contribute! -The information below is intended to *help* you contribute. +Thanks for considering contributing! The information below is intended to help you contribute. -## Runing tests +## Running tests -``` --- run all regression tests +```bash +# run all regression tests make test --- only run regression tests that have $FILTER in regression sql file path +# only run regression tests that have $FILTER in regression sql file path make test FILTER=hnsw ``` +## Running benchmarks +This requires Python to be installed. Please check the `Dockerfile.dev` for pip requirements. +```bash +# set up benchmarking, run benchmarks, and print results +make benchmark + +# run benchmarks and print results (skip setup) +make benchmark-skip-setup + +# print most recent benchmark results (skip setup and running benchmarks) +make benchmark-print-only +``` + ## VSCode and IntelliSense `.vscode/c_cpp_properties` is configured to use `./build/compile_commands.json`. -If you build lanterndb in a different directory, make sure to update ``.vscode` config appropriately -in order to have IntelliSense working. +If you build lanterndb in a different directory, make sure to update `.vscode` config appropriately in order to have IntelliSense working. ## Debugging the C codebase -If you make changes to the C codebase, in addition to `make test`, you can also use the `livedebug.py` utility -in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks. +If you make changes to the C codebase, in addition to `make test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks. Below is a short recording demonstrating the use of `livedebug.py`: [![asciicast](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt.svg)](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt) ## Adding/modifying LanternDB's SQL interface -When modifying the SQL interface, you add relevant SQL logic under `sql/`. In addition, you add an update script -under `sql/updates`, in a file named `[CURRENT_VERSION]--latest.sql`. You should create this file if it does not exist. +When modifying the SQL interface, you add relevant SQL logic under `sql/`. In addition, you add an update script under `sql/updates`, in a file named `[CURRENT_VERSION]--latest.sql`. You should create this file if it does not exist. Note that you never modify an already existing update file that does not have `latest` in its name. -The files that do not have `latest` in the name are part of a previous releases and help LanternDB users update -to a newer version of the extension via `ALTER EXTENSION lanterndb UPDATE`. + +The files that do not have `latest` in the name are part of a previous releases and help LanternDB users update to a newer version of the extension via `ALTER EXTENSION lanterndb UPDATE`. ## Browsing the Postgres repository offline You can download PostgreSQL source code from [their ftp server](https://www.postgresql.org/ftp/source/). Alternatively, can clone their git repository. ```bash -#full repository +# full repository git clone https://git.postgresql.org/git/postgresql.git -#release head only +# release head only git clone --single-branch --branch REL_15_STABLE https://git.postgresql.org/git/postgresql.git --depth=1 - ``` diff --git a/Dockerfile.dev b/Dockerfile.dev index ea7a4a095..f13c0ac85 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -31,4 +31,11 @@ RUN rm -rf build \ && mkdir build \ && cd build \ && cmake -DUSEARCH_NO_MARCH_NATIVE=ON -DCMAKE_BUILD_TYPE=Debug .. \ - && make install \ No newline at end of file + && make install + +# Install benchmarking tools in build folder +RUN git clone https://github.com/lanterndata/benchmark \ + && cd benchmark \ + && pip install -r core/requirements.txt --break-system-packages \ + && pip install -r external/requirements.txt --break-system-packages +ENV DATABASE_URL=postgres://postgres:postgres@localhost:5432/postgres \ No newline at end of file diff --git a/bench.sql b/bench.sql deleted file mode 100644 index 7dd97c422..000000000 --- a/bench.sql +++ /dev/null @@ -1,74 +0,0 @@ - -\set ON_ERROR_STOP on -\timing - -DROP EXTENSION IF EXISTS vector CASCADE; -CREATE EXTENSION IF NOT EXISTS vector; -DROP EXTENSION IF EXISTS lanterndb CASCADE; -CREATE EXTENSION IF NOT EXISTS lanterndb; - --- Create SIFT tables for benchmarking -DROP TABLE IF EXISTS sift_base10k; - CREATE TABLE sift_base10k ( - id SERIAL PRIMARY KEY, - v vector(128)); - - \copy sift_base10k (v) FROM 'base10k.csv' with csv; - --- CREATE TABLE sift_base1m ( --- id SERIAL PRIMARY KEY, --- v vector(128)); - --- CREATE TABLE gist_base1m ( --- id SERIAL PRIMARY KEY, --- v vector(960)); - --- CREATE TABLE sift_base1b ( --- id SERIAL PRIMARY KEY, --- v vector(128)); - --- \copy sift_base1m (v) FROM 'base1m.csv' with csv; - -select v as v4444 from sift_base10k where id = 4444 \gset -EXPLAIN (ANALYZE, TIMING FALSE) select * from sift_base10k order by v <-> :'v4444' -limit 10; - -select id, vector_l2sq_dist(v, :'v4444') -as dist -from sift_base10k order by dist limit 10; - -\set GROUP_LIMIT 10000 - --- CREATE INDEX ON sift_base1m USING hnsw (v vector_l2_ops) WITH (M=2, ef_construction=14, alg="diskann"); -CREATE INDEX ON sift_base10k USING hnsw (v dist_vec_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, alg="diskann"); -CREATE INDEX ON sift_base10k USING ivfflat (v vector_l2_ops); - -\echo "running" v4444 "vector queries" -\echo "@@@@@@@@@@@@@@@@@@@@ ivfflat index is also created @@@@@@@@@@@@@@" -begin; -drop index sift_base10k_v_idx; -explain (analyze,buffers) select q.id AS query_id, - ARRAY_AGG(b.id ORDER BY q.v <-> b.v) AS base_ids -FROM - sift_base10k q -JOIN LATERAL ( - SELECT id,v - FROM sift_base10k - ORDER BY q.v <-> v limit 10 -) b ON true -GROUP BY - q.id limit :GROUP_LIMIT; -rollback; -\echo "^^^^^^^^^^^^^^^^^^^^ ivfflat performance above ^^^^^^^^^^^^^^" - -explain (analyze,buffers) select q.id AS query_id, - ARRAY_AGG(b.id ORDER BY q.v <-> b.v) AS base_ids -FROM - sift_base10k q -JOIN LATERAL ( - SELECT id,v - FROM sift_base10k - ORDER BY q.v <-> v limit 10 -) b ON true -GROUP BY - q.id limit :GROUP_LIMIT; diff --git a/ci/scripts/run-benchmarks.sh b/ci/scripts/run-benchmarks.sh new file mode 100755 index 000000000..8d971659f --- /dev/null +++ b/ci/scripts/run-benchmarks.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -e + +wait_for_pg(){ + tries=0 + until pg_isready -U postgres 2>/dev/null; do + if [ $tries -eq 10 ]; + then + echo "Can not connect to postgres" + exit 1 + fi + + sleep 1 + tries=$((tries+1)) + done +} + +export WORKDIR=/tmp/lanterndb +export PG_VERSION=15 +export GITHUB_OUTPUT=/dev/null +export PGDATA=/etc/postgresql/$PG_VERSION/main/ + +echo "port = 5432" >> $PGDATA/postgresql.conf +# Enable auth without password +echo "local all all trust" > $PGDATA/pg_hba.conf +echo "host all all 127.0.0.1/32 trust" >> $PGDATA/pg_hba.conf +echo "host all all ::1/128 trust" >> $PGDATA/pg_hba.conf + +POSTGRES_HOST_AUTH_METHOD=trust /usr/lib/postgresql/$PG_VERSION/bin/postgres 1>/tmp/pg-out.log 2>/tmp/pg-error.log & +wait_for_pg +cd $WORKDIR/build + +export DATABASE_URL=postgresql://localhost:5432/postgres +git clone https://github.com/lanterndata/benchmark +cd benchmark +pip install -r core/requirements.txt +pip install -r external/requirements.txt +cd .. + +make benchmark +killall postgres + diff --git a/ci/scripts/run-tests.sh b/ci/scripts/run-tests.sh index b5c007e80..974c0c1d8 100755 --- a/ci/scripts/run-tests.sh +++ b/ci/scripts/run-tests.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -e wait_for_pg(){ tries=0 diff --git a/scripts/bench.sh b/scripts/bench.sh deleted file mode 100755 index 8e820a13d..000000000 --- a/scripts/bench.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -PSQL=psql -${PSQL} bench_db < ./bench.sql \ No newline at end of file diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh new file mode 100755 index 000000000..95749f128 --- /dev/null +++ b/scripts/run_benchmarks.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -e + +# Benchmarking parameters +BASE_PARAMS="--extension lantern --dataset sift --N 10k" +INDEX_PARAMS="--m 4 --ef_construction 128 --ef 10" +PARAMS="$BASE_PARAMS $INDEX_PARAMS --K 5" + +# Settings +SKIP_SETUP=0 +PRINT_ONLY=0 +while [[ "$#" -gt 0 ]]; do + case $1 in + --skip-setup) SKIP_SETUP=1 ;; + --print-only) PRINT_ONLY=1 ;; + esac + shift +done + +# Go to benchmark directory +cd benchmark + +# Run setup +if [ "$SKIP_SETUP" -ne 1 ] && [ "$PRINT_ONLY" -ne 1 ]; then + echo "Running data setup" + python3 -m core.setup --datapath /tmp/benchmark_data $BASE_PARAMS +else + echo "Skipping data setup" +fi + +# Run benchmarks +if [ "$PRINT_ONLY" -ne 1 ]; then + echo "Running benchmarks" + python3 -m external.run_benchmarks $PARAMS +fi + +# Render benchmarks +python3 -m external.show_benchmarks $PARAMS +python3 -m external.validate_benchmarks $PARAMS +python3 -m external.get_benchmarks_json $PARAMS > /tmp/benchmarks-out.json \ No newline at end of file diff --git a/test/sql/test_helpers/princeton_places.sql b/test/sql/test_helpers/princeton_places.sql deleted file mode 100644 index 09e0d5324..000000000 --- a/test/sql/test_helpers/princeton_places.sql +++ /dev/null @@ -1,8 +0,0 @@ -CREATE TABLE princeton_places ( - name text, - street text, - long float, - lat float, - v vector(2) -); -\copy pton_area(name, street, long, lat) FROM '/tmp/lanterndb/vector_datasets/sift_base1k.csv' DELIMITER E',';