Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a setup script that downloads data and creates recall tables #1

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,17 @@ Scripts you can run
- experiments/recall_experiment.py
- experiments/create_experiment.py
- experiments/select_experiment.py
- experiments/disk_usage_experiment.py
- experiments/disk_usage_experiment.py


### Examples

Run recall experiments on Lantern
```bash
python ./experiments/recall_experiment.py --dataset sift --extension pgvector --N 10k 100k
```

Use a custom database URL to run the experiments
```bash
DATABASE_URL='postgresql://ngalstyan:abra@localhost:5432/testdb' python ./experiments/recall_experiment.py --dataset sift --extension pgvector --N 10k 100k
```
38 changes: 19 additions & 19 deletions experiments/scripts/create_tables.sql
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
-- DROP TABLE IF EXISTS sift_base10k;
-- DROP TABLE IF EXISTS sift_base1m;
-- DROP TABLE IF EXISTS gist_base1m;
-- DROP TABLE IF EXISTS sift_base1b;
DROP TABLE IF EXISTS sift_base10k;
DROP TABLE IF EXISTS sift_base1m;
DROP TABLE IF EXISTS gist_base1m;
DROP TABLE IF EXISTS sift_base1b;

-- CREATE TABLE IF NOT EXISTS sift_base10k (
-- id SERIAL PRIMARY KEY,
-- v VECTOR(128)
-- );
CREATE TABLE IF NOT EXISTS sift_base10k (
id SERIAL PRIMARY KEY,
v VECTOR(128)
);

-- \COPY sift_base10k (v) FROM '/app/data/siftsmall/siftsmall_base.csv' WITH csv;
COPY sift_base10k (v) FROM '/tmp/lanterndb/vector_datasets/siftsmall/siftsmall_base.csv' WITH csv;

-- CREATE TABLE IF NOT EXISTS sift_base1m (
-- id SERIAL PRIMARY KEY,
-- v VECTOR(128)
-- );
CREATE TABLE IF NOT EXISTS sift_base1m (
id SERIAL PRIMARY KEY,
v VECTOR(128)
);

-- \COPY sift_base1m (v) FROM '/app/data/sift/sift_base.csv' WITH csv;
COPY sift_base1m (v) FROM '/tmp/lanterndb/vector_datasets/sift/sift_base.csv' WITH csv;

-- CREATE TABLE IF NOT EXISTS gist_base1m (
-- id SERIAL PRIMARY KEY,
Expand All @@ -24,9 +24,9 @@

-- \COPY gist_base1m (v) FROM '/app/data/gist/gist_base.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_base1b (
id SERIAL PRIMARY KEY,
v VECTOR(128)
);
-- CREATE TABLE IF NOT EXISTS sift_base1b (
-- id SERIAL PRIMARY KEY,
-- v VECTOR(128)
-- );

\COPY sift_base1b (v) FROM '/app/data/siftbig/bigann_base.csv' WITH csv;
-- \COPY sift_base1b (v) FROM '/app/data/siftbig/bigann_base.csv' WITH csv;
39 changes: 0 additions & 39 deletions experiments/scripts/create_tables_derived.sql
Original file line number Diff line number Diff line change
Expand Up @@ -122,42 +122,3 @@ SELECT * FROM sift_base1m WHERE id <= 600000;

INSERT INTO sift_base800k
SELECT * FROM sift_base1m WHERE id <= 800000;

INSERT INTO sift_base2m
SELECT * FROM sift_base1b WHERE id <= 2000000;

INSERT INTO sift_base5m
SELECT * FROM sift_base1b WHERE id <= 5000000;

INSERT INTO sift_base10m
SELECT * FROM sift_base1b WHERE id <= 10000000;

INSERT INTO sift_base20m
SELECT * FROM sift_base1b WHERE id <= 20000000;

INSERT INTO sift_base50m
SELECT * FROM sift_base1b WHERE id <= 50000000;

INSERT INTO sift_base100m
SELECT * FROM sift_base1b WHERE id <= 100000000;

INSERT INTO sift_base200m
SELECT * FROM sift_base1b WHERE id <= 200000000;

INSERT INTO sift_base500m
SELECT * FROM sift_base1b WHERE id <= 500000000;

INSERT INTO gist_base100k
SELECT * FROM gist_base1m WHERE id <= 100000;

INSERT INTO gist_base200k
SELECT * FROM gist_base1m WHERE id <= 200000;

INSERT INTO gist_base400k
SELECT * FROM gist_base1m WHERE id <= 400000;

INSERT INTO gist_base600k
SELECT * FROM gist_base1m WHERE id <= 600000;

INSERT INTO gist_base800k
SELECT * FROM gist_base1m WHERE id <= 800000;
98 changes: 14 additions & 84 deletions experiments/scripts/create_tables_recall.sql
Original file line number Diff line number Diff line change
Expand Up @@ -15,114 +15,44 @@ DROP TABLE IF EXISTS sift_truth200m;
DROP TABLE IF EXISTS sift_truth500m;
DROP TABLE IF EXISTS sift_truth1b;

-- \set SIFTSMALL_PATH '/tmp/lanterndb/vector_datasets/siftsmall'
-- \set SIFT_PATH '/tmp/lanterndb/vector_datasets/sift'

CREATE TABLE IF NOT EXISTS sift_query10k (
id SERIAL PRIMARY KEY,
v VECTOR(128)
);

\COPY sift_query10k (v) FROM '/app/data/siftsmall/siftsmall_query.csv' WITH csv;
COPY sift_query10k (v) FROM '/tmp/lanterndb/vector_datasets/siftsmall/siftsmall_query.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_query1m (
id SERIAL PRIMARY KEY,
v VECTOR(128)
);

\COPY sift_query1m (v) FROM '/app/data/sift/sift_query.csv' WITH csv;
COPY sift_query1m (v) FROM '/tmp/lanterndb/vector_datasets/sift/sift_query.csv' WITH csv;
-- I think the below is the name that some of the scripts expect this table to have...
CREATE TABLE IF NOT EXISTS sift_query100k (
id SERIAL PRIMARY KEY,
v VECTOR(128)
);
COPY sift_query100k (v) FROM '/tmp/lanterndb/vector_datasets/sift/sift_query.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_truth10k (
id SERIAL PRIMARY KEY,
indices INTEGER[]
);

\COPY sift_truth10k (indices) FROM '/app/data/siftsmall/siftsmall_truth.csv' WITH csv;
COPY sift_truth10k (indices) FROM '/tmp/lanterndb/vector_datasets/siftsmall/siftsmall_truth.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_truth1m (
id SERIAL PRIMARY KEY,
indices INTEGER[]
);

\COPY sift_truth1m (indices) FROM '/app/data/sift/sift_truth.csv' WITH csv;
COPY sift_truth1m (indices) FROM '/tmp/lanterndb/vector_datasets/sift/sift_truth.csv' WITH csv;

CREATE TABLE IF NOT EXISTS gist_query1m (
id SERIAL PRIMARY KEY,
v VECTOR(960)
);

\COPY gist_query1m (v) FROM '/app/data/gist/gist_query.csv' WITH csv;

CREATE TABLE IF NOT EXISTS gist_truth1m (
id SERIAL PRIMARY KEY,
indices INTEGER[]
);

\COPY gist_truth1m (indices) FROM '/app/data/gist/gist_truth.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_query1b (
id SERIAL PRIMARY KEY,
v VECTOR(128)
);

\COPY sift_query1b (v) FROM '/app/data/siftbig/bigann_query.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_truth2m (
id SERIAL PRIMARY KEY,
indices INTEGER[]
);

\COPY sift_truth2m (indices) FROM '/app/data/siftbig/gnd/idx_2M.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_truth5m (
id SERIAL PRIMARY KEY,
indices INTEGER[]
);

\COPY sift_truth5m (indices) FROM '/app/data/siftbig/gnd/idx_5M.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_truth10m (
id SERIAL PRIMARY KEY,
indices INTEGER[]
);

\COPY sift_truth10m (indices) FROM '/app/data/siftbig/gnd/idx_10M.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_truth20m (
id SERIAL PRIMARY KEY,
indices INTEGER[]
);

\COPY sift_truth20m (indices) FROM '/app/data/siftbig/gnd/idx_20M.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_truth50m (
id SERIAL PRIMARY KEY,
indices INTEGER[]
);

\COPY sift_truth50m (indices) FROM '/app/data/siftbig/gnd/idx_50M.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_truth100m (
id SERIAL PRIMARY KEY,
indices INTEGER[]
);

\COPY sift_truth100m (indices) FROM '/app/data/siftbig/gnd/idx_100M.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_truth200m (
id SERIAL PRIMARY KEY,
indices INTEGER[]
);

\COPY sift_truth200m (indices) FROM '/app/data/siftbig/gnd/idx_200M.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_truth500m (
id SERIAL PRIMARY KEY,
indices INTEGER[]
);

\COPY sift_truth500m (indices) FROM '/app/data/siftbig/gnd/idx_500M.csv' WITH csv;

CREATE TABLE IF NOT EXISTS sift_truth1b (
id SERIAL PRIMARY KEY,
indices INTEGER[]
);

\COPY sift_truth1b (indices) FROM '/app/data/siftbig/gnd/idx_1000M.csv' WITH csv;
);
51 changes: 51 additions & 0 deletions experiments/scripts/setup_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import argparse
import urllib.request
import psycopg2

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--datapath", default="/tmp/lanterndb/vector_datasets", help="Path to data directory")
args = parser.parse_args()

data_dir = args.datapath
sift_dir = os.path.join(data_dir, "sift")
siftsmall_dir = os.path.join(data_dir, "siftsmall")

if not os.path.exists(sift_dir):
print("SIFT directory does not exist. Creating... and downloading sift vectors...")
os.makedirs(sift_dir)
os.makedirs(siftsmall_dir)

siftsmall_fnames = ['siftsmall_base.csv', 'siftsmall_query.csv', 'siftsmall_truth.csv']
sift_fnames = ['sift_base.csv', 'sift_query.csv', 'sift_truth.csv']

for filename in siftsmall_fnames:
print(f"Downloading SIFTSMALL {filename}")
urllib.request.urlretrieve(f"https://storage.googleapis.com/lanterndata/siftsmall/{filename}", os.path.join(siftsmall_dir, filename))
for filename in sift_fnames:
print(f"Downloading SIFT {filename}")
urllib.request.urlretrieve(f"https://storage.googleapis.com/lanterndata/sift/{filename}", os.path.join(sift_dir, filename))
else:
print("SIFT directory exists. Skipping file download.")

print("Creating tables...")

conn = psycopg2.connect(os.environ["DATABASE_URL"])
cur = conn.cursor()

with open("create_tables.sql", "r") as sqlfile:
cur.execute(sqlfile.read())

with open("create_tables_recall.sql", "r") as sqlfile:
cur.execute(sqlfile.read())

with open("../../db/init/init_results.sql", "r") as sqlfile:
cur.execute(sqlfile.read())
with open("../../db/init/init_util.sql", "r") as sqlfile:
cur.execute(sqlfile.read())

conn.commit()
cur.close()
conn.close()

print("Done!")