lanterndata · Ngalstyan4 · Aug 7, 2023 · Aug 7, 2023 · Aug 12, 2023
diff --git a/README.md b/README.md
@@ -11,4 +11,17 @@ Scripts you can run
 - experiments/recall_experiment.py
 - experiments/create_experiment.py
 - experiments/select_experiment.py
-- experiments/disk_usage_experiment.py
+- experiments/disk_usage_experiment.py
+
+
+### Examples
+
+Run recall experiments on Lantern
+```bash
+python  ./experiments/recall_experiment.py --dataset sift --extension pgvector --N 10k 100k
+```
+
+Use a custom database URL to run the experiments
+```bash
+DATABASE_URL='postgresql://ngalstyan:abra@localhost:5432/testdb' python  ./experiments/recall_experiment.py --dataset sift --extension pgvector --N 10k 100k
+ ```
diff --git a/experiments/scripts/create_tables.sql b/experiments/scripts/create_tables.sql
@@ -1,21 +1,21 @@
--- DROP TABLE IF EXISTS sift_base10k;
--- DROP TABLE IF EXISTS sift_base1m;
--- DROP TABLE IF EXISTS gist_base1m;
--- DROP TABLE IF EXISTS sift_base1b;
+DROP TABLE IF EXISTS sift_base10k;
+DROP TABLE IF EXISTS sift_base1m;
+DROP TABLE IF EXISTS gist_base1m;
+DROP TABLE IF EXISTS sift_base1b;
 
--- CREATE TABLE IF NOT EXISTS sift_base10k (
---   id SERIAL PRIMARY KEY,
---   v VECTOR(128)
--- );
+CREATE TABLE IF NOT EXISTS sift_base10k (
+  id SERIAL PRIMARY KEY,
+  v VECTOR(128)
+);
 
--- \COPY sift_base10k (v) FROM '/app/data/siftsmall/siftsmall_base.csv' WITH csv;
+COPY sift_base10k (v) FROM '/tmp/lanterndb/vector_datasets/siftsmall/siftsmall_base.csv' WITH csv;
 
--- CREATE TABLE IF NOT EXISTS sift_base1m (
---   id SERIAL PRIMARY KEY,
---   v VECTOR(128)
--- );
+CREATE TABLE IF NOT EXISTS sift_base1m (
+  id SERIAL PRIMARY KEY,
+  v VECTOR(128)
+);
 
--- \COPY sift_base1m (v) FROM '/app/data/sift/sift_base.csv' WITH csv;
+COPY sift_base1m (v) FROM '/tmp/lanterndb/vector_datasets/sift/sift_base.csv' WITH csv;
 
 -- CREATE TABLE IF NOT EXISTS gist_base1m (
 --   id SERIAL PRIMARY KEY,
@@ -24,9 +24,9 @@
 
 -- \COPY gist_base1m (v) FROM '/app/data/gist/gist_base.csv' WITH csv;
 
-CREATE TABLE IF NOT EXISTS sift_base1b (
-  id SERIAL PRIMARY KEY,
-  v VECTOR(128)
-);
+-- CREATE TABLE IF NOT EXISTS sift_base1b (
+--   id SERIAL PRIMARY KEY,
+--   v VECTOR(128)
+-- );
 
-\COPY sift_base1b (v) FROM '/app/data/siftbig/bigann_base.csv' WITH csv;
+-- \COPY sift_base1b (v) FROM '/app/data/siftbig/bigann_base.csv' WITH csv;
diff --git a/experiments/scripts/create_tables_derived.sql b/experiments/scripts/create_tables_derived.sql
@@ -122,42 +122,3 @@ SELECT * FROM sift_base1m WHERE id <= 600000;
 
 INSERT INTO sift_base800k
 SELECT * FROM sift_base1m WHERE id <= 800000;
-
-INSERT INTO sift_base2m
-SELECT * FROM sift_base1b WHERE id <= 2000000;
-
-INSERT INTO sift_base5m
-SELECT * FROM sift_base1b WHERE id <= 5000000;
-
-INSERT INTO sift_base10m
-SELECT * FROM sift_base1b WHERE id <= 10000000;
-
-INSERT INTO sift_base20m
-SELECT * FROM sift_base1b WHERE id <= 20000000;
-
-INSERT INTO sift_base50m
-SELECT * FROM sift_base1b WHERE id <= 50000000;
-
-INSERT INTO sift_base100m
-SELECT * FROM sift_base1b WHERE id <= 100000000;
-
-INSERT INTO sift_base200m
-SELECT * FROM sift_base1b WHERE id <= 200000000;
-
-INSERT INTO sift_base500m
-SELECT * FROM sift_base1b WHERE id <= 500000000;
-
-INSERT INTO gist_base100k
-SELECT * FROM gist_base1m WHERE id <= 100000;
-
-INSERT INTO gist_base200k
-SELECT * FROM gist_base1m WHERE id <= 200000;
-
-INSERT INTO gist_base400k
-SELECT * FROM gist_base1m WHERE id <= 400000;
-
-INSERT INTO gist_base600k
-SELECT * FROM gist_base1m WHERE id <= 600000;
-
-INSERT INTO gist_base800k
-SELECT * FROM gist_base1m WHERE id <= 800000;
diff --git a/experiments/scripts/create_tables_recall.sql b/experiments/scripts/create_tables_recall.sql
@@ -15,114 +15,44 @@ DROP TABLE IF EXISTS sift_truth200m;
 DROP TABLE IF EXISTS sift_truth500m;
 DROP TABLE IF EXISTS sift_truth1b;
 
+-- \set SIFTSMALL_PATH '/tmp/lanterndb/vector_datasets/siftsmall'
+-- \set SIFT_PATH '/tmp/lanterndb/vector_datasets/sift'
+
 CREATE TABLE IF NOT EXISTS sift_query10k (
   id SERIAL PRIMARY KEY,
   v VECTOR(128)
 );
 
-\COPY sift_query10k (v) FROM '/app/data/siftsmall/siftsmall_query.csv' WITH csv;
+COPY sift_query10k (v) FROM '/tmp/lanterndb/vector_datasets/siftsmall/siftsmall_query.csv' WITH csv;
 
 CREATE TABLE IF NOT EXISTS sift_query1m (
   id SERIAL PRIMARY KEY,
   v VECTOR(128)
 );
 
-\COPY sift_query1m (v) FROM '/app/data/sift/sift_query.csv' WITH csv;
+COPY sift_query1m (v) FROM '/tmp/lanterndb/vector_datasets/sift/sift_query.csv' WITH csv;
+-- I think the below is the name that some of the scripts expect this table to have...
+CREATE TABLE IF NOT EXISTS sift_query100k (
+  id SERIAL PRIMARY KEY,
+  v VECTOR(128)
+);
+COPY sift_query100k (v) FROM '/tmp/lanterndb/vector_datasets/sift/sift_query.csv' WITH csv;
 
 CREATE TABLE IF NOT EXISTS sift_truth10k (
   id SERIAL PRIMARY KEY,
   indices INTEGER[]
 );
 
-\COPY sift_truth10k (indices) FROM '/app/data/siftsmall/siftsmall_truth.csv' WITH csv;
+COPY sift_truth10k (indices) FROM '/tmp/lanterndb/vector_datasets/siftsmall/siftsmall_truth.csv' WITH csv;
 
 CREATE TABLE IF NOT EXISTS sift_truth1m (
   id SERIAL PRIMARY KEY,
   indices INTEGER[]
 );
 
-\COPY sift_truth1m (indices) FROM '/app/data/sift/sift_truth.csv' WITH csv;
+COPY sift_truth1m (indices) FROM '/tmp/lanterndb/vector_datasets/sift/sift_truth.csv' WITH csv;
 
 CREATE TABLE IF NOT EXISTS gist_query1m (
   id SERIAL PRIMARY KEY,
   v VECTOR(960)
-);
-
-\COPY gist_query1m (v) FROM '/app/data/gist/gist_query.csv' WITH csv;
-
-CREATE TABLE IF NOT EXISTS gist_truth1m (
-  id SERIAL PRIMARY KEY,
-  indices INTEGER[]
-);
-
-\COPY gist_truth1m (indices) FROM '/app/data/gist/gist_truth.csv' WITH csv;
-
-CREATE TABLE IF NOT EXISTS sift_query1b (
-  id SERIAL PRIMARY KEY,
-  v VECTOR(128)
-);
-
-\COPY sift_query1b (v) FROM '/app/data/siftbig/bigann_query.csv' WITH csv;
-
-CREATE TABLE IF NOT EXISTS sift_truth2m (
-  id SERIAL PRIMARY KEY,
-  indices INTEGER[]
-);
-
-\COPY sift_truth2m (indices) FROM '/app/data/siftbig/gnd/idx_2M.csv' WITH csv;
-
-CREATE TABLE IF NOT EXISTS sift_truth5m (
-  id SERIAL PRIMARY KEY,
-  indices INTEGER[]
-);
-
-\COPY sift_truth5m (indices) FROM '/app/data/siftbig/gnd/idx_5M.csv' WITH csv;
-
-CREATE TABLE IF NOT EXISTS sift_truth10m (
-  id SERIAL PRIMARY KEY,
-  indices INTEGER[]
-);
-
-\COPY sift_truth10m (indices) FROM '/app/data/siftbig/gnd/idx_10M.csv' WITH csv;
-
-CREATE TABLE IF NOT EXISTS sift_truth20m (
-  id SERIAL PRIMARY KEY,
-  indices INTEGER[]
-);
-
-\COPY sift_truth20m (indices) FROM '/app/data/siftbig/gnd/idx_20M.csv' WITH csv;
-
-CREATE TABLE IF NOT EXISTS sift_truth50m (
-  id SERIAL PRIMARY KEY,
-  indices INTEGER[]
-);
-
-\COPY sift_truth50m (indices) FROM '/app/data/siftbig/gnd/idx_50M.csv' WITH csv;
-
-CREATE TABLE IF NOT EXISTS sift_truth100m (
-  id SERIAL PRIMARY KEY,
-  indices INTEGER[]
-);
-
-\COPY sift_truth100m (indices) FROM '/app/data/siftbig/gnd/idx_100M.csv' WITH csv;
-
-CREATE TABLE IF NOT EXISTS sift_truth200m (
-  id SERIAL PRIMARY KEY,
-  indices INTEGER[]
-);
-
-\COPY sift_truth200m (indices) FROM '/app/data/siftbig/gnd/idx_200M.csv' WITH csv;
-
-CREATE TABLE IF NOT EXISTS sift_truth500m (
-  id SERIAL PRIMARY KEY,
-  indices INTEGER[]
-);
-
-\COPY sift_truth500m (indices) FROM '/app/data/siftbig/gnd/idx_500M.csv' WITH csv;
-
-CREATE TABLE IF NOT EXISTS sift_truth1b (
-  id SERIAL PRIMARY KEY,
-  indices INTEGER[]
-);
-
-\COPY sift_truth1b (indices) FROM '/app/data/siftbig/gnd/idx_1000M.csv' WITH csv;
+);
diff --git a/experiments/scripts/setup_tables.py b/experiments/scripts/setup_tables.py
@@ -0,0 +1,51 @@
+import os
+import argparse
+import urllib.request
+import psycopg2
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-d", "--datapath", default="/tmp/lanterndb/vector_datasets", help="Path to data directory")
+args = parser.parse_args()
+
+data_dir = args.datapath
+sift_dir = os.path.join(data_dir, "sift")
+siftsmall_dir = os.path.join(data_dir, "siftsmall")
+
+if not os.path.exists(sift_dir):
+    print("SIFT directory does not exist. Creating... and downloading sift vectors...") 
+    os.makedirs(sift_dir)
+    os.makedirs(siftsmall_dir)
+
+    siftsmall_fnames = ['siftsmall_base.csv', 'siftsmall_query.csv', 'siftsmall_truth.csv']
+    sift_fnames = ['sift_base.csv', 'sift_query.csv', 'sift_truth.csv']
+
+    for filename in siftsmall_fnames:
+        print(f"Downloading SIFTSMALL {filename}")
+        urllib.request.urlretrieve(f"https://storage.googleapis.com/lanterndata/siftsmall/{filename}", os.path.join(siftsmall_dir, filename))
+    for filename in sift_fnames:
+        print(f"Downloading SIFT {filename}")
+        urllib.request.urlretrieve(f"https://storage.googleapis.com/lanterndata/sift/{filename}", os.path.join(sift_dir, filename))
+else:
+    print("SIFT directory exists. Skipping file download.")
+
+print("Creating tables...")
+
+conn = psycopg2.connect(os.environ["DATABASE_URL"]) 
+cur = conn.cursor()
+
+with open("create_tables.sql", "r") as sqlfile:
+    cur.execute(sqlfile.read())
+
+with open("create_tables_recall.sql", "r") as sqlfile:
+    cur.execute(sqlfile.read())
+
+with open("../../db/init/init_results.sql", "r") as sqlfile:
+    cur.execute(sqlfile.read())
+with open("../../db/init/init_util.sql", "r") as sqlfile:
+    cur.execute(sqlfile.read())
+
+conn.commit()
+cur.close()
+conn.close()
+
+print("Done!")