Fix missing handling for operator usage in Materialized queries

Ngalstyan4 · Ngalstyan4 · commit e7895a65b030 · 2023-11-25T21:34:46.000Z
- Add comment about default

- Add success case and a more complex error case
diff --git a/src/hooks/plan_tree_walker.c b/src/hooks/plan_tree_walker.c
@@ -8,8 +8,31 @@
 #include <nodes/parsenodes.h>
 #include <nodes/plannodes.h>
 
+#include "../hnsw/utils.h"
+
 bool base_plan_walker(Plan *plan, bool (*walker_func)(Node *plan, void *context), void *context)
 {
+    /*
+        If there is a need to debug this function, follow the steps below:
+        0. Add the following as the default branch in plan_tree_walker
+            default:
+                {
+                    ldb_dlog("plan_tree_walker: unsupported plan node type: %d", nodeTag(plan));
+                    return false;
+                }
+            This will print all nodes that are not explicitly handled by the walker.
+            Currently there are several such nodes which probably means there are more
+            latent issues here.
+        1. Attach gdb to the postgres process
+        2. Set a breakpoint at the function entry
+        3. navitate through relevant paths via gdb
+        4. debug print Plan* nodes via
+            p (char*) nodeToString(plan);
+
+        Note: for non-trivial Plan* nodes you may need to run:
+            set print elements 0
+        in gdb to make sure the node string is not truncated.
+    */
     if(walker_func((Node *)plan->targetlist, context)) return true;
     if(walker_func((Node *)plan->qual, context)) return true;
     if(walker_func((Node *)plan->lefttree, context)) return true;
@@ -124,6 +147,12 @@ bool plan_tree_walker(Plan *plan, bool (*walker_func)(Node *plan, void *context)
             if(walker_func((Node *)append->appendplans, context)) return true;
             break;
         }
+        case T_Material:
+        {
+            Material *material = (Material *)plan;
+            if(base_plan_walker(&(material->plan), walker_func, context)) return true;
+            break;
+        }
         default:
             return false;
     }
diff --git a/test/expected/hnsw_dist_func.out b/test/expected/hnsw_dist_func.out
@@ -222,7 +222,77 @@ WITH t AS (SELECT id FROM test1 ORDER BY v <-> '{1,2}' LIMIT 1) SELECT id, COUNT
 ERROR:  Operator <-> can only be used inside of an index
 WITH t AS (SELECT id FROM test1 ORDER BY v <-> '{1,2}') SELECT id FROM t UNION SELECT id FROM t;
 ERROR:  Operator <-> can only be used inside of an index
+-- issue #227
+SELECT * from test2 JOIN LATERAL (SELECT * FROM (SELECT id FROM test2 ORDER BY v <-> '{1,2}') as forall) haha on TRUE;
+ERROR:  Operator <-> can only be used inside of an index
+-- more complex setup of the above
+SELECT forall.id, nearest_per_id.* FROM
+(SELECT * FROM
+  test2) AS forall
+  JOIN LATERAL (
+    SELECT
+      ARRAY_AGG(id ORDER BY id) AS near_ids,
+      ARRAY_AGG(dist ORDER BY id) AS near_dists
+    FROM
+      (
+        SELECT
+          id,
+          l2sq_dist(v, forall.v) as dist
+        FROM
+          test2
+        ORDER BY
+          v <-> forall.v
+        LIMIT
+          5
+      ) as __unused_name
+  ) nearest_per_id on TRUE
+ORDER BY
+  forall.id
+LIMIT
+  9;
+ERROR:  Operator <-> can only be used inside of an index
 \set ON_ERROR_STOP on
+-- cross-lateral joins work as expected when appropriate index exists
+-- nearest element for each vector
+-- Note: The limit below is 4 to make sure all neighbors with distance 1 are included
+-- and none of distance 2 are included. if we include some of distance 2, then we need
+-- further sorting to make sure ties among nodes with distance 2 are broken consistently
+SELECT forall.id, nearest_per_id.* FROM
+(SELECT * FROM
+  small_world_l2) AS forall
+  JOIN LATERAL (
+    SELECT
+      ARRAY_AGG(id ORDER BY dist, id) AS near_ids,
+      ARRAY_AGG(dist ORDER BY dist, id) AS near_dists
+    FROM
+      (
+        SELECT
+          id,
+          l2sq_dist(v, forall.v) as dist
+        FROM
+          small_world_l2
+        ORDER BY
+          v <-> forall.v
+        LIMIT
+          4
+      ) as __unused_name
+  ) nearest_per_id on TRUE
+ORDER BY
+  forall.id
+LIMIT
+  9;
+ id  |     near_ids      | near_dists 
+-----+-------------------+------------
+ 000 | {000,001,010,100} | {0,1,1,1}
+ 001 | {001,000,011,101} | {0,1,1,1}
+ 010 | {010,000,011,110} | {0,1,1,1}
+ 011 | {011,001,010,111} | {0,1,1,1}
+ 100 | {100,000,101,110} | {0,1,1,1}
+ 101 | {101,001,100,111} | {0,1,1,1}
+ 110 | {110,010,100,111} | {0,1,1,1}
+ 111 | {111,011,101,110} | {0,1,1,1}
+(8 rows)
+
 -- Check that hamming distance query results are sorted correctly
 CREATE TABLE extra_small_world_ham (
     id SERIAL PRIMARY KEY,
diff --git a/test/sql/hnsw_dist_func.sql b/test/sql/hnsw_dist_func.sql
@@ -92,7 +92,65 @@ SELECT t2_results.id FROM test1 t1 JOIN LATERAL (SELECT t2.id FROM test2 t2 ORDE
 WITH t AS (SELECT id FROM test1 ORDER BY v <-> '{1,2}' LIMIT 1) SELECT DISTINCT id FROM t;
 WITH t AS (SELECT id FROM test1 ORDER BY v <-> '{1,2}' LIMIT 1) SELECT id, COUNT(*) FROM t GROUP BY 1;
 WITH t AS (SELECT id FROM test1 ORDER BY v <-> '{1,2}') SELECT id FROM t UNION SELECT id FROM t;
+
+-- issue #227
+SELECT * from test2 JOIN LATERAL (SELECT * FROM (SELECT id FROM test2 ORDER BY v <-> '{1,2}') as forall) haha on TRUE;
+-- more complex setup of the above
+SELECT forall.id, nearest_per_id.* FROM
+(SELECT * FROM
+  test2) AS forall
+  JOIN LATERAL (
+    SELECT
+      ARRAY_AGG(id ORDER BY id) AS near_ids,
+      ARRAY_AGG(dist ORDER BY id) AS near_dists
+    FROM
+      (
+        SELECT
+          id,
+          l2sq_dist(v, forall.v) as dist
+        FROM
+          test2
+        ORDER BY
+          v <-> forall.v
+        LIMIT
+          5
+      ) as __unused_name
+  ) nearest_per_id on TRUE
+ORDER BY
+  forall.id
+LIMIT
+  9;
+
 \set ON_ERROR_STOP on
+-- cross-lateral joins work as expected when appropriate index exists
+-- nearest element for each vector
+-- Note: The limit below is 4 to make sure all neighbors with distance 1 are included
+-- and none of distance 2 are included. if we include some of distance 2, then we need
+-- further sorting to make sure ties among nodes with distance 2 are broken consistently
+SELECT forall.id, nearest_per_id.* FROM
+(SELECT * FROM
+  small_world_l2) AS forall
+  JOIN LATERAL (
+    SELECT
+      ARRAY_AGG(id ORDER BY dist, id) AS near_ids,
+      ARRAY_AGG(dist ORDER BY dist, id) AS near_dists
+    FROM
+      (
+        SELECT
+          id,
+          l2sq_dist(v, forall.v) as dist
+        FROM
+          small_world_l2
+        ORDER BY
+          v <-> forall.v
+        LIMIT
+          4
+      ) as __unused_name
+  ) nearest_per_id on TRUE
+ORDER BY
+  forall.id
+LIMIT
+  9;
 
 -- Check that hamming distance query results are sorted correctly
 CREATE TABLE extra_small_world_ham (