neondatabase
diff --git a/‎src/backend/access/nbtree/README
+44 b/‎src/backend/access/nbtree/README
+44
diff --git a/‎src/backend/access/nbtree/nbtinsert.c
+1-1 b/‎src/backend/access/nbtree/nbtinsert.c
+1-1
diff --git a/‎src/backend/access/nbtree/nbtree.c
+1 b/‎src/backend/access/nbtree/nbtree.c
+1
diff --git a/‎src/backend/access/nbtree/nbtsearch.c
+210-4 b/‎src/backend/access/nbtree/nbtsearch.c
+210-4
diff --git a/‎src/backend/optimizer/path/costsize.c
+2 b/‎src/backend/optimizer/path/costsize.c
+2
@@ -1054,3 +1054,47 @@ item is irrelevant, and need not be stored at all.  This arrangement
 corresponds to the fact that an L&Y non-leaf page has one more pointer
 than key.  Suffix truncation's negative infinity attributes behave in
 the same way.
+
+Notes About Index Scan Prefetch
+-------------------------------
+
+Prefetch can significantly improve the speed of OLAP queries.
+To be able to perform prefetch, we need to know which pages will
+be accessed during the scan. It is trivial for heap- and bitmap scans,
+but requires more effort for index scans: to implement prefetch for
+index scans, we need to find out subsequent leaf pages. 
+
+Postgres links all pages at the same level of the B-Tree in a doubly linked list and uses this list for
+forward and backward iteration. This list, however, can not trivially be used for prefetching because to locate the next page because we need first to load the current page. To prefetch more than only the next page, we can utilize the parent page's downlinks instead, as it contains references to most of the target page's sibling pages.
+
+Because Postgres' nbtree pages have no reference to their parent page, we need to remember the parent page when descending the btree and use it to prefetch subsequent pages. We will utilize the parent's linked list to improve the performance of this prefetch system past the key range of the parent page.
+
+We should prefetch not only leaf pages, but also the next parent page.
+The trick is to correctly calculate the moment when it will be needed:
+We should not issue the prefetch request when prefetch requests for all children from the current parent page have already been issued, but when there are only effective_io_concurrency line pointers left to prefetch from the page.
+
+Currently there are two different prefetch implementations for
+index-only scan and index scan. Index-only scan doesn't need to access heap tuples so it prefetches
+only B-Tree leave pages (and their parents). Prefetch of index-only scan is performed only
+if parallel plan is not used. Parallel index scan is using critical section for obtaining next
+page by parallel worker. Leaf page is loaded in this critical section.
+And if most of time is spent in loading the page, then it actually eliminates any concurrency
+and makes prefetch useless. For relatively small tables Postgres will not choose parallel plan in
+any case. And for large tables it can be enforced by setting max_parallel_workers_per_gather=0.
+
+Prefetch for normal (not index-only) index tries to prefetch heap tuples
+referenced from leaf page. Average number of items per page
+is about 100 which is comparable with default value of effective_io_concurrency.
+So there is not so much sense trying to prefetch also next leaf page.
+
+As far as it is difficult to estimate number of entries traversed by index scan,
+we prefer not to prefetch  large number of pages from the very beginning.
+Such useless prefetch can reduce the performance of point lookups.
+Instead of it we start with smallest prefetch distance and increase it
+by INCREASE_PREFETCH_DISTANCE_STEP after processing each item
+until it reaches effective_io_concurrency. In case of index-only
+scan we increase prefetch distance after processing each leaf pages
+and for index scan - after processing each tuple.
+The only exception is case when no key bounds are specified.
+In this case we traverse the whole relation and it makes sense
+to start with the largest possible prefetch distance from the very beginning.
@@ -2159,7 +2159,7 @@ _bt_insert_parent(Relation rel,
 					 BlockNumberIsValid(RelationGetTargetBlock(rel))));
 
 			/* Find the leftmost page at the next level up */
-			pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
+			pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL, NULL);
 			/* Set up a phony stack entry pointing there */
 			stack = &fakestack;
 			stack->bts_blkno = BufferGetBlockNumber(pbuf);
 
@@ -368,6 +368,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 
 	so->killedItems = NULL;		/* until needed */
 	so->numKilled = 0;
+	so->prefetch_maximum = 0;   /* disable prefetch */
 
 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
 
@@ -18,12 +18,14 @@
 #include "access/nbtree.h"
 #include "access/relscan.h"
 #include "access/xact.h"
+#include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "pgstat.h"
 #include "storage/predicate.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
-
+#include "utils/spccache.h"
 
 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
@@ -47,6 +49,7 @@ static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
 
+#define INCREASE_PREFETCH_DISTANCE_STEP 1
 
 /*
  *	_bt_drop_lock_and_maybe_pin()
@@ -842,6 +845,70 @@ _bt_compare(Relation rel,
 	return 0;
 }
 
+
+/*
+ * _bt_read_parent_for_prefetch - read parent page and extract references to children for prefetch.
+ * This functions returns offset of first item.
+ */
+static int
+_bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirection dir)
+{
+	Relation rel = scan->indexRelation;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Buffer		buf;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber offnum;
+	OffsetNumber n_child;
+	int          next_parent_prefetch_index;
+	int          i, j;
+
+	buf = _bt_getbuf(rel, parent, BT_READ);
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	offnum = P_FIRSTDATAKEY(opaque);
+	n_child = PageGetMaxOffsetNumber(page) - offnum + 1;
+
+	/* Position where we should insert prefetch of parent page: we intentionally use prefetch_maximum here instead of current_prefetch_distance,
+	 * assuming that it will reach prefetch_maximum before we reach and of the parent page
+	 */
+	next_parent_prefetch_index = (n_child > so->prefetch_maximum)
+		? n_child - so->prefetch_maximum : 0;
+
+	if (ScanDirectionIsForward(dir))
+	{
+		so->next_parent = opaque->btpo_next;
+		if (so->next_parent == P_NONE)
+			next_parent_prefetch_index = -1;
+		for (i = 0, j = 0; i < n_child; i++)
+		{
+			ItemId itemid = PageGetItemId(page, offnum + i);
+			IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+			if (i == next_parent_prefetch_index)
+				so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */
+ 			so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup);
+		}
+	}
+	else
+	{
+		so->next_parent = opaque->btpo_prev;
+		if (so->next_parent == P_NONE)
+			next_parent_prefetch_index = -1;
+		for (i = 0, j = 0; i < n_child; i++)
+		{
+			ItemId itemid = PageGetItemId(page, offnum + n_child - i - 1);
+			IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+			if (i == next_parent_prefetch_index)
+				so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */
+			so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup);
+		}
+	}
+	so->n_prefetch_blocks = j;
+	so->last_prefetch_index = 0;
+	_bt_relbuf(rel, buf);
+	return offnum;
+}
+
 /*
  *	_bt_first() -- Find the first item in a scan.
  *
@@ -1101,6 +1168,37 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		}
 	}
 
+	/* Neon: initialize prefetch */
+	so->n_prefetch_requests = 0;
+	so->n_prefetch_blocks = 0;
+	so->last_prefetch_index = 0;
+	so->next_parent = P_NONE;
+	so->prefetch_maximum = IsCatalogRelation(rel)
+		? effective_io_concurrency
+		: get_tablespace_io_concurrency(rel->rd_rel->reltablespace);
+
+	if (scan->xs_want_itup) /* index only scan */
+	{
+		if (enable_indexonlyscan_prefetch)
+		{
+			/* We disable prefetch for parallel index-only scan.
+			 * Neon prefetch is efficient only if prefetched blocks are accessed by the same worker
+			 * which issued prefetch request. The logic of splitting pages between parallel workers in
+			 * index scan doesn't allow to satisfy this requirement.
+			 * Also prefetch of leave pages will be useless if expected number of rows fits in one page.
+			 */
+			if (scan->parallel_scan)
+				so->prefetch_maximum = 0;  /* disable prefetch */
+		}
+		else
+			so->prefetch_maximum = 0; /* disable prefetch */
+	}
+	else if (!enable_indexscan_prefetch || !scan->heapRelation)
+		so->prefetch_maximum = 0; /* disable prefetch */
+
+	/* If key bounds are not specified, then we will scan the whole relation and it make sense to start with the largest possible prefetch distance */
+	so->current_prefetch_distance = (keysCount == 0) ? so->prefetch_maximum : 0;
+
 	/*
 	 * If we found no usable boundary keys, we have to start from one end of
 	 * the tree.  Walk down that edge to the first or last key, and scan from
@@ -1371,6 +1469,21 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 */
 	stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
 
+	/* Start prefetching for index only scan */
+	if (so->prefetch_maximum > 0 && stack != NULL && scan->xs_want_itup) /* index only scan */
+	{
+		int first_offset = _bt_read_parent_for_prefetch(scan, stack->bts_blkno, dir);
+		int skip = ScanDirectionIsForward(dir)
+			? stack->bts_offset - first_offset
+			: first_offset + so->n_prefetch_blocks - 1 - stack->bts_offset;
+		Assert(so->n_prefetch_blocks >= skip);
+		so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP;
+		so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - skip);
+		so->last_prefetch_index = skip + so->n_prefetch_requests;
+		for (int i = skip; i < so->last_prefetch_index; i++)
+			PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]);
+	}
+
 	/* don't need to keep the stack around... */
 	_bt_freestack(stack);
 
@@ -1510,9 +1623,63 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 	/* OK, itemIndex says what to return */
 	currItem = &so->currPos.items[so->currPos.itemIndex];
 	scan->xs_heaptid = currItem->heapTid;
-	if (scan->xs_want_itup)
+	if (scan->xs_want_itup) /* index-only scan */
+	{
 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+	}
+	else if (so->prefetch_maximum > 0)
+	{
+		int prefetchLimit, prefetchDistance;
+
+		/* Neon: prefetch referenced heap pages.
+		 * As far as it is difficult to predict how much items index scan will return
+		 * we do not want to prefetch many heap pages from the very beginning because
+		 * them may not be needed. So we are going to increase prefetch distance by INCREASE_PREFETCH_DISTANCE_STEP
+		 * at each index scan iteration until it reaches prefetch_maximum.
+		 */
+
+		/* Advance pefetch distance until it reaches prefetch_maximum */
+		if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum)
+			so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP;
+		else
+			so->current_prefetch_distance = so->prefetch_maximum;
+
+		/* How much we can prefetch */
+		prefetchLimit = Min(so->current_prefetch_distance, so->currPos.lastItem - so->currPos.firstItem + 1);
 
+		/* Active prefeth requests */
+		prefetchDistance = so->n_prefetch_requests;
+
+		/*
+		 * Consume one prefetch request (if any)
+		 */
+		if (prefetchDistance != 0)
+			prefetchDistance -= 1;
+
+		/* Keep number of active prefetch requests equal to the current prefetch distance.
+		 * When prefetch distance reaches prefetch maximum, this loop performs at most one iteration,
+		 * but at the beginning of index scan it performs up to INCREASE_PREFETCH_DISTANCE_STEP+1 iterations
+		 */
+		if (ScanDirectionIsForward(dir))
+		{
+			while (prefetchDistance < prefetchLimit && so->currPos.itemIndex + prefetchDistance <= so->currPos.lastItem)
+			{
+				BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex + prefetchDistance].heapTid.ip_blkid);
+				PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno);
+				prefetchDistance += 1;
+			}
+		}
+		else
+		{
+			while (prefetchDistance < prefetchLimit && so->currPos.itemIndex - prefetchDistance >= so->currPos.firstItem)
+			{
+				BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex - prefetchDistance].heapTid.ip_blkid);
+				PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno);
+				prefetchDistance += 1;
+			}
+		}
+		so->n_prefetch_requests = prefetchDistance; /* update number of active prefetch requests */
+	}
 	return true;
 }
 
@@ -1919,6 +2086,30 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 		so->markItemIndex = -1;
 	}
 
+	if (scan->xs_want_itup && so->prefetch_maximum > 0) /* Prefetching of leave pages for index-only scan */
+	{
+		/* Advance pefetch distance until it reaches prefetch_maximum */
+		if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum)
+			so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP;
+
+		so->n_prefetch_requests -= 1; /* we load next leaf page, so decrement number of active prefetch requests */
+
+		/* Check if the are more children to prefetch at current parent  page */
+		if (so->last_prefetch_index == so->n_prefetch_blocks && so->next_parent != P_NONE)
+		{
+			/* we have prefetched all items from current parent page, let's move to the next parent page */
+			_bt_read_parent_for_prefetch(scan, so->next_parent, dir);
+			so->n_prefetch_requests -= 1; /* loading parent page consumes one more prefetch request */
+		}
+
+		/* Try to keep number of active prefetch requests equal to current prefetch distance */
+		while (so->n_prefetch_requests < so->current_prefetch_distance && so->last_prefetch_index < so->n_prefetch_blocks)
+		{
+			so->n_prefetch_requests += 1;
+			PrefetchBuffer(scan->indexRelation, MAIN_FORKNUM, so->prefetch_blocks[so->last_prefetch_index++]);
+		}
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* Walk right to the next page with data */
@@ -2323,6 +2514,7 @@ _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
  */
 Buffer
 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
+				 BlockNumber* parent,
 				 Snapshot snapshot)
 {
 	Buffer		buf;
@@ -2331,6 +2523,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 	OffsetNumber offnum;
 	BlockNumber blkno;
 	IndexTuple	itup;
+	BlockNumber parent_blocknum = P_NONE;
 
 	/*
 	 * If we are looking for a leaf page, okay to descend from fast root;
@@ -2348,6 +2541,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 	page = BufferGetPage(buf);
 	TestForOldSnapshot(snapshot, rel, page);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	blkno = BufferGetBlockNumber(buf);
 
 	for (;;)
 	{
@@ -2386,12 +2580,15 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 			offnum = P_FIRSTDATAKEY(opaque);
 
 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+		parent_blocknum = blkno;
 		blkno = BTreeTupleGetDownLink(itup);
 
 		buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	}
+	if (parent)
+		*parent = parent_blocknum;
 
 	return buf;
 }
@@ -2415,13 +2612,13 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 	BTPageOpaque opaque;
 	OffsetNumber start;
 	BTScanPosItem *currItem;
-
+	BlockNumber	parent;
 	/*
 	 * Scan down to the leftmost or rightmost leaf page.  This is a simplified
 	 * version of _bt_search().  We don't maintain a stack since we know we
 	 * won't need it.
 	 */
-	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
+	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), &parent, scan->xs_snapshot);
 
 	if (!BufferIsValid(buf))
 	{
@@ -2434,6 +2631,15 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		return false;
 	}
 
+	/* Start prefetching for index-only scan */
+	if (so->prefetch_maximum > 0 && parent != P_NONE && scan->xs_want_itup) /* index only scan */
+	{
+		_bt_read_parent_for_prefetch(scan, parent, dir);
+		so->n_prefetch_requests = so->last_prefetch_index = Min(so->prefetch_maximum, so->n_prefetch_blocks);
+		for (int i = 0; i < so->last_prefetch_index; i++)
+			PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]);
+	}
+
 	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
@@ -151,6 +151,8 @@ bool		enable_parallel_hash = true;
 bool		enable_partition_pruning = true;
 bool		enable_async_append = true;
 bool        enable_seqscan_prefetch = true;
+bool        enable_indexscan_prefetch = true;
+bool        enable_indexonlyscan_prefetch = true;
 
 typedef struct
 {
Original file line number	Diff line number	Diff line change
`@@ -151,6 +151,8 @@ bool enable_parallel_hash = true;`
`151`	`151`	`bool enable_partition_pruning = true;`
`152`	`152`	`bool enable_async_append = true;`
`153`	`153`	`bool enable_seqscan_prefetch = true;`
	`154`	`+bool enable_indexscan_prefetch = true;`
	`155`	`+bool enable_indexonlyscan_prefetch = true;`
`154`	`156`
`155`	`157`	`typedef struct`
`156`	`158`	`{`