Set wasShutdown=true during hot-standby replica startup only when primary is not alive (#365)

knizhnik · Konstantin Knizhnik · hlinnaka · web-flow · commit b9336bc7c0fe · 2024-02-22T15:50:24.000Z
* Set wasShutdown=true during hot-standby replica startup only when primary is not alive * Report fatal error if hot standaby replica is started with oldestAcriveXid=0 Postgres part of neondatabase/neon#6705 --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech> Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
@@ -542,6 +542,18 @@ typedef union WALInsertLockPadded
 	char		pad[PG_CACHE_LINE_SIZE];
 } WALInsertLockPadded;
 
+
+/*
+ * NEON: check if primary node is running.
+ * Correspondent GUC is received from control plane
+ */
+static bool
+IsPrimaryAlive()
+{
+	const char* val = GetConfigOption("neon.primary_is_running", true, false);
+	return val != NULL && strcmp(val, "on") == 0;
+}
+
 /*
  * State of an exclusive backup, necessary to control concurrent activities
  * across sessions when working on exclusive backups.
@@ -7031,7 +7043,25 @@ StartupXLOG(void)
 		EndRecPtr = ControlFile->checkPointCopy.redo;
 
 		memcpy(&checkPoint, &ControlFile->checkPointCopy, sizeof(CheckPoint));
-		wasShutdown = true;
+		// When primary Neon compute node is started, we pretend that it started after a clean shutdown and
+		// no recovery is needed. We don't need to do WAL replay, the page server does that on a page-by-page basis.
+		// When a read-only replica is started, PostgreSQL normally waits for a shutdown checkpoint or running-xacts
+		// record before enabling hot standby, to establish which transactions are still running in the primary,
+		// and might still commit later. But if we know that the primary is not running - because the control plane
+		// says so - we can skip that. That avoids having to wait indefinitely if the primary is not running. This is
+		// particularly important for Neon because we don't start recovery from a checkpoint record, so there's
+		// no guarantee on when we'll see the next checkpoint or running-xacts record, if ever. so if we know the primary is
+		// not currently running, also set wasShutdown to 'true'.
+		if (StandbyModeRequested &&
+			PrimaryConnInfo != NULL && *PrimaryConnInfo != '\0')
+		{
+			if (!IsPrimaryAlive())
+				wasShutdown = true;
+			else
+				wasShutdown = false;
+		}
+		else
+			wasShutdown = true;
 
 		/* Initialize expectedTLEs, like ReadRecord() does */
 		expectedTLEs = readTimeLineHistory(checkPoint.ThisTimeLineID);
@@ -7201,6 +7231,24 @@ StartupXLOG(void)
 		ereport(PANIC,
 				(errmsg("invalid next transaction ID")));
 
+	if (ZenithRecoveryRequested)
+	{
+		if (wasShutdown)
+			checkPoint.oldestActiveXid = InvalidTransactionId;
+		else if (!TransactionIdIsValid(checkPoint.oldestActiveXid))
+		{
+			/*
+			 * It should not actually happen: PS oldestActiveXid
+			 * from running xacts WAL records and include it in checkpoint
+			 * sent in basebackup.
+			 * FirstNormalTransactionId is conservative estimation of oldest active XACT, unless
+			 * current XID is greater than 1^31. So it is also not 100% safe solution but better than assertion failure.
+			 */
+			elog(FATAL, "oldestActiveXid=%d", checkPoint.oldestActiveXid);
+			checkPoint.oldestActiveXid = FirstNormalTransactionId;
+		}
+	}
+
 	/* initialize shared memory variables from the checkpoint record */
 	ShmemVariableCache->nextXid = checkPoint.nextXid;
 	ShmemVariableCache->nextOid = checkPoint.nextOid;