61
61
#include "replication/walreceiver.h"
62
62
#include "replication/walsender.h"
63
63
#include "storage/bufmgr.h"
64
+ #include "storage/buf_internals.h"
64
65
#include "storage/fd.h"
65
66
#include "storage/ipc.h"
66
67
#include "storage/large_object.h"
@@ -113,6 +114,7 @@ int wal_retrieve_retry_interval = 5000;
113
114
int max_slot_wal_keep_size_mb = -1 ;
114
115
bool track_wal_io_timing = false;
115
116
uint64 predefined_sysidentifier ;
117
+ int lastWrittenLsnCacheSize ;
116
118
117
119
#ifdef WAL_DEBUG
118
120
bool XLOG_DEBUG = false;
@@ -182,6 +184,26 @@ const struct config_enum_entry recovery_target_action_options[] = {
182
184
{NULL , 0 , false}
183
185
};
184
186
187
+
188
+ typedef struct LastWrittenLsnCacheEntry
189
+ {
190
+ BufferTag key ;
191
+ XLogRecPtr lsn ;
192
+ /* double linked list for LRU replacement algorithm */
193
+ dlist_node lru_node ;
194
+ } LastWrittenLsnCacheEntry ;
195
+
196
+
197
+ /*
198
+ * Cache of last written LSN for each relation chunk (hash bucket).
199
+ * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
200
+ * relation metadata update.
201
+ * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
202
+ * pages are replaced using LRU algorithm, based on L2-list.
203
+ * Access to this cache is protected by 'LastWrittenLsnLock'.
204
+ */
205
+ static HTAB * lastWrittenLsnCache ;
206
+
185
207
/*
186
208
* Statistics for current checkpoint are collected in this global struct.
187
209
* Because only the checkpointer or a stand-alone backend can perform
@@ -749,7 +771,17 @@ typedef struct XLogCtlData
749
771
* XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
750
772
*/
751
773
XLogRecPtr lastFpwDisableRecPtr ;
752
- XLogRecPtr lastWrittenPageLSN ;
774
+
775
+ /*
776
+ * Maximal last written LSN for pages not present in lastWrittenLsnCache
777
+ */
778
+ XLogRecPtr maxLastWrittenLsn ;
779
+
780
+ /*
781
+ * Double linked list to implement LRU replacement policy for last written LSN cache.
782
+ * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'.
783
+ */
784
+ dlist_head lastWrittenLsnLRU ;
753
785
754
786
/* neon: copy of startup's RedoStartLSN for walproposer's use */
755
787
XLogRecPtr RedoStartLSN ;
@@ -772,6 +804,8 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
772
804
*/
773
805
static ControlFileData * ControlFile = NULL ;
774
806
807
+ #define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */
808
+
775
809
/*
776
810
* Calculate the amount of space left on the page after 'endptr'. Beware
777
811
* multiple evaluation!
@@ -5141,11 +5175,8 @@ LocalProcessControlFile(bool reset)
5141
5175
ReadControlFile ();
5142
5176
}
5143
5177
5144
- /*
5145
- * Initialization of shared memory for XLOG
5146
- */
5147
- Size
5148
- XLOGShmemSize (void )
5178
+ static Size
5179
+ XLOGCtlShmemSize (void )
5149
5180
{
5150
5181
Size size ;
5151
5182
@@ -5185,6 +5216,16 @@ XLOGShmemSize(void)
5185
5216
return size ;
5186
5217
}
5187
5218
5219
+ /*
5220
+ * Initialization of shared memory for XLOG
5221
+ */
5222
+ Size
5223
+ XLOGShmemSize (void )
5224
+ {
5225
+ return XLOGCtlShmemSize () +
5226
+ hash_estimate_size (lastWrittenLsnCacheSize , sizeof (LastWrittenLsnCacheEntry ));
5227
+ }
5228
+
5188
5229
void
5189
5230
XLOGShmemInit (void )
5190
5231
{
@@ -5214,6 +5255,15 @@ XLOGShmemInit(void)
5214
5255
XLogCtl = (XLogCtlData * )
5215
5256
ShmemInitStruct ("XLOG Ctl" , XLOGShmemSize (), & foundXLog );
5216
5257
5258
+ {
5259
+ static HASHCTL info ;
5260
+ info .keysize = sizeof (BufferTag );
5261
+ info .entrysize = sizeof (LastWrittenLsnCacheEntry );
5262
+ lastWrittenLsnCache = ShmemInitHash ("last_written_lsn_cache" ,
5263
+ lastWrittenLsnCacheSize , lastWrittenLsnCacheSize ,
5264
+ & info ,
5265
+ HASH_ELEM | HASH_BLOBS );
5266
+ }
5217
5267
localControlFile = ControlFile ;
5218
5268
ControlFile = (ControlFileData * )
5219
5269
ShmemInitStruct ("Control File" , sizeof (ControlFileData ), & foundCFile );
@@ -8117,7 +8167,8 @@ StartupXLOG(void)
8117
8167
8118
8168
XLogCtl -> LogwrtRqst .Write = EndOfLog ;
8119
8169
XLogCtl -> LogwrtRqst .Flush = EndOfLog ;
8120
- XLogCtl -> lastWrittenPageLSN = EndOfLog ;
8170
+ XLogCtl -> maxLastWrittenLsn = EndOfLog ;
8171
+ dlist_init (& XLogCtl -> lastWrittenLsnLRU );
8121
8172
8122
8173
LocalSetXLogInsertAllowed ();
8123
8174
@@ -8889,29 +8940,141 @@ GetInsertRecPtr(void)
8889
8940
}
8890
8941
8891
8942
/*
8892
- * GetLastWrittenPageLSN -- Returns maximal LSN of written page
8943
+ * GetLastWrittenLSN -- Returns maximal LSN of written page.
8944
+ * It returns an upper bound for the last written LSN of a given page,
8945
+ * either from a cached last written LSN or a global maximum last written LSN.
8946
+ * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
8947
+ * If cache is large enough ,iterting through all hash items may be rather expensive.
8948
+ * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
8893
8949
*/
8894
8950
XLogRecPtr
8895
- GetLastWrittenPageLSN ( void )
8951
+ GetLastWrittenLSN ( RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
8896
8952
{
8897
8953
XLogRecPtr lsn ;
8898
- SpinLockAcquire (& XLogCtl -> info_lck );
8899
- lsn = XLogCtl -> lastWrittenPageLSN ;
8900
- SpinLockRelease (& XLogCtl -> info_lck );
8954
+ LastWrittenLsnCacheEntry * entry ;
8955
+
8956
+ LWLockAcquire (LastWrittenLsnLock , LW_SHARED );
8957
+
8958
+ /* Maximal last written LSN among all non-cached pages */
8959
+ lsn = XLogCtl -> maxLastWrittenLsn ;
8960
+
8961
+ if (rnode .relNode != InvalidOid )
8962
+ {
8963
+ BufferTag key ;
8964
+ key .rnode = rnode ;
8965
+ key .forkNum = forknum ;
8966
+ key .blockNum = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET ;
8967
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_FIND , NULL );
8968
+ if (entry != NULL )
8969
+ lsn = entry -> lsn ;
8970
+ }
8971
+ else
8972
+ {
8973
+ HASH_SEQ_STATUS seq ;
8974
+ /* Find maximum of all cached LSNs */
8975
+ hash_seq_init (& seq , lastWrittenLsnCache );
8976
+ while ((entry = (LastWrittenLsnCacheEntry * ) hash_seq_search (& seq )) != NULL )
8977
+ {
8978
+ if (entry -> lsn > lsn )
8979
+ lsn = entry -> lsn ;
8980
+ }
8981
+ }
8982
+ LWLockRelease (LastWrittenLsnLock );
8901
8983
8902
8984
return lsn ;
8903
8985
}
8904
8986
8905
8987
/*
8906
- * SetLastWrittenPageLSN -- Set maximal LSN of written page
8988
+ * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range.
8989
+ * We maintain cache of last written LSNs with limited size and LRU replacement
8990
+ * policy. To reduce cache size we store max LSN not for each page, but for
8991
+ * bucket (1024 blocks). This cache allows to use old LSN when
8992
+ * requesting pages of unchanged or appended relations.
8993
+ *
8994
+ * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated.
8995
+ * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions.
8907
8996
*/
8908
8997
void
8909
- SetLastWrittenPageLSN (XLogRecPtr lsn )
8998
+ SetLastWrittenLSNForBlockRange (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber from , BlockNumber till )
8910
8999
{
8911
- SpinLockAcquire (& XLogCtl -> info_lck );
8912
- if (lsn > XLogCtl -> lastWrittenPageLSN )
8913
- XLogCtl -> lastWrittenPageLSN = lsn ;
8914
- SpinLockRelease (& XLogCtl -> info_lck );
9000
+ if (lsn == InvalidXLogRecPtr )
9001
+ return ;
9002
+
9003
+ LWLockAcquire (LastWrittenLsnLock , LW_EXCLUSIVE );
9004
+ if (rnode .relNode == InvalidOid )
9005
+ {
9006
+ if (lsn > XLogCtl -> maxLastWrittenLsn )
9007
+ XLogCtl -> maxLastWrittenLsn = lsn ;
9008
+ }
9009
+ else
9010
+ {
9011
+ LastWrittenLsnCacheEntry * entry ;
9012
+ BufferTag key ;
9013
+ bool found ;
9014
+ BlockNumber bucket ;
9015
+
9016
+ key .rnode = rnode ;
9017
+ key .forkNum = forknum ;
9018
+ for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET ;
9019
+ bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET ;
9020
+ bucket ++ )
9021
+ {
9022
+ key .blockNum = bucket ;
9023
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_ENTER , & found );
9024
+ if (found )
9025
+ {
9026
+ if (lsn > entry -> lsn )
9027
+ entry -> lsn = lsn ;
9028
+ /* Unlink from LRU list */
9029
+ dlist_delete (& entry -> lru_node );
9030
+ }
9031
+ else
9032
+ {
9033
+ entry -> lsn = lsn ;
9034
+ if (hash_get_num_entries (lastWrittenLsnCache ) > lastWrittenLsnCacheSize )
9035
+ {
9036
+ /* Replace least recently used entry */
9037
+ LastWrittenLsnCacheEntry * victim = dlist_container (LastWrittenLsnCacheEntry , lru_node , dlist_pop_head_node (& XLogCtl -> lastWrittenLsnLRU ));
9038
+ /* Adjust max LSN for not cached relations/chunks if needed */
9039
+ if (victim -> lsn > XLogCtl -> maxLastWrittenLsn )
9040
+ XLogCtl -> maxLastWrittenLsn = victim -> lsn ;
9041
+
9042
+ hash_search (lastWrittenLsnCache , victim , HASH_REMOVE , NULL );
9043
+ }
9044
+ }
9045
+ /* Link to the end of LRU list */
9046
+ dlist_push_tail (& XLogCtl -> lastWrittenLsnLRU , & entry -> lru_node );
9047
+ }
9048
+ }
9049
+ LWLockRelease (LastWrittenLsnLock );
9050
+ }
9051
+
9052
+ /*
9053
+ * SetLastWrittenLSNForBlock -- Set maximal LSN for block
9054
+ */
9055
+ void
9056
+ SetLastWrittenLSNForBlock (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
9057
+ {
9058
+ SetLastWrittenLSNForBlockRange (lsn , rnode , forknum , blkno , blkno );
9059
+ }
9060
+
9061
+ /*
9062
+ * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata
9063
+ */
9064
+ void
9065
+ SetLastWrittenLSNForRelation (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum )
9066
+ {
9067
+ SetLastWrittenLSNForBlock (lsn , rnode , forknum , REL_METADATA_PSEUDO_BLOCKNO );
9068
+ }
9069
+
9070
+ /*
9071
+ * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database
9072
+ */
9073
+ void
9074
+ SetLastWrittenLSNForDatabase (XLogRecPtr lsn )
9075
+ {
9076
+ RelFileNode dummyNode = {InvalidOid , InvalidOid , InvalidOid };
9077
+ SetLastWrittenLSNForBlock (lsn , dummyNode , MAIN_FORKNUM , 0 );
8915
9078
}
8916
9079
8917
9080
/*
0 commit comments