61
61
#include "replication/walreceiver.h"
62
62
#include "replication/walsender.h"
63
63
#include "storage/bufmgr.h"
64
+ #include "storage/buf_internals.h"
64
65
#include "storage/fd.h"
65
66
#include "storage/ipc.h"
66
67
#include "storage/large_object.h"
@@ -113,6 +114,7 @@ int wal_retrieve_retry_interval = 5000;
113
114
int max_slot_wal_keep_size_mb = -1 ;
114
115
bool track_wal_io_timing = false;
115
116
uint64 predefined_sysidentifier ;
117
+ int lastWrittenLsnCacheSize ;
116
118
117
119
#ifdef WAL_DEBUG
118
120
bool XLOG_DEBUG = false;
@@ -182,6 +184,26 @@ const struct config_enum_entry recovery_target_action_options[] = {
182
184
{NULL , 0 , false}
183
185
};
184
186
187
+
188
+ typedef struct LastWrittenLsnCacheEntry
189
+ {
190
+ BufferTag key ;
191
+ XLogRecPtr lsn ;
192
+ /* double linked list for LRU replacement algorithm */
193
+ dlist_node lru_node ;
194
+ } LastWrittenLsnCacheEntry ;
195
+
196
+
197
+ /*
198
+ * Cache of last written LSN for each relation chunk (hash bucket).
199
+ * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
200
+ * relation metadata update.
201
+ * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
202
+ * pages are replaced using LRU algorithm, based on L2-list.
203
+ * Access to this cache is protected by 'LastWrittenLsnLock'.
204
+ */
205
+ static HTAB * lastWrittenLsnCache ;
206
+
185
207
/*
186
208
* Statistics for current checkpoint are collected in this global struct.
187
209
* Because only the checkpointer or a stand-alone backend can perform
@@ -749,7 +771,17 @@ typedef struct XLogCtlData
749
771
* XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
750
772
*/
751
773
XLogRecPtr lastFpwDisableRecPtr ;
752
- XLogRecPtr lastWrittenPageLSN ;
774
+
775
+ /*
776
+ * Maximal last written LSN for pages not present in lastWrittenLsnCache
777
+ */
778
+ XLogRecPtr maxLastWrittenLsn ;
779
+
780
+ /*
781
+ * Double linked list to implement LRU replacement policy for last written LSN cache.
782
+ * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'.
783
+ */
784
+ dlist_head lastWrittenLsnLRU ;
753
785
754
786
/* neon: copy of startup's RedoStartLSN for walproposer's use */
755
787
XLogRecPtr RedoStartLSN ;
@@ -772,6 +804,8 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
772
804
*/
773
805
static ControlFileData * ControlFile = NULL ;
774
806
807
+ #define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */
808
+
775
809
/*
776
810
* Calculate the amount of space left on the page after 'endptr'. Beware
777
811
* multiple evaluation!
@@ -5141,11 +5175,8 @@ LocalProcessControlFile(bool reset)
5141
5175
ReadControlFile ();
5142
5176
}
5143
5177
5144
- /*
5145
- * Initialization of shared memory for XLOG
5146
- */
5147
- Size
5148
- XLOGShmemSize (void )
5178
+ static Size
5179
+ XLOGCtlShmemSize (void )
5149
5180
{
5150
5181
Size size ;
5151
5182
@@ -5185,6 +5216,16 @@ XLOGShmemSize(void)
5185
5216
return size ;
5186
5217
}
5187
5218
5219
+ /*
5220
+ * Initialization of shared memory for XLOG
5221
+ */
5222
+ Size
5223
+ XLOGShmemSize (void )
5224
+ {
5225
+ return XLOGCtlShmemSize () +
5226
+ hash_estimate_size (lastWrittenLsnCacheSize , sizeof (LastWrittenLsnCacheEntry ));
5227
+ }
5228
+
5188
5229
void
5189
5230
XLOGShmemInit (void )
5190
5231
{
@@ -5214,6 +5255,15 @@ XLOGShmemInit(void)
5214
5255
XLogCtl = (XLogCtlData * )
5215
5256
ShmemInitStruct ("XLOG Ctl" , XLOGShmemSize (), & foundXLog );
5216
5257
5258
+ {
5259
+ static HASHCTL info ;
5260
+ info .keysize = sizeof (BufferTag );
5261
+ info .entrysize = sizeof (LastWrittenLsnCacheEntry );
5262
+ lastWrittenLsnCache = ShmemInitHash ("last_written_lsn_cache" ,
5263
+ lastWrittenLsnCacheSize , lastWrittenLsnCacheSize ,
5264
+ & info ,
5265
+ HASH_ELEM | HASH_BLOBS );
5266
+ }
5217
5267
localControlFile = ControlFile ;
5218
5268
ControlFile = (ControlFileData * )
5219
5269
ShmemInitStruct ("Control File" , sizeof (ControlFileData ), & foundCFile );
@@ -8117,7 +8167,8 @@ StartupXLOG(void)
8117
8167
8118
8168
XLogCtl -> LogwrtRqst .Write = EndOfLog ;
8119
8169
XLogCtl -> LogwrtRqst .Flush = EndOfLog ;
8120
- XLogCtl -> lastWrittenPageLSN = EndOfLog ;
8170
+ XLogCtl -> maxLastWrittenLsn = EndOfLog ;
8171
+ dlist_init (& XLogCtl -> lastWrittenLsnLRU );
8121
8172
8122
8173
LocalSetXLogInsertAllowed ();
8123
8174
@@ -8893,29 +8944,141 @@ GetInsertRecPtr(void)
8893
8944
}
8894
8945
8895
8946
/*
8896
- * GetLastWrittenPageLSN -- Returns maximal LSN of written page
8947
+ * GetLastWrittenLSN -- Returns maximal LSN of written page.
8948
+ * It returns an upper bound for the last written LSN of a given page,
8949
+ * either from a cached last written LSN or a global maximum last written LSN.
8950
+ * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
8951
+ * If cache is large enough ,iterting through all hash items may be rather expensive.
8952
+ * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
8897
8953
*/
8898
8954
XLogRecPtr
8899
- GetLastWrittenPageLSN ( void )
8955
+ GetLastWrittenLSN ( RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
8900
8956
{
8901
8957
XLogRecPtr lsn ;
8902
- SpinLockAcquire (& XLogCtl -> info_lck );
8903
- lsn = XLogCtl -> lastWrittenPageLSN ;
8904
- SpinLockRelease (& XLogCtl -> info_lck );
8958
+ LastWrittenLsnCacheEntry * entry ;
8959
+
8960
+ LWLockAcquire (LastWrittenLsnLock , LW_SHARED );
8961
+
8962
+ /* Maximal last written LSN among all non-cached pages */
8963
+ lsn = XLogCtl -> maxLastWrittenLsn ;
8964
+
8965
+ if (rnode .relNode != InvalidOid )
8966
+ {
8967
+ BufferTag key ;
8968
+ key .rnode = rnode ;
8969
+ key .forkNum = forknum ;
8970
+ key .blockNum = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET ;
8971
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_FIND , NULL );
8972
+ if (entry != NULL )
8973
+ lsn = entry -> lsn ;
8974
+ }
8975
+ else
8976
+ {
8977
+ HASH_SEQ_STATUS seq ;
8978
+ /* Find maximum of all cached LSNs */
8979
+ hash_seq_init (& seq , lastWrittenLsnCache );
8980
+ while ((entry = (LastWrittenLsnCacheEntry * ) hash_seq_search (& seq )) != NULL )
8981
+ {
8982
+ if (entry -> lsn > lsn )
8983
+ lsn = entry -> lsn ;
8984
+ }
8985
+ }
8986
+ LWLockRelease (LastWrittenLsnLock );
8905
8987
8906
8988
return lsn ;
8907
8989
}
8908
8990
8909
8991
/*
8910
- * SetLastWrittenPageLSN -- Set maximal LSN of written page
8992
+ * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range.
8993
+ * We maintain cache of last written LSNs with limited size and LRU replacement
8994
+ * policy. To reduce cache size we store max LSN not for each page, but for
8995
+ * bucket (1024 blocks). This cache allows to use old LSN when
8996
+ * requesting pages of unchanged or appended relations.
8997
+ *
8998
+ * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated.
8999
+ * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions.
8911
9000
*/
8912
9001
void
8913
- SetLastWrittenPageLSN (XLogRecPtr lsn )
9002
+ SetLastWrittenLSNForBlockRange (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber from , BlockNumber till )
8914
9003
{
8915
- SpinLockAcquire (& XLogCtl -> info_lck );
8916
- if (lsn > XLogCtl -> lastWrittenPageLSN )
8917
- XLogCtl -> lastWrittenPageLSN = lsn ;
8918
- SpinLockRelease (& XLogCtl -> info_lck );
9004
+ if (lsn == InvalidXLogRecPtr )
9005
+ return ;
9006
+
9007
+ LWLockAcquire (LastWrittenLsnLock , LW_EXCLUSIVE );
9008
+ if (rnode .relNode == InvalidOid )
9009
+ {
9010
+ if (lsn > XLogCtl -> maxLastWrittenLsn )
9011
+ XLogCtl -> maxLastWrittenLsn = lsn ;
9012
+ }
9013
+ else
9014
+ {
9015
+ LastWrittenLsnCacheEntry * entry ;
9016
+ BufferTag key ;
9017
+ bool found ;
9018
+ BlockNumber bucket ;
9019
+
9020
+ key .rnode = rnode ;
9021
+ key .forkNum = forknum ;
9022
+ for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET ;
9023
+ bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET ;
9024
+ bucket ++ )
9025
+ {
9026
+ key .blockNum = bucket ;
9027
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_ENTER , & found );
9028
+ if (found )
9029
+ {
9030
+ if (lsn > entry -> lsn )
9031
+ entry -> lsn = lsn ;
9032
+ /* Unlink from LRU list */
9033
+ dlist_delete (& entry -> lru_node );
9034
+ }
9035
+ else
9036
+ {
9037
+ entry -> lsn = lsn ;
9038
+ if (hash_get_num_entries (lastWrittenLsnCache ) > lastWrittenLsnCacheSize )
9039
+ {
9040
+ /* Replace least recently used entry */
9041
+ LastWrittenLsnCacheEntry * victim = dlist_container (LastWrittenLsnCacheEntry , lru_node , dlist_pop_head_node (& XLogCtl -> lastWrittenLsnLRU ));
9042
+ /* Adjust max LSN for not cached relations/chunks if needed */
9043
+ if (victim -> lsn > XLogCtl -> maxLastWrittenLsn )
9044
+ XLogCtl -> maxLastWrittenLsn = victim -> lsn ;
9045
+
9046
+ hash_search (lastWrittenLsnCache , victim , HASH_REMOVE , NULL );
9047
+ }
9048
+ }
9049
+ /* Link to the end of LRU list */
9050
+ dlist_push_tail (& XLogCtl -> lastWrittenLsnLRU , & entry -> lru_node );
9051
+ }
9052
+ }
9053
+ LWLockRelease (LastWrittenLsnLock );
9054
+ }
9055
+
9056
+ /*
9057
+ * SetLastWrittenLSNForBlock -- Set maximal LSN for block
9058
+ */
9059
+ void
9060
+ SetLastWrittenLSNForBlock (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
9061
+ {
9062
+ SetLastWrittenLSNForBlockRange (lsn , rnode , forknum , blkno , blkno );
9063
+ }
9064
+
9065
+ /*
9066
+ * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata
9067
+ */
9068
+ void
9069
+ SetLastWrittenLSNForRelation (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum )
9070
+ {
9071
+ SetLastWrittenLSNForBlock (lsn , rnode , forknum , REL_METADATA_PSEUDO_BLOCKNO );
9072
+ }
9073
+
9074
+ /*
9075
+ * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database
9076
+ */
9077
+ void
9078
+ SetLastWrittenLSNForDatabase (XLogRecPtr lsn )
9079
+ {
9080
+ RelFileNode dummyNode = {InvalidOid , InvalidOid , InvalidOid };
9081
+ SetLastWrittenLSNForBlock (lsn , dummyNode , MAIN_FORKNUM , 0 );
8919
9082
}
8920
9083
8921
9084
/*
0 commit comments