61
61
#include "replication/walreceiver.h"
62
62
#include "replication/walsender.h"
63
63
#include "storage/bufmgr.h"
64
+ #include "storage/buf_internals.h"
64
65
#include "storage/fd.h"
65
66
#include "storage/ipc.h"
66
67
#include "storage/large_object.h"
@@ -113,6 +114,7 @@ int wal_retrieve_retry_interval = 5000;
113
114
int max_slot_wal_keep_size_mb = -1 ;
114
115
bool track_wal_io_timing = false;
115
116
uint64 predefined_sysidentifier ;
117
+ int lastWrittenLsnCacheSize ;
116
118
117
119
#ifdef WAL_DEBUG
118
120
bool XLOG_DEBUG = false;
@@ -182,6 +184,26 @@ const struct config_enum_entry recovery_target_action_options[] = {
182
184
{NULL , 0 , false}
183
185
};
184
186
187
+
188
+ typedef struct LastWrittenLsnCacheEntry
189
+ {
190
+ BufferTag key ;
191
+ XLogRecPtr lsn ;
192
+ /* double linked list for LRU replacement algorithm */
193
+ dlist_node lru_node ;
194
+ } LastWrittenLsnCacheEntry ;
195
+
196
+
197
+ /*
198
+ * Cache of last written LSN for each relation chunk (hash bucket).
199
+ * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
200
+ * relation metadata update.
201
+ * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
202
+ * pages are replaced using LRU algorithm, based on L2-list.
203
+ * Access to this cache is protected by 'LastWrittenLsnLock'.
204
+ */
205
+ static HTAB * lastWrittenLsnCache ;
206
+
185
207
/*
186
208
* Statistics for current checkpoint are collected in this global struct.
187
209
* Because only the checkpointer or a stand-alone backend can perform
@@ -749,7 +771,17 @@ typedef struct XLogCtlData
749
771
* XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
750
772
*/
751
773
XLogRecPtr lastFpwDisableRecPtr ;
752
- XLogRecPtr lastWrittenPageLSN ;
774
+
775
+ /*
776
+ * Maximal last written LSN for pages not present in lastWrittenLsnCache
777
+ */
778
+ XLogRecPtr maxLastWrittenLsn ;
779
+
780
+ /*
781
+ * Double linked list to implement LRU replacement policy for last written LSN cache.
782
+ * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'.
783
+ */
784
+ dlist_head lastWrittenLsnLRU ;
753
785
754
786
/* neon: copy of startup's RedoStartLSN for walproposer's use */
755
787
XLogRecPtr RedoStartLSN ;
@@ -772,6 +804,8 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
772
804
*/
773
805
static ControlFileData * ControlFile = NULL ;
774
806
807
+ #define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */
808
+
775
809
/*
776
810
* Calculate the amount of space left on the page after 'endptr'. Beware
777
811
* multiple evaluation!
@@ -5143,11 +5177,8 @@ LocalProcessControlFile(bool reset)
5143
5177
ReadControlFile ();
5144
5178
}
5145
5179
5146
- /*
5147
- * Initialization of shared memory for XLOG
5148
- */
5149
- Size
5150
- XLOGShmemSize (void )
5180
+ static Size
5181
+ XLOGCtlShmemSize (void )
5151
5182
{
5152
5183
Size size ;
5153
5184
@@ -5187,6 +5218,16 @@ XLOGShmemSize(void)
5187
5218
return size ;
5188
5219
}
5189
5220
5221
+ /*
5222
+ * Initialization of shared memory for XLOG
5223
+ */
5224
+ Size
5225
+ XLOGShmemSize (void )
5226
+ {
5227
+ return XLOGCtlShmemSize () +
5228
+ hash_estimate_size (lastWrittenLsnCacheSize , sizeof (LastWrittenLsnCacheEntry ));
5229
+ }
5230
+
5190
5231
void
5191
5232
XLOGShmemInit (void )
5192
5233
{
@@ -5216,6 +5257,15 @@ XLOGShmemInit(void)
5216
5257
XLogCtl = (XLogCtlData * )
5217
5258
ShmemInitStruct ("XLOG Ctl" , XLOGShmemSize (), & foundXLog );
5218
5259
5260
+ {
5261
+ static HASHCTL info ;
5262
+ info .keysize = sizeof (BufferTag );
5263
+ info .entrysize = sizeof (LastWrittenLsnCacheEntry );
5264
+ lastWrittenLsnCache = ShmemInitHash ("last_written_lsn_cache" ,
5265
+ lastWrittenLsnCacheSize , lastWrittenLsnCacheSize ,
5266
+ & info ,
5267
+ HASH_ELEM | HASH_BLOBS );
5268
+ }
5219
5269
localControlFile = ControlFile ;
5220
5270
ControlFile = (ControlFileData * )
5221
5271
ShmemInitStruct ("Control File" , sizeof (ControlFileData ), & foundCFile );
@@ -8119,7 +8169,8 @@ StartupXLOG(void)
8119
8169
8120
8170
XLogCtl -> LogwrtRqst .Write = EndOfLog ;
8121
8171
XLogCtl -> LogwrtRqst .Flush = EndOfLog ;
8122
- XLogCtl -> lastWrittenPageLSN = EndOfLog ;
8172
+ XLogCtl -> maxLastWrittenLsn = EndOfLog ;
8173
+ dlist_init (& XLogCtl -> lastWrittenLsnLRU );
8123
8174
8124
8175
LocalSetXLogInsertAllowed ();
8125
8176
@@ -8895,29 +8946,141 @@ GetInsertRecPtr(void)
8895
8946
}
8896
8947
8897
8948
/*
8898
- * GetLastWrittenPageLSN -- Returns maximal LSN of written page
8949
+ * GetLastWrittenLSN -- Returns maximal LSN of written page.
8950
+ * It returns an upper bound for the last written LSN of a given page,
8951
+ * either from a cached last written LSN or a global maximum last written LSN.
8952
+ * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
8953
+ * If cache is large enough ,iterting through all hash items may be rather expensive.
8954
+ * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
8899
8955
*/
8900
8956
XLogRecPtr
8901
- GetLastWrittenPageLSN ( void )
8957
+ GetLastWrittenLSN ( RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
8902
8958
{
8903
8959
XLogRecPtr lsn ;
8904
- SpinLockAcquire (& XLogCtl -> info_lck );
8905
- lsn = XLogCtl -> lastWrittenPageLSN ;
8906
- SpinLockRelease (& XLogCtl -> info_lck );
8960
+ LastWrittenLsnCacheEntry * entry ;
8961
+
8962
+ LWLockAcquire (LastWrittenLsnLock , LW_SHARED );
8963
+
8964
+ /* Maximal last written LSN among all non-cached pages */
8965
+ lsn = XLogCtl -> maxLastWrittenLsn ;
8966
+
8967
+ if (rnode .relNode != InvalidOid )
8968
+ {
8969
+ BufferTag key ;
8970
+ key .rnode = rnode ;
8971
+ key .forkNum = forknum ;
8972
+ key .blockNum = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET ;
8973
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_FIND , NULL );
8974
+ if (entry != NULL )
8975
+ lsn = entry -> lsn ;
8976
+ }
8977
+ else
8978
+ {
8979
+ HASH_SEQ_STATUS seq ;
8980
+ /* Find maximum of all cached LSNs */
8981
+ hash_seq_init (& seq , lastWrittenLsnCache );
8982
+ while ((entry = (LastWrittenLsnCacheEntry * ) hash_seq_search (& seq )) != NULL )
8983
+ {
8984
+ if (entry -> lsn > lsn )
8985
+ lsn = entry -> lsn ;
8986
+ }
8987
+ }
8988
+ LWLockRelease (LastWrittenLsnLock );
8907
8989
8908
8990
return lsn ;
8909
8991
}
8910
8992
8911
8993
/*
8912
- * SetLastWrittenPageLSN -- Set maximal LSN of written page
8994
+ * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range.
8995
+ * We maintain cache of last written LSNs with limited size and LRU replacement
8996
+ * policy. To reduce cache size we store max LSN not for each page, but for
8997
+ * bucket (1024 blocks). This cache allows to use old LSN when
8998
+ * requesting pages of unchanged or appended relations.
8999
+ *
9000
+ * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated.
9001
+ * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions.
8913
9002
*/
8914
9003
void
8915
- SetLastWrittenPageLSN (XLogRecPtr lsn )
9004
+ SetLastWrittenLSNForBlockRange (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber from , BlockNumber till )
8916
9005
{
8917
- SpinLockAcquire (& XLogCtl -> info_lck );
8918
- if (lsn > XLogCtl -> lastWrittenPageLSN )
8919
- XLogCtl -> lastWrittenPageLSN = lsn ;
8920
- SpinLockRelease (& XLogCtl -> info_lck );
9006
+ if (lsn == InvalidXLogRecPtr )
9007
+ return ;
9008
+
9009
+ LWLockAcquire (LastWrittenLsnLock , LW_EXCLUSIVE );
9010
+ if (rnode .relNode == InvalidOid )
9011
+ {
9012
+ if (lsn > XLogCtl -> maxLastWrittenLsn )
9013
+ XLogCtl -> maxLastWrittenLsn = lsn ;
9014
+ }
9015
+ else
9016
+ {
9017
+ LastWrittenLsnCacheEntry * entry ;
9018
+ BufferTag key ;
9019
+ bool found ;
9020
+ BlockNumber bucket ;
9021
+
9022
+ key .rnode = rnode ;
9023
+ key .forkNum = forknum ;
9024
+ for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET ;
9025
+ bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET ;
9026
+ bucket ++ )
9027
+ {
9028
+ key .blockNum = bucket ;
9029
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_ENTER , & found );
9030
+ if (found )
9031
+ {
9032
+ if (lsn > entry -> lsn )
9033
+ entry -> lsn = lsn ;
9034
+ /* Unlink from LRU list */
9035
+ dlist_delete (& entry -> lru_node );
9036
+ }
9037
+ else
9038
+ {
9039
+ entry -> lsn = lsn ;
9040
+ if (hash_get_num_entries (lastWrittenLsnCache ) > lastWrittenLsnCacheSize )
9041
+ {
9042
+ /* Replace least recently used entry */
9043
+ LastWrittenLsnCacheEntry * victim = dlist_container (LastWrittenLsnCacheEntry , lru_node , dlist_pop_head_node (& XLogCtl -> lastWrittenLsnLRU ));
9044
+ /* Adjust max LSN for not cached relations/chunks if needed */
9045
+ if (victim -> lsn > XLogCtl -> maxLastWrittenLsn )
9046
+ XLogCtl -> maxLastWrittenLsn = victim -> lsn ;
9047
+
9048
+ hash_search (lastWrittenLsnCache , victim , HASH_REMOVE , NULL );
9049
+ }
9050
+ }
9051
+ /* Link to the end of LRU list */
9052
+ dlist_push_tail (& XLogCtl -> lastWrittenLsnLRU , & entry -> lru_node );
9053
+ }
9054
+ }
9055
+ LWLockRelease (LastWrittenLsnLock );
9056
+ }
9057
+
9058
+ /*
9059
+ * SetLastWrittenLSNForBlock -- Set maximal LSN for block
9060
+ */
9061
+ void
9062
+ SetLastWrittenLSNForBlock (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
9063
+ {
9064
+ SetLastWrittenLSNForBlockRange (lsn , rnode , forknum , blkno , blkno );
9065
+ }
9066
+
9067
+ /*
9068
+ * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata
9069
+ */
9070
+ void
9071
+ SetLastWrittenLSNForRelation (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum )
9072
+ {
9073
+ SetLastWrittenLSNForBlock (lsn , rnode , forknum , REL_METADATA_PSEUDO_BLOCKNO );
9074
+ }
9075
+
9076
+ /*
9077
+ * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database
9078
+ */
9079
+ void
9080
+ SetLastWrittenLSNForDatabase (XLogRecPtr lsn )
9081
+ {
9082
+ RelFileNode dummyNode = {InvalidOid , InvalidOid , InvalidOid };
9083
+ SetLastWrittenLSNForBlock (lsn , dummyNode , MAIN_FORKNUM , 0 );
8921
9084
}
8922
9085
8923
9086
/*
0 commit comments