61
61
#include "replication/walreceiver.h"
62
62
#include "replication/walsender.h"
63
63
#include "storage/bufmgr.h"
64
+ #include "storage/buf_internals.h"
64
65
#include "storage/fd.h"
65
66
#include "storage/ipc.h"
66
67
#include "storage/large_object.h"
@@ -113,6 +114,7 @@ int wal_retrieve_retry_interval = 5000;
113
114
int max_slot_wal_keep_size_mb = -1 ;
114
115
bool track_wal_io_timing = false;
115
116
uint64 predefined_sysidentifier ;
117
+ int lastWrittenLsnCacheSize ;
116
118
117
119
#ifdef WAL_DEBUG
118
120
bool XLOG_DEBUG = false;
@@ -182,6 +184,26 @@ const struct config_enum_entry recovery_target_action_options[] = {
182
184
{NULL , 0 , false}
183
185
};
184
186
187
+
188
+ typedef struct LastWrittenLsnCacheEntry
189
+ {
190
+ BufferTag key ;
191
+ XLogRecPtr lsn ;
192
+ /* double linked list for LRU replacement algorithm */
193
+ dlist_node lru_node ;
194
+ } LastWrittenLsnCacheEntry ;
195
+
196
+
197
+ /*
198
+ * Cache of last written LSN for each relation chunk (hash bucket).
199
+ * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
200
+ * relation metadata update.
201
+ * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
202
+ * pages are replaced using LRU algorithm, based on L2-list.
203
+ * Access to this cache is protected by 'LastWrittenLsnLock'.
204
+ */
205
+ static HTAB * lastWrittenLsnCache ;
206
+
185
207
/*
186
208
* Statistics for current checkpoint are collected in this global struct.
187
209
* Because only the checkpointer or a stand-alone backend can perform
@@ -749,7 +771,17 @@ typedef struct XLogCtlData
749
771
* XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
750
772
*/
751
773
XLogRecPtr lastFpwDisableRecPtr ;
752
- XLogRecPtr lastWrittenPageLSN ;
774
+
775
+ /*
776
+ * Maximal last written LSN for pages not present in lastWrittenLsnCache
777
+ */
778
+ XLogRecPtr maxLastWrittenLsn ;
779
+
780
+ /*
781
+ * Double linked list to implement LRU replacement policy for last written LSN cache.
782
+ * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'.
783
+ */
784
+ dlist_head lastWrittenLsnLRU ;
753
785
754
786
/* neon: copy of startup's RedoStartLSN for walproposer's use */
755
787
XLogRecPtr RedoStartLSN ;
@@ -772,6 +804,8 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
772
804
*/
773
805
static ControlFileData * ControlFile = NULL ;
774
806
807
+ #define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */
808
+
775
809
/*
776
810
* Calculate the amount of space left on the page after 'endptr'. Beware
777
811
* multiple evaluation!
@@ -5135,11 +5169,8 @@ LocalProcessControlFile(bool reset)
5135
5169
ReadControlFile ();
5136
5170
}
5137
5171
5138
- /*
5139
- * Initialization of shared memory for XLOG
5140
- */
5141
- Size
5142
- XLOGShmemSize (void )
5172
+ static Size
5173
+ XLOGCtlShmemSize (void )
5143
5174
{
5144
5175
Size size ;
5145
5176
@@ -5179,6 +5210,16 @@ XLOGShmemSize(void)
5179
5210
return size ;
5180
5211
}
5181
5212
5213
+ /*
5214
+ * Initialization of shared memory for XLOG
5215
+ */
5216
+ Size
5217
+ XLOGShmemSize (void )
5218
+ {
5219
+ return XLOGCtlShmemSize () +
5220
+ hash_estimate_size (lastWrittenLsnCacheSize , sizeof (LastWrittenLsnCacheEntry ));
5221
+ }
5222
+
5182
5223
void
5183
5224
XLOGShmemInit (void )
5184
5225
{
@@ -5208,6 +5249,15 @@ XLOGShmemInit(void)
5208
5249
XLogCtl = (XLogCtlData * )
5209
5250
ShmemInitStruct ("XLOG Ctl" , XLOGShmemSize (), & foundXLog );
5210
5251
5252
+ {
5253
+ static HASHCTL info ;
5254
+ info .keysize = sizeof (BufferTag );
5255
+ info .entrysize = sizeof (LastWrittenLsnCacheEntry );
5256
+ lastWrittenLsnCache = ShmemInitHash ("last_written_lsn_cache" ,
5257
+ lastWrittenLsnCacheSize , lastWrittenLsnCacheSize ,
5258
+ & info ,
5259
+ HASH_ELEM | HASH_BLOBS );
5260
+ }
5211
5261
localControlFile = ControlFile ;
5212
5262
ControlFile = (ControlFileData * )
5213
5263
ShmemInitStruct ("Control File" , sizeof (ControlFileData ), & foundCFile );
@@ -8098,7 +8148,8 @@ StartupXLOG(void)
8098
8148
8099
8149
XLogCtl -> LogwrtRqst .Write = EndOfLog ;
8100
8150
XLogCtl -> LogwrtRqst .Flush = EndOfLog ;
8101
- XLogCtl -> lastWrittenPageLSN = EndOfLog ;
8151
+ XLogCtl -> maxLastWrittenLsn = EndOfLog ;
8152
+ dlist_init (& XLogCtl -> lastWrittenLsnLRU );
8102
8153
8103
8154
LocalSetXLogInsertAllowed ();
8104
8155
@@ -8870,29 +8921,141 @@ GetInsertRecPtr(void)
8870
8921
}
8871
8922
8872
8923
/*
8873
- * GetLastWrittenPageLSN -- Returns maximal LSN of written page
8924
+ * GetLastWrittenLSN -- Returns maximal LSN of written page.
8925
+ * It returns an upper bound for the last written LSN of a given page,
8926
+ * either from a cached last written LSN or a global maximum last written LSN.
8927
+ * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
8928
+ * If cache is large enough ,iterting through all hash items may be rather expensive.
8929
+ * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
8874
8930
*/
8875
8931
XLogRecPtr
8876
- GetLastWrittenPageLSN ( void )
8932
+ GetLastWrittenLSN ( RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
8877
8933
{
8878
8934
XLogRecPtr lsn ;
8879
- SpinLockAcquire (& XLogCtl -> info_lck );
8880
- lsn = XLogCtl -> lastWrittenPageLSN ;
8881
- SpinLockRelease (& XLogCtl -> info_lck );
8935
+ LastWrittenLsnCacheEntry * entry ;
8936
+
8937
+ LWLockAcquire (LastWrittenLsnLock , LW_SHARED );
8938
+
8939
+ /* Maximal last written LSN among all non-cached pages */
8940
+ lsn = XLogCtl -> maxLastWrittenLsn ;
8941
+
8942
+ if (rnode .relNode != InvalidOid )
8943
+ {
8944
+ BufferTag key ;
8945
+ key .rnode = rnode ;
8946
+ key .forkNum = forknum ;
8947
+ key .blockNum = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET ;
8948
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_FIND , NULL );
8949
+ if (entry != NULL )
8950
+ lsn = entry -> lsn ;
8951
+ }
8952
+ else
8953
+ {
8954
+ HASH_SEQ_STATUS seq ;
8955
+ /* Find maximum of all cached LSNs */
8956
+ hash_seq_init (& seq , lastWrittenLsnCache );
8957
+ while ((entry = (LastWrittenLsnCacheEntry * ) hash_seq_search (& seq )) != NULL )
8958
+ {
8959
+ if (entry -> lsn > lsn )
8960
+ lsn = entry -> lsn ;
8961
+ }
8962
+ }
8963
+ LWLockRelease (LastWrittenLsnLock );
8882
8964
8883
8965
return lsn ;
8884
8966
}
8885
8967
8886
8968
/*
8887
- * SetLastWrittenPageLSN -- Set maximal LSN of written page
8969
+ * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range.
8970
+ * We maintain cache of last written LSNs with limited size and LRU replacement
8971
+ * policy. To reduce cache size we store max LSN not for each page, but for
8972
+ * bucket (1024 blocks). This cache allows to use old LSN when
8973
+ * requesting pages of unchanged or appended relations.
8974
+ *
8975
+ * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated.
8976
+ * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions.
8888
8977
*/
8889
8978
void
8890
- SetLastWrittenPageLSN (XLogRecPtr lsn )
8979
+ SetLastWrittenLSNForBlockRange (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber from , BlockNumber till )
8891
8980
{
8892
- SpinLockAcquire (& XLogCtl -> info_lck );
8893
- if (lsn > XLogCtl -> lastWrittenPageLSN )
8894
- XLogCtl -> lastWrittenPageLSN = lsn ;
8895
- SpinLockRelease (& XLogCtl -> info_lck );
8981
+ if (lsn == InvalidXLogRecPtr )
8982
+ return ;
8983
+
8984
+ LWLockAcquire (LastWrittenLsnLock , LW_EXCLUSIVE );
8985
+ if (rnode .relNode == InvalidOid )
8986
+ {
8987
+ if (lsn > XLogCtl -> maxLastWrittenLsn )
8988
+ XLogCtl -> maxLastWrittenLsn = lsn ;
8989
+ }
8990
+ else
8991
+ {
8992
+ LastWrittenLsnCacheEntry * entry ;
8993
+ BufferTag key ;
8994
+ bool found ;
8995
+ BlockNumber bucket ;
8996
+
8997
+ key .rnode = rnode ;
8998
+ key .forkNum = forknum ;
8999
+ for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET ;
9000
+ bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET ;
9001
+ bucket ++ )
9002
+ {
9003
+ key .blockNum = bucket ;
9004
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_ENTER , & found );
9005
+ if (found )
9006
+ {
9007
+ if (lsn > entry -> lsn )
9008
+ entry -> lsn = lsn ;
9009
+ /* Unlink from LRU list */
9010
+ dlist_delete (& entry -> lru_node );
9011
+ }
9012
+ else
9013
+ {
9014
+ entry -> lsn = lsn ;
9015
+ if (hash_get_num_entries (lastWrittenLsnCache ) > lastWrittenLsnCacheSize )
9016
+ {
9017
+ /* Replace least recently used entry */
9018
+ LastWrittenLsnCacheEntry * victim = dlist_container (LastWrittenLsnCacheEntry , lru_node , dlist_pop_head_node (& XLogCtl -> lastWrittenLsnLRU ));
9019
+ /* Adjust max LSN for not cached relations/chunks if needed */
9020
+ if (victim -> lsn > XLogCtl -> maxLastWrittenLsn )
9021
+ XLogCtl -> maxLastWrittenLsn = victim -> lsn ;
9022
+
9023
+ hash_search (lastWrittenLsnCache , victim , HASH_REMOVE , NULL );
9024
+ }
9025
+ }
9026
+ /* Link to the end of LRU list */
9027
+ dlist_push_tail (& XLogCtl -> lastWrittenLsnLRU , & entry -> lru_node );
9028
+ }
9029
+ }
9030
+ LWLockRelease (LastWrittenLsnLock );
9031
+ }
9032
+
9033
+ /*
9034
+ * SetLastWrittenLSNForBlock -- Set maximal LSN for block
9035
+ */
9036
+ void
9037
+ SetLastWrittenLSNForBlock (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
9038
+ {
9039
+ SetLastWrittenLSNForBlockRange (lsn , rnode , forknum , blkno , blkno );
9040
+ }
9041
+
9042
+ /*
9043
+ * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata
9044
+ */
9045
+ void
9046
+ SetLastWrittenLSNForRelation (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum )
9047
+ {
9048
+ SetLastWrittenLSNForBlock (lsn , rnode , forknum , REL_METADATA_PSEUDO_BLOCKNO );
9049
+ }
9050
+
9051
+ /*
9052
+ * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database
9053
+ */
9054
+ void
9055
+ SetLastWrittenLSNForDatabase (XLogRecPtr lsn )
9056
+ {
9057
+ RelFileNode dummyNode = {InvalidOid , InvalidOid , InvalidOid };
9058
+ SetLastWrittenLSNForBlock (lsn , dummyNode , MAIN_FORKNUM , 0 );
8896
9059
}
8897
9060
8898
9061
/*
0 commit comments