85
85
#include "replication/walreceiver.h"
86
86
#include "replication/walsender.h"
87
87
#include "storage/bufmgr.h"
88
+ #include "storage/buf_internals.h"
88
89
#include "storage/fd.h"
89
90
#include "storage/ipc.h"
90
91
#include "storage/large_object.h"
@@ -137,6 +138,7 @@ int max_slot_wal_keep_size_mb = -1;
137
138
int wal_decode_buffer_size = 512 * 1024 ;
138
139
bool track_wal_io_timing = false;
139
140
uint64 predefined_sysidentifier ;
141
+ int lastWrittenLsnCacheSize ;
140
142
141
143
#ifdef WAL_DEBUG
142
144
bool XLOG_DEBUG = false;
@@ -199,6 +201,25 @@ const struct config_enum_entry archive_mode_options[] = {
199
201
{NULL , 0 , false}
200
202
};
201
203
204
+ typedef struct LastWrittenLsnCacheEntry
205
+ {
206
+ BufferTag key ;
207
+ XLogRecPtr lsn ;
208
+ /* double linked list for LRU replacement algorithm */
209
+ dlist_node lru_node ;
210
+ } LastWrittenLsnCacheEntry ;
211
+
212
+
213
+ /*
214
+ * Cache of last written LSN for each relation chunk (hash bucket).
215
+ * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
216
+ * relation metadata update.
217
+ * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
218
+ * pages are replaced using LRU algorithm, based on L2-list.
219
+ * Access to this cache is protected by 'LastWrittenLsnLock'.
220
+ */
221
+ static HTAB * lastWrittenLsnCache ;
222
+
202
223
/*
203
224
* Statistics for current checkpoint are collected in this global struct.
204
225
* Because only the checkpointer or a stand-alone backend can perform
@@ -552,7 +573,17 @@ typedef struct XLogCtlData
552
573
* XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
553
574
*/
554
575
XLogRecPtr lastFpwDisableRecPtr ;
555
- XLogRecPtr lastWrittenPageLSN ;
576
+
577
+ /*
578
+ * Maximal last written LSN for pages not present in lastWrittenLsnCache
579
+ */
580
+ XLogRecPtr maxLastWrittenLsn ;
581
+
582
+ /*
583
+ * Double linked list to implement LRU replacement policy for last written LSN cache.
584
+ * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'.
585
+ */
586
+ dlist_head lastWrittenLsnLRU ;
556
587
557
588
/* neon: copy of startup's RedoStartLSN for walproposer's use */
558
589
XLogRecPtr RedoStartLSN ;
@@ -575,6 +606,8 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
575
606
*/
576
607
static ControlFileData * ControlFile = NULL ;
577
608
609
+ #define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */
610
+
578
611
/*
579
612
* Calculate the amount of space left on the page after 'endptr'. Beware
580
613
* multiple evaluation!
@@ -4355,11 +4388,8 @@ LocalProcessControlFile(bool reset)
4355
4388
ReadControlFile ();
4356
4389
}
4357
4390
4358
- /*
4359
- * Initialization of shared memory for XLOG
4360
- */
4361
- Size
4362
- XLOGShmemSize (void )
4391
+ static Size
4392
+ XLOGCtlShmemSize (void )
4363
4393
{
4364
4394
Size size ;
4365
4395
@@ -4408,6 +4438,16 @@ XLOGShmemSize(void)
4408
4438
return size ;
4409
4439
}
4410
4440
4441
+ /*
4442
+ * Initialization of shared memory for XLOG
4443
+ */
4444
+ Size
4445
+ XLOGShmemSize (void )
4446
+ {
4447
+ return XLOGCtlShmemSize () +
4448
+ hash_estimate_size (lastWrittenLsnCacheSize , sizeof (LastWrittenLsnCacheEntry ));
4449
+ }
4450
+
4411
4451
void
4412
4452
XLOGShmemInit (void )
4413
4453
{
@@ -4437,6 +4477,15 @@ XLOGShmemInit(void)
4437
4477
XLogCtl = (XLogCtlData * )
4438
4478
ShmemInitStruct ("XLOG Ctl" , XLOGShmemSize (), & foundXLog );
4439
4479
4480
+ {
4481
+ static HASHCTL info ;
4482
+ info .keysize = sizeof (BufferTag );
4483
+ info .entrysize = sizeof (LastWrittenLsnCacheEntry );
4484
+ lastWrittenLsnCache = ShmemInitHash ("last_written_lsn_cache" ,
4485
+ lastWrittenLsnCacheSize , lastWrittenLsnCacheSize ,
4486
+ & info ,
4487
+ HASH_ELEM | HASH_BLOBS );
4488
+ }
4440
4489
localControlFile = ControlFile ;
4441
4490
ControlFile = (ControlFileData * )
4442
4491
ShmemInitStruct ("Control File" , sizeof (ControlFileData ), & foundCFile );
@@ -5623,7 +5672,8 @@ StartupXLOG(void)
5623
5672
5624
5673
XLogCtl -> LogwrtRqst .Write = EndOfLog ;
5625
5674
XLogCtl -> LogwrtRqst .Flush = EndOfLog ;
5626
- XLogCtl -> lastWrittenPageLSN = EndOfLog ;
5675
+ XLogCtl -> maxLastWrittenLsn = EndOfLog ;
5676
+ dlist_init (& XLogCtl -> lastWrittenLsnLRU );
5627
5677
5628
5678
/*
5629
5679
* Preallocate additional log files, if wanted.
@@ -6047,29 +6097,141 @@ GetInsertRecPtr(void)
6047
6097
}
6048
6098
6049
6099
/*
6050
- * GetLastWrittenPageLSN -- Returns maximal LSN of written page
6100
+ * GetLastWrittenLSN -- Returns maximal LSN of written page.
6101
+ * It returns an upper bound for the last written LSN of a given page,
6102
+ * either from a cached last written LSN or a global maximum last written LSN.
6103
+ * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
6104
+ * If cache is large enough ,iterting through all hash items may be rather expensive.
6105
+ * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
6051
6106
*/
6052
6107
XLogRecPtr
6053
- GetLastWrittenPageLSN ( void )
6108
+ GetLastWrittenLSN ( RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
6054
6109
{
6055
6110
XLogRecPtr lsn ;
6056
- SpinLockAcquire (& XLogCtl -> info_lck );
6057
- lsn = XLogCtl -> lastWrittenPageLSN ;
6058
- SpinLockRelease (& XLogCtl -> info_lck );
6111
+ LastWrittenLsnCacheEntry * entry ;
6112
+
6113
+ LWLockAcquire (LastWrittenLsnLock , LW_SHARED );
6114
+
6115
+ /* Maximal last written LSN among all non-cached pages */
6116
+ lsn = XLogCtl -> maxLastWrittenLsn ;
6117
+
6118
+ if (rnode .relNode != InvalidOid )
6119
+ {
6120
+ BufferTag key ;
6121
+ key .rnode = rnode ;
6122
+ key .forkNum = forknum ;
6123
+ key .blockNum = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET ;
6124
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_FIND , NULL );
6125
+ if (entry != NULL )
6126
+ lsn = entry -> lsn ;
6127
+ }
6128
+ else
6129
+ {
6130
+ HASH_SEQ_STATUS seq ;
6131
+ /* Find maximum of all cached LSNs */
6132
+ hash_seq_init (& seq , lastWrittenLsnCache );
6133
+ while ((entry = (LastWrittenLsnCacheEntry * ) hash_seq_search (& seq )) != NULL )
6134
+ {
6135
+ if (entry -> lsn > lsn )
6136
+ lsn = entry -> lsn ;
6137
+ }
6138
+ }
6139
+ LWLockRelease (LastWrittenLsnLock );
6059
6140
6060
6141
return lsn ;
6061
6142
}
6062
6143
6063
6144
/*
6064
- * SetLastWrittenPageLSN -- Set maximal LSN of written page
6145
+ * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range.
6146
+ * We maintain cache of last written LSNs with limited size and LRU replacement
6147
+ * policy. To reduce cache size we store max LSN not for each page, but for
6148
+ * bucket (1024 blocks). This cache allows to use old LSN when
6149
+ * requesting pages of unchanged or appended relations.
6150
+ *
6151
+ * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated.
6152
+ * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions.
6065
6153
*/
6066
6154
void
6067
- SetLastWrittenPageLSN (XLogRecPtr lsn )
6155
+ SetLastWrittenLSNForBlockRange (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber from , BlockNumber till )
6068
6156
{
6069
- SpinLockAcquire (& XLogCtl -> info_lck );
6070
- if (lsn > XLogCtl -> lastWrittenPageLSN )
6071
- XLogCtl -> lastWrittenPageLSN = lsn ;
6072
- SpinLockRelease (& XLogCtl -> info_lck );
6157
+ if (lsn == InvalidXLogRecPtr )
6158
+ return ;
6159
+
6160
+ LWLockAcquire (LastWrittenLsnLock , LW_EXCLUSIVE );
6161
+ if (rnode .relNode == InvalidOid )
6162
+ {
6163
+ if (lsn > XLogCtl -> maxLastWrittenLsn )
6164
+ XLogCtl -> maxLastWrittenLsn = lsn ;
6165
+ }
6166
+ else
6167
+ {
6168
+ LastWrittenLsnCacheEntry * entry ;
6169
+ BufferTag key ;
6170
+ bool found ;
6171
+ BlockNumber bucket ;
6172
+
6173
+ key .rnode = rnode ;
6174
+ key .forkNum = forknum ;
6175
+ for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET ;
6176
+ bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET ;
6177
+ bucket ++ )
6178
+ {
6179
+ key .blockNum = bucket ;
6180
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_ENTER , & found );
6181
+ if (found )
6182
+ {
6183
+ if (lsn > entry -> lsn )
6184
+ entry -> lsn = lsn ;
6185
+ /* Unlink from LRU list */
6186
+ dlist_delete (& entry -> lru_node );
6187
+ }
6188
+ else
6189
+ {
6190
+ entry -> lsn = lsn ;
6191
+ if (hash_get_num_entries (lastWrittenLsnCache ) > lastWrittenLsnCacheSize )
6192
+ {
6193
+ /* Replace least recently used entry */
6194
+ LastWrittenLsnCacheEntry * victim = dlist_container (LastWrittenLsnCacheEntry , lru_node , dlist_pop_head_node (& XLogCtl -> lastWrittenLsnLRU ));
6195
+ /* Adjust max LSN for not cached relations/chunks if needed */
6196
+ if (victim -> lsn > XLogCtl -> maxLastWrittenLsn )
6197
+ XLogCtl -> maxLastWrittenLsn = victim -> lsn ;
6198
+
6199
+ hash_search (lastWrittenLsnCache , victim , HASH_REMOVE , NULL );
6200
+ }
6201
+ }
6202
+ /* Link to the end of LRU list */
6203
+ dlist_push_tail (& XLogCtl -> lastWrittenLsnLRU , & entry -> lru_node );
6204
+ }
6205
+ }
6206
+ LWLockRelease (LastWrittenLsnLock );
6207
+ }
6208
+
6209
+ /*
6210
+ * SetLastWrittenLSNForBlock -- Set maximal LSN for block
6211
+ */
6212
+ void
6213
+ SetLastWrittenLSNForBlock (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
6214
+ {
6215
+ SetLastWrittenLSNForBlockRange (lsn , rnode , forknum , blkno , blkno );
6216
+ }
6217
+
6218
+ /*
6219
+ * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata
6220
+ */
6221
+ void
6222
+ SetLastWrittenLSNForRelation (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum )
6223
+ {
6224
+ SetLastWrittenLSNForBlock (lsn , rnode , forknum , REL_METADATA_PSEUDO_BLOCKNO );
6225
+ }
6226
+
6227
+ /*
6228
+ * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database
6229
+ */
6230
+ void
6231
+ SetLastWrittenLSNForDatabase (XLogRecPtr lsn )
6232
+ {
6233
+ RelFileNode dummyNode = {InvalidOid , InvalidOid , InvalidOid };
6234
+ SetLastWrittenLSNForBlock (lsn , dummyNode , MAIN_FORKNUM , 0 );
6073
6235
}
6074
6236
6075
6237
void
@@ -6320,7 +6482,7 @@ LogCheckpointEnd(bool restartpoint)
6320
6482
average_sync_time = 0 ;
6321
6483
if (CheckpointStats .ckpt_sync_rels > 0 )
6322
6484
average_sync_time = CheckpointStats .ckpt_agg_sync_time /
6323
- CheckpointStats .ckpt_sync_rels ;
6485
+ CheckpointStats .ckpt_sync_rels ;
6324
6486
average_msecs = (long ) ((average_sync_time + 999 ) / 1000 );
6325
6487
6326
6488
if (restartpoint )
0 commit comments