85
85
#include "replication/walreceiver.h"
86
86
#include "replication/walsender.h"
87
87
#include "storage/bufmgr.h"
88
+ #include "storage/buf_internals.h"
88
89
#include "storage/fd.h"
89
90
#include "storage/ipc.h"
90
91
#include "storage/large_object.h"
@@ -137,6 +138,7 @@ int max_slot_wal_keep_size_mb = -1;
137
138
int wal_decode_buffer_size = 512 * 1024 ;
138
139
bool track_wal_io_timing = false;
139
140
uint64 predefined_sysidentifier ;
141
+ int lastWrittenLsnCacheSize ;
140
142
141
143
#ifdef WAL_DEBUG
142
144
bool XLOG_DEBUG = false;
@@ -199,6 +201,25 @@ const struct config_enum_entry archive_mode_options[] = {
199
201
{NULL , 0 , false}
200
202
};
201
203
204
+ typedef struct LastWrittenLsnCacheEntry
205
+ {
206
+ BufferTag key ;
207
+ XLogRecPtr lsn ;
208
+ /* double linked list for LRU replacement algorithm */
209
+ dlist_node lru_node ;
210
+ } LastWrittenLsnCacheEntry ;
211
+
212
+
213
+ /*
214
+ * Cache of last written LSN for each relation chunk (hash bucket).
215
+ * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
216
+ * relation metadata update.
217
+ * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
218
+ * pages are replaced using LRU algorithm, based on L2-list.
219
+ * Access to this cache is protected by 'LastWrittenLsnLock'.
220
+ */
221
+ static HTAB * lastWrittenLsnCache ;
222
+
202
223
/*
203
224
* Statistics for current checkpoint are collected in this global struct.
204
225
* Because only the checkpointer or a stand-alone backend can perform
@@ -552,7 +573,17 @@ typedef struct XLogCtlData
552
573
* XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
553
574
*/
554
575
XLogRecPtr lastFpwDisableRecPtr ;
555
- XLogRecPtr lastWrittenPageLSN ;
576
+
577
+ /*
578
+ * Maximal last written LSN for pages not present in lastWrittenLsnCache
579
+ */
580
+ XLogRecPtr maxLastWrittenLsn ;
581
+
582
+ /*
583
+ * Double linked list to implement LRU replacement policy for last written LSN cache.
584
+ * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'.
585
+ */
586
+ dlist_head lastWrittenLsnLRU ;
556
587
557
588
/* neon: copy of startup's RedoStartLSN for walproposer's use */
558
589
XLogRecPtr RedoStartLSN ;
@@ -575,6 +606,8 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
575
606
*/
576
607
static ControlFileData * ControlFile = NULL ;
577
608
609
+ #define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */
610
+
578
611
/*
579
612
* Calculate the amount of space left on the page after 'endptr'. Beware
580
613
* multiple evaluation!
@@ -4357,11 +4390,8 @@ LocalProcessControlFile(bool reset)
4357
4390
ReadControlFile ();
4358
4391
}
4359
4392
4360
- /*
4361
- * Initialization of shared memory for XLOG
4362
- */
4363
- Size
4364
- XLOGShmemSize (void )
4393
+ static Size
4394
+ XLOGCtlShmemSize (void )
4365
4395
{
4366
4396
Size size ;
4367
4397
@@ -4410,6 +4440,16 @@ XLOGShmemSize(void)
4410
4440
return size ;
4411
4441
}
4412
4442
4443
+ /*
4444
+ * Initialization of shared memory for XLOG
4445
+ */
4446
+ Size
4447
+ XLOGShmemSize (void )
4448
+ {
4449
+ return XLOGCtlShmemSize () +
4450
+ hash_estimate_size (lastWrittenLsnCacheSize , sizeof (LastWrittenLsnCacheEntry ));
4451
+ }
4452
+
4413
4453
void
4414
4454
XLOGShmemInit (void )
4415
4455
{
@@ -4439,6 +4479,15 @@ XLOGShmemInit(void)
4439
4479
XLogCtl = (XLogCtlData * )
4440
4480
ShmemInitStruct ("XLOG Ctl" , XLOGShmemSize (), & foundXLog );
4441
4481
4482
+ {
4483
+ static HASHCTL info ;
4484
+ info .keysize = sizeof (BufferTag );
4485
+ info .entrysize = sizeof (LastWrittenLsnCacheEntry );
4486
+ lastWrittenLsnCache = ShmemInitHash ("last_written_lsn_cache" ,
4487
+ lastWrittenLsnCacheSize , lastWrittenLsnCacheSize ,
4488
+ & info ,
4489
+ HASH_ELEM | HASH_BLOBS );
4490
+ }
4442
4491
localControlFile = ControlFile ;
4443
4492
ControlFile = (ControlFileData * )
4444
4493
ShmemInitStruct ("Control File" , sizeof (ControlFileData ), & foundCFile );
@@ -5625,7 +5674,8 @@ StartupXLOG(void)
5625
5674
5626
5675
XLogCtl -> LogwrtRqst .Write = EndOfLog ;
5627
5676
XLogCtl -> LogwrtRqst .Flush = EndOfLog ;
5628
- XLogCtl -> lastWrittenPageLSN = EndOfLog ;
5677
+ XLogCtl -> maxLastWrittenLsn = EndOfLog ;
5678
+ dlist_init (& XLogCtl -> lastWrittenLsnLRU );
5629
5679
5630
5680
/*
5631
5681
* Preallocate additional log files, if wanted.
@@ -6053,29 +6103,141 @@ GetInsertRecPtr(void)
6053
6103
}
6054
6104
6055
6105
/*
6056
- * GetLastWrittenPageLSN -- Returns maximal LSN of written page
6106
+ * GetLastWrittenLSN -- Returns maximal LSN of written page.
6107
+ * It returns an upper bound for the last written LSN of a given page,
6108
+ * either from a cached last written LSN or a global maximum last written LSN.
6109
+ * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
6110
+ * If cache is large enough ,iterting through all hash items may be rather expensive.
6111
+ * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
6057
6112
*/
6058
6113
XLogRecPtr
6059
- GetLastWrittenPageLSN ( void )
6114
+ GetLastWrittenLSN ( RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
6060
6115
{
6061
6116
XLogRecPtr lsn ;
6062
- SpinLockAcquire (& XLogCtl -> info_lck );
6063
- lsn = XLogCtl -> lastWrittenPageLSN ;
6064
- SpinLockRelease (& XLogCtl -> info_lck );
6117
+ LastWrittenLsnCacheEntry * entry ;
6118
+
6119
+ LWLockAcquire (LastWrittenLsnLock , LW_SHARED );
6120
+
6121
+ /* Maximal last written LSN among all non-cached pages */
6122
+ lsn = XLogCtl -> maxLastWrittenLsn ;
6123
+
6124
+ if (rnode .relNode != InvalidOid )
6125
+ {
6126
+ BufferTag key ;
6127
+ key .rnode = rnode ;
6128
+ key .forkNum = forknum ;
6129
+ key .blockNum = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET ;
6130
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_FIND , NULL );
6131
+ if (entry != NULL )
6132
+ lsn = entry -> lsn ;
6133
+ }
6134
+ else
6135
+ {
6136
+ HASH_SEQ_STATUS seq ;
6137
+ /* Find maximum of all cached LSNs */
6138
+ hash_seq_init (& seq , lastWrittenLsnCache );
6139
+ while ((entry = (LastWrittenLsnCacheEntry * ) hash_seq_search (& seq )) != NULL )
6140
+ {
6141
+ if (entry -> lsn > lsn )
6142
+ lsn = entry -> lsn ;
6143
+ }
6144
+ }
6145
+ LWLockRelease (LastWrittenLsnLock );
6065
6146
6066
6147
return lsn ;
6067
6148
}
6068
6149
6069
6150
/*
6070
- * SetLastWrittenPageLSN -- Set maximal LSN of written page
6151
+ * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range.
6152
+ * We maintain cache of last written LSNs with limited size and LRU replacement
6153
+ * policy. To reduce cache size we store max LSN not for each page, but for
6154
+ * bucket (1024 blocks). This cache allows to use old LSN when
6155
+ * requesting pages of unchanged or appended relations.
6156
+ *
6157
+ * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated.
6158
+ * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions.
6071
6159
*/
6072
6160
void
6073
- SetLastWrittenPageLSN (XLogRecPtr lsn )
6161
+ SetLastWrittenLSNForBlockRange (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber from , BlockNumber till )
6074
6162
{
6075
- SpinLockAcquire (& XLogCtl -> info_lck );
6076
- if (lsn > XLogCtl -> lastWrittenPageLSN )
6077
- XLogCtl -> lastWrittenPageLSN = lsn ;
6078
- SpinLockRelease (& XLogCtl -> info_lck );
6163
+ if (lsn == InvalidXLogRecPtr )
6164
+ return ;
6165
+
6166
+ LWLockAcquire (LastWrittenLsnLock , LW_EXCLUSIVE );
6167
+ if (rnode .relNode == InvalidOid )
6168
+ {
6169
+ if (lsn > XLogCtl -> maxLastWrittenLsn )
6170
+ XLogCtl -> maxLastWrittenLsn = lsn ;
6171
+ }
6172
+ else
6173
+ {
6174
+ LastWrittenLsnCacheEntry * entry ;
6175
+ BufferTag key ;
6176
+ bool found ;
6177
+ BlockNumber bucket ;
6178
+
6179
+ key .rnode = rnode ;
6180
+ key .forkNum = forknum ;
6181
+ for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET ;
6182
+ bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET ;
6183
+ bucket ++ )
6184
+ {
6185
+ key .blockNum = bucket ;
6186
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_ENTER , & found );
6187
+ if (found )
6188
+ {
6189
+ if (lsn > entry -> lsn )
6190
+ entry -> lsn = lsn ;
6191
+ /* Unlink from LRU list */
6192
+ dlist_delete (& entry -> lru_node );
6193
+ }
6194
+ else
6195
+ {
6196
+ entry -> lsn = lsn ;
6197
+ if (hash_get_num_entries (lastWrittenLsnCache ) > lastWrittenLsnCacheSize )
6198
+ {
6199
+ /* Replace least recently used entry */
6200
+ LastWrittenLsnCacheEntry * victim = dlist_container (LastWrittenLsnCacheEntry , lru_node , dlist_pop_head_node (& XLogCtl -> lastWrittenLsnLRU ));
6201
+ /* Adjust max LSN for not cached relations/chunks if needed */
6202
+ if (victim -> lsn > XLogCtl -> maxLastWrittenLsn )
6203
+ XLogCtl -> maxLastWrittenLsn = victim -> lsn ;
6204
+
6205
+ hash_search (lastWrittenLsnCache , victim , HASH_REMOVE , NULL );
6206
+ }
6207
+ }
6208
+ /* Link to the end of LRU list */
6209
+ dlist_push_tail (& XLogCtl -> lastWrittenLsnLRU , & entry -> lru_node );
6210
+ }
6211
+ }
6212
+ LWLockRelease (LastWrittenLsnLock );
6213
+ }
6214
+
6215
+ /*
6216
+ * SetLastWrittenLSNForBlock -- Set maximal LSN for block
6217
+ */
6218
+ void
6219
+ SetLastWrittenLSNForBlock (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
6220
+ {
6221
+ SetLastWrittenLSNForBlockRange (lsn , rnode , forknum , blkno , blkno );
6222
+ }
6223
+
6224
+ /*
6225
+ * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata
6226
+ */
6227
+ void
6228
+ SetLastWrittenLSNForRelation (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum )
6229
+ {
6230
+ SetLastWrittenLSNForBlock (lsn , rnode , forknum , REL_METADATA_PSEUDO_BLOCKNO );
6231
+ }
6232
+
6233
+ /*
6234
+ * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database
6235
+ */
6236
+ void
6237
+ SetLastWrittenLSNForDatabase (XLogRecPtr lsn )
6238
+ {
6239
+ RelFileNode dummyNode = {InvalidOid , InvalidOid , InvalidOid };
6240
+ SetLastWrittenLSNForBlock (lsn , dummyNode , MAIN_FORKNUM , 0 );
6079
6241
}
6080
6242
6081
6243
void
@@ -6326,7 +6488,7 @@ LogCheckpointEnd(bool restartpoint)
6326
6488
average_sync_time = 0 ;
6327
6489
if (CheckpointStats .ckpt_sync_rels > 0 )
6328
6490
average_sync_time = CheckpointStats .ckpt_agg_sync_time /
6329
- CheckpointStats .ckpt_sync_rels ;
6491
+ CheckpointStats .ckpt_sync_rels ;
6330
6492
average_msecs = (long ) ((average_sync_time + 999 ) / 1000 );
6331
6493
6332
6494
if (restartpoint )
0 commit comments