85
85
#include "replication/walreceiver.h"
86
86
#include "replication/walsender.h"
87
87
#include "storage/bufmgr.h"
88
+ #include "storage/buf_internals.h"
88
89
#include "storage/fd.h"
89
90
#include "storage/ipc.h"
90
91
#include "storage/large_object.h"
@@ -137,6 +138,7 @@ int max_slot_wal_keep_size_mb = -1;
137
138
int wal_decode_buffer_size = 512 * 1024 ;
138
139
bool track_wal_io_timing = false;
139
140
uint64 predefined_sysidentifier ;
141
+ int lastWrittenLsnCacheSize ;
140
142
141
143
#ifdef WAL_DEBUG
142
144
bool XLOG_DEBUG = false;
@@ -199,6 +201,25 @@ const struct config_enum_entry archive_mode_options[] = {
199
201
{NULL , 0 , false}
200
202
};
201
203
204
+ typedef struct LastWrittenLsnCacheEntry
205
+ {
206
+ BufferTag key ;
207
+ XLogRecPtr lsn ;
208
+ /* double linked list for LRU replacement algorithm */
209
+ dlist_node lru_node ;
210
+ } LastWrittenLsnCacheEntry ;
211
+
212
+
213
+ /*
214
+ * Cache of last written LSN for each relation chunk (hash bucket).
215
+ * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
216
+ * relation metadata update.
217
+ * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
218
+ * pages are replaced using LRU algorithm, based on L2-list.
219
+ * Access to this cache is protected by 'LastWrittenLsnLock'.
220
+ */
221
+ static HTAB * lastWrittenLsnCache ;
222
+
202
223
/*
203
224
* Statistics for current checkpoint are collected in this global struct.
204
225
* Because only the checkpointer or a stand-alone backend can perform
@@ -552,7 +573,17 @@ typedef struct XLogCtlData
552
573
* XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
553
574
*/
554
575
XLogRecPtr lastFpwDisableRecPtr ;
555
- XLogRecPtr lastWrittenPageLSN ;
576
+
577
+ /*
578
+ * Maximal last written LSN for pages not present in lastWrittenLsnCache
579
+ */
580
+ XLogRecPtr maxLastWrittenLsn ;
581
+
582
+ /*
583
+ * Double linked list to implement LRU replacement policy for last written LSN cache.
584
+ * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'.
585
+ */
586
+ dlist_head lastWrittenLsnLRU ;
556
587
557
588
/* neon: copy of startup's RedoStartLSN for walproposer's use */
558
589
XLogRecPtr RedoStartLSN ;
@@ -575,6 +606,8 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
575
606
*/
576
607
static ControlFileData * ControlFile = NULL ;
577
608
609
+ #define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */
610
+
578
611
/*
579
612
* Calculate the amount of space left on the page after 'endptr'. Beware
580
613
* multiple evaluation!
@@ -4355,11 +4388,8 @@ LocalProcessControlFile(bool reset)
4355
4388
ReadControlFile ();
4356
4389
}
4357
4390
4358
- /*
4359
- * Initialization of shared memory for XLOG
4360
- */
4361
- Size
4362
- XLOGShmemSize (void )
4391
+ static Size
4392
+ XLOGCtlShmemSize (void )
4363
4393
{
4364
4394
Size size ;
4365
4395
@@ -4408,6 +4438,16 @@ XLOGShmemSize(void)
4408
4438
return size ;
4409
4439
}
4410
4440
4441
+ /*
4442
+ * Initialization of shared memory for XLOG
4443
+ */
4444
+ Size
4445
+ XLOGShmemSize (void )
4446
+ {
4447
+ return XLOGCtlShmemSize () +
4448
+ hash_estimate_size (lastWrittenLsnCacheSize , sizeof (LastWrittenLsnCacheEntry ));
4449
+ }
4450
+
4411
4451
void
4412
4452
XLOGShmemInit (void )
4413
4453
{
@@ -4437,6 +4477,15 @@ XLOGShmemInit(void)
4437
4477
XLogCtl = (XLogCtlData * )
4438
4478
ShmemInitStruct ("XLOG Ctl" , XLOGShmemSize (), & foundXLog );
4439
4479
4480
+ {
4481
+ static HASHCTL info ;
4482
+ info .keysize = sizeof (BufferTag );
4483
+ info .entrysize = sizeof (LastWrittenLsnCacheEntry );
4484
+ lastWrittenLsnCache = ShmemInitHash ("last_written_lsn_cache" ,
4485
+ lastWrittenLsnCacheSize , lastWrittenLsnCacheSize ,
4486
+ & info ,
4487
+ HASH_ELEM | HASH_BLOBS );
4488
+ }
4440
4489
localControlFile = ControlFile ;
4441
4490
ControlFile = (ControlFileData * )
4442
4491
ShmemInitStruct ("Control File" , sizeof (ControlFileData ), & foundCFile );
@@ -5623,7 +5672,8 @@ StartupXLOG(void)
5623
5672
5624
5673
XLogCtl -> LogwrtRqst .Write = EndOfLog ;
5625
5674
XLogCtl -> LogwrtRqst .Flush = EndOfLog ;
5626
- XLogCtl -> lastWrittenPageLSN = EndOfLog ;
5675
+ XLogCtl -> maxLastWrittenLsn = EndOfLog ;
5676
+ dlist_init (& XLogCtl -> lastWrittenLsnLRU );
5627
5677
5628
5678
/*
5629
5679
* Preallocate additional log files, if wanted.
@@ -6051,29 +6101,141 @@ GetInsertRecPtr(void)
6051
6101
}
6052
6102
6053
6103
/*
6054
- * GetLastWrittenPageLSN -- Returns maximal LSN of written page
6104
+ * GetLastWrittenLSN -- Returns maximal LSN of written page.
6105
+ * It returns an upper bound for the last written LSN of a given page,
6106
+ * either from a cached last written LSN or a global maximum last written LSN.
6107
+ * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
6108
+ * If cache is large enough ,iterting through all hash items may be rather expensive.
6109
+ * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
6055
6110
*/
6056
6111
XLogRecPtr
6057
- GetLastWrittenPageLSN ( void )
6112
+ GetLastWrittenLSN ( RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
6058
6113
{
6059
6114
XLogRecPtr lsn ;
6060
- SpinLockAcquire (& XLogCtl -> info_lck );
6061
- lsn = XLogCtl -> lastWrittenPageLSN ;
6062
- SpinLockRelease (& XLogCtl -> info_lck );
6115
+ LastWrittenLsnCacheEntry * entry ;
6116
+
6117
+ LWLockAcquire (LastWrittenLsnLock , LW_SHARED );
6118
+
6119
+ /* Maximal last written LSN among all non-cached pages */
6120
+ lsn = XLogCtl -> maxLastWrittenLsn ;
6121
+
6122
+ if (rnode .relNode != InvalidOid )
6123
+ {
6124
+ BufferTag key ;
6125
+ key .rnode = rnode ;
6126
+ key .forkNum = forknum ;
6127
+ key .blockNum = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET ;
6128
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_FIND , NULL );
6129
+ if (entry != NULL )
6130
+ lsn = entry -> lsn ;
6131
+ }
6132
+ else
6133
+ {
6134
+ HASH_SEQ_STATUS seq ;
6135
+ /* Find maximum of all cached LSNs */
6136
+ hash_seq_init (& seq , lastWrittenLsnCache );
6137
+ while ((entry = (LastWrittenLsnCacheEntry * ) hash_seq_search (& seq )) != NULL )
6138
+ {
6139
+ if (entry -> lsn > lsn )
6140
+ lsn = entry -> lsn ;
6141
+ }
6142
+ }
6143
+ LWLockRelease (LastWrittenLsnLock );
6063
6144
6064
6145
return lsn ;
6065
6146
}
6066
6147
6067
6148
/*
6068
- * SetLastWrittenPageLSN -- Set maximal LSN of written page
6149
+ * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range.
6150
+ * We maintain cache of last written LSNs with limited size and LRU replacement
6151
+ * policy. To reduce cache size we store max LSN not for each page, but for
6152
+ * bucket (1024 blocks). This cache allows to use old LSN when
6153
+ * requesting pages of unchanged or appended relations.
6154
+ *
6155
+ * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated.
6156
+ * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions.
6069
6157
*/
6070
6158
void
6071
- SetLastWrittenPageLSN (XLogRecPtr lsn )
6159
+ SetLastWrittenLSNForBlockRange (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber from , BlockNumber till )
6072
6160
{
6073
- SpinLockAcquire (& XLogCtl -> info_lck );
6074
- if (lsn > XLogCtl -> lastWrittenPageLSN )
6075
- XLogCtl -> lastWrittenPageLSN = lsn ;
6076
- SpinLockRelease (& XLogCtl -> info_lck );
6161
+ if (lsn == InvalidXLogRecPtr )
6162
+ return ;
6163
+
6164
+ LWLockAcquire (LastWrittenLsnLock , LW_EXCLUSIVE );
6165
+ if (rnode .relNode == InvalidOid )
6166
+ {
6167
+ if (lsn > XLogCtl -> maxLastWrittenLsn )
6168
+ XLogCtl -> maxLastWrittenLsn = lsn ;
6169
+ }
6170
+ else
6171
+ {
6172
+ LastWrittenLsnCacheEntry * entry ;
6173
+ BufferTag key ;
6174
+ bool found ;
6175
+ BlockNumber bucket ;
6176
+
6177
+ key .rnode = rnode ;
6178
+ key .forkNum = forknum ;
6179
+ for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET ;
6180
+ bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET ;
6181
+ bucket ++ )
6182
+ {
6183
+ key .blockNum = bucket ;
6184
+ entry = hash_search (lastWrittenLsnCache , & key , HASH_ENTER , & found );
6185
+ if (found )
6186
+ {
6187
+ if (lsn > entry -> lsn )
6188
+ entry -> lsn = lsn ;
6189
+ /* Unlink from LRU list */
6190
+ dlist_delete (& entry -> lru_node );
6191
+ }
6192
+ else
6193
+ {
6194
+ entry -> lsn = lsn ;
6195
+ if (hash_get_num_entries (lastWrittenLsnCache ) > lastWrittenLsnCacheSize )
6196
+ {
6197
+ /* Replace least recently used entry */
6198
+ LastWrittenLsnCacheEntry * victim = dlist_container (LastWrittenLsnCacheEntry , lru_node , dlist_pop_head_node (& XLogCtl -> lastWrittenLsnLRU ));
6199
+ /* Adjust max LSN for not cached relations/chunks if needed */
6200
+ if (victim -> lsn > XLogCtl -> maxLastWrittenLsn )
6201
+ XLogCtl -> maxLastWrittenLsn = victim -> lsn ;
6202
+
6203
+ hash_search (lastWrittenLsnCache , victim , HASH_REMOVE , NULL );
6204
+ }
6205
+ }
6206
+ /* Link to the end of LRU list */
6207
+ dlist_push_tail (& XLogCtl -> lastWrittenLsnLRU , & entry -> lru_node );
6208
+ }
6209
+ }
6210
+ LWLockRelease (LastWrittenLsnLock );
6211
+ }
6212
+
6213
+ /*
6214
+ * SetLastWrittenLSNForBlock -- Set maximal LSN for block
6215
+ */
6216
+ void
6217
+ SetLastWrittenLSNForBlock (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum , BlockNumber blkno )
6218
+ {
6219
+ SetLastWrittenLSNForBlockRange (lsn , rnode , forknum , blkno , blkno );
6220
+ }
6221
+
6222
+ /*
6223
+ * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata
6224
+ */
6225
+ void
6226
+ SetLastWrittenLSNForRelation (XLogRecPtr lsn , RelFileNode rnode , ForkNumber forknum )
6227
+ {
6228
+ SetLastWrittenLSNForBlock (lsn , rnode , forknum , REL_METADATA_PSEUDO_BLOCKNO );
6229
+ }
6230
+
6231
+ /*
6232
+ * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database
6233
+ */
6234
+ void
6235
+ SetLastWrittenLSNForDatabase (XLogRecPtr lsn )
6236
+ {
6237
+ RelFileNode dummyNode = {InvalidOid , InvalidOid , InvalidOid };
6238
+ SetLastWrittenLSNForBlock (lsn , dummyNode , MAIN_FORKNUM , 0 );
6077
6239
}
6078
6240
6079
6241
void
@@ -6324,7 +6486,7 @@ LogCheckpointEnd(bool restartpoint)
6324
6486
average_sync_time = 0 ;
6325
6487
if (CheckpointStats .ckpt_sync_rels > 0 )
6326
6488
average_sync_time = CheckpointStats .ckpt_agg_sync_time /
6327
- CheckpointStats .ckpt_sync_rels ;
6489
+ CheckpointStats .ckpt_sync_rels ;
6328
6490
average_msecs = (long ) ((average_sync_time + 999 ) / 1000 );
6329
6491
6330
6492
if (restartpoint )
0 commit comments