PostgreSQL数据库WAL——简介

网友投稿 294 2022-12-02

PostgreSQL数据库WAL——简介

PostgreSQL的故障恢复主要基于ARIES方法实现。该方法对应的论文ARIES: A Transaction Recovery Method Supporting Fine-Granularity Locking and Partial Rollbacks Using Write-Ahead Logging发表于1992年。它通过构建脏页表、模糊检查点等技术简化了WAL日志实现的逻辑降低了故障恢复的时间,减少了减少检查点的开销,提高了故障恢复的效率。

当插入、删除等变更动作发生时,PG会生成对应动作的WAL记录,然后写入到内存中的WAL缓冲区。满足以下任意条件时,WAL记录会从缓冲区写入到段文件,以确保数据安全:

一个运行中的​​事务提交​​或中止WAL缓冲区被写入的元组填满(WAL缓冲区大小可由参数wal_buffre控制)WalWriter周期性的执行写操作checkpoint

当数据库进行恢复时,会从重做点开始进行恢复。重做点是指最新的checkpoint开始时WAL记录写入的位置。当数据库启动时,PG会判断是否需要进行恢复,比如数据库是在immediate模式下关闭的,此时需要进入恢复模式,从WAL中的redo点开始进行重放WAL记录,其中WAL记录的位置是由LSN记录,lsn是一个64位的整数,记录了wal记录在WAL段中的位置。

WAL缓冲区

在数据库启动时,会从系统中申请一份WAL的共享内存,入口函数是XLOGShmemInit(void)(ControlFile也在这里初始化共享内存)。使用XLogCtlData结构体来管理WAL共享内存。

类型

属性名

描述

XLogwrtRqst

LogwrtRqst

表示当前请求写入系统缓冲区或同步写入磁盘的日志位置

XLogRecPtr

asyncXactLSN

最近需要异步提交的日志位置

XLogwrtResult

LogwrtResult

当前已经写入系统缓冲区或者同步写入磁盘的日志位置

XLogRecPtr*

xlblocks

LSN数组

int

XLogCacheBlck

WAL缓冲区的大小,单位为页

char*

pages

指向WAL缓存Buffer的首地址

typedef struct XLogCtlData { XLogCtlInsert Insert; /* Protected by info_lck: */ XLogwrtRqst LogwrtRqst; XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */ FullTransactionId ckptFullXid; /* nextFullXid of latest checkpoint */ XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */ XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */ XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */ /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */ XLogRecPtr unloggedLSN; slock_t ulsn_lck; /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */ pg_time_t lastSegSwitchTime; XLogRecPtr lastSegSwitchLSN; /* Protected by info_lck and WALWriteLock (you must hold either lock to read it, but both to update) */ XLogwrtResult LogwrtResult; /* Latest initialized page in the cache (last byte position + 1). * To change the identity of a buffer (and InitializedUpTo), you need to * hold WALBufMappingLock. To change the identity of a buffer that's * still dirty, the old page needs to be written out first, and for that * you need WALWriteLock, and you need to ensure that there are no * in-progress insertions to the page by calling * WaitXLogInsertionsToFinish(). */ XLogRecPtr InitializedUpTo; /* These values do not change after startup, although the pointed-to pages * and xlblocks values certainly do. xlblock values are protected by * WALBufMappingLock. */ char *pages; /* buffers for unwritten XLOG pages */ XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */ int XLogCacheBlck; /* highest allocated xlog buffer index */ /* Shared copy of ThisTimeLineID. Does not change after end-of-recovery. * If we created a new timeline when the system was started up, * PrevTimeLineID is the old timeline's ID that we forked off from. * Otherwise it's equal to ThisTimeLineID. */ TimeLineID ThisTimeLineID; TimeLineID PrevTimeLineID; /* SharedRecoveryState indicates if we're still in crash or archive recovery. Protected by info_lck. */ RecoveryState SharedRecoveryState; /* SharedHotStandbyActive indicates if we're still in crash or archive recovery. Protected by info_lck. */ bool SharedHotStandbyActive; /* WalWriterSleeping indicates whether the WAL writer is currently in * low-power mode (and hence should be nudged if an async commit occurs). * Protected by info_lck. */ bool WalWriterSleeping; /* recoveryWakeupLatch is used to wake up the startup process to continue WAL replay, if it is waiting for WAL to arrive or failover trigger file to appear. */ Latch recoveryWakeupLatch; /* During recovery, we keep a copy of the latest checkpoint record here. * lastCheckPointRecPtr points to start of checkpoint record and * lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the * checkpointer when it wants to create a restartpoint. * Protected by info_lck. */ XLogRecPtr lastCheckPointRecPtr; XLogRecPtr lastCheckPointEndPtr; CheckPoint lastCheckPoint; /* lastReplayedEndRecPtr points to end+1 of the last record successfully * replayed. When we're currently replaying a record, ie. in a redo * function, replayEndRecPtr points to the end+1 of the record being * replayed, otherwise it's equal to lastReplayedEndRecPtr. */ XLogRecPtr lastReplayedEndRecPtr; TimeLineID lastReplayedTLI; XLogRecPtr replayEndRecPtr; TimeLineID replayEndTLI; /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */ TimestampTz recoveryLastXTime; /* timestamp of when we started replaying the current chunk of WAL data, only relevant for replication or archive recovery */ TimestampTz currentChunkStartTime; /* Are we requested to pause recovery? */ bool recoveryPause; /* lastFpwDisableRecPtr points to the start of the last replayed XLOG_FPW_CHANGE record that instructs full_page_writes is disabled. */ XLogRecPtr lastFpwDisableRecPtr; slock_t info_lck; /* locks shared variables shown above */} XLogCtlData;

XLOGShmemInit

XLOGShmemInit函数用于在共享内存中为XLogCtlData和ControlFileData申请内存。

static XLogCtlData *XLogCtl = NULL;static ControlFileData *ControlFile = NULL;void XLOGShmemInit(void) { bool foundCFile, foundXLog; char *allocptr; int i; ControlFileData *localControlFile; XLogCtl = (XLogCtlData *)ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog); localControlFile = ControlFile; ControlFile = (ControlFileData *)ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile); if (foundCFile || foundXLog) { /* both should be present or neither */ Assert(foundCFile && foundXLog); /* Initialize local copy of WALInsertLocks and register the tranche */ WALInsertLocks = XLogCtl->Insert.WALInsertLocks; LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,"wal_insert"); if (localControlFile) pfree(localControlFile); return; } memset(XLogCtl, 0, sizeof(XLogCtlData)); /* * Already have read control file locally, unless in bootstrap mode. Move * contents into shared memory. */ if (localControlFile) { memcpy(ControlFile, localControlFile, sizeof(ControlFileData)); pfree(localControlFile); } /* * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a * multiple of the alignment for same, so no extra alignment padding is * needed here. */ allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData); XLogCtl->xlblocks = (XLogRecPtr *) allocptr; memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers); allocptr += sizeof(XLogRecPtr) * XLOGbuffers; /* WAL insertion locks. Ensure they're aligned to the full padded size */ allocptr += sizeof(WALInsertLockPadded) - ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded); WALInsertLocks = XLogCtl->Insert.WALInsertLocks = (WALInsertLockPadded *) allocptr; allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS; LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, "wal_insert"); for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) { LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT); WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr; WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr; } /* * Align the start of the page buffers to a full xlog block size boundary. * This simplifies some calculations in XLOG insertion. It is also * required for O_DIRECT. */ allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr); XLogCtl->pages = allocptr; memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers); /* * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill * in additional info.) */ XLogCtl->XLogCacheBlck = XLOGbuffers - 1; XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH; XLogCtl->SharedHotStandbyActive = false; XLogCtl->WalWriterSleeping = false; SpinLockInit(&XLogCtl->Insert.insertpos_lck); SpinLockInit(&XLogCtl->info_lck); SpinLockInit(&XLogCtl->ulsn_lck); InitSharedLatch(&XLogCtl->recoveryWakeupLatch);}

版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:PostgreSQL数据库RelationAM——RelationData
下一篇:一篇文章告诉你如何在Java数组中插入一个字符
相关文章

 发表评论

暂时没有评论,来抢沙发吧~