本节简单介绍了PostgreSQL手工执行vacuum的处理流程,主要分析了ExecVacuum->vacuum->vacuum_rel->heap_vacuum_rel->lazy_scan_heap函数的实现逻辑,该函数扫描已打开的heap relation,清理堆中的每个页面。
一、数据结构
宏定义
Vacuum和Analyze命令选项
typedef enum VacuumOption
{
VACOPT_VACUUM = 1 << 0,
VACOPT_ANALYZE = 1 << 1,
VACOPT_VERBOSE = 1 << 2,
VACOPT_FREEZE = 1 << 3,
VACOPT_FULL = 1 << 4,
VACOPT_SKIP_LOCKED = 1 << 5,
VACOPT_SKIPTOAST = 1 << 6,
VACOPT_DISABLE_PAGE_SKIPPING = 1 << 7
} VacuumOption;
xl_heap_freeze_tuple
该结构表示’freeze plan’,用于存储在vacuum期间冻结tuple所需要的信息
#define XLH_FREEZE_XVAC 0x02
#define XLH_INVALID_XVAC 0x04
typedef struct xl_heap_freeze_tuple
{
TransactionId xmax;
OffsetNumber offset;
uint16 t_infomask2;
uint16 t_infomask;
uint8 frzflags;
} xl_heap_freeze_tuple;
二、源码解读
lazy_scan_heap扫描已打开的heap relation,清理堆中的每个页面,具体工作包括:
1.将DEAD元组截断为DEAD行指针
2.整理页面碎片
3.设置提交状态位(参见heap_page_prune)
4.构建空闲空间的DEAD元组和页链表
5.计算堆中存活元组数量的统计信息,并在合适的情况下将页标记为all-visible
6.执行index vacuuming并调用lazy_vacuum_heap回收DEAD行指针
其处理流程如下:
1.初始化相关变量
2.获取总块数(nblocks)
3.初始化统计信息和相关数组(vacrelstats/frozen)
4.计算下一个不能跳过的block(next_unskippable_block)
5.遍历每个block
5.1如已达next_unskippable_block块,计算下一个不能跳过的block
否则,如skipping_blocks为T,并且没有强制执行页面检查,则跳到下一个block
5.2如即将超出DEAD元组tid的可用空间,那么在处理此页面之前,执行vacuuming
5.2.1遍历index relation,调用lazy_vacuum_index执行vacuum
5.2.2调用lazy_vacuum_heap清理heap relation中的元组
5.2.3重置vacrelstats->num_dead_tuples计数器为0
5.2.4Vacuum FSM以使新释放的空间再顶层FSM pages中可见
5.3以扩展方式读取buffer
5.4获取buffer cleanup lock但不成功,则
A.aggressive为F并且非强制检查页面,则处理下一个block;
B.aggressive为T或者要求强制检查页面,如不需要冻结元组,则跳过该block;
C.aggressive为F(即要求强制检查页面),更新统计信息,跳过该block;
D.调用LockBufferForCleanup锁定buf,进入常规流程
5.5如为新页,执行相关处理逻辑(重新初始化或者标记buffer为脏),继续下一个block;
5.6如为空页,执行相关逻辑(设置all-visible标记等),继续下一个block;
5.7调用heap_page_prune清理该page中的所有HOT-update链
5.8遍历page中的行指针
5.8.1行指针未使用,继续下一个tuple
5.8.2行指针是重定向指针,继续下一个tuple
5.8.3行指针已废弃,调用lazy_record_dead_tuple记录需删除的tuple,设置all_visible,继续下一个tuple
5.8.4初始化tuple变量
5.8.5调用HeapTupleSatisfiesVacuum函数确定元组状态,根据元组状态执行相关标记处理
5.8.6如tupgone标记为T,记录需删除的tuple;否则调用heap_prepare_freeze_tuple判断是否需要冻结,如需冻结则记录偏移
5.9如冻结统计数>0,遍历需冻结的行指针,执行冻结;如需记录日志,则写WAL Record
5.10如果没有索引,那么执行vacuum page,而不需要二次扫描了.
5.11通过all_visible和all_visible_according_to_vm标记同步vm
5.12释放frozen
5.13更新统计信息
5.14位最后一批dead tuples执行清理
5.15vacuum FSM
5.16执行vacuum收尾工作,为每个索引更新统计信息
5.17记录系统日志
static void
lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
Relation *Irel, int nindexes, bool aggressive)
{
BlockNumber nblocks,//块数
blkno;//块号
HeapTupleData tuple;//元组
char *relname;//关系名称
TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid;//冻结的XID
TransactionId relminmxid = onerel->rd_rel->relminmxid;//最新的mxid
BlockNumber empty_pages,//空页数
vacuumed_pages,//已被vacuum数
next_fsm_block_to_vacuum;//块号
//未被清理的元组数/仍存活的元组数(估算)/通过vacuum清理的元组数/DEAD但未被清理的元组数/未使用的行指针
double num_tuples,
live_tuples,
tups_vacuumed,
nkeep,
nunused;
IndexBulkDeleteResult **indstats;
int i;//临时变量
PGRUsage ru0;
Buffer vmbuffer = InvalidBuffer;//buffer
BlockNumber next_unskippable_block;//block number
bool skipping_blocks;//是否跳过block?
xl_heap_freeze_tuple *frozen;//冻结元组数组
StringInfoData buf;
const int initprog_index[] = {
PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
PROGRESS_VACUUM_MAX_DEAD_TUPLES
};
int64 initprog_val[3];
//初始化PGRUsage变量
pg_rusage_init(&ru0);
//获取关系名称
relname = RelationGetRelationName(onerel);
//记录操作日志
if (aggressive)
ereport(elevel,
(errmsg("aggressively vacuuming \"%s.%s\"",
get_namespace_name(RelationGetNamespace(onerel)),
relname)));
else
ereport(elevel,
(errmsg("vacuuming \"%s.%s\"",
get_namespace_name(RelationGetNamespace(onerel)),
relname)));
//初始化变量
empty_pages = vacuumed_pages = 0;
next_fsm_block_to_vacuum = (BlockNumber) 0;
num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0;
indstats = (IndexBulkDeleteResult **)
palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
//获取该relation总的块数
nblocks = RelationGetNumberOfBlocks(onerel);
//初始化统计信息
vacrelstats->rel_pages = nblocks;
vacrelstats->scanned_pages = 0;
vacrelstats->tupcount_pages = 0;
vacrelstats->nonempty_pages = 0;
vacrelstats->latestRemovedXid = InvalidTransactionId;
//每个block都进行单独记录
lazy_space_alloc(vacrelstats, nblocks);
//为frozen数组分配内存空间
frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
//报告正在扫描heap,并广播总的块数
//PROGRESS_VACUUM_PHASE_SCAN_HEAP状态
initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
initprog_val[1] = nblocks;//总块数
initprog_val[2] = vacrelstats->max_dead_tuples;//最大废弃元组数
pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
//下一个未跳过的block
next_unskippable_block = 0;
if ((options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
{
//选项没有禁用跳过PAGE
while (next_unskippable_block < nblocks)//循环k
{
uint8 vmstatus;//vm状态
vmstatus = visibilitymap_get_status(onerel, next_unskippable_block,
&vmbuffer);
if (aggressive)
{
if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
break;//遇到全冻结的block,跳出循环
}
else
{
if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
break;//如非强制扫描,遇到全可见block,跳出循环
}
vacuum_delay_point();
next_unskippable_block++;
}
}
if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
skipping_blocks = true;//大于阈值,则设置为T
else
skipping_blocks = false;//否则为F
for (blkno = 0; blkno < nblocks; blkno++)
{
//循环处理每个block
Buffer buf;//缓冲区编号
Page page;//page
OffsetNumber offnum,//偏移
maxoff;
bool tupgone,
hastup;
int prev_dead_count;//上次已废弃元组统计
int nfrozen;//冻结统计
Size freespace;//空闲空间
bool all_visible_according_to_vm = false;//通过vm判断可见性的标记
bool all_visible;//全可见?
bool all_frozen = true;
bool has_dead_tuples;//是否存在dead元组?
TransactionId visibility_cutoff_xid = InvalidTransactionId;//事务ID
//请查看上述关于最后一个page的强制扫描注释
//全部扫描&尝试截断
#define FORCE_CHECK_PAGE() \
(blkno == nblocks - 1 && should_attempt_truncation(vacrelstats))
//更新统计信息
pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
if (blkno == next_unskippable_block)
{
//到达了next_unskippable_block标记的地方
//是时候增加next_unskippable_block计数了
next_unskippable_block++;
//寻找下一个需跳过的block
if ((options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
{
while (next_unskippable_block < nblocks)
{
uint8 vmskipflags;
vmskipflags = visibilitymap_get_status(onerel,
next_unskippable_block,
&vmbuffer);
if (aggressive)
{
if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
break;
}
else
{
if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
break;
}
vacuum_delay_point();
next_unskippable_block++;
}
}
if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
skipping_blocks = true;
else
skipping_blocks = false;
if (aggressive && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
all_visible_according_to_vm = true;
}
else
{
//尚未到达next_unskippable_block标记的地方
if (skipping_blocks && !FORCE_CHECK_PAGE())
{
if (aggressive || VM_ALL_FROZEN(onerel, blkno, &vmbuffer))
vacrelstats->frozenskipped_pages++;//完全冻结的page计数+1
continue;//跳到下一个block
}
all_visible_according_to_vm = true;
}
vacuum_delay_point();
if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
vacrelstats->num_dead_tuples > 0)
{
//存在废弃的元组,而且:
//MaxHeapTuplesPerPage + vacrelstats->num_dead_tuples > vacrelstats->max_dead_tuples
const int hvp_index[] = {
PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_NUM_INDEX_VACUUMS
};
int64 hvp_val[2];
if (BufferIsValid(vmbuffer))
{
ReleaseBuffer(vmbuffer);
vmbuffer = InvalidBuffer;
}
//在开始处理indexes前清除日志信息
vacuum_log_cleanup_info(onerel, vacrelstats);
//正在清理vacuum indexes
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
//遍历index relation,执行vacuum
//删除指向在vacrelstats->dead_tuples元组的索引条目,更新运行时统计信息
for (i = 0; i < nindexes; i++)
lazy_vacuum_index(Irel[i],
&indstats[i],
vacrelstats);
hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP;
hvp_val[1] = vacrelstats->num_index_scans + 1;
pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
//清理heap relation中的元组
lazy_vacuum_heap(onerel, vacrelstats);
vacrelstats->num_dead_tuples = 0;//重置计数
vacrelstats->num_index_scans++;//索引扫描次数+1
FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno);
next_fsm_block_to_vacuum = blkno;
//报告再次扫描heap
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_PHASE_SCAN_HEAP);
}
visibilitymap_pin(onerel, blkno, &vmbuffer);
//以扩展方式读取buffer
buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
RBM_NORMAL, vac_strategy);
//需要buffer cleanup lock以便清理HOT chains.
//ConditionalLockBufferForCleanup - 跟LockBufferForCleanup类似,但不会等待锁的获取
if (!ConditionalLockBufferForCleanup(buf))
{
//----------- 不能获取到锁
if (!aggressive && !FORCE_CHECK_PAGE())
{
//非aggressive扫描 && 不强制检查page
//释放buffer,跳过pinned pages+1
ReleaseBuffer(buf);
vacrelstats->pinskipped_pages++;
continue;
}
//共享方式锁定buffer
LockBuffer(buf, BUFFER_LOCK_SHARE);
//lazy_check_needs_freeze --> 扫描page检查是否存在元组需要清理以避免wraparound
if (!lazy_check_needs_freeze(buf, &hastup))
{
//不存在需要清理的tuples
UnlockReleaseBuffer(buf);
vacrelstats->scanned_pages++;
vacrelstats->pinskipped_pages++;
if (hastup)
vacrelstats->nonempty_pages = blkno + 1;
//跳过该block
continue;
}
if (!aggressive)
{
UnlockReleaseBuffer(buf);
vacrelstats->pinskipped_pages++;
if (hastup)
vacrelstats->nonempty_pages = blkno + 1;
continue;
}
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
LockBufferForCleanup(buf);
}
//更新统计信息
vacrelstats->scanned_pages++;
vacrelstats->tupcount_pages++;
//获取page
page = BufferGetPage(buf);
if (PageIsNew(page))
{
//-------------- 新初始化的PAGE
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
//ExclusiveLock锁定
LockRelationForExtension(onerel, ExclusiveLock);
//ExclusiveLock释放
UnlockRelationForExtension(onerel, ExclusiveLock);
//锁定buffer
LockBufferForCleanup(buf);
//再次判断page是否NEW
if (PageIsNew(page))
{
//page仍然是New的,那可以重新init该page了.
ereport(WARNING,
(errmsg("relation \"%s\" page %u is uninitialized --- fixing",
relname, blkno)));
PageInit(page, BufferGetPageSize(buf), 0);
empty_pages++;
}
//获取空闲空间
freespace = PageGetHeapFreeSpace(page);
//标记buffer为脏
MarkBufferDirty(buf);
UnlockReleaseBuffer(buf);
//标记page
RecordPageWithFreeSpace(onerel, blkno, freespace);
//下一个page
continue;
}
if (PageIsEmpty(page))
{
//----------------- 空PAGE
empty_pages++;
freespace = PageGetHeapFreeSpace(page);
//空pages通常是all-visible和all-frozen的
if (!PageIsAllVisible(page))
{
//Page不是all-Visible
//处理之
START_CRIT_SECTION();
//写入WAL Record前标记该buffer为脏buffer
MarkBufferDirty(buf);
if (RelationNeedsWAL(onerel) &&
PageGetLSN(page) == InvalidXLogRecPtr)
//如需要记录WAL Record但page的LSN非法,则记录日志
log_newpage_buffer(buf, true);
//设置page的all-visible标记
PageSetAllVisible(page);
//设置vm
visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
vmbuffer, InvalidTransactionId,
VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
END_CRIT_SECTION();
}
UnlockReleaseBuffer(buf);
RecordPageWithFreeSpace(onerel, blkno, freespace);
//处理下一个block
continue;
}
tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
&vacrelstats->latestRemovedXid);
all_visible = true;
has_dead_tuples = false;
nfrozen = 0;
hastup = false;
prev_dead_count = vacrelstats->num_dead_tuples;
maxoff = PageGetMaxOffsetNumber(page);//获取最大偏移
for (offnum = FirstOffsetNumber;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
ItemId itemid;
itemid = PageGetItemId(page, offnum);
//未使用的条目无需处理,但需要计数.
if (!ItemIdIsUsed(itemid))
{
//未被使用,跳过
nunused += 1;
continue;
}
//重定向的条目不需要"接触".
if (ItemIdIsRedirected(itemid))
{
//重定向的ITEM
//该page不能被截断
hastup = true;
continue;
}
//设置行指针
ItemPointerSet(&(tuple.t_self), blkno, offnum);
if (ItemIdIsDead(itemid))
{
//记录需删除的tuple
//vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
//vacrelstats->num_dead_tuples++;
lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
all_visible = false;
continue;
}
Assert(ItemIdIsNormal(itemid));
//获取数据
tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
tuple.t_len = ItemIdGetLength(itemid);
tuple.t_tableOid = RelationGetRelid(onerel);
tupgone = false;
//为VACUUM确定元组的状态.
//在这里,主要目的是一个元组是否可能对所有正在运行中的事务可见.
switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
{
case HEAPTUPLE_DEAD:
if (HeapTupleIsHotUpdated(&tuple) ||
HeapTupleIsHeapOnly(&tuple))
nkeep += 1;
else
//可以删除元组
tupgone = true;
//存在dead tuple,设置all Visible标记为F
all_visible = false;
break;
case HEAPTUPLE_LIVE:
live_tuples += 1;
if (all_visible)
{
//all_visible = T
TransactionId xmin;
if (!HeapTupleHeaderXminCommitted(tuple.t_data))
{
//xmin not committed,设置为F
all_visible = false;
break;
}
xmin = HeapTupleHeaderGetXmin(tuple.t_data);
if (!TransactionIdPrecedes(xmin, OldestXmin))
{
//元组xmin比OldestXmin要小,则设置为F
all_visible = false;
break;
}
//跟踪page上最新的xmin
//if (int32)(xmin > visibility_cutoff_xid) > 0,return T
if (TransactionIdFollows(xmin, visibility_cutoff_xid))
visibility_cutoff_xid = xmin;
}
break;
case HEAPTUPLE_RECENTLY_DEAD:
nkeep += 1;
all_visible = false;
break;
case HEAPTUPLE_INSERT_IN_PROGRESS:
all_visible = false;
break;
case HEAPTUPLE_DELETE_IN_PROGRESS:
//在同步期间,这种情况可以预期
all_visible = false;
live_tuples += 1;
break;
default:
//没有其他状态了.
elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
break;
}
if (tupgone)
{
//记录需删除的tuple
//vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
//vacrelstats->num_dead_tuples++;
lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
&vacrelstats->latestRemovedXid);
tups_vacuumed += 1;
has_dead_tuples = true;
}
else
{
bool tuple_totally_frozen;//所有都冻结标记
num_tuples += 1;
hastup = true;
if (heap_prepare_freeze_tuple(tuple.t_data,
relfrozenxid, relminmxid,
FreezeLimit, MultiXactCutoff,
&frozen[nfrozen],
&tuple_totally_frozen))
frozen[nfrozen++].offset = offnum;
if (!tuple_totally_frozen)
all_frozen = false;
}
}
if (nfrozen > 0)
{
//已冻结计数>0,执行相关处理
START_CRIT_SECTION();
//标记缓冲为脏
MarkBufferDirty(buf);
//执行冻结
for (i = 0; i < nfrozen; i++)
{
ItemId itemid;
HeapTupleHeader htup;
itemid = PageGetItemId(page, frozen[i].offset);
htup = (HeapTupleHeader) PageGetItem(page, itemid);
//执行冻结
heap_execute_freeze_tuple(htup, &frozen[i]);
}
//如需要,记录冻结日志
if (RelationNeedsWAL(onerel))
{
XLogRecPtr recptr;
recptr = log_heap_freeze(onerel, buf, FreezeLimit,
frozen, nfrozen);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
}
if (nindexes == 0 &&
vacrelstats->num_dead_tuples > 0)
{
//------------- 如无索引并且存在dead元组,执行清理
//清除元组
lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer);
has_dead_tuples = false;
vacrelstats->num_dead_tuples = 0;//重置计数器
vacuumed_pages++;//已完成的page+1
if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
{
//批量处理
FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum,
blkno);
next_fsm_block_to_vacuum = blkno;
}
}
//获取空闲空间
freespace = PageGetHeapFreeSpace(page);
//以下if/else逻辑用于同步vm状态
//如OK,标记页面为all-Visible
if (all_visible && !all_visible_according_to_vm)
{
//
uint8 flags = VISIBILITYMAP_ALL_VISIBLE;
if (all_frozen)
flags |= VISIBILITYMAP_ALL_FROZEN;
PageSetAllVisible(page);
MarkBufferDirty(buf);
visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
vmbuffer, visibility_cutoff_xid, flags);
}
else if (all_visible_according_to_vm && !PageIsAllVisible(page)
&& VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
{
elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
relname, blkno);
visibilitymap_clear(onerel, blkno, vmbuffer,
VISIBILITYMAP_VALID_BITS);
}
else if (PageIsAllVisible(page) && has_dead_tuples)
{
elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
relname, blkno);
PageClearAllVisible(page);
MarkBufferDirty(buf);
visibilitymap_clear(onerel, blkno, vmbuffer,
VISIBILITYMAP_VALID_BITS);
}
else if (all_visible_according_to_vm && all_visible && all_frozen &&
!VM_ALL_FROZEN(onerel, blkno, &vmbuffer))
{
visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
vmbuffer, InvalidTransactionId,
VISIBILITYMAP_ALL_FROZEN);
}
UnlockReleaseBuffer(buf);
//使用未被清理的元组记录最后一个页面的位置.
if (hastup)
vacrelstats->nonempty_pages = blkno + 1;
if (vacrelstats->num_dead_tuples == prev_dead_count)
RecordPageWithFreeSpace(onerel, blkno, freespace);
} //结束block循环
//报告所有数据已扫描并vacuumed.
pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
pfree(frozen);
//存储统计已备后用
vacrelstats->tuples_deleted = tups_vacuumed;
vacrelstats->new_dead_tuples = nkeep;
//现在可以为pg_class.reltuples设置新值了.
vacrelstats->new_live_tuples = vac_estimate_reltuples(onerel,
nblocks,
vacrelstats->tupcount_pages,
live_tuples);
//同时,技术存活的heap条目总数
vacrelstats->new_rel_tuples =
vacrelstats->new_live_tuples + vacrelstats->new_dead_tuples;
if (BufferIsValid(vmbuffer))
{
ReleaseBuffer(vmbuffer);
vmbuffer = InvalidBuffer;
}
//如果仍有元组需要删除,执行最后的vacuum循环.
//在这里为元组的最小数目设置一个阈值?
if (vacrelstats->num_dead_tuples > 0)
{
const int hvp_index[] = {
PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_NUM_INDEX_VACUUMS
};
int64 hvp_val[2];
//在访问索引前记录清理信息
vacuum_log_cleanup_info(onerel, vacrelstats);
//报告我们正在vacumming索引
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
//清理索引条目
for (i = 0; i < nindexes; i++)
lazy_vacuum_index(Irel[i],
&indstats[i],
vacrelstats);
//报告我们正在vacuuming heap
hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP;
hvp_val[1] = vacrelstats->num_index_scans + 1;
pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
//清理元组
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
lazy_vacuum_heap(onerel, vacrelstats);
vacrelstats->num_index_scans++;
}
if (blkno > next_fsm_block_to_vacuum)
FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno);
//报告所有blocks vacuumed,已完成清理.
pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_PHASE_INDEX_CLEANUP);
//执行vacuum收尾工作,为每个索引更新统计信息
for (i = 0; i < nindexes; i++)
lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
//如无索引,写日志
if (vacuumed_pages)
ereport(elevel,
(errmsg("\"%s\": removed %.0f row versions in %u pages",
RelationGetRelationName(onerel),
tups_vacuumed, vacuumed_pages)));
initStringInfo(&buf);
appendStringInfo(&buf,
_("%.0f dead row versions cannot be removed yet, oldest xmin: %u\n"),
nkeep, OldestXmin);
appendStringInfo(&buf, _("There were %.0f unused item pointers.\n"),
nunused);
appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ",
"Skipped %u pages due to buffer pins, ",
vacrelstats->pinskipped_pages),
vacrelstats->pinskipped_pages);
appendStringInfo(&buf, ngettext("%u frozen page.\n",
"%u frozen pages.\n",
vacrelstats->frozenskipped_pages),
vacrelstats->frozenskipped_pages);
appendStringInfo(&buf, ngettext("%u page is entirely empty.\n",
"%u pages are entirely empty.\n",
empty_pages),
empty_pages);
appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0));
ereport(elevel,
(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
RelationGetRelationName(onerel),
tups_vacuumed, num_tuples,
vacrelstats->scanned_pages, nblocks),
errdetail_internal("%s", buf.data)));
pfree(buf.data);
}
三、跟踪分析
测试脚本,执行压力测试的同时,执行vacuum
-- session 1
pgbench -c 2 -C -f ./update.sql -j 1 -n -T 600 -U xdb testdb
-- session 2
17:52:59 (xdb@[local]:5432)testdb=# vacuum verbose t1;
启动gdb,设置断点
(gdb) b lazy_scan_heap
Breakpoint 1 at 0x6bc38a: file vacuumlazy.c, line 470.
(gdb) c
Continuing.
Breakpoint 1, lazy_scan_heap (onerel=0x7f224a197788, options=5, vacrelstats=0x296d7b8, Irel=0x296d8b0, nindexes=1,
aggressive=false) at vacuumlazy.c:470
470 TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid;
(gdb)
输入参数
1-relation
(gdb) p *onerel
$1 = {rd_node = {spcNode = 1663, dbNode = 16402, relNode = 50820}, rd_smgr = 0x2930270, rd_refcnt = 1, rd_backend = -1,
rd_islocaltemp = false, rd_isnailed = false, rd_isvalid = true, rd_indexvalid = 1 '\001', rd_statvalid = false,
rd_createSubid = 0, rd_newRelfilenodeSubid = 0, rd_rel = 0x7f224a197bb8, rd_att = 0x7f224a0d8050, rd_id = 50820,
rd_lockInfo = {lockRelId = {relId = 50820, dbId = 16402}}, rd_rules = 0x0, rd_rulescxt = 0x0, trigdesc = 0x0,
rd_rsdesc = 0x0, rd_fkeylist = 0x0, rd_fkeyvalid = false, rd_partkeycxt = 0x0, rd_partkey = 0x0, rd_pdcxt = 0x0,
rd_partdesc = 0x0, rd_partcheck = 0x0, rd_indexlist = 0x7f224a198fe8, rd_oidindex = 0, rd_pkindex = 0,
rd_replidindex = 0, rd_statlist = 0x0, rd_indexattr = 0x0, rd_projindexattr = 0x0, rd_keyattr = 0x0, rd_pkattr = 0x0,
rd_idattr = 0x0, rd_projidx = 0x0, rd_pubactions = 0x0, rd_options = 0x0, rd_index = 0x0, rd_indextuple = 0x0,
rd_amhandler = 0, rd_indexcxt = 0x0, rd_amroutine = 0x0, rd_opfamily = 0x0, rd_opcintype = 0x0, rd_support = 0x0,
rd_supportinfo = 0x0, rd_indoption = 0x0, rd_indexprs = 0x0, rd_indpred = 0x0, rd_exclops = 0x0, rd_exclprocs = 0x0,
rd_exclstrats = 0x0, rd_amcache = 0x0, rd_indcollation = 0x0, rd_fdwroutine = 0x0, rd_toastoid = 0,
pgstat_info = 0x2923e50}
(gdb)
2-options=5,即VACOPT_VACUUM | VACOPT_VERBOSE
3-vacrelstats
(gdb) p *vacrelstats
$2 = {hasindex = true, old_rel_pages = 75, rel_pages = 0, scanned_pages = 0, pinskipped_pages = 0, frozenskipped_pages = 0,
tupcount_pages = 0, old_live_tuples = 10000, new_rel_tuples = 0, new_live_tuples = 0, new_dead_tuples = 0,
pages_removed = 0, tuples_deleted = 0, nonempty_pages = 0, num_dead_tuples = 0, max_dead_tuples = 0, dead_tuples = 0x0,
num_index_scans = 0, latestRemovedXid = 0, lock_waiter_detected = false}
(gdb)
4-Irel
(gdb) p *Irel
$3 = (Relation) 0x7f224a198688
(gdb) p **Irel
$4 = {rd_node = {spcNode = 1663, dbNode = 16402, relNode = 50823}, rd_smgr = 0x29302e0, rd_refcnt = 1, rd_backend = -1,
rd_islocaltemp = false, rd_isnailed = false, rd_isvalid = true, rd_indexvalid = 0 '\000', rd_statvalid = false,
rd_createSubid = 0, rd_newRelfilenodeSubid = 0, rd_rel = 0x7f224a1988a0, rd_att = 0x7f224a1989b8, rd_id = 50823,
rd_lockInfo = {lockRelId = {relId = 50823, dbId = 16402}}, rd_rules = 0x0, rd_rulescxt = 0x0, trigdesc = 0x0,
rd_rsdesc = 0x0, rd_fkeylist = 0x0, rd_fkeyvalid = false, rd_partkeycxt = 0x0, rd_partkey = 0x0, rd_pdcxt = 0x0,
rd_partdesc = 0x0, rd_partcheck = 0x0, rd_indexlist = 0x0, rd_oidindex = 0, rd_pkindex = 0, rd_replidindex = 0,
rd_statlist = 0x0, rd_indexattr = 0x0, rd_projindexattr = 0x0, rd_keyattr = 0x0, rd_pkattr = 0x0, rd_idattr = 0x0,
rd_projidx = 0x0, rd_pubactions = 0x0, rd_options = 0x0, rd_index = 0x7f224a198d58, rd_indextuple = 0x7f224a198d20,
rd_amhandler = 330, rd_indexcxt = 0x28cb340, rd_amroutine = 0x28cb480, rd_opfamily = 0x28cb598, rd_opcintype = 0x28cb5b8,
rd_support = 0x28cb5d8, rd_supportinfo = 0x28cb600, rd_indoption = 0x28cb738, rd_indexprs = 0x0, rd_indpred = 0x0,
rd_exclops = 0x0, rd_exclprocs = 0x0, rd_exclstrats = 0x0, rd_amcache = 0x0, rd_indcollation = 0x28cb718,
rd_fdwroutine = 0x0, rd_toastoid = 0, pgstat_info = 0x2923ec8}
(gdb)
5-nindexes=1,存在一个索引
6-aggressive=false,无需执行全表扫描
下面开始初始化相关变量
(gdb) n
471 TransactionId relminmxid = onerel->rd_rel->relminmxid;
(gdb)
483 Buffer vmbuffer = InvalidBuffer;
(gdb)
488 const int initprog_index[] = {
(gdb)
495 pg_rusage_init(&ru0);
(gdb)
497 relname = RelationGetRelationName(onerel);
(gdb)
498 if (aggressive)
(gdb)
504 ereport(elevel,
(gdb)
509 empty_pages = vacuumed_pages = 0;
(gdb)
510 next_fsm_block_to_vacuum = (BlockNumber) 0;
(gdb)
511 num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0;
(gdb)
514 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
(gdb)
513 indstats = (IndexBulkDeleteResult **)
(gdb)
516 nblocks = RelationGetNumberOfBlocks(onerel);
(gdb) p relminmxid
$5 = 1
(gdb) p ru0
$6 = {tv = {tv_sec = 1548669429, tv_usec = 578779}, ru = {ru_utime = {tv_sec = 0, tv_usec = 29531}, ru_stime = {tv_sec = 0,
tv_usec = 51407}, {ru_maxrss = 7488, __ru_maxrss_word = 7488}, {ru_ixrss = 0, __ru_ixrss_word = 0}, {ru_idrss = 0,
__ru_idrss_word = 0}, {ru_isrss = 0, __ru_isrss_word = 0}, {ru_minflt = 1819, __ru_minflt_word = 1819}, {
ru_majflt = 0, __ru_majflt_word = 0}, {ru_nswap = 0, __ru_nswap_word = 0}, {ru_inblock = 2664,
__ru_inblock_word = 2664}, {ru_oublock = 328, __ru_oublock_word = 328}, {ru_msgsnd = 0, __ru_msgsnd_word = 0}, {
ru_msgrcv = 0, __ru_msgrcv_word = 0}, {ru_nsignals = 0, __ru_nsignals_word = 0}, {ru_nvcsw = 70,
__ru_nvcsw_word = 70}, {ru_nivcsw = 3, __ru_nivcsw_word = 3}}}
(gdb) p relname
$7 = 0x7f224a197bb8 "t1"
(gdb)
获取总块数
(gdb) n
517 vacrelstats->rel_pages = nblocks;
(gdb) p nblocks
$8 = 75
(gdb)
初始化统计信息和相关数组
(gdb) n
518 vacrelstats->scanned_pages = 0;
(gdb)
519 vacrelstats->tupcount_pages = 0;
(gdb)
520 vacrelstats->nonempty_pages = 0;
(gdb)
521 vacrelstats->latestRemovedXid = InvalidTransactionId;
(gdb)
523 lazy_space_alloc(vacrelstats, nblocks);
(gdb)
524 frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
(gdb)
527 initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
(gdb)
528 initprog_val[1] = nblocks;
(gdb)
529 initprog_val[2] = vacrelstats->max_dead_tuples;
(gdb)
530 pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
(gdb) p *vacrelstats
$9 = {hasindex = true, old_rel_pages = 75, rel_pages = 75, scanned_pages = 0, pinskipped_pages = 0,
frozenskipped_pages = 0, tupcount_pages = 0, old_live_tuples = 10000, new_rel_tuples = 0, new_live_tuples = 0,
new_dead_tuples = 0, pages_removed = 0, tuples_deleted = 0, nonempty_pages = 0, num_dead_tuples = 0,
max_dead_tuples = 21825, dead_tuples = 0x297e820, num_index_scans = 0, latestRemovedXid = 0, lock_waiter_detected = false}
(gdb)
计算下一个不能跳过的block
第0个块也不能跳过(0
< 32),设置标记skipping_blocks为F
(gdb) n
576 next_unskippable_block = 0;
(gdb)
577 if ((options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
(gdb)
579 while (next_unskippable_block < nblocks)
(gdb)
583 vmstatus = visibilitymap_get_status(onerel, next_unskippable_block,
(gdb)
585 if (aggressive)
(gdb) p vmstatus
$10 = 0 '\000'
(gdb) n
592 if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
(gdb)
593 break;
(gdb)
600 if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
(gdb) p next_unskippable_block
$11 = 0
(gdb) p SKIP_PAGES_THRESHOLD
$12 = 32
(gdb) n
603 skipping_blocks = false;
(gdb)
开始遍历每个block
初始化相关变量
(gdb)
605 for (blkno = 0; blkno < nblocks; blkno++)
(gdb)
616 bool all_visible_according_to_vm = false;
(gdb)
618 bool all_frozen = true;
(gdb)
620 TransactionId visibility_cutoff_xid = InvalidTransactionId;
(gdb)
626 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
(gdb)
628 if (blkno == next_unskippable_block)
(gdb)
blkno == next_unskippable_block,获取下一个不可跳过的block
(gdb) p blkno
$13 = 0
(gdb) p next_unskippable_block
$14 = 0
(gdb) n
631 next_unskippable_block++;
(gdb)
632 if ((options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
(gdb)
634 while (next_unskippable_block < nblocks)
(gdb)
638 vmskipflags = visibilitymap_get_status(onerel,
(gdb)
641 if (aggressive)
(gdb) p vmskipflags
$15 = 0 '\000'
(gdb) n
648 if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
(gdb)
649 break;
(gdb)
660 if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
(gdb) p next_unskippable_block
$16 = 1
(gdb) n
1047 if (onerel->rd_rel->relhasoids &&
(gdb)
1132 if (tupgone)
(gdb)
tupgone为F,判断是否需要冻结(F)
获取偏移,遍历元组
(gdb) p tupgone
$17 = false
(gdb) n
1144 num_tuples += 1;
(gdb)
1145 hastup = true;
(gdb)
1151 if (heap_prepare_freeze_tuple(tuple.t_data,
(gdb)
1154 &frozen[nfrozen],
(gdb) p nfrozen
$18 = 0
(gdb) n
1151 if (heap_prepare_freeze_tuple(tuple.t_data,
(gdb)
1158 if (!tuple_totally_frozen)
(gdb)
1159 all_frozen = false;
(gdb)
958 offnum = OffsetNumberNext(offnum))
(gdb)
956 for (offnum = FirstOffsetNumber;
(gdb)
该元组正常
(gdb) p offnum
$19 = 3
(gdb) n
962 itemid = PageGetItemId(page, offnum);
(gdb)
965 if (!ItemIdIsUsed(itemid))
(gdb)
972 if (ItemIdIsRedirected(itemid))
(gdb)
978 ItemPointerSet(&(tuple.t_self), blkno, offnum);
(gdb)
986 if (ItemIdIsDead(itemid))
(gdb)
993 Assert(ItemIdIsNormal(itemid));
(gdb)
995 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
(gdb)
996 tuple.t_len = ItemIdGetLength(itemid);
(gdb)
997 tuple.t_tableOid = RelationGetRelid(onerel);
(gdb)
999 tupgone = false;
(gdb)
调用HeapTupleSatisfiesVacuum确定元组状态,主要目的是一个元组是否可能对所有正在运行中的事务可见
该元组是Live tuple
1012 switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
(gdb)
(gdb) n
1047 if (onerel->rd_rel->relhasoids &&
(gdb) n
1056 live_tuples += 1;
(gdb)
1067 if (all_visible)
(gdb) p all_visible
$20 = false
跳出循环
(gdb) b vacuumlazy.c:1168
Breakpoint 2 at 0x6bd4e7: file vacuumlazy.c, line 1168.
(gdb) c
Continuing.
Breakpoint 2, lazy_scan_heap (onerel=0x7f224a197788, options=5, vacrelstats=0x296d7b8, Irel=0x296d8b0, nindexes=1,
aggressive=false) at vacuumlazy.c:1168
1168 if (nfrozen > 0)
(gdb)
更新统计信息
(gdb) n
1203 if (nindexes == 0 &&
(gdb) p nfrozen
$23 = 0
(gdb) n
1232 freespace = PageGetHeapFreeSpace(page);
(gdb)
1235 if (all_visible && !all_visible_according_to_vm)
(gdb)
1268 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
(gdb)
1290 else if (PageIsAllVisible(page) && has_dead_tuples)
(gdb)
1305 else if (all_visible_according_to_vm && all_visible && all_frozen &&
(gdb)
1318 UnlockReleaseBuffer(buf);
(gdb)
1321 if (hastup)
(gdb)
1322 vacrelstats->nonempty_pages = blkno + 1;
(gdb) p hastup
$24 = true
(gdb) n
1331 if (vacrelstats->num_dead_tuples == prev_dead_count)
(gdb)
1332 RecordPageWithFreeSpace(onerel, blkno, freespace);
继续下一个block
(gdb)
605 for (blkno = 0; blkno < nblocks; blkno++)
(gdb) p blkno
$25 = 0
(gdb) n
616 bool all_visible_according_to_vm = false;
(gdb) p blkno
$26 = 1
(gdb)
判断(vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0,不满足,继续执行
...
(gdb)
701 vacuum_delay_point();
(gdb)
707 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
(gdb) p vacrelstats->max_dead_tuples
$27 = 21825
(gdb) p vacrelstats->num_dead_tuples
$28 = 0
(gdb) p MaxHeapTuplesPerPage
No symbol "__builtin_offsetof" in current context.
(gdb)
以扩展方式读取buffer
(gdb) n
783 visibilitymap_pin(onerel, blkno, &vmbuffer);
(gdb)
785 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
(gdb)
789 if (!ConditionalLockBufferForCleanup(buf))
(gdb)
取buffer cleanup lock,成功!
调用heap_page_prune清理该page中的所有HOT-update链
(gdb) n
847 vacrelstats->scanned_pages++;
(gdb)
848 vacrelstats->tupcount_pages++;
(gdb)
850 page = BufferGetPage(buf);
(gdb)
852 if (PageIsNew(page))
(gdb)
894 if (PageIsEmpty(page))
(gdb)
938 tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
(gdb)
945 all_visible = true;
(gdb)
遍历page中的行指针
956 for (offnum = FirstOffsetNumber;
(gdb) p maxoff
$29 = 291
(gdb)
$30 = 291
(gdb) n
962 itemid = PageGetItemId(page, offnum);
(gdb) n
965 if (!ItemIdIsUsed(itemid))
(gdb)
972 if (ItemIdIsRedirected(itemid))
(gdb)
978 ItemPointerSet(&(tuple.t_self), blkno, offnum);
(gdb)
986 if (ItemIdIsDead(itemid))
(gdb)
993 Assert(ItemIdIsNormal(itemid));
(gdb)
995 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
(gdb)
996 tuple.t_len = ItemIdGetLength(itemid);
(gdb)
997 tuple.t_tableOid = RelationGetRelid(onerel);
(gdb)
999 tupgone = false;
(gdb)
1012 switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
(gdb)
1099 nkeep += 1;
(gdb)
1100 all_visible = false;
(gdb)
1101 break;
(gdb)
1132 if (tupgone)
(gdb)
1144 num_tuples += 1;
跳出循环
(gdb) c
Continuing.
Breakpoint 2, lazy_scan_heap (onerel=0x7f224a197788, options=5, vacrelstats=0x296d7b8, Irel=0x296d8b0, nindexes=1,
aggressive=false) at vacuumlazy.c:1168
1168 if (nfrozen > 0)
(gdb)
DONE!
四、参考资料
PG Source Code
免责声明:
① 本站未注明“稿件来源”的信息均来自网络整理。其文字、图片和音视频稿件的所属权归原作者所有。本站收集整理出于非商业性的教育和科研之目的,并不意味着本站赞同其观点或证实其内容的真实性。仅作为临时的测试数据,供内部测试之用。本站并未授权任何人以任何方式主动获取本站任何信息。
② 本站未注明“稿件来源”的临时测试数据将在测试完成后最终做删除处理。有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341
软考中级精品资料免费领
- 历年真题答案解析
- 备考技巧名师总结
- 高频考点精准押题
- 资料下载
- 历年真题
193.9 KB下载数265
191.63 KB下载数245
143.91 KB下载数1148
183.71 KB下载数642
644.84 KB下载数2756