DML操作的大致流程
在解答上述疑惑之前,我们来梳理一下DML操作的大致流程:
语法解析、语义解析
生成执行计划
事务修改阶段
1) 激活事务,事务状态由not_active变为active
2) 查找定位数据
3) 乐观插入
4) 记录insert相关的undo记录,并将undo记录的变化写入redo log buffer
5) 进行insert 元组插入,及实际的插入操作,并写入到redo log buffer
6) binlog event 写入到 binlog cache
事务提交阶段
1) 事务prepare
2) redo组提交,redo落盘
3) flush binlog cache到binlog文件,然后fsync binlog文件将它落盘
4) innodb进行提交,事务状态由prepare变为not_active
写了哪些文件?会写UNDO相关的文件吗?
从上述流程中可以看到,主要对redo log file和binlog进行了写入。
那么是否会实时地写入Undo tablespace呢?
我们先来简单地分析一下:
磁盘中的undo segment,不论它是保存在system tablespace中,还是保存在独立的undo tablespace中,根据页的物理结构(参考阿里内核月报)来看,它们是离散地分布在表空间文件中的。因此需要读/写的时候,会产生很多的随机读写io操作,而随机读写的效率是非常低的;
Innodb使用了很多种方法来将磁盘随机读写尽可能地转换成顺序读写,比如change buffer特性、WAL特性、MRR、extent块管理,等等。上述这些都是在尽可能地减少磁盘随机读写。所以Innodb应该不会将undo日志实时地落盘;
在上述流程中的3.4部分,已经将Undo的变化写入到redo log buffer了,redo会在事务提交时落盘,所以即使在事务失败、Undo没有落盘的情况下实例宕机,重新启动实例的时候,也会从redo中找到Undo来回滚,从而保证事务的原子性。
综上,可以初步判断Undo不会实时地落盘。但是这只是根据原理来进行分析的,为了确定我的分析是否正确,可以打开源码进行分析验证,或使用strace等工具来验证。
以下是源码浅析:
插入的流程:
1 //trx_undof_page_add_undo_rec_log--记录undo的redo log 入redo buffer
2 > mysqld.exe!trx_undof_page_add_undo_rec_log(unsigned char * undo_page, unsigned __int64 old_free, unsigned __int64 new_free, mtr_t * mtr) 行 74
3 mysqld.exe!trx_undo_page_set_next_prev_and_add(unsigned char * undo_page, unsigned char * ptr, mtr_t * mtr) 行 204
4 //trx_undo_page_report_insert--记录insert的undo记录
5 mysqld.exe!trx_undo_page_report_insert(unsigned char * undo_page, trx_t * trx, dict_index_t * index, const dtuple_t * clust_entry, mtr_t * mtr) 行 537
6 mysqld.exe!trx_undo_report_row_operation(unsigned __int64 flags, unsigned __int64 op_type, que_thr_t * thr, dict_index_t * index, const dtuple_t * clust_entry, const upd_t * update, unsigned __int64 cmpl_info, const unsigned char * rec, const unsigned __int64 * offsets, unsigned __int64 * roll_ptr) 行 1951
7 mysqld.exe!btr_cur_ins_lock_and_undo(unsigned __int64 flags, btr_cur_t * cursor, dtuple_t * entry, que_thr_t * thr, mtr_t * mtr, unsigned __int64 * inherit) 行 2984
8 //btr_cur_optimistic_insert--进行乐观插入
9 mysqld.exe!btr_cur_optimistic_insert(unsigned __int64 flags, btr_cur_t * cursor, unsigned __int64 * * offsets, mem_block_info_t * * heap, dtuple_t * entry, unsigned char * * rec, big_rec_t * * big_rec, unsigned __int64 n_ext, que_thr_t * thr, mtr_t * mtr) 行 3244
10 mysqld.exe!row_ins_clust_index_entry_low(unsigned __int64 flags, unsigned __int64 mode, dict_index_t * index, unsigned __int64 n_uniq, dtuple_t * entry, unsigned __int64 n_ext, que_thr_t * thr, bool dup_chk_only) 行 2447
11 mysqld.exe!row_ins_clust_index_entry(dict_index_t * index, dtuple_t * entry, que_thr_t * thr, unsigned __int64 n_ext, bool dup_chk_only) 行 3162
12 mysqld.exe!row_ins_index_entry(dict_index_t * index, dtuple_t * entry, que_thr_t * thr) 行 3292
13 mysqld.exe!row_ins_index_entry_step(ins_node_t * node, que_thr_t * thr) 行 3442
14 mysqld.exe!row_ins(ins_node_t * node, que_thr_t * thr) 行 3584
15 mysqld.exe!row_ins_step(que_thr_t * thr) 行 3769
16 mysqld.exe!row_insert_for_mysql_using_ins_graph(const unsigned char * mysql_rec, row_prebuilt_t * prebuilt) 行 1734
17 mysqld.exe!row_insert_for_mysql(const unsigned char * mysql_rec, row_prebuilt_t * prebuilt) 行 1853
18 mysqld.exe!ha_innobase::write_row(unsigned char * record) 行 7484
19 mysqld.exe!handler::ha_write_row(unsigned char * buf) 行 7845
20 mysqld.exe!write_record(THD * thd, TABLE * table, COPY_INFO * info, COPY_INFO * update) 行 1860
21 mysqld.exe!Sql_cmd_insert::mysql_insert(THD * thd, TABLE_LIST * table_list) 行 780
22 mysqld.exe!Sql_cmd_insert::execute(THD * thd) 行 3092
23 mysqld.exe!mysql_execute_command(THD * thd, bool first_level) 行 3520
24 mysqld.exe!mysql_parse(THD * thd, Parser_state * parser_state) 行 5519
25 mysqld.exe!dispatch_command(THD * thd, const COM_DATA * com_data, enum_server_command command) 行 1432
26 mysqld.exe!do_command(THD * thd) 行 997
27 mysqld.exe!handle_connection(void * arg) 行 301
28 mysqld.exe!pfs_spawn_thread(void * arg) 行 2190
29 mysqld.exe!win_thread_start(void * p) 行 37
其中,trx_undo_page_report_insert函数的代码如下:
1
4 static
5 ulint
6 trx_undo_page_report_insert(
7
8 page_t* undo_page,
9 trx_t* trx,
10 dict_index_t* index,
11 const dtuple_t* clust_entry,
13 mtr_t* mtr)
14 {
15 ulint first_free;
16 byte* ptr;
17 ulint i;
18
19 //...省略若干内容
20
21
22
23 ptr += 2;
24
25
26 *ptr++ = TRX_UNDO_INSERT_REC;
27 ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
28 ptr += mach_u64_write_much_compressed(ptr, index->table->id);
29
30
31
32 for (i = 0; i < dict_index_get_n_unique(index); i++) {
33
34 const dfield_t* field = dtuple_get_nth_field(clust_entry, i);
35 ulint flen = dfield_get_len(field);
36
37 if (trx_undo_left(undo_page, ptr) < 5) {
38
39 return(0);
40 }
41
42 ptr += mach_write_compressed(ptr, flen);
43
44 if (flen != UNIV_SQL_NULL) {
45 if (trx_undo_left(undo_page, ptr) < flen) {
46
47 return(0);
48 }
49
50 ut_memcpy(ptr, dfield_get_data(field), flen);
51 ptr += flen;
52 }
53 }
54
55 if (index->table->n_v_cols) {
56 if (!trx_undo_report_insert_virtual(
57 undo_page, index->table, clust_entry, &ptr)) {
58 return(0);
59 }
60 }
61
62 return(trx_undo_page_set_next_prev_and_add(undo_page, ptr, mtr));
63 }
trx_undo_page_set_next_prev_and_add函数的代码如下:
1
4 static
5 ulint
6 trx_undo_page_set_next_prev_and_add(
7
8 page_t* undo_page,
9 byte* ptr,
11 mtr_t* mtr)
12 {
13 ulint first_free;
14 ulint end_of_rec;
15 byte* ptr_to_first_free;
16
19
20 //...省略若干代码
21
22 ptr_to_first_free = undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE;
23
24 first_free = mach_read_from_2(ptr_to_first_free);
25
26
27 mach_write_to_2(ptr, first_free);
28 ptr += 2;
29
30 end_of_rec = ptr - undo_page;
31
32
33 mach_write_to_2(undo_page + first_free, end_of_rec);
34
35
36 mach_write_to_2(ptr_to_first_free, end_of_rec);
37
38
41 trx_undof_page_add_undo_rec_log(undo_page, first_free,
42 end_of_rec, mtr);
43
44 return(first_free);
45 }
trx_undof_page_add_undo_rec_log函数的代码如下:
1
5 UNIV_INLINE
6 void
7 trx_undof_page_add_undo_rec_log(
8
9 page_t* undo_page,
10 ulint old_free,
11 ulint new_free,
12 mtr_t* mtr)
13 {
14 byte* log_ptr;
15 const byte* log_end;
16 ulint len;
17
18 log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN);
19
20 if (log_ptr == NULL) {
21
22 return;
23 }
24
25 log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN];
26
28 log_ptr = mlog_write_initial_log_record_fast(
29 undo_page, MLOG_UNDO_INSERT, log_ptr, mtr);
30 len = new_free - old_free - 4;
31
32 mach_write_to_2(log_ptr, len);
33 log_ptr += 2;
34
35 if (log_ptr + len <= log_end) {
36 memcpy(log_ptr, undo_page + old_free + 2, len);
37 mlog_close(mtr, log_ptr + len);
38 } else {
39 mlog_close(mtr, log_ptr);
40 mlog_catenate_string(mtr, undo_page + old_free + 2, len);
41 }
42 }
总结
MySQL一条insert操作,会写redo log file和binlog文件,但是不会将UNDO落盘。
UNDO包含在Innodb Buffer Pool中,由Page Cleaner Thread定时刷到磁盘,由Purge Thread定时回收。