tapdisk vhd block device driver
struct tap_disk tapdisk_vhd = {
.disk_type = "tapdisk_vhd",
.flags = 0,
.private_data_size = sizeof(struct vhd_state),
.td_open = _vhd_open,
.td_close = _vhd_close,
.td_queue_read = vhd_queue_read,
.td_queue_write = vhd_queue_write,
.td_get_parent_id = vhd_get_parent_id,
.td_validate_parent = vhd_validate_parent,
.td_debug = vhd_debug,
};
/* Layout of a dynamic disk:
*
* +-------------------------------------------------+
* | Mirror image of HD footer (hd_ftr) (512 bytes) |
* +-------------------------------------------------+
* | Sparse drive header (dd_hdr) (1024 bytes) |
* +-------------------------------------------------+
* | BAT (Block allocation table) |
* | - Array of absolute sector offsets into the |
* | file (u32). |
* | - Rounded up to a sector boundary. |
* | - Unused entries are marked as 0xFFFFFFFF |
* | - max entries in dd_hdr->max_bat_size |
* +-------------------------------------------------+
* | Data Block 0 |
* | Bitmap (padded to 512 byte sector boundary) |
* | - each bit indicates whether the associated |
* | sector within this block is used. |
* | Data |
* | - power-of-two multiple of sectors. |
* | - default 2MB (4096 * 512) |
* | - Any entries with zero in bitmap should be |
* | zero on disk |
* +-------------------------------------------------+
* | Data Block 1 |
* +-------------------------------------------------+
* | ... |
* +-------------------------------------------------+
* | Data Block n |
* +-------------------------------------------------+
* | HD Footer (511 bytes) |
* +-------------------------------------------------+
*/
_vhd_open: _vhd_open -> __vhd_open -> vhd_open , 其中 vhd_open 通過 open 函數開啟 vhd 檔案同時把 fd 賦值給 vhd_context,之後調用 vhd_read_footer, vhd_read_header 等讀取 vhd 資訊。
_vhd_close: 調用 vhd_write_footer, vhd_write_batmap 來回寫 vhd 的 BAT 和 Bitmap
_vhd_debug : VHD 的 bitmap 是存放在 cache 中的,類似於檔案系統的 cache 機制
vhd_queue_read : 首先調用 read_bitmap_cache,讀取對應 block 的 bitmap。
如果 BAT 為空白 (VHD_BM_BAT_CLEAR),調用 td_forward_request
如果 bitmap 所有位都為空白 (VHD_BM_BIT_CLEAR), 調用 td_forward_request
(這兩種情況可能表示vhd鏡像的資料是無效的,所以不執行真正的IO)
如果 bitmap 不為空白 (VHD_BM_BIT_SET),調用read_bitmap_cache_span,找到bitmap為1的sector個數,然後調用schedule_data_read去讀取這些sector。
schedule_data_read 構造 td_request_t 請求,通過aio_read 發出真正的IO請求。
aio_read 調用 tapdisk_prep_tiocb, tapdisk_queue_tiocb 把 tiocb 結構請求放入tapdisk的隊列中
如果 bitmap 沒有在 cache 中,調用schedule_bitmap_read,讀取對應 block 中的bitmap 成功之後會同時把 bitmap 寫入cache。
schedule_bitmap_read 裡首先構造一個操作符為 VHD_OP_BITMAP_READ 的請求,調用aio_read去讀取block對應的bitmap的內容,可以看到aio_read之後調用了lock_bitmap把bitmap設定為VHD_FLAG_BM_LOCKED狀態,調用install_bitmap把 bitmap置入cache,最後把bitmap設定為VHD_FLAG_BM_READ_PENDING狀態。
schedule_bitmap_read之後調用__vhd_queue_request,構造一個td_request_t的IO請求,並把該請求加入到vhd_bitmap->waiting隊列中,之後再次lock該bitmap。下面就是等待bitmap讀取完成了。
從下面的struct vhd_bitmap的定義可以知道waiting的含義:
struct vhd_bitmap {
u32 blk;
u64 seqno; /* lru sequence number */
vhd_flag_t status;
char *map; /* map should only be modified
* in finish_bitmap_write */
char *shadow; /* in-memory bitmap changes are
* made to shadow and copied to
* map only after having been
* flushed to disk */
struct vhd_transaction tx; /* transaction data structure
* encapsulating data, bitmap,
* and bat writes */
struct vhd_req_list queue; /* data writes waiting for next
* transaction */
struct vhd_req_list waiting; /* pending requests that cannot
* be serviced until this bitmap
* is read from disk */
struct vhd_request req;
};
如果 bitmap 處於 VHD_BM_READ_PENDING 狀態,則只調用 __vhd_queue_request 等待 bitmap 的讀取完畢
vhd_queue_write 的原理和 vhd_queue_read 基本類似,不多說了。
最後提下vhd_complete函數,在tapdisk queue的機制中,vhd_complete被作為callback函數傳入struct tiocb結構,每次這個iocb的IO返回,都會調用事先註冊號的vhd_complete,我們看下這個函數究竟做啥:
vhd_complete中最重要的一部分如下:
switch (req->op) {
case VHD_OP_DATA_READ:
finish_data_read(req);
break;
case VHD_OP_DATA_WRITE:
finish_data_write(req);
break;
case VHD_OP_BITMAP_READ:
finish_bitmap_read(req);
break;
case VHD_OP_BITMAP_WRITE:
finish_bitmap_write(req);
break;
case VHD_OP_ZERO_BM_WRITE:
finish_zero_bm_write(req);
break;
case VHD_OP_BAT_WRITE:
finish_bat_write(req);
break;
default:
ASSERT(0);
break;
}
根據request的操作符,執行不同的finish_xxxxx函數