KVM virtIO block源代码分析
源代码:linux-3.16.37-git, qemu-v2.7
1.vm启动时qemu的代码
virtio block的qemu cmd:
x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu host -m 256 -smp 1 -drive file=~/vm/centos6-virtio.qcow2,if=none,id=drive-virtio-disk,format=qcow2 -device virtio-blk-pci,bus=pci.0,drive=drive-virtio-disk,id=virtio-disk
qemu中virtio blk代码所在的重点文件如下:
hw\virtio\virtio.c
hw\virtio\virtio-bus.c
hw\virtio\virtio-rng.c
hw\block\virtio-blk.c
hw\net\virtio-net.c
在type_initialize过程中,virtio的设备都会初始化一遍,有virtio_device_class_init,virtio_rng_class_init,virtio_bus_class_init,virtio_pci_bus_class_init,virtio_blk_class_init。
gdb抓取的信息如下:
#0 0x0000555555804ffc in virtio_device_class_init (klass=0x5555566f0090, data=0x0)
at /home/oenhan/workspace/src/qemu-v2.7.0/hw/virtio/virtio.c:1968
#1 0x0000555555b1e542 in type_initialize (ti=0x5555566c9060) at qom/object.c:328
#2 0x0000555555b1e2ba in type_initialize (ti=0x5555566c5b80) at qom/object.c:280
#3 0x0000555555b1f6f6 in object_class_foreach_tramp (key=0x5555566a9710, value=0x5555566c5b80, opaque=0x7fffffffd870) at qom/object.c:798
#4 0x00007ffff2bd43d0 in g_hash_table_foreach () at /lib64/libglib-2.0.so.0
#5 0x0000555555b1f7cd in object_class_foreach (fn=
0x555555b1f922 <object_class_get_list_tramp>, implements_type=0x555555c4816e "machine", include_abstract=false, opaque=0x7fffffffd8c0)
at qom/object.c:820
#6 0x0000555555b1f99d in object_class_get_list (implements_type=0x555555c4816e "machine", include_abstract=false) at qom/object.c:874
#7 0x00005555558bdf1c in find_default_machine () at vl.c:1470
#8 0x00005555558c20bf in select_machine () at vl.c:2732
#9 0x00005555558c2bc3 in main (argc=12, argv=0x7fffffffdd38, envp=0x7fffffffdda0) at vl.c:3986
对于virtio blk而言,重点看一下virtio_blk_device_realize函数,
virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, sizeof(struct virtio_blk_config))
//virtio_init初始化VirtIODevice结构体
struct VirtIODevice
{
size_t config_len;
void *config;
uint16_t config_vector;
//virtio 虚拟队列
VirtQueue *vq;
uint16_t device_id;
bool vm_running;
VMChangeStateEntry *vmstate;
char *bus_name;
QLIST_HEAD(, VirtQueue) *vector_queues;
};
void virtio_init(VirtIODevice *vdev, const char *name, uint16_t device_id, size_t config_size)
{
//赋值为VIRTIO_ID_BLOCK,靠这个辨别设备类型
vdev->device_id = device_id;
//初始化vq结构体,最大支持VIRTIO_QUEUE_MAX个
vdev->vq = g_malloc0(sizeof(VirtQueue) * VIRTIO_QUEUE_MAX);
vdev->name = name;
vdev->config_len = config_size;
vdev->config = g_malloc0(config_size);
//添加vmstat变化的回调函数virtio_vmstate_change
vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change, vdev);
}
回到virtio_blk_device_realize,virtio_add_queue_aio负责初始化vq结构体
for (i = 0; i < conf->num_queues; i++) {
virtio_add_queue_aio(vdev, 128, virtio_blk_handle_output);
}
vdev->vq[i].vring.num = queue_size;
vdev->vq[i].vring.num_default = queue_size;
vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN;
vdev->vq[i].handle_output = handle_output;
vdev->vq[i].handle_aio_output = NULL;
vdev->vq[i].use_aio = use_aio;
此时需要注意到vq的handle_output设置为virtio_blk_handle_output。
virtio_blk_data_plane此刻先假定不支持,后续代码暂时不涉及。
//添加virtio blk的vmstat watch,当vm stat变化是,负责刷新所有的blk buff到img //在virtio_blk_dma_restart_cb里面,使用qemu_bh_new添加virtio_blk_dma_restart_bh, //而virtio_blk_dma_restart_bh函数,基本就是virtio blk提交的过程了, s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s); //填充blk结构体 blk_set_dev_ops(s->blk, &virtio_block_ops, s); blk_set_guest_block_size(s->blk, s->conf.conf.logical_block_size); blk_iostatus_enable(s->blk);
下面是VirtIOBlock的解释:
typedef struct VirtIOBlock {
BlockBackend *blk;
//指向准备提交的VirtIOBlockReq
void *rq;
/*异步线程写入使用的结构,将写入的buff抽象出来,QEMUBH,使用aio_bh_new插入队列
*aio_bh_poll负责flush队列,每个buff都要自己的bh->cb,而virtio_blk_dma_restart_cb处理
*机制于此类似,将没有处理的数据重新执行一遍。*/
QEMUBH *bh;
VirtIOBlkConf conf;
VMChangeStateEntry *change;
bool dataplane_disabled;
bool dataplane_started;
struct VirtIOBlockDataPlane *dataplane;
} VirtIOBlock;
2.guest kernel中virtio modules处理
virtio pv的代码在virtio_blk.c文件,初始化init就三个函数:
virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
major = register_blkdev(0, "virtblk");
error = register_virtio_driver(&virtio_blk);
static struct virtio_driver virtio_blk = {
.id_table= id_table,
.probe= virtblk_probe,
.remove= virtblk_remove,
.config_changed= virtblk_config_changed,
};
重点在virtio_blk,当然driver->driver.bus = &virtio_bus需要留意,直接看virtio_blk,当kernel 探测到设备时调用virtblk_probe:
static int virtblk_probe(struct virtio_device *vdev)
{
struct virtio_blk *vblk;
/* priv 指向具体的virtio设备*/
vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
/*初始化virtqueue*/
err = init_vq(vblk);
/*设置virtio磁盘多队列*/
memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
/*mq执行对象函数*/
vblk->tag_set.ops = &virtio_mq_ops;
vblk->tag_set.nr_hw_queues = 1;
vblk->tag_set.queue_depth = virtblk_queue_depth;
vblk->tag_set.numa_node = NUMA_NO_NODE;
vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
vblk->tag_set.cmd_size = sizeof(struct virtblk_req) + sizeof(struct scatterlist) * sg_elems;
vblk->tag_set.driver_data = vblk;
err = blk_mq_alloc_tag_set(&vblk->tag_set);
/*设置virtio blk磁盘队列*/
q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set);
vblk->disk->major = major;
vblk->disk->first_minor = index_to_minor(index);
vblk->disk->private_data = vblk;
vblk->disk->fops = &virtblk_fops;
vblk->disk->driverfs_dev = &vdev->dev;
vblk->index = index;
}
再看init_vq,到virtio_find_single_vq,vdev->config->find_vqs(vdev, 1, &vq, callbacks, names)在virtio_pci_config_ops对象中,即从vp_find_vqs到vp_try_to_find_vqs,注意此处传进来的callback是virtblk_done。在vp_try_to_find_vqs中:
for (i = 0; i < nvqs; ++i) {
vqs[i] = setup_vq(vdev, i, callbacks[i], names[i], msix_vec);
}
在setup_vq中
/*通过ioport告诉qemu PV开始设置*/ iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); /*检查qemu对virtio blk vq的初始化状况*/ num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM); info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); /*将gpa通知给qemu*/ iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); /*vq的notify是vp_notify*/ vq = vring_new_virtqueue(index, info->num, VIRTIO_PCI_VRING_ALIGN, vdev, true, info->queue, vp_notify, callback, name);
在guest kernel block层提交,submit_bio,使用generic_make_request提交bio
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
q->make_request_fn(q, bio);
bio = bio_list_pop(current->bio_list);
} while (bio);
回头看virtblk_probe函数,vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set)
blk_mq_init_queue调用blk_queue_make_request给q->make_request_fn赋值为blk_mq_make_request,在blk_mq_make_request中有q->mq_ops->queue_rq(data.hctx, rq),即virtio_mq_ops.queue_rq=virtio_queue_rq。
在virtio_queue_rq中:
/*添加req到ring中*/ err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num); if (notify) /*对qemu发出通知,bio commit已经完成*/ virtqueue_notify(vblk->vq); virtqueue_notify直接调用vq->notify,即vp_notify iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY)
3.qemu对virtio vring的处理
在virtio_blk_device_realize中,最终调用virtio_add_queue_internal,vdev->vq[i].handle_output = handle_output。而当iowrite到qemu实现是virtio_pci_config_write,在virtio_ioport_write中,当满足VIRTIO_PCI_QUEUE_NOTIFY后执行virtio_queue_notify,然后virtio_queue_notify_vq,最终执行vq->handle_output即virtio_blk_handle_output。
在virtio_blk_handle_output中,blk_data_plane机制和普通的统一线程差别不是太大,不在单独表述,在virtio_blk_handle_vq中,virtio_blk_get_request通过virtqueue_pop获取req,其中gpa到hva的转换在virtqueue_map_desc函数完成,virtio_blk_handle_request拿到req后执行virtio_blk_submit_multireq,即是跳出这个函数外面也是一个virtio_blk_submit_multireq。
virtio_blk_submit_multireq调用submit_requests:
if (is_write) {
blk_aio_pwritev(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0,
virtio_blk_rw_complete, mrb->reqs[start]);
} else {
blk_aio_preadv(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0,
virtio_blk_rw_complete, mrb->reqs[start]);
以blk_aio_preadv为例:
在blk_aio_prwv中,另外创建线程执行blk_aio_read_entry,即blk_co_preadv,又bdrv_co_preadv,最终到bdrv_aligned_preadv,然后在bdrv_driver_preadv有:
drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov)
此处就算完成了整个IO过程。
剩下的就是qcow2到raw的不同kvm img进行解析了,参考gdb bt栈
#0 0x0000555555b83b05 in raw_co_preadv (bs=0x5555570ee4f0, offset=1175007232, bytes=28672, qiov=0x55555b66d330, flags=0) at block/raw-posix.c:1274
#1 0x0000555555b8cb0b in bdrv_driver_preadv (bs=0x5555570ee4f0, offset=1175007232, bytes=28672, qiov=0x55555b66d330, flags=0) at block/io.c:815
#2 0x0000555555b8d441 in bdrv_aligned_preadv (bs=0x5555570ee4f0, req=0x55555b66d240, offset=1175007232, bytes=28672, align=1, qiov=0x55555b66d330, flags=0) at block/io.c:1039
#3 0x0000555555b8d92b in bdrv_co_preadv (child=0x55555708b4a0, offset=1175007232, bytes=28672, qiov=0x55555b66d330, flags=0) at block/io.c:1131
#4 0x0000555555b54c08 in qcow2_co_preadv (bs=0x5555570e8250, offset=1174548480, bytes=28672, qiov=0x55555b66d660, flags=0) at block/qcow2.c:1509
#5 0x0000555555b8cb0b in bdrv_driver_preadv (bs=0x5555570e8250, offset=1174548480, bytes=28672, qiov=0x55555b66d660, flags=0) at block/io.c:815
#6 0x0000555555b8d441 in bdrv_aligned_preadv (bs=0x5555570e8250, req=0x55555b66d550, offset=1174548480, bytes=28672, align=1, qiov=0x55555b66d660, flags=0) at block/io.c:1039
#7 0x0000555555b8d92b in bdrv_co_preadv (child=0x5555570c61f0, offset=1174548480, bytes=28672, qiov=0x55555b66d660, flags=0) at block/io.c:1131
#8 0x0000555555b549cb in qcow2_co_preadv (bs=0x5555570b9c70, offset=1174548480, bytes=28672, qiov=0x555558153fb0, flags=0) at block/qcow2.c:1446
#9 0x0000555555b8cb0b in bdrv_driver_preadv (bs=0x5555570b9c70, offset=1174548480, bytes=28672, qiov=0x555558153fb0, flags=0) at block/io.c:815
对于qcow2如果有backing file,qcow2_co_preadv第一遍失败,大递归,在执行qcow2_co_preadv,然后解析成raw,使用raw_co_prw。
然后从raw_co_prw到paio_submit_co:
thread_pool_submit_co(pool, aio_worker, acb)
aio_worker调用handle_aiocb_rw,继续handle_aiocb_rw_linear
if (aiocb->aio_type & QEMU_AIO_WRITE) {
len = pwrite(aiocb->aio_fildes,
(const char *)buf + offset,
aiocb->aio_nbytes - offset,
aiocb->aio_offset + offset);
} else {
len = pread(aiocb->aio_fildes,
buf + offset,
aiocb->aio_nbytes - offset,
aiocb->aio_offset + offset);
}
最终完成终点。
关于读写request的设定
在guest中:
virtio_queue_rq
num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
if (num) {
if (rq_data_dir(vbr->req) == WRITE)
vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
else
vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
}
写磁盘的head放在第一个out_iovc中,读磁盘的head放在最后一个in_iovc中.
vring满就好kick到qemu,一次pop全出。
4.guest区分读写request
1.virtio_queue_rq根据req->cmd_type区分,在vbr->out_hdr.type中添加VIRTIO_BLK_T_IN, VIRTIO_BLK_T_OUT, VIRTIO_BLK_T_SCSI_CMD, VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID,每一个req都有自己的目的,其中IN/OUT优先级最大,可以覆盖其他。根据vbr->out_hdr.type,读写类型让num_out和num_in继承,
通过
for (n = 0; n < out_sgs; n++) {
for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
vq->vring.desc[i].addr = sg_phys(sg);
vq->vring.desc[i].len = sg->length;
prev = i;
i = vq->vring.desc[i].next;
}
}
for (; n < (out_sgs + in_sgs); n++) {
for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
vq->vring.desc[i].addr = sg_phys(sg);
vq->vring.desc[i].len = sg->length;
prev = i;
i = vq->vring.desc[i].next;
}
}
通过sgs遍历,读写类型由desc[i].flags根据VRING_DESC_F_WRITE继承。
有具体数据的情况下,读写在req中是对立的,但是对于全写的req也会有些非数据的cmd读取。
所以在qemu virtio block中,有读写element,理论上virtqueue_pop中的读写不会都大于1,gdb测试结果也是如此
hw/virtio/virtio.c:615 if in_num>1 && out_num>1
KVM virtIO block源代码分析来自于OENHAN
链接为:https://oenhan.com/kvm-virtio-block-src/
大神有微信吗?或者群,求加入学习
@小亮 有疑问直接在文章下问即可
大神有微信吗?或者群,求加入学习
@小亮 有疑问直接在文章下问即可
内核的gdb调试栈 是用什么工具调试打印的?
内核的gdb调试栈 是用什么工具调试打印的?
请问,guestOS中的virtio虚拟总线是什么时候注册的?我知道是通过bus_unregister(&virtio_bus);这个函数注册。但是不太了解是在哪个流程里面注册的virtio_bus.谢谢。
请问,guestOS中的virtio虚拟总线是什么时候注册的?我知道是通过bus_unregister(&virtio_bus);这个函数注册。但是不太了解是在哪个流程里面注册的virtio_bus.谢谢。
上条评论有误:应该是通过bus_register(&virtio_bus)注册,而不是bus_unregister(&virtio_bus);。我看到代码里面直接通过dev->dev.bus = &virtio_bus将virtio设备注册到virtio总线上了。但是没有看到向注册virtio虚拟总线是在哪个流程里面。是不需要注册virtio总线吗?谢谢。
@小SSS 用gdb断一下virtio_pci_probe,看谁调用了它就知道了
上条评论有误:应该是通过bus_register(&virtio_bus)注册,而不是bus_unregister(&virtio_bus);。我看到代码里面直接通过dev->dev.bus = &virtio_bus将virtio设备注册到virtio总线上了。但是没有看到向注册virtio虚拟总线是在哪个流程里面。是不需要注册virtio总线吗?谢谢。
@小SSS 用gdb断一下virtio_pci_probe,看谁调用了它就知道了
我最近在研究virtio-blk,想搞明白guest中读写/dev/vda后,应该会跳到qemu的kvm_handle_io中吧,如果是这样的话,那kvm_handle_io如何关联到virtio-blk.c中的呢?
@ARTHUR.DAYNE 自己gdb断点一下virtio_blk_handle_output就很清楚了
我最近在研究virtio-blk,想搞明白guest中读写/dev/vda后,应该会跳到qemu的kvm_handle_io中吧,如果是这样的话,那kvm_handle_io如何关联到virtio-blk.c中的呢?
@ARTHUR.DAYNE 自己gdb断点一下virtio_blk_handle_output就很清楚了