KVM virtIO block源代码分析
源代码:linux-3.16.37-git, qemu-v2.7
1.vm启动时qemu的代码
virtio block的qemu cmd:
x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu host -m 256 -smp 1 -drive file=~/vm/centos6-virtio.qcow2,if=none,id=drive-virtio-disk,format=qcow2 -device virtio-blk-pci,bus=pci.0,drive=drive-virtio-disk,id=virtio-disk
qemu中virtio blk代码所在的重点文件如下:
hw\virtio\virtio.c
hw\virtio\virtio-bus.c
hw\virtio\virtio-rng.c
hw\block\virtio-blk.c
hw\net\virtio-net.c
在type_initialize过程中,virtio的设备都会初始化一遍,有virtio_device_class_init,virtio_rng_class_init,virtio_bus_class_init,virtio_pci_bus_class_init,virtio_blk_class_init。
gdb抓取的信息如下:
#0 0x0000555555804ffc in virtio_device_class_init (klass=0x5555566f0090, data=0x0)
at /home/oenhan/workspace/src/qemu-v2.7.0/hw/virtio/virtio.c:1968
#1 0x0000555555b1e542 in type_initialize (ti=0x5555566c9060) at qom/object.c:328
#2 0x0000555555b1e2ba in type_initialize (ti=0x5555566c5b80) at qom/object.c:280
#3 0x0000555555b1f6f6 in object_class_foreach_tramp (key=0x5555566a9710, value=0x5555566c5b80, opaque=0x7fffffffd870) at qom/object.c:798
#4 0x00007ffff2bd43d0 in g_hash_table_foreach () at /lib64/libglib-2.0.so.0
#5 0x0000555555b1f7cd in object_class_foreach (fn=
0x555555b1f922 <object_class_get_list_tramp>, implements_type=0x555555c4816e "machine", include_abstract=false, opaque=0x7fffffffd8c0)
at qom/object.c:820
#6 0x0000555555b1f99d in object_class_get_list (implements_type=0x555555c4816e "machine", include_abstract=false) at qom/object.c:874
#7 0x00005555558bdf1c in find_default_machine () at vl.c:1470
#8 0x00005555558c20bf in select_machine () at vl.c:2732
#9 0x00005555558c2bc3 in main (argc=12, argv=0x7fffffffdd38, envp=0x7fffffffdda0) at vl.c:3986
对于virtio blk而言,重点看一下virtio_blk_device_realize函数,
virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, sizeof(struct virtio_blk_config)) //virtio_init初始化VirtIODevice结构体 struct VirtIODevice { size_t config_len; void *config; uint16_t config_vector; //virtio 虚拟队列 VirtQueue *vq; uint16_t device_id; bool vm_running; VMChangeStateEntry *vmstate; char *bus_name; QLIST_HEAD(, VirtQueue) *vector_queues; }; void virtio_init(VirtIODevice *vdev, const char *name, uint16_t device_id, size_t config_size) { //赋值为VIRTIO_ID_BLOCK,靠这个辨别设备类型 vdev->device_id = device_id; //初始化vq结构体,最大支持VIRTIO_QUEUE_MAX个 vdev->vq = g_malloc0(sizeof(VirtQueue) * VIRTIO_QUEUE_MAX); vdev->name = name; vdev->config_len = config_size; vdev->config = g_malloc0(config_size); //添加vmstat变化的回调函数virtio_vmstate_change vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change, vdev); }
回到virtio_blk_device_realize,virtio_add_queue_aio负责初始化vq结构体
for (i = 0; i < conf->num_queues; i++) { virtio_add_queue_aio(vdev, 128, virtio_blk_handle_output); } vdev->vq[i].vring.num = queue_size; vdev->vq[i].vring.num_default = queue_size; vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN; vdev->vq[i].handle_output = handle_output; vdev->vq[i].handle_aio_output = NULL; vdev->vq[i].use_aio = use_aio;
此时需要注意到vq的handle_output设置为virtio_blk_handle_output。
virtio_blk_data_plane此刻先假定不支持,后续代码暂时不涉及。
//添加virtio blk的vmstat watch,当vm stat变化是,负责刷新所有的blk buff到img //在virtio_blk_dma_restart_cb里面,使用qemu_bh_new添加virtio_blk_dma_restart_bh, //而virtio_blk_dma_restart_bh函数,基本就是virtio blk提交的过程了, s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s); //填充blk结构体 blk_set_dev_ops(s->blk, &virtio_block_ops, s); blk_set_guest_block_size(s->blk, s->conf.conf.logical_block_size); blk_iostatus_enable(s->blk);
下面是VirtIOBlock的解释:
typedef struct VirtIOBlock { BlockBackend *blk; //指向准备提交的VirtIOBlockReq void *rq; /*异步线程写入使用的结构,将写入的buff抽象出来,QEMUBH,使用aio_bh_new插入队列 *aio_bh_poll负责flush队列,每个buff都要自己的bh->cb,而virtio_blk_dma_restart_cb处理 *机制于此类似,将没有处理的数据重新执行一遍。*/QEMUBH *bh; VirtIOBlkConf conf; VMChangeStateEntry *change; bool dataplane_disabled; bool dataplane_started; struct VirtIOBlockDataPlane *dataplane; } VirtIOBlock;
2.guest kernel中virtio modules处理
virtio pv的代码在virtio_blk.c文件,初始化init就三个函数:
virtblk_wq = alloc_workqueue("virtio-blk", 0, 0); major = register_blkdev(0, "virtblk"); error = register_virtio_driver(&virtio_blk); static struct virtio_driver virtio_blk = { .id_table= id_table, .probe= virtblk_probe, .remove= virtblk_remove, .config_changed= virtblk_config_changed, };
重点在virtio_blk,当然driver->driver.bus = &virtio_bus需要留意,直接看virtio_blk,当kernel 探测到设备时调用virtblk_probe:
static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; /* priv 指向具体的virtio设备*/vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); /*初始化virtqueue*/err = init_vq(vblk); /*设置virtio磁盘多队列*/memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); /*mq执行对象函数*/vblk->tag_set.ops = &virtio_mq_ops; vblk->tag_set.nr_hw_queues = 1; vblk->tag_set.queue_depth = virtblk_queue_depth; vblk->tag_set.numa_node = NUMA_NO_NODE; vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; vblk->tag_set.cmd_size = sizeof(struct virtblk_req) + sizeof(struct scatterlist) * sg_elems; vblk->tag_set.driver_data = vblk; err = blk_mq_alloc_tag_set(&vblk->tag_set); /*设置virtio blk磁盘队列*/q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set); vblk->disk->major = major; vblk->disk->first_minor = index_to_minor(index); vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; vblk->disk->driverfs_dev = &vdev->dev; vblk->index = index; }
再看init_vq,到virtio_find_single_vq,vdev->config->find_vqs(vdev, 1, &vq, callbacks, names)在virtio_pci_config_ops对象中,即从vp_find_vqs到vp_try_to_find_vqs,注意此处传进来的callback是virtblk_done。在vp_try_to_find_vqs中:
for (i = 0; i < nvqs; ++i) { vqs[i] = setup_vq(vdev, i, callbacks[i], names[i], msix_vec); }
在setup_vq中
/*通过ioport告诉qemu PV开始设置*/iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); /*检查qemu对virtio blk vq的初始化状况*/num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM); info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); /*将gpa通知给qemu*/iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); /*vq的notify是vp_notify*/vq = vring_new_virtqueue(index, info->num, VIRTIO_PCI_VRING_ALIGN, vdev, true, info->queue, vp_notify, callback, name);
在guest kernel block层提交,submit_bio,使用generic_make_request提交bio
do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); q->make_request_fn(q, bio); bio = bio_list_pop(current->bio_list); } while (bio);
回头看virtblk_probe函数,vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set)
blk_mq_init_queue调用blk_queue_make_request给q->make_request_fn赋值为blk_mq_make_request,在blk_mq_make_request中有q->mq_ops->queue_rq(data.hctx, rq),即virtio_mq_ops.queue_rq=virtio_queue_rq。
在virtio_queue_rq中:
/*添加req到ring中*/err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num); if (notify) /*对qemu发出通知,bio commit已经完成*/virtqueue_notify(vblk->vq); virtqueue_notify直接调用vq->notify,即vp_notify iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY)
3.qemu对virtio vring的处理
在virtio_blk_device_realize中,最终调用virtio_add_queue_internal,vdev->vq[i].handle_output = handle_output。而当iowrite到qemu实现是virtio_pci_config_write,在virtio_ioport_write中,当满足VIRTIO_PCI_QUEUE_NOTIFY后执行virtio_queue_notify,然后virtio_queue_notify_vq,最终执行vq->handle_output即virtio_blk_handle_output。
在virtio_blk_handle_output中,blk_data_plane机制和普通的统一线程差别不是太大,不在单独表述,在virtio_blk_handle_vq中,virtio_blk_get_request通过virtqueue_pop获取req,其中gpa到hva的转换在virtqueue_map_desc函数完成,virtio_blk_handle_request拿到req后执行virtio_blk_submit_multireq,即是跳出这个函数外面也是一个virtio_blk_submit_multireq。
virtio_blk_submit_multireq调用submit_requests:
if (is_write) { blk_aio_pwritev(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0, virtio_blk_rw_complete, mrb->reqs[start]); } else { blk_aio_preadv(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0, virtio_blk_rw_complete, mrb->reqs[start]);
以blk_aio_preadv为例:
在blk_aio_prwv中,另外创建线程执行blk_aio_read_entry,即blk_co_preadv,又bdrv_co_preadv,最终到bdrv_aligned_preadv,然后在bdrv_driver_preadv有:
drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov)
此处就算完成了整个IO过程。
剩下的就是qcow2到raw的不同kvm img进行解析了,参考gdb bt栈
#0 0x0000555555b83b05 in raw_co_preadv (bs=0x5555570ee4f0, offset=1175007232, bytes=28672, qiov=0x55555b66d330, flags=0) at block/raw-posix.c:1274
#1 0x0000555555b8cb0b in bdrv_driver_preadv (bs=0x5555570ee4f0, offset=1175007232, bytes=28672, qiov=0x55555b66d330, flags=0) at block/io.c:815
#2 0x0000555555b8d441 in bdrv_aligned_preadv (bs=0x5555570ee4f0, req=0x55555b66d240, offset=1175007232, bytes=28672, align=1, qiov=0x55555b66d330, flags=0) at block/io.c:1039
#3 0x0000555555b8d92b in bdrv_co_preadv (child=0x55555708b4a0, offset=1175007232, bytes=28672, qiov=0x55555b66d330, flags=0) at block/io.c:1131
#4 0x0000555555b54c08 in qcow2_co_preadv (bs=0x5555570e8250, offset=1174548480, bytes=28672, qiov=0x55555b66d660, flags=0) at block/qcow2.c:1509
#5 0x0000555555b8cb0b in bdrv_driver_preadv (bs=0x5555570e8250, offset=1174548480, bytes=28672, qiov=0x55555b66d660, flags=0) at block/io.c:815
#6 0x0000555555b8d441 in bdrv_aligned_preadv (bs=0x5555570e8250, req=0x55555b66d550, offset=1174548480, bytes=28672, align=1, qiov=0x55555b66d660, flags=0) at block/io.c:1039
#7 0x0000555555b8d92b in bdrv_co_preadv (child=0x5555570c61f0, offset=1174548480, bytes=28672, qiov=0x55555b66d660, flags=0) at block/io.c:1131
#8 0x0000555555b549cb in qcow2_co_preadv (bs=0x5555570b9c70, offset=1174548480, bytes=28672, qiov=0x555558153fb0, flags=0) at block/qcow2.c:1446
#9 0x0000555555b8cb0b in bdrv_driver_preadv (bs=0x5555570b9c70, offset=1174548480, bytes=28672, qiov=0x555558153fb0, flags=0) at block/io.c:815
对于qcow2如果有backing file,qcow2_co_preadv第一遍失败,大递归,在执行qcow2_co_preadv,然后解析成raw,使用raw_co_prw。
然后从raw_co_prw到paio_submit_co:
thread_pool_submit_co(pool, aio_worker, acb)
aio_worker调用handle_aiocb_rw,继续handle_aiocb_rw_linear
if (aiocb->aio_type & QEMU_AIO_WRITE) { len = pwrite(aiocb->aio_fildes, (const char *)buf + offset, aiocb->aio_nbytes - offset, aiocb->aio_offset + offset); } else { len = pread(aiocb->aio_fildes, buf + offset, aiocb->aio_nbytes - offset, aiocb->aio_offset + offset); }
最终完成终点。
关于读写request的设定
在guest中:
virtio_queue_rq num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg); if (num) { if (rq_data_dir(vbr->req) == WRITE) vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT); else vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN); }
写磁盘的head放在第一个out_iovc中,读磁盘的head放在最后一个in_iovc中.
vring满就好kick到qemu,一次pop全出。
4.guest区分读写request
1.virtio_queue_rq根据req->cmd_type区分,在vbr->out_hdr.type中添加VIRTIO_BLK_T_IN, VIRTIO_BLK_T_OUT, VIRTIO_BLK_T_SCSI_CMD, VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID,每一个req都有自己的目的,其中IN/OUT优先级最大,可以覆盖其他。根据vbr->out_hdr.type,读写类型让num_out和num_in继承,
通过
for (n = 0; n < out_sgs; n++) { for (sg = sgs[n]; sg; sg = next(sg, &total_out)) { vq->vring.desc[i].flags = VRING_DESC_F_NEXT; vq->vring.desc[i].addr = sg_phys(sg); vq->vring.desc[i].len = sg->length; prev = i; i = vq->vring.desc[i].next; } } for (; n < (out_sgs + in_sgs); n++) { for (sg = sgs[n]; sg; sg = next(sg, &total_in)) { vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; vq->vring.desc[i].addr = sg_phys(sg); vq->vring.desc[i].len = sg->length; prev = i; i = vq->vring.desc[i].next; } }
通过sgs遍历,读写类型由desc[i].flags根据VRING_DESC_F_WRITE继承。
有具体数据的情况下,读写在req中是对立的,但是对于全写的req也会有些非数据的cmd读取。
所以在qemu virtio block中,有读写element,理论上virtqueue_pop中的读写不会都大于1,gdb测试结果也是如此
hw/virtio/virtio.c:615 if in_num>1 && out_num>1
KVM virtIO block源代码分析来自于OenHan
链接为:https://oenhan.com/kvm-virtio-block-src
大神有微信吗?或者群,求加入学习
@小亮 有疑问直接在文章下问即可
内核的gdb调试栈 是用什么工具调试打印的?
请问,guestOS中的virtio虚拟总线是什么时候注册的?我知道是通过bus_unregister(&virtio_bus);这个函数注册。但是不太了解是在哪个流程里面注册的virtio_bus.谢谢。
上条评论有误:应该是通过bus_register(&virtio_bus)注册,而不是bus_unregister(&virtio_bus);。我看到代码里面直接通过dev->dev.bus = &virtio_bus将virtio设备注册到virtio总线上了。但是没有看到向注册virtio虚拟总线是在哪个流程里面。是不需要注册virtio总线吗?谢谢。
@小SSS 用gdb断一下virtio_pci_probe,看谁调用了它就知道了
我最近在研究virtio-blk,想搞明白guest中读写/dev/vda后,应该会跳到qemu的kvm_handle_io中吧,如果是这样的话,那kvm_handle_io如何关联到virtio-blk.c中的呢?
@ARTHUR.DAYNE 自己gdb断点一下virtio_blk_handle_output就很清楚了