首页 > Virtualization > KVM virtIO block源代码分析

KVM virtIO block源代码分析

Virtualization 2016-09-16

源代码:linux-3.16.37-git, qemu-v2.7

1.vm启动时qemu的代码

virtio block的qemu cmd:

x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu host -m 256 -smp 1 -drive file=~/vm/centos6-virtio.qcow2,if=none,id=drive-virtio-disk,format=qcow2 -device virtio-blk-pci,bus=pci.0,drive=drive-virtio-disk,id=virtio-disk

qemu中virtio blk代码所在的重点文件如下:

hw\virtio\virtio.c

hw\virtio\virtio-bus.c

hw\virtio\virtio-rng.c

hw\block\virtio-blk.c

hw\net\virtio-net.c

在type_initialize过程中,virtio的设备都会初始化一遍,有virtio_device_class_init,virtio_rng_class_init,virtio_bus_class_init,virtio_pci_bus_class_init,virtio_blk_class_init。

gdb抓取的信息如下:

#0  0x0000555555804ffc in virtio_device_class_init (klass=0x5555566f0090, data=0x0)

at /home/oenhan/workspace/src/qemu-v2.7.0/hw/virtio/virtio.c:1968

#1  0x0000555555b1e542 in type_initialize (ti=0x5555566c9060) at qom/object.c:328

#2  0x0000555555b1e2ba in type_initialize (ti=0x5555566c5b80) at qom/object.c:280

#3  0x0000555555b1f6f6 in object_class_foreach_tramp (key=0x5555566a9710, value=0x5555566c5b80, opaque=0x7fffffffd870) at qom/object.c:798

#4  0x00007ffff2bd43d0 in g_hash_table_foreach () at /lib64/libglib-2.0.so.0

#5  0x0000555555b1f7cd in object_class_foreach (fn=

0x555555b1f922 <object_class_get_list_tramp>, implements_type=0x555555c4816e "machine", include_abstract=false, opaque=0x7fffffffd8c0)

at qom/object.c:820

#6  0x0000555555b1f99d in object_class_get_list (implements_type=0x555555c4816e "machine", include_abstract=false) at qom/object.c:874

#7  0x00005555558bdf1c in find_default_machine () at vl.c:1470

#8  0x00005555558c20bf in select_machine () at vl.c:2732

#9  0x00005555558c2bc3 in main (argc=12, argv=0x7fffffffdd38, envp=0x7fffffffdda0) at vl.c:3986

对于virtio blk而言,重点看一下virtio_blk_device_realize函数,

virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, sizeof(struct virtio_blk_config))

virtio_init初始化VirtIODevice结构体

struct VirtIODevice

{

size_t config_len;

void *config;

uint16_t config_vector;

//virtio 虚拟队列

VirtQueue *vq;

uint16_t device_id;

bool vm_running;

VMChangeStateEntry *vmstate;

char *bus_name;

QLIST_HEAD(, VirtQueue) *vector_queues;

};

void virtio_init(VirtIODevice *vdev, const char *name,  uint16_t device_id, size_t config_size)

{

//赋值为VIRTIO_ID_BLOCK,靠这个辨别设备类型

vdev->device_id = device_id;

//初始化vq结构体,最大支持VIRTIO_QUEUE_MAX个

vdev->vq = g_malloc0(sizeof(VirtQueue) * VIRTIO_QUEUE_MAX);

vdev->name = name;

vdev->config_len = config_size;

vdev->config = g_malloc0(config_size);

//添加vmstat变化的回调函数virtio_vmstate_change

vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change, vdev);

}

回到virtio_blk_device_realize,virtio_add_queue_aio负责初始化vq结构体

for (i = 0; i < conf->num_queues; i++) {

virtio_add_queue_aio(vdev, 128, virtio_blk_handle_output);

}

vdev->vq[i].vring.num = queue_size;

vdev->vq[i].vring.num_default = queue_size;

vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN;

vdev->vq[i].handle_output = handle_output;

vdev->vq[i].handle_aio_output = NULL;

vdev->vq[i].use_aio = use_aio;

此时需要注意到vq的handle_output设置为virtio_blk_handle_output。

virtio_blk_data_plane此刻先假定不支持,后续代码暂时不涉及。

//添加virtio blk的vmstat watch,当vm stat变化是,负责刷新所有的blk buff到img

//在virtio_blk_dma_restart_cb里面,使用qemu_bh_new添加virtio_blk_dma_restart_bh,

//而virtio_blk_dma_restart_bh函数,基本就是virtio blk提交的过程了,

s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);

//填充blk结构体

blk_set_dev_ops(s->blk, &virtio_block_ops, s);

blk_set_guest_block_size(s->blk, s->conf.conf.logical_block_size);

blk_iostatus_enable(s->blk);

下面是VirtIOBlock的解释:

typedef struct VirtIOBlock {

BlockBackend *blk;

//指向准备提交的VirtIOBlockReq

void *rq;

/*异步线程写入使用的结构,将写入的buff抽象出来,QEMUBH,使用aio_bh_new插入队列

*aio_bh_poll负责flush队列,每个buff都要自己的bh->cb,而virtio_blk_dma_restart_cb处理

*机制于此类似,将没有处理的数据重新执行一遍。*/

QEMUBH *bh;

VirtIOBlkConf conf;

VMChangeStateEntry *change;

bool dataplane_disabled;

bool dataplane_started;

struct VirtIOBlockDataPlane *dataplane;

} VirtIOBlock;

2.guest kernel中virtio modules处理

virtio pv的代码在virtio_blk.c文件,初始化init就三个函数:

virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);

major = register_blkdev(0, "virtblk");

error = register_virtio_driver(&virtio_blk);

static struct virtio_driver virtio_blk = {

.id_table= id_table,

.probe= virtblk_probe,

.remove= virtblk_remove,

.config_changed= virtblk_config_changed,

};

重点在virtio_blk,当然driver->driver.bus = &virtio_bus需要留意,直接看virtio_blk,当kernel 探测到设备时调用virtblk_probe:

static int virtblk_probe(struct virtio_device *vdev)

{

struct virtio_blk *vblk;

/* priv 指向具体的virtio设备*/

vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);

/*初始化virtqueue*/

err = init_vq(vblk);

/*设置virtio磁盘多队列*/

memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));

/*mq执行对象函数*/

vblk->tag_set.ops = &virtio_mq_ops;

vblk->tag_set.nr_hw_queues = 1;

vblk->tag_set.queue_depth = virtblk_queue_depth;

vblk->tag_set.numa_node = NUMA_NO_NODE;

vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;

vblk->tag_set.cmd_size = sizeof(struct virtblk_req) + sizeof(struct scatterlist) * sg_elems;

vblk->tag_set.driver_data = vblk;

err = blk_mq_alloc_tag_set(&vblk->tag_set);

/*设置virtio blk磁盘队列*/

q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set);

vblk->disk->major = major;

vblk->disk->first_minor = index_to_minor(index);

vblk->disk->private_data = vblk;

vblk->disk->fops = &virtblk_fops;

vblk->disk->driverfs_dev = &vdev->dev;

vblk->index = index;

}

再看init_vq,到virtio_find_single_vq,vdev->config->find_vqs(vdev, 1, &vq, callbacks, names)在virtio_pci_config_ops对象中,即从vp_find_vqs到vp_try_to_find_vqs,注意此处传进来的callback是virtblk_done。在vp_try_to_find_vqs中:

for (i = 0; i < nvqs; ++i) {

vqs[i] = setup_vq(vdev, i, callbacks[i], names[i], msix_vec);

}

在setup_vq中

/*通过ioport告诉qemu PV开始设置*/

iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);

/*检查qemu对virtio blk vq的初始化状况*/

num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM);

info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);

/*将gpa通知给qemu*/

iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,

vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);

/*vq的notify是vp_notify*/

vq = vring_new_virtqueue(index, info->num, VIRTIO_PCI_VRING_ALIGN, vdev,

true, info->queue, vp_notify, callback, name);

在guest kernel block层提交,submit_bio,使用generic_make_request提交bio

do {

struct request_queue *q = bdev_get_queue(bio->bi_bdev);

q->make_request_fn(q, bio);

bio = bio_list_pop(current->bio_list);

} while (bio);

回头看virtblk_probe函数,vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set)

blk_mq_init_queue调用blk_queue_make_request给q->make_request_fn赋值为blk_mq_make_request,在blk_mq_make_request中有q->mq_ops->queue_rq(data.hctx, rq),即virtio_mq_ops.queue_rq=virtio_queue_rq。

在virtio_queue_rq中:

/*添加req到ring中*/

err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num);

if (notify)

/*对qemu发出通知,bio commit已经完成*/

virtqueue_notify(vblk->vq);

virtqueue_notify直接调用vq->notify,即vp_notify

iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY)

3.qemu对virtio vring的处理

在virtio_blk_device_realize中,最终调用virtio_add_queue_internal,vdev->vq[i].handle_output = handle_output。而当iowrite到qemu实现是virtio_pci_config_write,在virtio_ioport_write中,当满足VIRTIO_PCI_QUEUE_NOTIFY后执行virtio_queue_notify,然后virtio_queue_notify_vq,最终执行vq->handle_output即virtio_blk_handle_output。

在virtio_blk_handle_output中,blk_data_plane机制和普通的统一线程差别不是太大,不在单独表述,在virtio_blk_handle_vq中,virtio_blk_get_request通过virtqueue_pop获取req,其中gpa到hva的转换在virtqueue_map_desc函数完成,virtio_blk_handle_request拿到req后执行virtio_blk_submit_multireq,即是跳出这个函数外面也是一个virtio_blk_submit_multireq。

virtio_blk_submit_multireq调用submit_requests:

if (is_write) {

blk_aio_pwritev(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0,

virtio_blk_rw_complete, mrb->reqs[start]);

} else {

blk_aio_preadv(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0,

virtio_blk_rw_complete, mrb->reqs[start]);

以blk_aio_preadv为例:

在blk_aio_prwv中,另外创建线程执行blk_aio_read_entry,即blk_co_preadv,又bdrv_co_preadv,最终到bdrv_aligned_preadv,然后在bdrv_driver_preadv有:

drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov)

此处就算完成了整个IO过程

剩下的就是qcow2到raw的不同kvm img进行解析了,参考gdb bt栈

#0  0x0000555555b83b05 in raw_co_preadv (bs=0x5555570ee4f0, offset=1175007232, bytes=28672, qiov=0x55555b66d330, flags=0) at block/raw-posix.c:1274

#1  0x0000555555b8cb0b in bdrv_driver_preadv (bs=0x5555570ee4f0, offset=1175007232, bytes=28672, qiov=0x55555b66d330, flags=0) at block/io.c:815

#2  0x0000555555b8d441 in bdrv_aligned_preadv (bs=0x5555570ee4f0, req=0x55555b66d240, offset=1175007232, bytes=28672, align=1, qiov=0x55555b66d330, flags=0) at block/io.c:1039

#3  0x0000555555b8d92b in bdrv_co_preadv (child=0x55555708b4a0, offset=1175007232, bytes=28672, qiov=0x55555b66d330, flags=0) at block/io.c:1131

#4  0x0000555555b54c08 in qcow2_co_preadv (bs=0x5555570e8250, offset=1174548480, bytes=28672, qiov=0x55555b66d660, flags=0) at block/qcow2.c:1509

#5  0x0000555555b8cb0b in bdrv_driver_preadv (bs=0x5555570e8250, offset=1174548480, bytes=28672, qiov=0x55555b66d660, flags=0) at block/io.c:815

#6  0x0000555555b8d441 in bdrv_aligned_preadv (bs=0x5555570e8250, req=0x55555b66d550, offset=1174548480, bytes=28672, align=1, qiov=0x55555b66d660, flags=0) at block/io.c:1039

#7  0x0000555555b8d92b in bdrv_co_preadv (child=0x5555570c61f0, offset=1174548480, bytes=28672, qiov=0x55555b66d660, flags=0) at block/io.c:1131

#8  0x0000555555b549cb in qcow2_co_preadv (bs=0x5555570b9c70, offset=1174548480, bytes=28672, qiov=0x555558153fb0, flags=0) at block/qcow2.c:1446

#9  0x0000555555b8cb0b in bdrv_driver_preadv (bs=0x5555570b9c70, offset=1174548480, bytes=28672, qiov=0x555558153fb0, flags=0) at block/io.c:815

对于qcow2如果有backing file,qcow2_co_preadv第一遍失败,大递归,在执行qcow2_co_preadv,然后解析成raw,使用raw_co_prw。

然后从raw_co_prw到paio_submit_co:

thread_pool_submit_co(pool, aio_worker, acb)

aio_worker调用handle_aiocb_rw,继续handle_aiocb_rw_linear

if (aiocb->aio_type & QEMU_AIO_WRITE) {

len = pwrite(aiocb->aio_fildes,

(const char *)buf + offset,

aiocb->aio_nbytes - offset,

aiocb->aio_offset + offset);

} else {

len = pread(aiocb->aio_fildes,

buf + offset,

aiocb->aio_nbytes - offset,

aiocb->aio_offset + offset);

}

最终完成终点。

关于读写request的设定

在guest中:

virtio_queue_rq

num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);

if (num) {

if (rq_data_dir(vbr->req) == WRITE)

vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);

else

vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);

}

写磁盘的head放在第一个out_iovc中,读磁盘的head放在最后一个in_iovc中.

vring满就好kick到qemu,一次pop全出。

4.guest区分读写request

1.virtio_queue_rq根据req->cmd_type区分,在vbr->out_hdr.type中添加VIRTIO_BLK_T_IN, VIRTIO_BLK_T_OUT, VIRTIO_BLK_T_SCSI_CMD, VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID,每一个req都有自己的目的,其中IN/OUT优先级最大,可以覆盖其他。根据vbr->out_hdr.type,读写类型让num_out和num_in继承,

通过

for (n = 0; n < out_sgs; n++) {

for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {

vq->vring.desc[i].flags = VRING_DESC_F_NEXT;

vq->vring.desc[i].addr = sg_phys(sg);

vq->vring.desc[i].len = sg->length;

prev = i;

i = vq->vring.desc[i].next;

}

}
for (; n < (out_sgs + in_sgs); n++) {

for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {

vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;

vq->vring.desc[i].addr = sg_phys(sg);

vq->vring.desc[i].len = sg->length;

prev = i;

i = vq->vring.desc[i].next;

}

}

通过sgs遍历,读写类型由desc[i].flags根据VRING_DESC_F_WRITE继承。

有具体数据的情况下,读写在req中是对立的,但是对于全写的req也会有些非数据的cmd读取。

所以在qemu virtio block中,有读写element,理论上virtqueue_pop中的读写不会都大于1,gdb测试结果也是如此

b hw/virtio/virtio.c:615 if in_num>1 && out_num>1

KVM virtIO block源代码分析来自于OenHan,链接为:http://oenhan.com/kvm-virtio-block-src
更多阅读