virtIO vring工作机制分析
http://git.qemu.org/git/qemu.git v2.8.0
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git 4.4.70
1.VRing的初始化
QEMU下的VRing
typedef struct VRing {//vring中最多有多少各request unsigned int num; unsigned int num_default; //数据对齐比例尺 unsigned int align; //VRingDesc结构存储对应的gpa hwaddr desc; hwaddr avail; hwaddr used; } VRing;
VirtQueue在QEMU端的初始化,即virtio_add_queue, virtio最多有1024个虚拟队列,且每个队列最多容纳1024个request单位
if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE) abort(); vdev->vq[i].vring.num = queue_size; vdev->vq[i].vring.num_default = queue_size; vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN; vdev->vq[i].handle_output = handle_output; vdev->vq[i].handle_aio_output = NULL;
QEMU初始化了vring.num和handle_output.
回到guest中的virtio驱动, vm_setup_vq负责建立和qemu对应的virtqueue,
info->num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX);
上条代码帮助guest获取了vring.num, vring的内存从guest memory中申请:
info->queue = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
vring_new_virtqueue初始化了guest的结构体vring_virtqueue
struct vring_virtqueue { struct virtqueue vq; /* Actual memory layout for this queue */ struct vring vring; /* Can we use weak barriers? */ bool weak_barriers; /* Other side has made a mess, don't try any more. */ bool broken; /* Host supports indirect buffers */ bool indirect; /* Host publishes avail event idx */ bool event; /* Head of free buffer list. */ unsigned int free_head; /* Number we've added since last sync. */ unsigned int num_added; /* Last used index we've seen. */ u16 last_used_idx; /* Last written value to avail->flags */ u16 avail_flags_shadow; /* Last written value to avail->idx in guest byte order */ u16 avail_idx_shadow; /* How to notify other side. FIXME: commonalize hcalls! */ bool (*notify)(struct virtqueue *vq); /* Tokens for callbacks. 比如virtblk_req */ void *data[]; }; size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN)); //guest分配内存给vring info->queue = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
其中vring_size可以看出vring的组成形式
static inline unsigned vring_size(unsigned int num, unsigned long align) {return ((sizeof(struct vring_desc) * num + sizeof(__virtio16) * (3 + num) + align - 1) & ~(align - 1)) + sizeof(__virtio16) * 3 + sizeof(struct vring_used_elem) * num;} //每一个request在vring中的表现形式,和qemu下结构体对应 struct vring_desc { /* Address (guest-physical). */ __virtio64 addr; /* Length. */ __virtio32 len; /* The flags as indicated above. */ __virtio16 flags; /* We chain unused descriptors via this, too */ __virtio16 next; };
//再次通知QEMU queue中的request数目 writel(info->num, vm_dev->base + VIRTIO_MMIO_QUEUE_NUM); //将guest vring的gpa传递给qemu writel(virt_to_phys(info->queue) >> PAGE_SHIFT, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
再次回到QEMU,在virtio_mmio_write下,
case VIRTIO_MMIO_QUEUEPFN: if (value == 0) virtio_reset(vdev); else virtio_queue_set_addr(vdev, vdev->queue_sel,value << proxy->guest_page_shift);
在virtio_queue_set_addr中, vdev->vq[n].vring.desc = addr,看到vring.desc是vring的desc部分的头指针也即是vring的gpa头指针.
在virtio_queue_update_rings中则分别更新了vring->avail和vring->used的gpa指针
vring->avail = vring->desc + vring->num * sizeof(VRingDesc); vring->used = vring_align(vring->avail + offsetof(VRingAvail, ring[vring->num]), vring->align);
vring的布局如上,此刻基本的初始化已经完成.
2.Guest对vring的写
guest添加request到vring上的common函数是virtqueue_add, 前后的START_USE(vq)和END_USE(vq), 是debug用的引用计数器, 即vq不可同时有两个request请求.
vq->indirect是由VIRTIO_RING_F_INDIRECT_DESC决定的, 是指request数据是否可以放到guest memory再申请的内存中,然后将指针传递给vring中的desc单元:
//当前desc还有下一级索引,由VRING_DESC_F_INDIRECT决定 vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT); vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, virt_to_phys(desc)); vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
如果vring空闲的desc不满足申请的数目, flush virtqueue之后报错.
if (vq->vq.num_free < descs_used) if (out_sgs) vq->notify(&vq->vq); return -ENOSPC;
将sgs中的数据copy到desc空间中
vq->vq.num_free -= descs_used; for (n = 0; n < out_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT); desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg)); desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); prev = i; i = virtio16_to_cpu(_vq->vdev, desc[i].next); } } for (; n < (out_sgs + in_sgs); n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE); desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg)); desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); prev = i; i = virtio16_to_cpu(_vq->vdev, desc[i].next); } }
desc读写的流向是由desc[i].flags & VRING_DESC_F_WRITE决定的
desc是否结束是通过desc[i].flags & VRING_DESC_F_NEXT决定的, 有值则继续.
//最后一个desc进行处理 desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
上一次的写入vring的index保存在vring_avail中
avail = vq->avail_idx_shadow & (vq->vring.num - 1); vq->vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); vq->avail_idx_shadow++; vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow); struct vring_avail { __virtio16 flags; //index是根据avail_idx_shadow顺序增长的,那么desc[ring[idx-1]]的值就是上一次操作vring单元 //或者表示virtio前端更新了新的request后,就会更新avail结构,对应到QEMU的值是shadow_avail_idx __virtio16 idx; __virtio16 ring[]; };
当vring的更新次数达到64k后, flush vring内容到QEMU
if (unlikely(vq->num_added == (1 << 16) - 1)) virtqueue_kick(_vq);
前面提到-ENOSPC返回值也会触发virtqueue_kick.
在virtqueue_kick下的virtqueue_kick_prepare中,
//old是上次的kick时avail ring元素的index位置 old = vq->avail_idx_shadow - vq->num_added; //new是当前的avail ring元素的index位置 new = vq->avail_idx_shadow; //积攒的num_added清零 vq->num_added = 0;
当前代码中vq->event恒定为1, 即支持virtio速度控制VIRTIO_RING_F_EVENT_IDX功能
needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)), new, old);
先看 vring_avail_event(&vq->vring)
#define vring_avail_event(vr) (*(__virtio16 *)&(vr)->used->ring[(vr)->num])
直接读取了used ring的最后一个单元的值, 而在QEMU内, 最后一个值填充的是当前正在处理的last_avail_idx
if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { vring_set_avail_event(vq, vq->last_avail_idx); } static inline void vring_set_avail_event(VirtQueue *vq, uint16_t val) { hwaddr pa = vq->vring.used + offsetof(VRingUsed, ring[vq->vring.num]); virtio_stw_phys(vq->vdev, pa, val); }
vring_avail_event返回的即是virtio后端正在pop的last_avail_idx
int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old) return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
上图满足条件为真,表示virtio后端已经处理完上一次kick提交的request, 处理速度还可以,此刻前端不需要等待直接kick即可.
参考刘峰同学:http://blog.csdn.net/leoufung/article/details/53584970
3. QEMU对vring的出栈
回到QEMU
struct VirtQueue { VRing vring; /* Next head to pop, 是avail vring的开头*/ uint16_t last_avail_idx; /* Last avail_idx read from VQ. 是avail vring的末尾,*/ /* 如果和last_avail_idx相等,则说明前端没有提交任何request或前端队列为空 */ uint16_t shadow_avail_idx; uint16_t used_idx; /* Last used index value we have signalled on */ uint16_t signalled_used; /* Last used index value we have signalled on */ bool signalled_used_valid; /* Notification enabled? */ bool notification; uint16_t queue_index; //当前vring中正在被使用单元个数 int inuse; uint16_t vector; };
virtqueue_pop中, 先判断vq->inuse >= vq->vring.num, 从avail ring里面获取desc的index
virtqueue_get_head(vq, vq->last_avail_idx++, &head)
如果支持VIRTIO_RING_F_EVENT_IDX,则将last_avail_idx保存到VRingUsed末尾,上节已经提到.
vring_desc_read读取head对应的desc并解析, 并处理是否是间接索引VRING_DESC_F_INDIRECT.
virtqueue_map_desc将desc.addr和desc.len映射给VirtQueueElement, 此处有elem->index = head, elem->index是取自vring的desc的index. 然后vq->inuse++;
当virtio device完成具体任务时,virtqueue_push会被调用,
virtqueue_fill(vq, elem, len, 0); virtqueue_flush(vq, 1);
在virtqueue_fill下,virtqueue_unmap_sg解除了virtqueue_map_desc做的映射关系, 将elem->index和len填写到VRingUsedElem, 最终使用vring_used_write写入到VRingUsed.
typedef struct VRingUsed { uint16_t flags; //对应QEMU下的used_idx uint16_t idx; VRingUsedElem ring[0]; } VRingUsed;
然后回到virtqueue_flush,更新了VRingUsed idx:
old = vq->used_idx; new = old + count; vring_used_idx_set(vq, new); vq->inuse -= count;
最终virtio_notify发送一个虚拟中断给guest进行通知.
4. guest 对virtio 中断的处理
guest kernel在vp_try_to_find_vqs中选择调用vp_request_intx或vp_request_msix_vectors, 在vp_request_intx中会使用request_irq注册中断,中断处理函数就是vp_interrupt.
从vp_interrupt->vp_vring_interrupt->vring_interrupt一层层调用,最终执行vq->vq.callback(&vq->vq).
而vq->vq.callback是在vring_new_virtqueue中初始化的:vq->vq.callback = callback.
从vring_new_virtqueue<-setup_vq<-vp_setup_vq<-vp_try_to_find_vqs<-vp_find_vqs<-vp_modern_find_vqs<-virtio_config_ops.find_vqs依次往上走
就会看到调用find_vqs的函数下callbacks对应的有virtblk_done, balloon_ack, control_intr, 或者virtscsi_req_done.
virtIO vring工作机制分析来自于OenHan
链接为:https://oenhan.com/virtio-vring