1. Inflight机制存在的原因

QEMU/KVM/SPDK/DPDK这种架构下,vhost相关进程会因为各种异常概率crash的,在DPDK这种网络架构下,最多是产生丢包效应,一旦vhost进程拉起重连后,DPDK网络架构仍可以正常工作,而对于SPDK则不可行,因为存储系统中丢失的任一块请求在文件系统中都可能表意元数据,导致文件系统异常,对于用户而言,QEMU VM正常运行,而SPDK vhost crash后再恢复,文件系统出现了异常尚可接受,用户数据发生无感知丢失则不可忍受了。而QEMU/SPDK inflights则用于解决这个问题。

2. 代码版本

QEMU代码:https://github.com/qemu/qemu.git v5.1.0

SPDK代码:https://github.com/spdk/spdk v20.07.x

3. 核心原理

SPDK创建一个本地文件,作为位图标记vhost正在处理的virtio io reqs,如果vhost异常crash后拉起,则读取这个本地文件处理未完成的io reqs。

4. 创建INFLIGHT文件

4.1. QEMU INFLIGHT API

VhostUserRequest有两个API负责处理INFLIGHT_FD

typedef enum VhostUserRequest {
    VHOST_USER_GET_INFLIGHT_FD = 31,
    VHOST_USER_SET_INFLIGHT_FD = 32,
}

在QEMU侧,VHostUserBlk结构体下有vhost_inflight,SPDK本地文件就挂在vhost_inflight.fd下

struct vhost_inflight {
    //本地文件fd
    int fd;
    //fd mmap addr
    void *addr;
    uint64_t size;
    uint64_t offset;
    uint16_t queue_size;
};
typedef struct VHostUserBlk {
    struct vhost_inflight *inflight;
} VHostUserBlk;

vhost_inflight在vhost_user_blk_start函数下进行初始化操作

    if (!s->inflight->addr) {
        ret = vhost_dev_get_inflight(&s->dev, s->queue_size, s->inflight);
        if (ret < 0) {
            error_report("Error get inflight: %d", -ret);
            goto err_guest_notifiers;
        }
    }

    ret = vhost_dev_set_inflight(&s->dev, s->inflight);
    if (ret < 0) {
        error_report("Error set inflight: %d", -ret);
        goto err_guest_notifiers;
    }

即如果当前vhost blk inflight->addr为NULL,就通过vhost_dev_get_inflight通知SPDK建立文件,并将相关信息传递给QEMU。如果inflight->addr不为NULL,则说明当前的SPDK是crash后再拉起的,需要将之前的本地文件fd传递SPDK,让其处理未完成的IO reqs。

以上就可以知道,inflight文件是SPDK创建,用fd的方式存储在QEMU的,当新vhost拉起时,再将fd信息传递给vhost。

具体再说vhost_dev_get_inflight和vhost_dev_set_inflight,在vhost_user_get_inflight_fd中

    VhostUserMsg msg = {
        .hdr.request = VHOST_USER_GET_INFLIGHT_FD,
        .hdr.flags = VHOST_USER_VERSION,
        .payload.inflight.num_queues = dev->nvqs,
        .payload.inflight.queue_size = queue_size,
        .hdr.size = sizeof(msg.payload.inflight),
    };
    vhost_user_write(dev, &msg, NULL, 0);
    addr = mmap(0, msg.payload.inflight.mmap_size, PROT_READ | PROT_WRITE,
                MAP_SHARED, fd, msg.payload.inflight.mmap_offset);
    //QEMU迁移的时候使用
    inflight->addr = addr;
    inflight->fd = fd;
    //QEMU迁移的时候使用
    inflight->size = msg.payload.inflight.mmap_size;
    //上面QEMU mmap使用
    inflight->offset = msg.payload.inflight.mmap_offset;
    //确定pervq_inflight_size大小
    inflight->queue_size = queue_size;

一般QMEU只需要保存inflight->fd即可,但涉及到QEMU迁移等一系列问题,故vhost_inflight其他元素有作用如上所写。
vhost_dev_set_inflight大致代码和vhost_dev_get_inflight类似,不再赘叙。

4.2. SPDK inflight 文件创建

在vhost_message_handlers有对应API的回调函数

static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
    [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd,
    [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd,
}

从QEMU的vhost_dev_get_inflight和vhost_dev_set_inflight vhost_user_write下来就到了这两个函数,单说vhost_user_get_inflight_fd

static int vhost_user_get_inflight_fd(struct virtio_net **pdev,
               VhostUserMsg *msg,int main_fd __rte_unused)
{
//专门处理vring VIRTIO_F_RING_PACKED情况,和非packed机制无差别
//后面只提非VIRTIO_F_RING_PACKED的情况
    if (vq_is_packed(dev))
        pervq_inflight_size = get_pervq_shm_size_packed(queue_size);
    else
        pervq_inflight_size = get_pervq_shm_size_split(queue_size);
//mmap_size就是inflight文件大小,每个vq需要的size*queue_num
    mmap_size = num_queues * pervq_inflight_size;
//inflight文件在inflight_mem_alloc中创建,即"/tmp/memfd-XXXXXX"
//通过open后unlink的方式打开,只能通过ll /proc/`pidof qemu or vhost`/fd/看到文件句柄
    addr = inflight_mem_alloc("vhost-inflight", mmap_size, &fd);
    if (!addr) {
        VHOST_LOG_CONFIG(ERR,
            "failed to alloc vhost inflight area\n");
            msg->payload.inflight.mmap_size = 0;
        return RTE_VHOST_MSG_RESULT_ERR;
    }
    memset(addr, 0, mmap_size);

    dev->inflight_info->addr = addr;
    dev->inflight_info->size = msg->payload.inflight.mmap_size = mmap_size;
    //将fd传递给QEMU
    dev->inflight_info->fd = msg->fds[0] = fd;
    msg->payload.inflight.mmap_offset = 0;
    msg->fd_num = 1;

    return RTE_VHOST_MSG_RESULT_REPLY;
}

在QEMU VHOST_USER_GET_INFLIGHT_FD后就进行set操作,在vhost_user_set_inflight_fd下

static int vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
               int main_fd __rte_unused)
{
    fd = msg->fds[0];
    mmap_size = msg->payload.inflight.mmap_size;
    mmap_offset = msg->payload.inflight.mmap_offset;
    num_queues = msg->payload.inflight.num_queues;
    queue_size = msg->payload.inflight.queue_size;

    if (vq_is_packed(dev))
        pervq_inflight_size = get_pervq_shm_size_packed(queue_size);
    else
        pervq_inflight_size = get_pervq_shm_size_split(queue_size);
//重新mmap文件内容
    addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, mmap_offset);

    dev->inflight_info->fd = fd;
    dev->inflight_info->addr = addr;
    dev->inflight_info->size = mmap_size;

    for (i = 0; i < num_queues; i++) {
        vq = dev->virtqueue[i];
        if (vq_is_packed(dev)) {
            vq->inflight_packed = addr;
            vq->inflight_packed->desc_num = queue_size;
        } else {
            //将vq对应的地址赋值
            vq->inflight_split = addr;
            vq->inflight_split->desc_num = queue_size;
        }
        //为每个vq计算mmap中对应的地址
        addr = (void *)((char *)addr + pervq_inflight_size);
    }

    return RTE_VHOST_MSG_RESULT_OK;
}

5. SPDK中inflight记录reqs

接着上面,回到vq->inflight_split即struct rte_vhost_inflight_info_split *inflight_split,在process_vq函数下,每准备处理一个vring desc就执行rte_vhost_set_inflight_desc_split标记该index,如下,即可看出inflight_split本质是vring desc是否正被处理的位图。

int rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx, uint16_t idx)
{
    vq->inflight_split->desc[idx].counter = vq->global_counter++;
    vq->inflight_split->desc[idx].inflight = 1;
    return 0;
}

而清除位图则使用的是rte_vhost_clr_inflight_desc_split

int rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
                  uint16_t last_used_idx, uint16_t idx)
{
    vq->inflight_split->desc[idx].inflight = 0;
    rte_smp_mb();
    vq->inflight_split->used_idx = last_used_idx;
    return 0;
}

从process_blk_request -> blk_request_complete_cb -> blk_request_finish -> blk_task_enqueue -> vhost_vq_used_ring_enqueue -> rte_vhost_clr_inflight_desc_split可以看出,当vhost完成io更新virtio vring used_inex之后才清理位图。

6. SPDK中inflight恢复reqs

从vhost日志得知

VHOST_CONFIG: read message VHOST_USER_SET_VRING_NUM
VHOST_CONFIG: read message VHOST_USER_SET_VRING_BASE
VHOST_CONFIG: read message VHOST_USER_SET_VRING_ADDR
VHOST_CONFIG: read message VHOST_USER_SET_VRING_KICK
VHOST_CONFIG: vring kick idx:3 file:79
VHOST_CONFIG: reallocate vq from 0 to 1 node
VHOST_CONFIG: virtio is now ready for processing.

VHOST_USER_SET_VRING_KICK是vhost vritio ready前的最后一个请求,reqs恢复则做到了vhost_user_set_vring_kick中,其执行了vhost_check_queue_inflights_split函数

static int vhost_check_queue_inflights_split(struct virtio_net *dev,
                  struct vhost_virtqueue *vq)
{
    //计算总数,更新last_avail_idx
    for (i = 0; i < inflight_split->desc_num; i++) {
        if (inflight_split->desc[i].inflight == 1)
            resubmit_num++;
    }
    vq->last_avail_idx += resubmit_num;

    if (resubmit_num) {
        resubmit  = calloc(1, sizeof(struct rte_vhost_resubmit_info));
        if (!resubmit) {
            VHOST_LOG_CONFIG(ERR,
                "failed to allocate memory for resubmit info.\n");
            return RTE_VHOST_MSG_RESULT_ERR;
        }

        resubmit->resubmit_list = calloc(resubmit_num,
            sizeof(struct rte_vhost_resubmit_desc));

        num = 0;
        for (i = 0; i < vq->inflight_split->desc_num; i++) {
            if (vq->inflight_split->desc[i].inflight == 1) {
                //将inflight信息更新到resubmit
                resubmit->resubmit_list[num].index = i;
                resubmit->resubmit_list[num].counter = inflight_split->desc[i].counter;
                num++;
            }
        }
        resubmit->resubmit_num = num;

        if (resubmit->resubmit_num > 1)
            qsort(resubmit->resubmit_list, resubmit->resubmit_num,
                  sizeof(struct rte_vhost_resubmit_desc),
                  resubmit_desc_compare);

        vq->global_counter = resubmit->resubmit_list[0].counter + 1;
        //赋值给vq->resubmit_inflight
        vq->resubmit_inflight = resubmit;
    }
}

而在vhost_start_device_cb中,rte_vhost_get_vhost_ring_inflight将vq->resubmit_inflight赋值给vring->resubmit_inflight,那么回到process_vq,一开始就有submit_inflight_desc(bvsession, vq)

static void submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
             struct spdk_vhost_virtqueue *vq)
{
    resubmit_list = resubmit->resubmit_list;
    while (resubmit->resubmit_num-- > 0) {
        req_idx = resubmit_list[resubmit->resubmit_num].index;
        process_blk_task(vq, req_idx);
    }

    free(resubmit_list);
    resubmit->resubmit_list = NULL;
}

submit_inflight_desc就是获取到index,提交给process_blk_task处理,处理完成之后会调用rte_vhost_clr_inflight_desc_split清除位图。

7. 其他应用场景

除了解决SPDK vhost crash问题,还可以应用于解决DPDK/SPDK VHOST热升级过程中virtio req丢失的问题。

---end---


QEMU SPDK/DPDK inflight机制来自于OenHan

链接为:https://oenhan.com/qemu-spdk-dpdk-inflight

发表评论