源代码:git tag, kernel v3.16.37 qemu v2.7 ,上一篇:KVM源代码分析4:内存虚拟化OenHan

1. PIO指令介绍

80386的I/O指令使得处理器可以访问I/O端口,以便从外设输入数据,或者向外设发送数据。这些指令有一个指定I/O空间端口地址的操作数。有两类的I/O指令:

1、 在寄存器指定的地址传送一个数据(字节、字、双字)。

2、 传送指定内存中的一串数据(字节串、字串、双字串)。这些被称作为“串 I/O指令”或者说“块I/O指令”。

有IN/OUT INS/OUTS指令

2. PIO运行在KVM

当guest执行PIO指令时,触发vmx_handle_exit,根据EXIT_REASON_IO_INSTRUCTION执行handle_io函数,根据sdm 3 27.5图表:

Bit Position(s) ContentsContents
2:0Size of access:
0 = 1-byte 1 = 2-byte 3 = 4-byte
Other values not used
3Direction of the attempted access (0 = OUT, 1 = IN)
4String instruction (0 = not string; 1 = string)
5REP prefixed (0 = not REP; 1 = REP)
6Operand encoding (0 = DX, 1 = immediate)
15:7Reserved (cleared to 0)
31:16Port number (as specified in DX or in an immediate operand)
63:32Reserved (cleared to 0). These bits exist only on processors that support Intel 64 architecture.

在handle_io中,如果

string = (exit_qualification & 16) != 0;

in = (exit_qualification & 8) != 0;

// string串指令或者IO读的指令进行处理

if (string || in)

return emulate_instruction(vcpu, 0) == EMULATE_DONE;

port = exit_qualification >> 16;

size = (exit_qualification & 7) + 1;

//跳过模拟指令,将非串写指令参数保存

skip_emulated_instruction(vcpu);

return kvm_fast_pio_out(vcpu, size, port);

先看kvm_fast_pio_out函数

unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);

int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,

   size, port, &val, 1);

pio指令对象放在eax里面,在emulator_pio_out_emulated中

memcpy(vcpu->arch.pio_data, val, size * count);

vcpu->arch.pio.port = port;

vcpu->arch.pio.in = in;

vcpu->arch.pio.count  = count;

vcpu->arch.pio.size = size;

if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {

vcpu->arch.pio.count = 0;

return 1;

}

vcpu->run->exit_reason = KVM_EXIT_IO;

vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;

vcpu->run->io.size = size;

vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;

vcpu->run->io.count = count;

vcpu->run->io.port = port;

此处就将pio保存到vcpu->arch.pio中了,注意exit_reason赋值为KVM_EXIT_IO

麻烦点在x86_emulate_instruction函数,具体看

struct x86_emulate_ctxt {

        /*操作对象*/
const struct x86_emulate_ops *ops;

/* Register state before/after emulation. */
unsigned long eflags;

unsigned long eip;  /* eip before instruction emulation */
/* Emulated execution mode, represented by an X86EMUL_MODE value. */
enum x86emul_mode mode;

/* interruptibility state, as a result of execution of STI or MOV SS */
int interruptibility;

int emul_flags;

bool perm_ok; /* do not check permissions if true */
bool ud;/* inject an #UD if host doesn't support insn */
bool have_exception;

struct x86_exception exception;

/*

* decode cache

*/
/* current opcode length in bytes */
u8 opcode_len;

u8 b;

u8 intercept;

u8 op_bytes;

u8 ad_bytes;

struct operand src;

struct operand src2;

struct operand dst;

int (*execute)(struct x86_emulate_ctxt *ctxt);

int (*check_perm)(struct x86_emulate_ctxt *ctxt);

/*

* The following six fields are cleared together,

* the rest are initialized unconditionally in x86_decode_insn

* or elsewhere

*/
bool rip_relative;

u8 rex_prefix;

u8 lock_prefix;

u8 rep_prefix;

/* bitmaps of registers in _regs[] that can be read */
u32 regs_valid;

/* bitmaps of registers in _regs[] that have been written */
u32 regs_dirty;

/* modrm */
u8 modrm;

u8 modrm_mod;

u8 modrm_reg;

u8 modrm_rm;

u8 modrm_seg;

u8 seg_override;

u64 d;

unsigned long _eip;

struct operand memop;

/* Fields above regs are cleared together. */
unsigned long _regs[NR_VCPU_REGS];

struct operand *memopp;

struct fetch_cache fetch;

struct read_cache io_read;

struct read_cache mem_read;

};

//init_emulate_ctxt负责初始化这个结构体,

ctxt->fetch.ptr = ctxt->fetch.data;
ctxt->fetch.end = ctxt->fetch.data + insn_len;
if (insn_len > 0)
         /*所有指令内容存放在data中*/memcpy(ctxt->fetch.data, insn, insn_len);
else {
        /*如果没有指定指令的内容,就从当前eip读取一个指令*/rc = __do_insn_fetch_bytes(ctxt, 1);
if (rc != X86EMUL_CONTINUE)
return rc;
}

都是将指令读取出来放到ctxt->fetch.data中而已,没有执行。

看__do_insn_fetch_bytes函数,

int cur_size = ctxt->fetch.end - ctxt->fetch.data;

/*下面的ea事实上应该说是指令decode时真正的eip,因为指令还没有被执行,eip没有

 * 更新,所以每次decode计算eip都要加上已经decode的代码长度,就是cur_size */
struct segmented_address addr = { .seg = VCPU_SREG_CS,
  .ea = ctxt->eip + cur_size };

/*__linearize就是获取线性地址*/ la = seg_base(ctxt, addr.seg) + addr.ea;

后面的即是一些处理,如不满足权限的就要模拟gp错误,emulate_gp。

然后是:

rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end, size, &ctxt->exception);

即kvm_fetch_guest_virt

先通过vcpu->arch.walk_mmu->gva_to_gpa获取gpa的值,然后用kvm_vcpu_read_guest_page获取gpa对应的内存值,这两个函数不再展开。

最终效果就是将linear对应的mem中大小为size的内容写入到ctxt->fetch.end指针对应的缓存中,因为ctxt->fetch.end = ctxt->fetch.data + insn_len而insn_len为0,则写入的是ctxt->fetch.data,在x86_decode_insn中,__do_insn_fetch_bytes就是从eip中copy了一个指令的内容到ctxt->fetch.data。x86_emulate_insn后面是指令解释的跳过。

x86_emulate_instruction后面一段代码:

/*exception在decode中产生*/if (ctxt->have_exception) {
r = EMULATE_DONE;
if (inject_emulated_exception(vcpu))
return r;
/*decode会处理,pio的执行次数,针对串指令*/} else if (vcpu->arch.pio.count) {
/*写入io的不特殊处理,只需完成写入即可*/if (!vcpu->arch.pio.in) {
/* FIXME: return into emulator if single-stepping.  */vcpu->arch.pio.count = 0;
} else {
writeback = false;
/*读取io的则需要回头处理读的值的流程*/vcpu->arch.complete_userspace_io = complete_emulated_pio;
}
r = EMULATE_USER_EXIT;
} else if (vcpu->mmio_needed) {
/*同上*/if (!vcpu->mmio_is_write)
writeback = false;
r= EMULATE_USER_EXIT;
vcpu->arch.complete_userspace_io = complete_emulated_mmio;
} else if (r == EMULATION_RESTART)
goto restart;
else
r = EMULATE_DONE;

handle_io 返回return emulate_instruction(vcpu, 0) == EMULATE_DONE,即是kvm_x86_ops->handle_exit(vcpu)的返回值,返回值不大于0,则从vcpu_run循环中跳出到qemu mode。

if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
} else {
r = vcpu_block(kvm, vcpu);
}
if (r <= 0)
break;

3. PIO运行在QEMU

在qemu kvm_cpu_exec函数中:

switch (run->exit_reason) {
/*KVM_EXIT_IO是在emulator_pio_in_out标记的*/        case KVM_EXIT_IO:
            DPRINTF("handle_io\n");
            /* Called outside BQL */            kvm_handle_io(run->io.port, attrs,
                          (uint8_t *)run + run->io.data_offset,
                         run->io.direction,
                          run->io.size,
                          run->io.count);
            ret = 0;
            break;
        case KVM_EXIT_MMIO:
            DPRINTF("handle_mmio\n");
            /* Called outside BQL */            address_space_rw(&address_space_memory,
                             run->mmio.phys_addr, attrs,
                             run->mmio.data,
                             run->mmio.len,
                             run->mmio.is_write);
            ret = 0;
            break;

static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,

                          int size, uint32_t count)
{
    int i;
    uint8_t *ptr = data;
    for (i = 0; i < count; i++) {
        /*从qemu内存中读写数据,本质就成了mmio操作 */        address_space_rw(&address_space_io, port, attrs, ptr, size, direction == KVM_EXIT_IO_OUT);
        ptr += size;
    }
}

如果写IO此处就算完成了,如果读取IO,此时读取完成,还需要后面处理。

处理在kvm_arch_vcpu_ioctl_run中的vcpu_run之前,

if (unlikely(vcpu->arch.complete_userspace_io)) {
int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
vcpu->arch.complete_userspace_io = NULL;
r = cui(vcpu);
if (r <= 0)
goto out;
} else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);

本质执行了complete_userspace_io函数,即是complete_emulated_pio。

就是r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE),再次模拟IO指令,此时将数据读取到模拟的IO端口上,模拟完成即可。


KVM源代码分析5:IO虚拟化之PIO来自于OenHan

链接为:https://oenhan.com/kvm-src-5-io-pio

6 thoughts on “KVM源代码分析5:IO虚拟化之PIO”

  1. 博主大侠,你好,什么样的IO会在KVM中进行模拟,什么样的IO需要在QEMU中进行模拟呢?

  2. 外设不都是在QEMU中模拟么?除了外设还有那些会用PIO?那些是在KVM中模拟的呢?

  3. 博主,你好
    请问对于out操作的串指令,为什么不需要通过complete_userspace_io再次调用complete_emulated_pio?
    else if (vcpu->arch.pio.count) {
    if (!vcpu->arch.pio.in)
    vcpu->arch.pio.count = 0;
    else {
    writeback = false;
    vcpu->arch.complete_userspace_io = complete_emulated_pio;
    }
    r = EMULATE_DO_MMIO;
    }

    1. @SOHU2000000 1.这个完全看不同情况,比如virtio是在qemu中处理IO的,但是vhost则是在hypervisor中处理IO的.
      2.Virtio原始的vring notify用的模拟的PCI portIO,当然也是QEMU模拟的.具体真正在hypervisor中处理的PIO没注意,可以看一下emulate_instruction,参考EMULATE_USER_EXIT的引用情况,貌似真正使用的时候都在qemu中.
      3.out串指令写内存就可以完成,不需要给guest反馈

  4. 模拟in指令的第二阶段写数据到虚拟机中, in指令会把端口的数据读入rax
    in的第二阶段如下:
    complete_emulated_io->emulate_instruction(vcpu, EMULTYPE_NO_DECODE)->
    x86_emulate_instruction,在这里面不会进行解码,
    x86_emulate_insn->ctxt->execute->em_in->pio_in_emulated->ctxt->ops->pio_in_emulated->
    emulator_pio_in_emulated,最后这个函数,这个时候vcpu->arch.pio_count不为0了(第一次返回的时候设置了),
    所以会到data_avail,然后把val就有了in的数据。
    但是我看不到如何写到客户机rax的代码,
    static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
    int size, unsigned short port, void *val,
    unsigned int count)
    {
    struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
    int ret;

    if (vcpu->arch.pio.count)
    goto data_avail;

    ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
    if (ret) {
    data_avail:
    memcpy(val, vcpu->arch.pio_data, size * count);
    trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
    vcpu->arch.pio.count = 0;
    return 1;
    }

    因为在调用 emulator_pio_in_emulated之前,val为rc->data,但是没有看到哪里取了这个值。
    static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
    unsigned int size, unsigned short port,
    void *dest)
    {
    struct read_cache *rc = &ctxt->io_read;

    if (rc->pos == rc->end) { /* refill pio read ahead */
    unsigned int in_page, n;
    unsigned int count = ctxt->rep_prefix ?
    address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
    in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
    offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
    PAGE_SIZE – offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
    n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
    if (n == 0)
    n = 1;
    rc->pos = rc->end = 0;
    if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
    return 0;
    rc->end = n * size;
    }

    有没有什么思路提示提示啊?

发表回复