KVM源代码分析5:IO虚拟化之PIO
源代码:git tag, kernel v3.16.37 qemu v2.7 ,上一篇:KVM源代码分析4:内存虚拟化–OenHan
1. PIO指令介绍
80386的I/O指令使得处理器可以访问I/O端口,以便从外设输入数据,或者向外设发送数据。这些指令有一个指定I/O空间端口地址的操作数。有两类的I/O指令:
1、 在寄存器指定的地址传送一个数据(字节、字、双字)。
2、 传送指定内存中的一串数据(字节串、字串、双字串)。这些被称作为“串 I/O指令”或者说“块I/O指令”。
有IN/OUT INS/OUTS指令
2. PIO运行在KVM
当guest执行PIO指令时,触发vmx_handle_exit,根据EXIT_REASON_IO_INSTRUCTION执行handle_io函数,根据sdm 3 27.5图表:
Bit Position(s) Contents | Contents |
2:0 | Size of access: 0 = 1-byte 1 = 2-byte 3 = 4-byte Other values not used |
3 | Direction of the attempted access (0 = OUT, 1 = IN) |
4 | String instruction (0 = not string; 1 = string) |
5 | REP prefixed (0 = not REP; 1 = REP) |
6 | Operand encoding (0 = DX, 1 = immediate) |
15:7 | Reserved (cleared to 0) |
31:16 | Port number (as specified in DX or in an immediate operand) |
63:32 | Reserved (cleared to 0). These bits exist only on processors that support Intel 64 architecture. |
在handle_io中,如果
string = (exit_qualification & 16) != 0; in = (exit_qualification & 8) != 0; // string串指令或者IO读的指令进行处理 if (string || in) return emulate_instruction(vcpu, 0) == EMULATE_DONE; port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; //跳过模拟指令,将非串写指令参数保存 skip_emulated_instruction(vcpu); return kvm_fast_pio_out(vcpu, size, port);
先看kvm_fast_pio_out函数
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, size, port, &val, 1);
pio指令对象放在eax里面,在emulator_pio_out_emulated中
memcpy(vcpu->arch.pio_data, val, size * count); vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { vcpu->arch.pio.count = 0; return 1; } vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = size; vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port;
此处就将pio保存到vcpu->arch.pio中了,注意exit_reason赋值为KVM_EXIT_IO
麻烦点在x86_emulate_instruction函数,具体看
struct x86_emulate_ctxt { /*操作对象*/ const struct x86_emulate_ops *ops; /* Register state before/after emulation. */ unsigned long eflags; unsigned long eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ enum x86emul_mode mode; /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; int emul_flags; bool perm_ok; /* do not check permissions if true */ bool ud;/* inject an #UD if host doesn't support insn */ bool have_exception; struct x86_exception exception; /* * decode cache */ /* current opcode length in bytes */ u8 opcode_len; u8 b; u8 intercept; u8 op_bytes; u8 ad_bytes; struct operand src; struct operand src2; struct operand dst; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); /* * The following six fields are cleared together, * the rest are initialized unconditionally in x86_decode_insn * or elsewhere */ bool rip_relative; u8 rex_prefix; u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ u32 regs_valid; /* bitmaps of registers in _regs[] that have been written */ u32 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; u8 modrm_reg; u8 modrm_rm; u8 modrm_seg; u8 seg_override; u64 d; unsigned long _eip; struct operand memop; /* Fields above regs are cleared together. */ unsigned long _regs[NR_VCPU_REGS]; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; }; //init_emulate_ctxt负责初始化这个结构体, ctxt->fetch.ptr = ctxt->fetch.data; ctxt->fetch.end = ctxt->fetch.data + insn_len; if (insn_len > 0) /*所有指令内容存放在data中*/memcpy(ctxt->fetch.data, insn, insn_len); else { /*如果没有指定指令的内容,就从当前eip读取一个指令*/rc = __do_insn_fetch_bytes(ctxt, 1); if (rc != X86EMUL_CONTINUE) return rc; }
都是将指令读取出来放到ctxt->fetch.data中而已,没有执行。
看__do_insn_fetch_bytes函数,
int cur_size = ctxt->fetch.end - ctxt->fetch.data; /*下面的ea事实上应该说是指令decode时真正的eip,因为指令还没有被执行,eip没有 * 更新,所以每次decode计算eip都要加上已经decode的代码长度,就是cur_size */ struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = ctxt->eip + cur_size }; /*__linearize就是获取线性地址*/ la = seg_base(ctxt, addr.seg) + addr.ea;
后面的即是一些处理,如不满足权限的就要模拟gp错误,emulate_gp。
然后是:
rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end, size, &ctxt->exception);
即kvm_fetch_guest_virt
先通过vcpu->arch.walk_mmu->gva_to_gpa获取gpa的值,然后用kvm_vcpu_read_guest_page获取gpa对应的内存值,这两个函数不再展开。
最终效果就是将linear对应的mem中大小为size的内容写入到ctxt->fetch.end指针对应的缓存中,因为ctxt->fetch.end = ctxt->fetch.data + insn_len而insn_len为0,则写入的是ctxt->fetch.data,在x86_decode_insn中,__do_insn_fetch_bytes就是从eip中copy了一个指令的内容到ctxt->fetch.data。x86_emulate_insn后面是指令解释的跳过。
x86_emulate_instruction后面一段代码:
/*exception在decode中产生*/if (ctxt->have_exception) { r = EMULATE_DONE; if (inject_emulated_exception(vcpu)) return r; /*decode会处理,pio的执行次数,针对串指令*/} else if (vcpu->arch.pio.count) { /*写入io的不特殊处理,只需完成写入即可*/if (!vcpu->arch.pio.in) { /* FIXME: return into emulator if single-stepping. */vcpu->arch.pio.count = 0; } else { writeback = false; /*读取io的则需要回头处理读的值的流程*/vcpu->arch.complete_userspace_io = complete_emulated_pio; } r = EMULATE_USER_EXIT; } else if (vcpu->mmio_needed) { /*同上*/if (!vcpu->mmio_is_write) writeback = false; r= EMULATE_USER_EXIT; vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; else r = EMULATE_DONE;
handle_io 返回return emulate_instruction(vcpu, 0) == EMULATE_DONE,即是kvm_x86_ops->handle_exit(vcpu)的返回值,返回值不大于0,则从vcpu_run循环中跳出到qemu mode。
if (kvm_vcpu_running(vcpu)) { r = vcpu_enter_guest(vcpu); } else { r = vcpu_block(kvm, vcpu); } if (r <= 0) break;
3. PIO运行在QEMU
在qemu kvm_cpu_exec函数中:
switch (run->exit_reason) { /*KVM_EXIT_IO是在emulator_pio_in_out标记的*/ case KVM_EXIT_IO: DPRINTF("handle_io\n"); /* Called outside BQL */ kvm_handle_io(run->io.port, attrs, (uint8_t *)run + run->io.data_offset, run->io.direction, run->io.size, run->io.count); ret = 0; break; case KVM_EXIT_MMIO: DPRINTF("handle_mmio\n"); /* Called outside BQL */ address_space_rw(&address_space_memory, run->mmio.phys_addr, attrs, run->mmio.data, run->mmio.len, run->mmio.is_write); ret = 0; break; static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, int size, uint32_t count) { int i; uint8_t *ptr = data; for (i = 0; i < count; i++) { /*从qemu内存中读写数据,本质就成了mmio操作 */ address_space_rw(&address_space_io, port, attrs, ptr, size, direction == KVM_EXIT_IO_OUT); ptr += size; } }
如果写IO此处就算完成了,如果读取IO,此时读取完成,还需要后面处理。
处理在kvm_arch_vcpu_ioctl_run中的vcpu_run之前,
if (unlikely(vcpu->arch.complete_userspace_io)) { int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; vcpu->arch.complete_userspace_io = NULL; r = cui(vcpu); if (r <= 0) goto out; } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
本质执行了complete_userspace_io函数,即是complete_emulated_pio。
就是r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE),再次模拟IO指令,此时将数据读取到模拟的IO端口上,模拟完成即可。
KVM源代码分析5:IO虚拟化之PIO来自于OenHan
链接为:https://oenhan.com/kvm-src-5-io-pio
博主大侠,你好,什么样的IO会在KVM中进行模拟,什么样的IO需要在QEMU中进行模拟呢?
外设不都是在QEMU中模拟么?除了外设还有那些会用PIO?那些是在KVM中模拟的呢?
博主,你好
请问对于out操作的串指令,为什么不需要通过complete_userspace_io再次调用complete_emulated_pio?
else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in)
vcpu->arch.pio.count = 0;
else {
writeback = false;
vcpu->arch.complete_userspace_io = complete_emulated_pio;
}
r = EMULATE_DO_MMIO;
}
@SOHU2000000 1.这个完全看不同情况,比如virtio是在qemu中处理IO的,但是vhost则是在hypervisor中处理IO的.
2.Virtio原始的vring notify用的模拟的PCI portIO,当然也是QEMU模拟的.具体真正在hypervisor中处理的PIO没注意,可以看一下emulate_instruction,参考EMULATE_USER_EXIT的引用情况,貌似真正使用的时候都在qemu中.
3.out串指令写内存就可以完成,不需要给guest反馈
@OENHAN 感谢博主的回答,又仔细看了一下,PIC的IO操作是在KVM完成的
模拟in指令的第二阶段写数据到虚拟机中, in指令会把端口的数据读入rax
in的第二阶段如下:
complete_emulated_io->emulate_instruction(vcpu, EMULTYPE_NO_DECODE)->
x86_emulate_instruction,在这里面不会进行解码,
x86_emulate_insn->ctxt->execute->em_in->pio_in_emulated->ctxt->ops->pio_in_emulated->
emulator_pio_in_emulated,最后这个函数,这个时候vcpu->arch.pio_count不为0了(第一次返回的时候设置了),
所以会到data_avail,然后把val就有了in的数据。
但是我看不到如何写到客户机rax的代码,
static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
int size, unsigned short port, void *val,
unsigned int count)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int ret;
if (vcpu->arch.pio.count)
goto data_avail;
ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
if (ret) {
data_avail:
memcpy(val, vcpu->arch.pio_data, size * count);
trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
vcpu->arch.pio.count = 0;
return 1;
}
因为在调用 emulator_pio_in_emulated之前,val为rc->data,但是没有看到哪里取了这个值。
static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
unsigned int size, unsigned short port,
void *dest)
{
struct read_cache *rc = &ctxt->io_read;
if (rc->pos == rc->end) { /* refill pio read ahead */
unsigned int in_page, n;
unsigned int count = ctxt->rep_prefix ?
address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
PAGE_SIZE – offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
if (n == 0)
n = 1;
rc->pos = rc->end = 0;
if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
return 0;
rc->end = n * size;
}
有没有什么思路提示提示啊?