KVM源代码分析5:IO虚拟化之PIO
源代码:git tag, kernel v3.16.37 qemu v2.7 ,上一篇:KVM源代码分析4:内存虚拟化–OenHan
1. PIO指令介绍
80386的I/O指令使得处理器可以访问I/O端口,以便从外设输入数据,或者向外设发送数据。这些指令有一个指定I/O空间端口地址的操作数。有两类的I/O指令:
1、 在寄存器指定的地址传送一个数据(字节、字、双字)。
2、 传送指定内存中的一串数据(字节串、字串、双字串)。这些被称作为“串 I/O指令”或者说“块I/O指令”。
有IN/OUT INS/OUTS指令
2. PIO运行在KVM
当guest执行PIO指令时,触发vmx_handle_exit,根据EXIT_REASON_IO_INSTRUCTION执行handle_io函数,根据sdm 3 27.5图表:
Bit Position(s) Contents | Contents |
2:0 | Size of access: 0 = 1-byte 1 = 2-byte 3 = 4-byte Other values not used |
3 | Direction of the attempted access (0 = OUT, 1 = IN) |
4 | String instruction (0 = not string; 1 = string) |
5 | REP prefixed (0 = not REP; 1 = REP) |
6 | Operand encoding (0 = DX, 1 = immediate) |
15:7 | Reserved (cleared to 0) |
31:16 | Port number (as specified in DX or in an immediate operand) |
63:32 | Reserved (cleared to 0). These bits exist only on processors that support Intel 64 architecture. |
在handle_io中,如果
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | string = (exit_qualification & 16) != 0; in = (exit_qualification & 8) != 0; // string串指令或者IO读的指令进行处理 if (string || in) return emulate_instruction(vcpu, 0) == EMULATE_DONE; port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; //跳过模拟指令,将非串写指令参数保存 skip_emulated_instruction(vcpu); return kvm_fast_pio_out(vcpu, size, port); |
先看kvm_fast_pio_out函数
1 2 3 4 5 | unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, size, port, &val, 1); |
pio指令对象放在eax里面,在emulator_pio_out_emulated中
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | memcpy(vcpu->arch.pio_data, val, size * count); vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { vcpu->arch.pio.count = 0; return 1; } vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = size; vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port; |
此处就将pio保存到vcpu->arch.pio中了,注意exit_reason赋值为KVM_EXIT_IO
麻烦点在x86_emulate_instruction函数,具体看
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | struct x86_emulate_ctxt { /*操作对象*/ const struct x86_emulate_ops *ops; /* Register state before/after emulation. */ unsigned long eflags; unsigned long eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ enum x86emul_mode mode; /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; int emul_flags; bool perm_ok; /* do not check permissions if true */ bool ud;/* inject an #UD if host doesn't support insn */ bool have_exception; struct x86_exception exception; /* * decode cache */ /* current opcode length in bytes */ u8 opcode_len; u8 b; u8 intercept; u8 op_bytes; u8 ad_bytes; struct operand src; struct operand src2; struct operand dst; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); /* * The following six fields are cleared together, * the rest are initialized unconditionally in x86_decode_insn * or elsewhere */ bool rip_relative; u8 rex_prefix; u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ u32 regs_valid; /* bitmaps of registers in _regs[] that have been written */ u32 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; u8 modrm_reg; u8 modrm_rm; u8 modrm_seg; u8 seg_override; u64 d; unsigned long _eip; struct operand memop; /* Fields above regs are cleared together. */ unsigned long _regs[NR_VCPU_REGS]; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; }; //init_emulate_ctxt负责初始化这个结构体, ctxt->fetch.ptr = ctxt->fetch.data; ctxt->fetch.end = ctxt->fetch.data + insn_len; if (insn_len > 0) /*所有指令内容存放在data中*/ memcpy(ctxt->fetch.data, insn, insn_len); else { /*如果没有指定指令的内容,就从当前eip读取一个指令*/ rc = __do_insn_fetch_bytes(ctxt, 1); if (rc != X86EMUL_CONTINUE) return rc; } |
都是将指令读取出来放到ctxt->fetch.data中而已,没有执行。
看__do_insn_fetch_bytes函数,
1 2 3 4 5 6 7 8 9 10 11 | int cur_size = ctxt->fetch.end - ctxt->fetch.data; /*下面的ea事实上应该说是指令decode时真正的eip,因为指令还没有被执行,eip没有 * 更新,所以每次decode计算eip都要加上已经decode的代码长度,就是cur_size */ struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = ctxt->eip + cur_size }; /*__linearize就是获取线性地址*/ la = seg_base(ctxt, addr.seg) + addr.ea; |
后面的即是一些处理,如不满足权限的就要模拟gp错误,emulate_gp。
然后是:
1 | rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end, size, &ctxt->exception); |
即kvm_fetch_guest_virt
先通过vcpu->arch.walk_mmu->gva_to_gpa获取gpa的值,然后用kvm_vcpu_read_guest_page获取gpa对应的内存值,这两个函数不再展开。
最终效果就是将linear对应的mem中大小为size的内容写入到ctxt->fetch.end指针对应的缓存中,因为ctxt->fetch.end = ctxt->fetch.data + insn_len而insn_len为0,则写入的是ctxt->fetch.data,在x86_decode_insn中,__do_insn_fetch_bytes就是从eip中copy了一个指令的内容到ctxt->fetch.data。x86_emulate_insn后面是指令解释的跳过。
x86_emulate_instruction后面一段代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | /*exception在decode中产生*/ if (ctxt->have_exception) { r = EMULATE_DONE; if (inject_emulated_exception(vcpu)) return r; /*decode会处理,pio的执行次数,针对串指令*/ } else if (vcpu->arch.pio.count) { /*写入io的不特殊处理,只需完成写入即可*/ if (!vcpu->arch.pio.in) { /* FIXME: return into emulator if single-stepping. */ vcpu->arch.pio.count = 0; } else { writeback = false; /*读取io的则需要回头处理读的值的流程*/ vcpu->arch.complete_userspace_io = complete_emulated_pio; } r = EMULATE_USER_EXIT; } else if (vcpu->mmio_needed) { /*同上*/ if (!vcpu->mmio_is_write) writeback = false; r= EMULATE_USER_EXIT; vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; else r = EMULATE_DONE; |
handle_io 返回return emulate_instruction(vcpu, 0) == EMULATE_DONE,即是kvm_x86_ops->handle_exit(vcpu)的返回值,返回值不大于0,则从vcpu_run循环中跳出到qemu mode。
1 2 3 4 5 6 7 | if (kvm_vcpu_running(vcpu)) { r = vcpu_enter_guest(vcpu); } else { r = vcpu_block(kvm, vcpu); } if (r <= 0) break; |
3. PIO运行在QEMU
在qemu kvm_cpu_exec函数中:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | switch (run->exit_reason) { /*KVM_EXIT_IO是在emulator_pio_in_out标记的*/ case KVM_EXIT_IO: DPRINTF("handle_io\n"); /* Called outside BQL */ kvm_handle_io(run->io.port, attrs, (uint8_t *)run + run->io.data_offset, run->io.direction, run->io.size, run->io.count); ret = 0; break; case KVM_EXIT_MMIO: DPRINTF("handle_mmio\n"); /* Called outside BQL */ address_space_rw(&address_space_memory, run->mmio.phys_addr, attrs, run->mmio.data, run->mmio.len, run->mmio.is_write); ret = 0; break; static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, int size, uint32_t count) { int i; uint8_t *ptr = data; for (i = 0; i < count; i++) { /*从qemu内存中读写数据,本质就成了mmio操作 */ address_space_rw(&address_space_io, port, attrs, ptr, size, direction == KVM_EXIT_IO_OUT); ptr += size; } } |
如果写IO此处就算完成了,如果读取IO,此时读取完成,还需要后面处理。
处理在kvm_arch_vcpu_ioctl_run中的vcpu_run之前,
1 2 3 4 5 6 7 8 | if (unlikely(vcpu->arch.complete_userspace_io)) { int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; vcpu->arch.complete_userspace_io = NULL; r = cui(vcpu); if (r <= 0) goto out; } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); |
本质执行了complete_userspace_io函数,即是complete_emulated_pio。
就是r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE),再次模拟IO指令,此时将数据读取到模拟的IO端口上,模拟完成即可。
KVM源代码分析5:IO虚拟化之PIO来自于OenHan
链接为:http://oenhan.com/kvm-src-5-io-pio
博主大侠,你好,什么样的IO会在KVM中进行模拟,什么样的IO需要在QEMU中进行模拟呢?
外设不都是在QEMU中模拟么?除了外设还有那些会用PIO?那些是在KVM中模拟的呢?
博主,你好
请问对于out操作的串指令,为什么不需要通过complete_userspace_io再次调用complete_emulated_pio?
else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in)
vcpu->arch.pio.count = 0;
else {
writeback = false;
vcpu->arch.complete_userspace_io = complete_emulated_pio;
}
r = EMULATE_DO_MMIO;
}
@SOHU2000000 1.这个完全看不同情况,比如virtio是在qemu中处理IO的,但是vhost则是在hypervisor中处理IO的.
2.Virtio原始的vring notify用的模拟的PCI portIO,当然也是QEMU模拟的.具体真正在hypervisor中处理的PIO没注意,可以看一下emulate_instruction,参考EMULATE_USER_EXIT的引用情况,貌似真正使用的时候都在qemu中.
3.out串指令写内存就可以完成,不需要给guest反馈
@OENHAN 感谢博主的回答,又仔细看了一下,PIC的IO操作是在KVM完成的
模拟in指令的第二阶段写数据到虚拟机中, in指令会把端口的数据读入rax
in的第二阶段如下:
complete_emulated_io->emulate_instruction(vcpu, EMULTYPE_NO_DECODE)->
x86_emulate_instruction,在这里面不会进行解码,
x86_emulate_insn->ctxt->execute->em_in->pio_in_emulated->ctxt->ops->pio_in_emulated->
emulator_pio_in_emulated,最后这个函数,这个时候vcpu->arch.pio_count不为0了(第一次返回的时候设置了),
所以会到data_avail,然后把val就有了in的数据。
但是我看不到如何写到客户机rax的代码,
static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
int size, unsigned short port, void *val,
unsigned int count)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int ret;
if (vcpu->arch.pio.count)
goto data_avail;
ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
if (ret) {
data_avail:
memcpy(val, vcpu->arch.pio_data, size * count);
trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
vcpu->arch.pio.count = 0;
return 1;
}
因为在调用 emulator_pio_in_emulated之前,val为rc->data,但是没有看到哪里取了这个值。
static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
unsigned int size, unsigned short port,
void *dest)
{
struct read_cache *rc = &ctxt->io_read;
if (rc->pos == rc->end) { /* refill pio read ahead */
unsigned int in_page, n;
unsigned int count = ctxt->rep_prefix ?
address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
PAGE_SIZE – offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
if (n == 0)
n = 1;
rc->pos = rc->end = 0;
if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
return 0;
rc->end = n * size;
}
有没有什么思路提示提示啊?