QEMU虚拟网络E1000源代码分析
代码版本:QEMU git release v2.8
x86_64-softmmu/qemu-system-x86_64 -enable-kvm -M q35 -m 256 -smp 1 -drive file=/workspace/kvm_auto_test/kvm_test.qcow2,format=qcow2 -netdev tap,fd=26,id=hostnet0 -device e1000,netdev=hostnet0,id=net0,mac=52:54:00:f1:ec:ba,bus=pcie.0
e1000_class_init是网络设备初始化的源头,然后看pci_e1000_realize,
pci_dev->config_write = e1000_write_config; //配置PCI config pci_conf[PCI_CACHE_LINE_SIZE] = 0x10; pci_conf[PCI_INTERRUPT_PIN] = 1; //配置mmio e1000_mmio_setup(d); //注册bar空间 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &d->mmio); pci_register_bar(pci_dev, 1, PCI_BASE_ADDRESS_SPACE_IO, &d->io); //初始化inc信息 d->nic = qemu_new_nic(&net_e1000_info, &d->conf, object_get_typename(OBJECT(d)), dev->id, d); static void e1000_mmio_setup(E1000State *d) { int i; const uint32_t excluded_regs[] = { E1000_MDIC, E1000_ICR, E1000_ICS, E1000_IMS, E1000_IMC, E1000_TCTL, E1000_TDT, PNPMMIO_SIZE }; //初始化了mmio和pio,但是只是基本信息,比如addr地址就没有赋值 memory_region_init_io(&d->mmio, OBJECT(d), &e1000_mmio_ops, d, "e1000-mmio", PNPMMIO_SIZE); memory_region_add_coalescing(&d->mmio, 0, excluded_regs[0]); for (i = 0; excluded_regs[i] != PNPMMIO_SIZE; i++) memory_region_add_coalescing(&d->mmio, excluded_regs[i] + 4, excluded_regs[i+1] - excluded_regs[i] - 4); memory_region_init_io(&d->io, OBJECT(d), &e1000_io_ops, d, "e1000-io", IOPORT_SIZE); }
对于e1000_write_config,除了基本的PCI规范
pci_default_write_config(pci_dev, address, val, len); //当config写入范围大于PCI_COMMAND且使用了PCI_COMMAND_MASTER //那么将排队的包继续刷新,类似PCI下的写同步的本质是读一次。 if (range_covers_byte(address, len, PCI_COMMAND) && (pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) { qemu_flush_queued_packets(qemu_get_queue(s->nic)); } static const MemoryRegionOps e1000_mmio_ops = { .read = e1000_mmio_read, .write = e1000_mmio_write, .endianness = DEVICE_LITTLE_ENDIAN, .impl = { .min_access_size = 4, .max_access_size = 4, }, };
e1000_mmio_write和e1000_mmio_read都是分别调用macreg_writeops和macreg_readops而已
#define putreg(x) [x] = mac_writereg static void (*macreg_writeops[])(E1000State *, int, uint32_t) = { putreg(PBA), putreg(EERD), putreg(SWSM), putreg(WUFC), putreg(TDBAL), putreg(TDBAH), putreg(TXDCTL), putreg(RDBAH), putreg(RDBAL), putreg(LEDCTL), putreg(VET), putreg(FCRUC), putreg(TDFH), putreg(TDFT), putreg(TDFHS), putreg(TDFTS), putreg(TDFPC), putreg(RDFH), putreg(RDFT), putreg(RDFHS), putreg(RDFTS), putreg(RDFPC), putreg(IPAV), putreg(WUC), putreg(WUS), putreg(AIT), [TDLEN] = set_dlen, [RDLEN] = set_dlen, [TCTL] = set_tctl, [TDT] = set_tctl, [MDIC] = set_mdic, [ICS] = set_ics, [TDH] = set_16bit, [RDH] = set_16bit, [RDT] = set_rdt, [IMC] = set_imc, [IMS] = set_ims, [ICR] = set_icr, [EECD] = set_eecd, [RCTL] = set_rx_control, [CTRL] = set_ctrl, [RDTR] = set_16bit, [RADV] = set_16bit, [TADV] = set_16bit, [ITR] = set_16bit, [IP6AT ... IP6AT+3] = &mac_writereg, [IP4AT ... IP4AT+6] = &mac_writereg, [FFLT ... FFLT+6] = &mac_writereg, [RA ... RA+31] = &mac_writereg, [WUPM ... WUPM+31] = &mac_writereg, [MTA ... MTA+127] = &mac_writereg, [VFTA ... VFTA+127] = &mac_writereg, [FFMT ... FFMT+254] = &mac_writereg, [FFVT ... FFVT+254] = &mac_writereg, [PBM ... PBM+16383] = &mac_writereg, };
macreg_writeops顾名思义的就是针对于各个写reg的处理,挨着看
mac_writereg(E1000State *s, int index, uint32_t val) { uint32_t macaddr[2]; //单纯的赋值 s->mac_reg[index] = val; if (index == RA + 1) { macaddr[0] = cpu_to_le32(s->mac_reg[RA]); macaddr[1] = cpu_to_le32(s->mac_reg[RA + 1]); qemu_format_nic_info_str(qemu_get_queue(s->nic), (uint8_t *)macaddr); } } typedef struct E1000State_st { uint32_t mac_reg[0x8000];
mac_reg也本身是数组分配好的事情。
set_dlen,set_mdic也是mac赋值而已,
s->mac_reg[index] = val & 0xfff80; s->mac_reg[MDIC] = val | E1000_MDIC_READY;
set_mdic会配置s->phy_reg[addr] = data,同时通过set_ics发送虚拟中断
set_interrupt_cause(s, 0, val | s->mac_reg[ICR]);
set_16bit,set_rdt,set_imc,set_ims,set_icr都是mac设置后最多在加一个中断。
set_rx_control配置接收缓存,负责使用qemu_flush_queued_packets刷新队列中的package。
最终看一下set_tctl函数,
set_tctl(E1000State *s, int index, uint32_t val) { s->mac_reg[index] = val; s->mac_reg[TDT] &= 0xffff; start_xmit(s); }
start_xmit负责发包
while (s->mac_reg[TDH] != s->mac_reg[TDT]) { //获取发送ring的首地址gpa base = tx_desc_base(s) + sizeof(struct e1000_tx_desc) * s->mac_reg[TDH]; //从guest中读取desc的内容 pci_dma_read(d, base, &desc, sizeof(desc));
process_tx_desc中开始拼装网络包
addr = le64_to_cpu(dp->buffer_addr); //从内存读取网络包的data数据 pci_dma_read(d, addr, tp->data + tp->size, bytes);
xmit_seg继续组装IP协议头部,使用e1000_send_packet进行发包
if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) { nc->info->receive(nc, buf, size); 在net_tap_fd_init下, nc = qemu_new_net_client(&net_tap_info, peer, model, name); s = DO_UPCAST(TAPState, nc, nc); static NetClientInfo net_tap_info = { .type = NET_CLIENT_DRIVER_TAP, .size = sizeof(TAPState), .receive = tap_receive, .receive_raw = tap_receive_raw, .receive_iov = tap_receive_iov, .poll = tap_poll, .cleanup = tap_cleanup, .has_ufo = tap_has_ufo, .has_vnet_hdr = tap_has_vnet_hdr, .has_vnet_hdr_len = tap_has_vnet_hdr_len, .using_vnet_hdr = tap_using_vnet_hdr, .set_offload = tap_set_offload, .set_vnet_hdr_len = tap_set_vnet_hdr_len, .set_vnet_le = tap_set_vnet_le, .set_vnet_be = tap_set_vnet_be, };
最终使用tap_receive进行发包,其实就是len = writev(s->fd, iov, iovcnt)向/dev/net/tun进行写入。
在tap_update_fd_handler下,当tun收包可读时,调用tap_send。
qemu_set_fd_handler(s->fd, s->read_poll && s->enabled ? tap_send : NULL, s->write_poll && s->enabled ? tap_writable : NULL, s); //从tun中读取数据 size = tap_read_packet(s->fd, s->buf, sizeof(s->buf)); //向qemu发送数据 qemu_send_packet_async(&s->nc, buf, size, tap_send_completed);
qemu_send_packet_async_with_flags继续调用qemu_net_queue_send
//如果qemu不能接收包, qemu_can_send_packet, //则将起放到队列中等待, qemu_net_queue_append(queue, sender, flags, data, size, sent_cb); //qemu可以接收包 qemu_net_queue_deliver(queue, sender, flags, data, size) ret = queue->deliver(sender, flags, &iov, 1, queue->opaque);
在qemu_net_client_setup下有
nc->incoming_queue = qemu_new_net_queue(qemu_deliver_packet_iov, nc);
而在qemu_deliver_packet_iov下,
NetClientState *nc = opaque; //即queue->opaque
调用nc->info->receive_iov(nc, iov, iovcnt),即net_e1000_info下的e1000_receive_iov。
//先将iov下的数据拷贝到本函数下的临时空间 if (size < sizeof(min_buf)) { iov_to_buf(iov, iovcnt, 0, min_buf, size); memset(&min_buf[size], 0, sizeof(min_buf) - size); e1000x_inc_reg_if_not_full(s->mac_reg, RUC); min_iov.iov_base = filter_buf = min_buf; min_iov.iov_len = size = sizeof(min_buf); iovcnt = 1; iov = &min_iov; } else if (iov->iov_len < MAXIMUM_ETHERNET_HDR_LEN) { /* This is very unlikely, but may happen. */ iov_to_buf(iov, iovcnt, 0, min_buf, MAXIMUM_ETHERNET_HDR_LEN); filter_buf = min_buf; } //读取guest下的描述符 base = rx_desc_base(s) + sizeof(desc) * s->mac_reg[RDH]; pci_dma_read(d, base, &desc, sizeof(desc)); //将内容写入到guest的接收队列中 iov_copy = MIN(copy_size, iov->iov_len - iov_ofs); pci_dma_write(d, ba, iov->iov_base + iov_ofs, iov_copy);
QEMU虚拟网络E1000源代码分析来自于OenHan
链接为:https://oenhan.com/qemu-virtual-network-e1000