首页 > Virtualization > QEMU虚拟网络E1000源代码分析

QEMU虚拟网络E1000源代码分析

Virtualization 2016-12-25

代码版本:QEMU git release v2.8

x86_64-softmmu/qemu-system-x86_64 -enable-kvm -M q35  -m 256 -smp 1 -drive file=/workspace/kvm_auto_test/kvm_test.qcow2,format=qcow2 -netdev tap,fd=26,id=hostnet0 -device e1000,netdev=hostnet0,id=net0,mac=52:54:00:f1:ec:ba,bus=pcie.0

e1000_class_init是网络设备初始化的源头,然后看pci_e1000_realize,

pci_dev->config_write = e1000_write_config;

//配置PCI config

pci_conf[PCI_CACHE_LINE_SIZE] = 0x10;

pci_conf[PCI_INTERRUPT_PIN] = 1;

//配置mmio

e1000_mmio_setup(d);

//注册bar空间

pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &d->mmio);

pci_register_bar(pci_dev, 1, PCI_BASE_ADDRESS_SPACE_IO, &d->io);

//初始化inc信息

d->nic = qemu_new_nic(&net_e1000_info, &d->conf,

object_get_typename(OBJECT(d)), dev->id, d);

static void e1000_mmio_setup(E1000State *d)

{   int i;

const uint32_t excluded_regs[] = {

E1000_MDIC, E1000_ICR, E1000_ICS, E1000_IMS,

E1000_IMC, E1000_TCTL, E1000_TDT, PNPMMIO_SIZE

};

//初始化了mmio和pio,但是只是基本信息,比如addr地址就没有赋值

memory_region_init_io(&d->mmio, OBJECT(d), &e1000_mmio_ops, d, "e1000-mmio", PNPMMIO_SIZE);

memory_region_add_coalescing(&d->mmio, 0, excluded_regs[0]);

for (i = 0; excluded_regs[i] != PNPMMIO_SIZE; i++)

memory_region_add_coalescing(&d->mmio, excluded_regs[i] + 4, excluded_regs[i+1] - excluded_regs[i] - 4);

memory_region_init_io(&d->io, OBJECT(d), &e1000_io_ops, d, "e1000-io", IOPORT_SIZE);

}

对于e1000_write_config,除了基本的PCI规范

pci_default_write_config(pci_dev, address, val, len);

//当config写入范围大于PCI_COMMAND且使用了PCI_COMMAND_MASTER

//那么将排队的包继续刷新,类似PCI下的写同步的本质是读一次。

if (range_covers_byte(address, len, PCI_COMMAND) &&

(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {

qemu_flush_queued_packets(qemu_get_queue(s->nic));

}

static const MemoryRegionOps e1000_mmio_ops = {

.read = e1000_mmio_read,

.write = e1000_mmio_write,

.endianness = DEVICE_LITTLE_ENDIAN,

.impl = {

.min_access_size = 4,

.max_access_size = 4,

},

};

e1000_mmio_write和e1000_mmio_read都是分别调用macreg_writeops和macreg_readops而已

#define putreg(x)    [x] = mac_writereg

static void (*macreg_writeops[])(E1000State *, int, uint32_t) = {

putreg(PBA),      putreg(EERD),     putreg(SWSM),     putreg(WUFC),

putreg(TDBAL),    putreg(TDBAH),    putreg(TXDCTL),   putreg(RDBAH),

putreg(RDBAL),    putreg(LEDCTL),   putreg(VET),      putreg(FCRUC),

putreg(TDFH),     putreg(TDFT),     putreg(TDFHS),    putreg(TDFTS),

putreg(TDFPC),    putreg(RDFH),     putreg(RDFT),     putreg(RDFHS),

putreg(RDFTS),    putreg(RDFPC),    putreg(IPAV),     putreg(WUC),

putreg(WUS),      putreg(AIT),

[TDLEN]  = set_dlen,   [RDLEN]  = set_dlen,       [TCTL] = set_tctl,

[TDT]    = set_tctl,   [MDIC]   = set_mdic,       [ICS]  = set_ics,

[TDH]    = set_16bit,  [RDH]    = set_16bit,      [RDT]  = set_rdt,

[IMC]    = set_imc,    [IMS]    = set_ims,        [ICR]  = set_icr,

[EECD]   = set_eecd,   [RCTL]   = set_rx_control, [CTRL] = set_ctrl,

[RDTR]   = set_16bit,  [RADV]   = set_16bit,      [TADV] = set_16bit,

[ITR]    = set_16bit,

[IP6AT ... IP6AT+3] = &mac_writereg, [IP4AT ... IP4AT+6] = &mac_writereg,

[FFLT ... FFLT+6]   = &mac_writereg, [RA ... RA+31]      = &mac_writereg,

[WUPM ... WUPM+31]  = &mac_writereg, [MTA ... MTA+127]   = &mac_writereg,

[VFTA ... VFTA+127] = &mac_writereg, [FFMT ... FFMT+254] = &mac_writereg,

[FFVT ... FFVT+254] = &mac_writereg, [PBM ... PBM+16383] = &mac_writereg,

};

macreg_writeops顾名思义的就是针对于各个写reg的处理,挨着看

mac_writereg(E1000State *s, int index, uint32_t val)

{

uint32_t macaddr[2];

//单纯的赋值

s->mac_reg[index] = val;

if (index == RA + 1) {

macaddr[0] = cpu_to_le32(s->mac_reg[RA]);

macaddr[1] = cpu_to_le32(s->mac_reg[RA + 1]);

qemu_format_nic_info_str(qemu_get_queue(s->nic), (uint8_t *)macaddr);

}

}

typedef struct E1000State_st {

uint32_t mac_reg[0x8000];

mac_reg也本身是数组分配好的事情。

set_dlen,set_mdic也是mac赋值而已,

s->mac_reg[index] = val & 0xfff80;

s->mac_reg[MDIC] = val | E1000_MDIC_READY;

set_mdic会配置s->phy_reg[addr] = data,同时通过set_ics发送虚拟中断

set_interrupt_cause(s, 0, val | s->mac_reg[ICR]);

set_16bit,set_rdt,set_imc,set_ims,set_icr都是mac设置后最多在加一个中断。

set_rx_control配置接收缓存,负责使用qemu_flush_queued_packets刷新队列中的package。

最终看一下set_tctl函数,

set_tctl(E1000State *s, int index, uint32_t val)

{

s->mac_reg[index] = val;

s->mac_reg[TDT] &= 0xffff;

start_xmit(s);

}

start_xmit负责发包

while (s->mac_reg[TDH] != s->mac_reg[TDT]) {

//获取发送ring的首地址gpa

base = tx_desc_base(s) + sizeof(struct e1000_tx_desc) * s->mac_reg[TDH];

//从guest中读取desc的内容

pci_dma_read(d, base, &desc, sizeof(desc));

process_tx_desc中开始拼装网络包

addr = le64_to_cpu(dp->buffer_addr);

//从内存读取网络包的data数据

pci_dma_read(d, addr, tp->data + tp->size, bytes);

xmit_seg继续组装IP协议头部,使用e1000_send_packet进行发包

if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) {

nc->info->receive(nc, buf, size);

在net_tap_fd_init下,

nc = qemu_new_net_client(&net_tap_info, peer, model, name);

s = DO_UPCAST(TAPState, nc, nc);

static NetClientInfo net_tap_info = {

.type = NET_CLIENT_DRIVER_TAP,

.size = sizeof(TAPState),

.receive = tap_receive,

.receive_raw = tap_receive_raw,

.receive_iov = tap_receive_iov,

.poll = tap_poll,

.cleanup = tap_cleanup,

.has_ufo = tap_has_ufo,

.has_vnet_hdr = tap_has_vnet_hdr,

.has_vnet_hdr_len = tap_has_vnet_hdr_len,

.using_vnet_hdr = tap_using_vnet_hdr,

.set_offload = tap_set_offload,

.set_vnet_hdr_len = tap_set_vnet_hdr_len,

.set_vnet_le = tap_set_vnet_le,

.set_vnet_be = tap_set_vnet_be,

};

最终使用tap_receive进行发包,其实就是len = writev(s->fd, iov, iovcnt)向/dev/net/tun进行写入。

在tap_update_fd_handler下,当tun收包可读时,调用tap_send。

qemu_set_fd_handler(s->fd,

s->read_poll && s->enabled ? tap_send : NULL,

s->write_poll && s->enabled ? tap_writable : NULL,

s);

//从tun中读取数据

size = tap_read_packet(s->fd, s->buf, sizeof(s->buf));

//向qemu发送数据

qemu_send_packet_async(&s->nc, buf, size, tap_send_completed);

qemu_send_packet_async_with_flags继续调用qemu_net_queue_send

//如果qemu不能接收包,

qemu_can_send_packet,

//则将起放到队列中等待,

qemu_net_queue_append(queue, sender, flags, data, size, sent_cb);

//qemu可以接收包

qemu_net_queue_deliver(queue, sender, flags, data, size)

ret = queue->deliver(sender, flags, &iov, 1, queue->opaque);

在qemu_net_client_setup下有

nc->incoming_queue = qemu_new_net_queue(qemu_deliver_packet_iov, nc);

而在qemu_deliver_packet_iov下,

NetClientState *nc = opaque; //即queue->opaque

调用nc->info->receive_iov(nc, iov, iovcnt),即net_e1000_info下的e1000_receive_iov。

//先将iov下的数据拷贝到本函数下的临时空间

if (size < sizeof(min_buf)) {

iov_to_buf(iov, iovcnt, 0, min_buf, size);

memset(&min_buf[size], 0, sizeof(min_buf) - size);

e1000x_inc_reg_if_not_full(s->mac_reg, RUC);

min_iov.iov_base = filter_buf = min_buf;

min_iov.iov_len = size = sizeof(min_buf);

iovcnt = 1;

iov = &min_iov;

} else if (iov->iov_len < MAXIMUM_ETHERNET_HDR_LEN) {

/* This is very unlikely, but may happen. */

iov_to_buf(iov, iovcnt, 0, min_buf, MAXIMUM_ETHERNET_HDR_LEN);

filter_buf = min_buf;

}
//读取guest下的描述符

base = rx_desc_base(s) + sizeof(desc) * s->mac_reg[RDH];

pci_dma_read(d, base, &desc, sizeof(desc));

//将内容写入到guest的接收队列中

iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);

pci_dma_write(d, ba, iov->iov_base + iov_ofs, iov_copy);

QEMU虚拟网络E1000源代码分析来自于OenHan,链接为:http://oenhan.com/qemu-virtual-network-e1000
更多阅读