Linux网络协议栈之驱动框架

作者:bullbat

        网卡驱动可以以模块的方式加载也可以内核初始化的时候加载,我们选定e100系列的网卡进行说明网卡驱动的一般框架。

网卡设备通用数据结构:

struct net_device

{

         /*

          * This is the first field of the "visible" part of this structure

          * (i.e. as seen by users in the "Space.c" file).  It is the name

          * the interface.

          */

          /*网络设备名*/

         char                    name[IFNAMSIZ];

         /* device name hash chain */

         /*根据网络设备名以散列表的形式组织到dev_name_head散列表中,这样就可以通过网络

         设备名快速地定位到网络设备*/

         struct hlist_node       name_hlist;

         /*

          *     I/O specific fields

          *     FIXME: Merge these and struct ifmap into one

          */

          /*网络设备共享内存的起始和终止地址*/

         unsigned long            mem_end;          /* shared mem end    */

         unsigned long            mem_start;        /* shared mem start   */

         /*网络接口I/O基地址,在探测设备时被初始化ifconfig命令可显示和修改

         当前命令*/

         unsigned long            base_addr;        /* device I/O address         */

         /*分配给设备的中断号,一般在初始化设备时被初始化*/

         unsigned int               irq;             /* device IRQ number        */

         /*

          *     Some hardware also needs these fields, but they are not

          *     part of the usual set specified in Space.c.

          */

         /*指定在多端口设备上使用那个端口*/

         unsigned char            if_port;     /* Selectable AUI, TP,..*/

         /*为设备分配的DMA通道*/

         unsigned char            dma;          /* DMA channel                 */

         /*设备状态*/

         unsigned long            state;

         /*网络设备组织*/

         struct net_device      *next;

         /*驱动程序的初始化函数*/    

         /* The device initialization function. Called only once. */

         int                       (*init)(struct net_device *dev);

         /* ------- Fields preinitialized in Space.c finish here ------- */

         /* Net device features */

         /*接口支持特性*/

         unsigned long            features;

#define NETIF_F_SG                  1       /* Scatter/gather IO. */

#define NETIF_F_IP_CSUM              2       /* Can checksum only TCP/UDP over IPv4. */

#define NETIF_F_NO_CSUM            4       /* Does not require checksum. F.e. loopack. */

#define NETIF_F_HW_CSUM           8       /* Can checksum all the packets. */

#define NETIF_F_HIGHDMA            32     /* Can DMA to high memory. */

#define NETIF_F_FRAGLIST   64     /* Scatter/gather IO. */

#define NETIF_F_HW_VLAN_TX   128   /* Transmit VLAN hw acceleration */

#define NETIF_F_HW_VLAN_RX   256   /* Receive VLAN hw acceleration */

#define NETIF_F_HW_VLAN_FILTER     512   /* Receive filtering on VLAN */

#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */

#define NETIF_F_GSO               2048 /* Enable software GSO. */

#define NETIF_F_LLTX             4096 /* LockLess TX */

         /* Segmentation offload features */

#define NETIF_F_GSO_SHIFT 16

#define NETIF_F_GSO_MASK         0xffff0000

#define NETIF_F_TSO               (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)

#define NETIF_F_UFO               (SKB_GSO_UDP << NETIF_F_GSO_SHIFT)

#define NETIF_F_GSO_ROBUST      (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)

#define NETIF_F_TSO_ECN              (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)

#define NETIF_F_TSO6             (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)

         /* List of features with software fallbacks. */

#define NETIF_F_GSO_SOFTWARE        (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)

 

#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)

#define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)

         /*用于连接那些已经调度有数据报输出的网络设备指针*/

         struct net_device      *next_sched;

         /* Interface index. Unique device identifier         */

         /*网络设备的索引号*/

         int                       ifindex;

         /*网络设备的唯一标识,主要用于虚拟隧道设备*/

         int                       iflink;

         /*提供给应用程序获得接口统计信息的接口*/

         struct net_device_stats* (*get_stats)(struct net_device *dev);

         /* List of functions to handle Wireless Extensions (instead of ioctl).

          * See <net/iw_handler.h> for details. Jean II */

         /*无线网相关*/

         const struct iw_handler_def *   wireless_handlers;

         /* Instance data managed by the core of Wireless Extensions. */

         struct iw_public_data *     wireless_data;

 

         const struct ethtool_ops *ethtool_ops;

         /*

          * This marks the end of the "visible" part of the structure. All

          * fields hereafter are internal to the system, and may change at

          * will (read: may be cleaned up at will).

          */

         unsigned int               flags;         /* interface flags (a la BSD)        */

         /*记录当前网络设备IFF_PROMISCIFF_ALLMULTI的状态,用来配合flags的设置*/

         unsigned short          gflags;

        unsigned short          priv_flags; /* Like 'flags' but invisible to userspace. */

         unsigned short          padded;    /* How much padding added by alloc_netdev() */

 

         unsigned char            operstate; /* RFC2863 operstate */

         unsigned char            link_mode; /* mapping policy to operstate */

 

         unsigned           mtu; /* interface MTU value               */

         unsigned short          type;         /* interface hardware type          */

         unsigned short          hard_header_len;      /* hardware hdr length      */

 

         struct net_device      *master; /* Pointer to master device of a group,

                                                 * which this device is member of.

                                                 */

         /* Interface address info. */

         /*MAC地址,通常初始化时从硬件中读出来*/

         unsigned char            perm_addr[MAX_ADDR_LEN]; /* permanent hw address */

         unsigned char            addr_len;  /* hardware address length        */

         unsigned short          dev_id;              /* for shared network cards */

 

         struct dev_mc_list    *mc_list;  /* Multicast mac addresses       */

         int                       mc_count;         /* Number of installed mcasts   */

         /*设置网络设备混杂模式计数器*/

         int                       promiscuity;

         /*设置网络设备接收所有组播报的计数器,每次设置或是退出操作,该字段

         都会相应的加或减1,为0时,网络设备才真正不再接收组播报*/

         int                       allmulti;

         /* Protocol specific pointers */

        

         void                            *atalk_ptr;         /* AppleTalk link      */

         void                    *ip_ptr;    /* IPv4 specific data */ 

         void                    *dn_ptr;        /* DECnet specific data */

         void                    *ip6_ptr;       /* IPv6 specific data */

         void                    *ec_ptr;    /* Econet specific data      */

         void                    *ax25_ptr;         /* AX.25 specific data */

/*

 * Cache line mostly used on receive path (including eth_type_trans())

 */

        /*该结构实例通过该字段连接到softnet_datapoll_list成员上*/

         struct list_head          poll_list ____cacheline_aligned_in_smp;

                                               /* Link to poll list      */

         /*轮询模式操作接口*/

         int                       (*poll) (struct net_device *dev, int *quota);

         /*读取数据包的配额,动态变化,由netdev_budget初始化,每次从网络设备中读取数据包后,

         会从中减去本次读取的数据包数,当该配额等于或小于0时,结束当前轮询等待下层轮询

         这样即使某个网络设备有大量的数据包输入,也能保证其他网络设备能及时收到数据包

         在输入时,遍历网络设备轮询队列,从选定的网络设备中读取数据包,一旦已经读取的数据

         包的数量操作配额,即停止本次读取,将该网络设备移至网络设备轮询队列的队尾,等待

         下次轮询*/

         int                       quota;

         /*数据包输入软中断中,单个网络读取数据包的配额*/

         int                       weight;

         unsigned long            last_rx;      /* Time of last Rx       */

         /* Interface address info used in eth_type_trans() */

         unsigned char            dev_addr[MAX_ADDR_LEN];          /* hw address, (before bcast

                                                                 because most packets are unicast) */

         unsigned char            broadcast[MAX_ADDR_LEN];         /* hw bcast add         */

/*

 * Cache line mostly used on queue transmit path (qdisc)

 */

         /* device queue lock */

         spinlock_t                   queue_lock ____cacheline_aligned_in_smp;

         /*当前使用的根排队规则,配置的排队规则生效时由qdisc_sleeping设置*/

         struct Qdisc                *qdisc;

         /*当前配置的排队规则,生效时将被设置到qdisc*/

         struct Qdisc                *qdisc_sleeping;

         /*通过链表方式记录配置所在网络的所有排队规则*/

         struct list_head          qdisc_list;

         /*可在设备发送队列中排队的最大数据包*/

         unsigned long            tx_queue_len;   /* Max frames per queue allowed */

         /* Partially transmitted GSO packet. */

         struct sk_buff            *gso_skb;

         /* ingress path synchronizer */

         spinlock_t                   ingress_lock;

         /*数据包输入的排队规则*/

         struct Qdisc                *qdisc_ingress;

/*

 * One part is mostly used on xmit path (device)

 */

         /* hard_start_xmit synchronizer */

         spinlock_t                   _xmit_lock ____cacheline_aligned_in_smp;

         /* cpu id of processor entered to hard_start_xmit or -1,

            if nobody entered there.

          */

         int                       xmit_lock_owner;

         void                    *priv;        /* pointer to private data   */

         /*驱动提供给上一层发送数据包的接口,在发送数据包时必定会调用该接口*/

         int                       (*hard_start_xmit) (struct sk_buff *skb,

                                                            struct net_device *dev);

         /* These may be needed for future network-power-down code. */

         unsigned long            trans_start;       /* Time (in jiffies) of last Tx        */

         /*网络层确定传输已经超时,而调用驱动程序的tx_timeout接口的最短时间*/

         int                       watchdog_timeo; /* used by dev_watchdog() */

         /*用于检测网络设备处于正常的工作状态时,是否存在由于关闭队列功能

         而导致发送超时的情况,一旦发生以上状况,就调用网络设备驱动的tx_timeout

         接口处理*/

         struct timer_list          watchdog_timer;

/*

 * refcnt is a very hot point, so align it on SMP

 */

         /* Number of references to this device */

         atomic_t             refcnt ____cacheline_aligned_in_smp;

         /* delayed register/unregister */

         /*用来连接net_todo_list链表,包含已经注销即将结束的网络设备*/

         struct list_head          todo_list;

         /* device index hash chain */

         /*根据网络设备的索引,以散列表的形式组织到dev_index_hlist*/

         struct hlist_node       index_hlist;

         /* register/unregister state machine */

         enum { NETREG_UNINITIALIZED=0,

                NETREG_REGISTERED,        /* completed register_netdevice */

                NETREG_UNREGISTERING,          /* called unregister_netdevice */

                NETREG_UNREGISTERED,  /* completed unregister todo */

                NETREG_RELEASED,            /* called free_netdev */

         } reg_state;

         /* Called after device is detached from network. */

         void                    (*uninit)(struct net_device *dev);

         /* Called after last user reference disappears. */

         void                    (*destructor)(struct net_device *dev);

         /* Pointers to interface service routines.    */

         /*启用设备函数指针,完成那个注册所需的系统资源,打开硬件极其所有

         设备*/

         int                       (*open)(struct net_device *dev);

         int                       (*stop)(struct net_device *dev);

#define HAVE_NETDEV_POLL

/*根据先前检测到的源和目标硬件地址创建硬件首部*/

         int                       (*hard_header) (struct sk_buff *skb,

                                                        struct net_device *dev,

                                                        unsigned short type,

                                                        void *daddr,

                                                        void *saddr,

                                                        unsigned len);

/*用来在传输包之前,ARP解析完成之后,重建硬件首部*/

         int                       (*rebuild_header)(struct sk_buff *skb);

#define HAVE_MULTICAST                     

         /*将组播地址列表更新到网络设备中*/

         void                    (*set_multicast_list)(struct net_device *dev);

#define HAVE_SET_MAC_ADDR                     

         /*修改硬件地址接口,需要网络设备支持该功能*/

         int                       (*set_mac_address)(struct net_device *dev,

                                                           void *addr);

#define HAVE_PRIVATE_IOCTL

         int                       (*do_ioctl)(struct net_device *dev,

                                                   struct ifreq *ifr, int cmd);

#define HAVE_SET_CONFIG

         int                       (*set_config)(struct net_device *dev,

                                                     struct ifmap *map);

#define HAVE_HEADER_CACHE

         /*根据ARP查询的结果填充hh_cache结构*/

         int                       (*hard_header_cache)(struct neighbour *neigh,

                                                             struct hh_cache *hh);

         void                    (*header_cache_update)(struct hh_cache *hh,

                                                               struct net_device *dev,

                                                               unsigned char *  haddr);

#define HAVE_CHANGE_MTU

         int                       (*change_mtu)(struct net_device *dev, int new_mtu);

 

#define HAVE_TX_TIMEOUT

         void                    (*tx_timeout) (struct net_device *dev);

 

         void                    (*vlan_rx_register)(struct net_device *dev,

                                                            struct vlan_group *grp);

         void                    (*vlan_rx_add_vid)(struct net_device *dev,

                                                           unsigned short vid);

         void                    (*vlan_rx_kill_vid)(struct net_device *dev,

                                                            unsigned short vid);

 

         int                       (*hard_header_parse)(struct sk_buff *skb,

                                                             unsigned char *haddr);

         /*设置邻居子系统相关的参数*/

         int                       (*neigh_setup)(struct net_device *dev, struct neigh_parms *);

#ifdef CONFIG_NETPOLL

         /*网络设备netpoll信息块*/

         struct netpoll_info    *npinfo;

#endif

#ifdef CONFIG_NET_POLL_CONTROLLER

         /*该函数在禁止中断的情况下,要求驱动程序以轮询模式在接口上查询事件*/

         void                    (*poll_controller)(struct net_device *dev);

#endif

         /* bridge stuff */

         struct net_bridge_port      *br_port;

 

         /* class/net/name entry */

         struct class_device   class_dev;

         /* space for optional statistics and wireless sysfs groups */

         struct attribute_group  *sysfs_groups[3];

};

网卡驱动的注册是在e100_init_modle中,

static int __init e100_init_module(void)

{

         if(((1 << debug) - 1) & NETIF_MSG_DRV) {

                   printk(KERN_INFO PFX "%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);

                   printk(KERN_INFO PFX "%s\n", DRV_COPYRIGHT);

         }

         return pci_register_driver(&e100_driver);

}

可见,网卡驱动也就是和一般的PCI驱动编写一样。

static struct pci_driver e100_driver = {

         .name =         DRV_NAME,

         .id_table =     e100_id_table,

         .probe =        e100_probe,

         .remove =       __devexit_p(e100_remove),

#ifdef CONFIG_PM

         /* Power Management hooks */

         .suspend =      e100_suspend,

         .resume =       e100_resume,

#endif

         .shutdown =     e100_shutdown,

         .err_handler = &e100_err_handler,

};

       如果网络设备驱动程序被编译进内核,则将在启动时被初始化,在运行时作为模块被加载。无论初始化是否被发生,由驱动程序控制的网络设备都会被注册。这种情形适用于所有的总线类型,无论是总线体系结构还是模块初始化代码调用注册函数,结果都是一样的。PCI设备驱动程序加载以至执行pci_drive->probe()函数。我们看看e100网卡的驱动注册过程:

static int __devinit e100_probe(struct pci_dev *pdev,

         const struct pci_device_id *ent)

{

         struct net_device *netdev;

         struct nic *nic;

         int err;

         /*分配设备数据结构*/

         if(!(netdev = alloc_etherdev(sizeof(struct nic)))) {

                   if(((1 << debug) - 1) & NETIF_MSG_PROBE)

                            printk(KERN_ERR PFX "Etherdev alloc failed, abort.\n");

                   return -ENOMEM;

         }

         /*初始化设备*/

         netdev->open = e100_open;

         netdev->stop = e100_close;

         /*e100网络设备的hard_start_xmit接口实现,最终将数据包输出到硬件*/

         netdev->hard_start_xmit = e100_xmit_frame;

         netdev->get_stats = e100_get_stats;

         netdev->set_multicast_list = e100_set_multicast_list;

         netdev->set_mac_address = e100_set_mac_address;

         netdev->change_mtu = e100_change_mtu;

         netdev->do_ioctl = e100_do_ioctl;

         SET_ETHTOOL_OPS(netdev, &e100_ethtool_ops);

         netdev->tx_timeout = e100_tx_timeout;

         netdev->watchdog_timeo = E100_WATCHDOG_PERIOD;

         netdev->poll = e100_poll;

         netdev->weight = E100_NAPI_WEIGHT;

#ifdef CONFIG_NET_POLL_CONTROLLER

/*为了实现netpoll接收报文功能,需要实现下面的函数调用,该函数

用来模拟网络设备发生中断,进行中断处理*/

         netdev->poll_controller = e100_netpoll;

#endif

         strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);

         /*alloc_etherdev中设置的私有属性,即结构nic,在这里提出来*/

         nic = netdev_priv(netdev);

         /*初始化该nic*/

         nic->netdev = netdev;

         nic->pdev = pdev;

         nic->msg_enable = (1 << debug) - 1;

         /*设置PCI设备私有数据为网络设备结构实例*/

         pci_set_drvdata(pdev, netdev);

        

         /* Initialize device before it's used by a driver. Ask low-level code

 *  to enable I/O and memory. Wake up the device if it was suspended.

 *  Beware, this function can fail.*/

         if((err = pci_enable_device(pdev))) {

                   DPRINTK(PROBE, ERR, "Cannot enable PCI device, aborting.\n");

                   goto err_out_free_dev;

         }

         if(!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {

                   DPRINTK(PROBE, ERR, "Cannot find proper PCI device "

                            "base address, aborting.\n");

                   err = -ENODEV;

                   goto err_out_disable_pdev;

         }

         /*保留资源,包括I/O和内存*/

         if((err = pci_request_regions(pdev, DRV_NAME))) {

                   DPRINTK(PROBE, ERR, "Cannot obtain PCI resources, aborting.\n");

                   goto err_out_disable_pdev;

         }

         /*DMA相关,探测设备的DMA能力,如果设备支持DMA,

         返回0*/

         if((err = pci_set_dma_mask(pdev, DMA_32BIT_MASK))) {

                   DPRINTK(PROBE, ERR, "No usable DMA configuration, aborting.\n");

                   goto err_out_free_res;

         }

         SET_MODULE_OWNER(netdev);

         SET_NETDEV_DEV(netdev, &pdev->dev);

         /*控制状态寄存器映射内存资源*/

         nic->csr = ioremap(pci_resource_start(pdev, 0), sizeof(struct csr));

         if(!nic->csr) {

                   DPRINTK(PROBE, ERR, "Cannot map device registers, aborting.\n");

                   err = -ENOMEM;

                   goto err_out_free_res;

         }

         if(ent->driver_data)

                   nic->flags |= ich;

         else

                   nic->flags &= ~ich;

         /*初始化nic相关字段*/

         e100_get_defaults(nic);

         /* locks must be initialized before calling hw_reset */

         spin_lock_init(&nic->cb_lock);

         spin_lock_init(&nic->cmd_lock);

         spin_lock_init(&nic->mdio_lock);

         /* Reset the device before pci_set_master() in case device is in some

          * funky state and has an interrupt pending - hint: we don't have the

          * interrupt handler registered yet. */

          /*设备复位,写相关寄存器方式实现*/

         e100_hw_reset(nic);

         /*启用设备*/

         pci_set_master(pdev);

         /*初始化两个软件时钟*/

         init_timer(&nic->watchdog);

         nic->watchdog.function = e100_watchdog;

         nic->watchdog.data = (unsigned long)nic;

         init_timer(&nic->blink_timer);

         nic->blink_timer.function = e100_blink_led;

         nic->blink_timer.data = (unsigned long)nic;

         /*初始化工作队列*/

         INIT_WORK(&nic->tx_timeout_task, e100_tx_timeout_task);

         /*DMA区分配*/

         if((err = e100_alloc(nic))) {

                   DPRINTK(PROBE, ERR, "Cannot alloc driver memory, aborting.\n");

                   goto err_out_iounmap;

         }

         /*读取网卡的EEPROM。其中存放这网卡的MAC地址

         */

         if((err = e100_eeprom_load(nic)))

                   goto err_out_free;

         /*初始化nic的物理信息*/

         e100_phy_init(nic);

         memcpy(netdev->dev_addr, nic->eeprom, ETH_ALEN);

         memcpy(netdev->perm_addr, nic->eeprom, ETH_ALEN);

         /*验证网卡的MAC地址是否格式正确*/

         if(!is_valid_ether_addr(netdev->perm_addr)) {

                   DPRINTK(PROBE, ERR, "Invalid MAC address from "

                            "EEPROM, aborting.\n");

                   err = -EAGAIN;

                   goto err_out_free;

         }

         /* Wol magic packet can be enabled from eeprom */

         if((nic->mac >= mac_82558_D101_A4) &&

            (nic->eeprom[eeprom_id] & eeprom_id_wol))

                   nic->flags |= wol_magic;

         /* ack any pending wake events, disable PME */

         /*,这个函数的第二个参数表示一种电源状态

         PME#就是Power Management Event Signal,即电源管理事件信号.)PME#信号是PCI Power Spec中出镜率最高的一个名词.如果一个设备希望改变它的电源状态,它就可以发送一个PME#信号.而设备是否允许发送信号也是有开关的,并且每种状态都有一个开关

         第三个参数是表示开还是关.即传递1进去就是enable,传递0进去就是disable*/

         err = pci_enable_wake(pdev, 0, 0);

         if (err)

                   DPRINTK(PROBE, ERR, "Error clearing wake event\n");

         /*网络设备的名称前加上eth*/

         strcpy(netdev->name, "eth%d");

         /*注册网络设备*/

         if((err = register_netdev(netdev))) {

                   DPRINTK(PROBE, ERR, "Cannot register net device, aborting.\n");

                   goto err_out_free;

         }

         DPRINTK(PROBE, INFO, "addr 0x%llx, irq %d, "

                   "MAC addr %02X:%02X:%02X:%02X:%02X:%02X\n",

                   (unsigned long long)pci_resource_start(pdev, 0), pdev->irq,

                   netdev->dev_addr[0], netdev->dev_addr[1], netdev->dev_addr[2],

                   netdev->dev_addr[3], netdev->dev_addr[4], netdev->dev_addr[5]);

         return 0;

err_out_free:

         e100_free(nic);

err_out_iounmap:

         iounmap(nic->csr);

err_out_free_res:

         pci_release_regions(pdev);

err_out_disable_pdev:

         pci_disable_device(pdev);

err_out_free_dev:

         pci_set_drvdata(pdev, NULL);

         free_netdev(netdev);

         return err;

}

其辅助函数:

分配网络设备结构

/*传入的参数为nic结构的大小*/

struct net_device *alloc_etherdev(int sizeof_priv)

{

         return alloc_netdev(sizeof_priv, "eth%d", ether_setup);

}

struct net_device *alloc_netdev(int sizeof_priv, const char *name,

                   void (*setup)(struct net_device *))

{

         void *p;

         struct net_device *dev;

         int alloc_size;

         BUG_ON(strlen(name) >= sizeof(dev->name));

         /* ensure 32-byte alignment of both the device and private area */

         /*计算分配的大小为设备结构大小加上nic结构大小*/

         alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;

         alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;

         /*分配空间*/

         p = kzalloc(alloc_size, GFP_KERNEL);

         if (!p) {

                   printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");

                   return NULL;

         }

         dev = (struct net_device *)

                   (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);

         /*计算padd大小为结构大小减去对其的数据大小*/

         dev->padded = (char *)dev - (char *)p;

         if (sizeof_priv)

                   /*私有数据为nic结构的起始地址*/

                   dev->priv = netdev_priv(dev);

         /*调用参数中的函数指针,初始化设备结构*/

         setup(dev);

         strcpy(dev->name, name);

         return dev;

}

/*分配设备结构时调用,用于初始化该设备结构*/

void ether_setup(struct net_device *dev)

{

         dev->change_mtu              = eth_change_mtu;

         dev->hard_header     = eth_header;

         dev->rebuild_header         = eth_rebuild_header;

         dev->set_mac_address    = eth_mac_addr;

         dev->hard_header_cache = eth_header_cache;

         dev->header_cache_update= eth_header_cache_update;

         dev->hard_header_parse  = eth_header_parse;

         dev->type                   = ARPHRD_ETHER;

         dev->hard_header_len     = ETH_HLEN;

         dev->mtu           = ETH_DATA_LEN;

         dev->addr_len           = ETH_ALEN;

         dev->tx_queue_len   = 1000;      /* Ethernet wants good queues */    

         dev->flags                  = IFF_BROADCAST|IFF_MULTICAST;

        

         memset(dev->broadcast, 0xFF, ETH_ALEN);

}

注册网络设备的实际操作由register_netdev(netdev)调用register_netdevice()完成

int register_netdevice(struct net_device *dev)

{

         struct hlist_head *head;

         struct hlist_node *p;

         int ret;

         BUG_ON(dev_boot_phase);

         ASSERT_RTNL();

         /*2.6内核支持内核抢占,该函数检查是否需要从新调度

         如果是,则进行调度,无论此时进行执行在内核空间还是

         用户空间*/

         might_sleep();

         /*初始化设备的各个字段*/

         /* When net_device's are persistent, this will be fatal. */

         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);

         spin_lock_init(&dev->queue_lock);

         spin_lock_init(&dev->_xmit_lock);

         dev->xmit_lock_owner = -1;

#ifdef CONFIG_NET_CLS_ACT

         spin_lock_init(&dev->ingress_lock);

#endif

         dev->iflink = -1;

         /* Init, if this function is available */

         /*如果有init函数,调用该函数进行初始化*/

         if (dev->init) {

                   ret = dev->init(dev);

                   if (ret) {

                            if (ret > 0)

                                     ret = -EIO;

                            goto out;

                   }

         }

        /*检测待注册的网络设备名是否有效*/

         if (!dev_valid_name(dev->name)) {

                   ret = -EINVAL;

                   goto out;

         }

         /*为设备分配一个唯一索引号和一个用于虚拟隧道设备

         的唯一标识。*/       

         dev->ifindex = dev_new_index();

         if (dev->iflink == -1)

                   dev->iflink = dev->ifindex;

         /* Check for existence of name */

         /*将网络设备添加到dev_name_head散列表中,并检测是否

         存在同名的网络设备*/

         head = dev_name_hash(dev->name);

         hlist_for_each(p, head) {

                   struct net_device *d

                            = hlist_entry(p, struct net_device, name_hlist);

                   if (!strncmp(d->name, dev->name, IFNAMSIZ)) {

                            ret = -EEXIST;

                           goto out;

                   }

        }

         /* Fix illegal SG+CSUM combinations. */

         /*只有在网络设备支持校验和计算的情况下,网络设备才能支持SG类型的聚合分散I/O

         因为SG类型的聚合分散I/O特性没有传输层硬件检验和支持是无用的*/

         if ((dev->features & NETIF_F_SG) &&

             !(dev->features & NETIF_F_ALL_CSUM)) {

                   printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",

                          dev->name);

                   dev->features &= ~NETIF_F_SG;

         }

         /* TSO requires that SG is present as well. */

         /*TSO需要SG类型的聚合分散性I/O的支持,因此在后者不被支持时也将被禁用*/

         if ((dev->features & NETIF_F_TSO) &&

             !(dev->features & NETIF_F_SG)) {

                   printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",

                          dev->name);

                   dev->features &= ~NETIF_F_TSO;

         }

         /*UFO需要NETIF_F_HW_CSUMSG类型的聚合分散I/O的支持,因此在后者不被支持的情况下

         也将被禁用*/

         if (dev->features & NETIF_F_UFO) {

                   if (!(dev->features & NETIF_F_HW_CSUM)) {

                            printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "

                                               "NETIF_F_HW_CSUM feature.\n",

                                                                 dev->name);

                            dev->features &= ~NETIF_F_UFO;

                   }

                   if (!(dev->features & NETIF_F_SG)) {

                            printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "

                                               "NETIF_F_SG feature.\n",

                                               dev->name);

                            dev->features &= ~NETIF_F_UFO;

                   }

         }

         /*

          *     nil rebuild_header routine,

          *     that should be never called and used as just bug trap.

          */

         /*初始化网络设备用于重建硬件首部的rebuild_header接口

         */

         if (!dev->rebuild_header)

                   dev->rebuild_header = default_rebuild_header;

         /*将网络设备的注册信息注册到sysfs文件系统中*/

         ret = netdev_register_sysfs(dev);

         if (ret)

                   goto out;

         /*设置网络设备的状态,表示注册已经完成*/

         dev->reg_state = NETREG_REGISTERED;

         /*

          *     Default initial state at registry is that the

          *     device is present.

          */

         /*设置相应位,表示设备对系统是可用的*/

         set_bit(__LINK_STATE_PRESENT, &dev->state);

         /*下面为初始化网络设备排队规则,并注册到网络设备的

         链表和相关散列表中*/

         dev->next = NULL;

         dev_init_scheduler(dev);

         write_lock_bh(&dev_base_lock);

         *dev_tail = dev;

         dev_tail = &dev->next;

         hlist_add_head(&dev->name_hlist, head);

         hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));

         dev_hold(dev);

         write_unlock_bh(&dev_base_lock);

         /* Notify protocols, that a new device appeared. */

         /*通知所有对设备注册感兴趣的其他内核模块*/

         raw_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);

         ret = 0;

out:

         return ret;

}

PCI驱动中的其他函数意义同其名,我们看看挂起操作e100_suspend

static int e100_suspend(struct pci_dev *pdev, pm_message_t state)

{

         struct net_device *netdev = pci_get_drvdata(pdev);

         struct nic *nic = netdev_priv(netdev);

         /*如果网络设备处于激活状态,则等待网络设备完成轮询接收数据包*/

         if (netif_running(netdev))

                   netif_poll_disable(nic->netdev);

         /*删除监视网络设备工作状态的定时器*/

         del_timer_sync(&nic->watchdog);

         /*使设备驱动处于不可传递数据状态,并关闭网络设备的队列功能*/

         netif_carrier_off(nic->netdev);

         netif_device_detach(netdev);

         pci_save_state(pdev);

         /*

         这个函数的第二个参数表示一种电源状态,咱们看到传递的一次是PCI_D3hot,一次是PCI_D3cold,

         这就是使得设备可以从这两种状态中产生PME#信号.(PME#就是Power Management Event Signal,即电源管理事件信号

         .)PME#信号是PCI Power Spec中出镜率最高的一个名词.如果一个设备希望改变它的电源状态,它就可以发送

         一个PME#信号.而设备是否允许发送信号也是有开关的,并且每种状态都有一个开关.

         所以这里的做法就是为D3hotD3cold打开开关.而这里pci_enable_wake的第三个参数是表示开还是关.

         即传递1进去就是enable,传递0进去就是disable.

         */

         if ((nic->flags & wol_magic) | e100_asf(nic)) {

                   pci_enable_wake(pdev, PCI_D3hot, 1);

                   pci_enable_wake(pdev, PCI_D3cold, 1);

         } else {

                   pci_enable_wake(pdev, PCI_D3hot, 0);

                   pci_enable_wake(pdev, PCI_D3cold, 0);

         }

         /*禁用设备*/

         pci_disable_device(pdev);

         /*释放中断*/

         free_irq(pdev->irq, netdev);

         /*设置PCI的电源状态*/

         pci_set_power_state(pdev, PCI_D3hot);

         return 0;

}

       这样,网络设备的驱动框架就搭建起来了,驱动程序在模块初始化函数中注册网卡的PCI驱动,在probe函数中注册网卡设备驱动,初始化相关数据结构和函数指针。对于特定的网卡需要特定的数据结构来保存信息,硬件相关的操作需要按照对应网卡的约定来实现。对于e100系列网卡,数据结构nic保存了该网卡的所有信息。另外net_device中提供的函数指针在e100_probe中做了初始化,如e100_open,依据他们的名字我们可以猜到他们的意思和用途(e100_open做网卡的打开、启动、中断的注册等操作)。这里就不再深入了,如果对他们的实现细节感兴趣,需要参看其网卡的硬件手册。

后面我们在分析上层代码中会遇到一些操作特定网卡的函数指针,在这里就能找到其实现。

GitHub 加速计划 / li / linux-dash
10.39 K
1.2 K
下载
A beautiful web dashboard for Linux
最近提交(Master分支:2 个月前 )
186a802e added ecosystem file for PM2 4 年前
5def40a3 Add host customization support for the NodeJS version 4 年前
Logo

旨在为数千万中国开发者提供一个无缝且高效的云端环境,以支持学习、使用和贡献开源项目。

更多推荐