linux核心網路通訊協定棧學習筆記(6)

來源:互聯網
上載者:User

本篇討論IP包的收發(暫不包括路由)

先來看inet_init,

首先是調用proto_register,註冊了tcp_prot, udp_prot, raw_prot,其中proto_register前半部分是初始化各種slab_cache,後半部分把這些struct proto結構鏈到proto_list裡

其次調用sock_register,核心有一個全域的net_proto_family結構的net_families數組,inet_init調用sock_register就是把inet_family_ops加到net_families[PF_NET]中,inet_family_ops結構如下

static struct net_proto_family inet_family_ops = {
    .family = PF_INET,
    .create = inet_create,
    .owner  = THIS_MODULE,
};

接著調用inet_add_protocol,去填充inet_protos數組,inet_protos是一個全域的指標數組,其定義如下:

const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp;

可以看出數組最大長度MAX_INET_PROTOS為256,in.h裡對所有的協議做了定義

/* Standard well-defined IP protocols.  */
enum {
  IPPROTO_IP = 0,       /* Dummy protocol for TCP       */
  IPPROTO_ICMP = 1,     /* Internet Control Message Protocol    */
  IPPROTO_IGMP = 2,     /* Internet Group Management Protocol   */
  IPPROTO_IPIP = 4,     /* IPIP tunnels (older KA9Q tunnels use 94) */
  IPPROTO_TCP = 6,      /* Transmission Control Protocol    */
  IPPROTO_EGP = 8,      /* Exterior Gateway Protocol        */
  IPPROTO_PUP = 12,     /* PUP protocol             */
  IPPROTO_UDP = 17,     /* User Datagram Protocol       */
  IPPROTO_IDP = 22,     /* XNS IDP protocol         */
  IPPROTO_DCCP = 33,        /* Datagram Congestion Control Protocol */
  IPPROTO_RSVP = 46,        /* RSVP protocol            */
  IPPROTO_GRE = 47,     /* Cisco GRE tunnels (rfc 1701,1702)    */
  IPPROTO_IPV6   = 41,      /* IPv6-in-IPv4 tunnelling      */
  IPPROTO_ESP = 50,            /* Encapsulation Security Payload protocol */
  IPPROTO_AH = 51,             /* Authentication Header protocol       */
  IPPROTO_BEETPH = 94,         /* IP option pseudo header for BEET */
  IPPROTO_PIM    = 103,     /* Protocol Independent Multicast   */
  IPPROTO_COMP   = 108,                /* Compression Header protocol */
  IPPROTO_SCTP   = 132,     /* Stream Control Transport Protocol    */
  IPPROTO_UDPLITE = 136,    /* UDP-Lite (RFC 3828)          */
  IPPROTO_RAW    = 255,     /* Raw IP packets           */
  IPPROTO_MAX
};

inet_init裡對inet_protos裡只定義了ICMP, IGMP, TCP, UDP,以TCP為例,其net_protocol定義為

static const struct net_protocol tcp_protocol = {
    .handler =  tcp_v4_rcv,
    .err_handler =  tcp_v4_err,
    .gso_send_check = tcp_v4_gso_send_check,
    .gso_segment =  tcp_tso_segment,
    .gro_receive =  tcp4_gro_receive,
    .gro_complete = tcp4_gro_complete,
    .no_policy =    1,
    .netns_ok = 1,
};

IP層在把報文往上送的時候,e.g. ip_local_deliver_finish,實際上就是根據skb的protocol在inet_protos裡找到對應的net_protocol結構,然後調用net_protocol->handler函數,e.g. 如果是TCP協議的skb,這時就調用tcp_v4_rcv

下面開始初始化inetsw數組以及inetsw_arry數組,inetsw是個list_head數組,每個索引代表了IP報的一種類型(由四層決定的),如 SOCK_STREAM, SOCK_DGRAM, SOCK_RAW等,定義如下

enum sock_type {
    SOCK_STREAM = 1,
    SOCK_DGRAM  = 2,
    SOCK_RAW    = 3,
    SOCK_RDM    = 4,
    SOCK_SEQPACKET  = 5,
    SOCK_DCCP   = 6,
    SOCK_PACKET = 10,
};  

inetsw_array數組是一個inet_protosw類型的數組,定義如下

static struct inet_protosw inetsw_array[] =
{       
    {
        .type =       SOCK_STREAM,
        .protocol =   IPPROTO_TCP,
        .prot =       &tcp_prot,
        .ops =        &inet_stream_ops,
        .no_check =   0,
        .flags =      INET_PROTOSW_PERMANENT |
                  INET_PROTOSW_ICSK,
    },
    
    {   
        .type =       SOCK_DGRAM,
        .protocol =   IPPROTO_UDP,
        .prot =       &udp_prot,
        .ops =        &inet_dgram_ops,
        .no_check =   UDP_CSUM_DEFAULT,
        .flags =      INET_PROTOSW_PERMANENT,
       },
    
       {
           .type =       SOCK_RAW,
           .protocol =   IPPROTO_IP,    /* wild card */
           .prot =       &raw_prot,
           .ops =        &inet_sockraw_ops,
           .no_check =   UDP_CSUM_DEFAULT,
           .flags =      INET_PROTOSW_REUSE,
       }
};

而inet_protosw定義如下

/* This is used to register socket interfaces for IP protocols.  */
struct inet_protosw {  
    struct list_head list;        
    /* These two fields form the lookup key.  */
    unsigned short   type;     /* This is the 2nd argument to socket(2). */
    unsigned short   protocol; /* This is the L4 protocol number.  */        
    struct proto     *prot;
    const struct proto_ops *ops;
    char             no_check;   /* checksum on rcv/xmit/none? */
    unsigned char    flags;      /* See INET_PROTOSW_* below.  */
};         
#define INET_PROTOSW_REUSE 0x01      /* Are ports automatically reusable? */
#define INET_PROTOSW_PERMANENT 0x02  /* Permanent protocols are unremovable. */
#define INET_PROTOSW_ICSK      0x04  /* Is this an inet_connection_sock? */

可以看出inet_protosw的list就是inetsw指向的list_head指標

最後是分別調用 arp_init, ip_init, tcp_v4_init, tcp_init, udp_init 等,這裡略過了

下面來談IP協議,這裡我們略過IP option部分,因為實際應用的網路幾乎不會有IP option出現,先看IP頭部

struct iphdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
    __u8    ihl:4,
        version:4;
#elif defined (__BIG_ENDIAN_BITFIELD)
    __u8    version:4,
        ihl:4;
#else
#error  "Please fix <asm/byteorder.h>"
#endif
    __u8    tos;
    __be16  tot_len;
    __be16  id;
    __be16  frag_off;
    __u8    ttl;
    __u8    protocol;
    __sum16 check;
    __be32  saddr;
    __be32  daddr;
    /*The options start here. */
};

ihl單位是4位元組,一般而言ihl長度是20位元組因此是這個值是5

tot_len單位是位元組

id一般用於IP的分段/組合,同一IP包的所有分段其ID值是相同的

protocol表示4層協議值

check是IP首部的校檢和

sk_buff 結構中,skb->csum儲存了L4的校正和,skb->ip_summed表示校正和的狀態

CHECKSUM_NONE,表示L4校正和無效,需要重新計算

CHECKSUM_HW,表示網卡已經正確計算了L4校正和,但程式需要再次驗證L4校正和

CHECKSUM_UNNECESSARY,表示L4校正和無需驗證

static struct packet_type ip_packet_type __read_mostly = {
    .type = cpu_to_be16(ETH_P_IP),
    .func = ip_rcv,
    .gso_send_check = inet_gso_send_check,
    .gso_segment = inet_gso_segment,
    .gro_receive = inet_gro_receive,
    .gro_complete = inet_gro_complete,
};

L2層通過ip_packet_type找到ip_rcv函數,從而把報文傳到L3,下面分析下ip_rcv 函數:

    /* When the interface is in promisc. mode, drop all the crap
     * that it receives, do not try to analyse it.
     */
    if (skb->pkt_type == PACKET_OTHERHOST)
        goto drop; 
        
    IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);

    if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
        IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
        goto out;
    }   
        
    if (!pskb_may_pull(skb, sizeof(struct iphdr)))
        goto inhdr_error;
    
    iph = ip_hdr(skb);

如果skb是通過混雜模式擷取的且不是發往原生,直接丟棄;如果skb是share的,調用skb_share_check複製一份出來處理;pskb_may_pull這個函數比較複雜,其目的是,確保在skb->data開始的線性記憶體裡面至少有 iphdr 的內容(這裡要提下sk_buff這個結構的複雜性就在於:真正的報文內容很多情況下是不存在skb所在的線性記憶體中的,通常情況下,sk_buff後面會跟著一塊線性記憶體空間,用skb_shared_info來表示,如果IP包沒有分區的話,這裡會儲存scatter-gather的報文內容,這些內容是分散在各個不同的記憶體頁中的,用一個
skb_frag_t 數組frags表示,nrfrags裡儲存了數組中元素的個數;如果IP包存在分區的話,可以看到有個sk_buff的數組frag_list,裡面就是分區的skb咯),如果skb->data後續的記憶體不夠,pskb_may_pull會擴充這個skb結構,然後把frags或者frag_list裡的IP頭內容拷出來填到skb線性記憶體裡

    if (iph->ihl < 5 || iph->version != 4)
        goto inhdr_error;

    if (!pskb_may_pull(skb, iph->ihl*4))
        goto inhdr_error;

    iph = ip_hdr(skb);

    if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
        goto inhdr_error;

    len = ntohs(iph->tot_len);
    if (skb->len < len) {
        IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
        goto drop;
    } else if (len < (iph->ihl*4))
        goto inhdr_error;

這段代碼基本都是做一些check,略過了

    /* Our transport medium may have padded the buffer out. Now we know it
     * is IP we can trim to the true length of the frame.
     * Note this now means skb->len holds ntohs(iph->tot_len).
     */
    if (pskb_trim_rcsum(skb, len)) {
        IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
        goto drop;
    }

pskb_trim_rcsum用於去掉L2用來padding的部分,並重新計算checksum,瞭解下就行了

return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
               ip_rcv_finish);

最後走一遍netfilter,如果不被DROP或啥的,進入ip_rcv_finish

static int ip_rcv_finish(struct sk_buff *skb)
{
    const struct iphdr *iph = ip_hdr(skb);
    struct rtable *rt;

    /*
     *  Initialise the virtual path cache for the packet. It describes
     *  how the packet travels inside Linux networking.
     */
    if (skb_dst(skb) == NULL) {
        int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
                     skb->dev);
        if (unlikely(err)) {
            if (err == -EHOSTUNREACH)
                IP_INC_STATS_BH(dev_net(skb->dev),
                        IPSTATS_MIB_INADDRERRORS);
            else if (err == -ENETUNREACH)
                IP_INC_STATS_BH(dev_net(skb->dev),
                        IPSTATS_MIB_INNOROUTES);
            goto drop;
        }   
    }

    if (iph->ihl > 5 && ip_rcv_options(skb))
        goto drop;

    rt = skb_rtable(skb);
    if (rt->rt_type == RTN_MULTICAST) {
        IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST,
                skb->len);
    } else if (rt->rt_type == RTN_BROADCAST)
        IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST,
                skb->len);

    return dst_input(skb);

drop:
    kfree_skb(skb);
    return NET_RX_DROP;
}

ip_rcv_finish首先調用ip_route_input擷取目的地路由,關於路由的部分放到以後說,這裡通過本地路由表,會得知這個包究竟是應該本地接收還是給轉寄出去,ip_route_input會把路由資訊存到 (struct dst_entry *)skb->_skb_dst 中,而這個dst_entry->input 的函數指標究竟指向ip_local_deliver還是ip_forward是在ip_route_input_slow裡決定的(ip_route_input_slow由ip_route_input調用)

ip_route_input_slow中,先調用ip_mkroute_input,查看是否有轉寄路由表項,如果沒有則返錯表示是本地接收。ip_mkroute_input會調用__mkroute_input,裡面會調用dst_alloc建立一個rtable,並設定rth->u.dst.input = ip_forward,程式碼片段如下:

    rth = dst_alloc(&ipv4_dst_ops);
    if (!rth) {
        err = -ENOBUFS;
        goto cleanup;
    }
    atomic_set(&rth->u.dst.__refcnt, 1);
    rth->u.dst.flags= DST_HOST;
    if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
        rth->u.dst.flags |= DST_NOPOLICY;
    if (IN_DEV_CONF_GET(out_dev, NOXFRM))
        rth->u.dst.flags |= DST_NOXFRM;
    rth->fl.fl4_dst = daddr;
    rth->rt_dst = daddr;
    rth->fl.fl4_tos = tos;
    rth->fl.mark    = skb->mark;
    rth->fl.fl4_src = saddr;
    rth->rt_src = saddr;
    rth->rt_gateway = daddr;
    rth->rt_iif     =
        rth->fl.iif = in_dev->dev->ifindex;
    rth->u.dst.dev  = (out_dev)->dev;
    dev_hold(rth->u.dst.dev);
    rth->idev   = in_dev_get(rth->u.dst.dev);
    rth->fl.oif     = 0;
    rth->rt_spec_dst= spec_dst;
    rth->u.dst.input = ip_forward;
    rth->u.dst.output = ip_output;
    rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
    rt_set_nexthop(rth, res, itag);
    rth->rt_flags = flags;

如果是broadcast input, 或者local_input,會走進如下程式碼片段:

local_input:
    rth = dst_alloc(&ipv4_dst_ops);
    if (!rth)
        goto e_nobufs;

    rth->u.dst.output= ip_rt_bug;
    rth->rt_genid = rt_genid(net);

    atomic_set(&rth->u.dst.__refcnt, 1);
    rth->u.dst.flags= DST_HOST;
    if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
        rth->u.dst.flags |= DST_NOPOLICY;
    rth->fl.fl4_dst = daddr;
    rth->rt_dst = daddr;
    rth->fl.fl4_tos = tos;
    rth->fl.mark    = skb->mark;
    rth->fl.fl4_src = saddr;
    rth->rt_src = saddr;
#ifdef CONFIG_NET_CLS_ROUTE
    rth->u.dst.tclassid = itag;
#endif
    rth->rt_iif =
    rth->fl.iif = dev->ifindex;
    rth->u.dst.dev  = net->loopback_dev;
    dev_hold(rth->u.dst.dev);
    rth->idev   = in_dev_get(rth->u.dst.dev);
    rth->rt_gateway = daddr;
    rth->rt_spec_dst= spec_dst;
    rth->u.dst.input= ip_local_deliver;
    rth->rt_flags   = flags|RTCF_LOCAL;
    if (res.type == RTN_UNREACHABLE) {
        rth->u.dst.input= ip_error;
        rth->u.dst.error= -err;
        rth->rt_flags   &= ~RTCF_LOCAL;
    }
    rth->rt_type    = res.type;
    hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
    err = rt_intern_hash(hash, rth, NULL, skb);
    goto done;

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.