本篇討論IP包的收發(暫不包括路由)
先來看inet_init,
首先是調用proto_register,註冊了tcp_prot, udp_prot, raw_prot,其中proto_register前半部分是初始化各種slab_cache,後半部分把這些struct proto結構鏈到proto_list裡
其次調用sock_register,核心有一個全域的net_proto_family結構的net_families數組,inet_init調用sock_register就是把inet_family_ops加到net_families[PF_NET]中,inet_family_ops結構如下
static struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
接著調用inet_add_protocol,去填充inet_protos數組,inet_protos是一個全域的指標數組,其定義如下:
const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp;
可以看出數組最大長度MAX_INET_PROTOS為256,in.h裡對所有的協議做了定義
/* Standard well-defined IP protocols. */
enum {
IPPROTO_IP = 0, /* Dummy protocol for TCP */
IPPROTO_ICMP = 1, /* Internet Control Message Protocol */
IPPROTO_IGMP = 2, /* Internet Group Management Protocol */
IPPROTO_IPIP = 4, /* IPIP tunnels (older KA9Q tunnels use 94) */
IPPROTO_TCP = 6, /* Transmission Control Protocol */
IPPROTO_EGP = 8, /* Exterior Gateway Protocol */
IPPROTO_PUP = 12, /* PUP protocol */
IPPROTO_UDP = 17, /* User Datagram Protocol */
IPPROTO_IDP = 22, /* XNS IDP protocol */
IPPROTO_DCCP = 33, /* Datagram Congestion Control Protocol */
IPPROTO_RSVP = 46, /* RSVP protocol */
IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */
IPPROTO_IPV6 = 41, /* IPv6-in-IPv4 tunnelling */
IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */
IPPROTO_AH = 51, /* Authentication Header protocol */
IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */
IPPROTO_PIM = 103, /* Protocol Independent Multicast */
IPPROTO_COMP = 108, /* Compression Header protocol */
IPPROTO_SCTP = 132, /* Stream Control Transport Protocol */
IPPROTO_UDPLITE = 136, /* UDP-Lite (RFC 3828) */
IPPROTO_RAW = 255, /* Raw IP packets */
IPPROTO_MAX
};
inet_init裡對inet_protos裡只定義了ICMP, IGMP, TCP, UDP,以TCP為例,其net_protocol定義為
static const struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.gso_send_check = tcp_v4_gso_send_check,
.gso_segment = tcp_tso_segment,
.gro_receive = tcp4_gro_receive,
.gro_complete = tcp4_gro_complete,
.no_policy = 1,
.netns_ok = 1,
};
IP層在把報文往上送的時候,e.g. ip_local_deliver_finish,實際上就是根據skb的protocol在inet_protos裡找到對應的net_protocol結構,然後調用net_protocol->handler函數,e.g. 如果是TCP協議的skb,這時就調用tcp_v4_rcv
下面開始初始化inetsw數組以及inetsw_arry數組,inetsw是個list_head數組,每個索引代表了IP報的一種類型(由四層決定的),如 SOCK_STREAM, SOCK_DGRAM, SOCK_RAW等,定義如下
enum sock_type {
SOCK_STREAM = 1,
SOCK_DGRAM = 2,
SOCK_RAW = 3,
SOCK_RDM = 4,
SOCK_SEQPACKET = 5,
SOCK_DCCP = 6,
SOCK_PACKET = 10,
};
inetsw_array數組是一個inet_protosw類型的數組,定義如下
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
而inet_protosw定義如下
/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {
struct list_head list;
/* These two fields form the lookup key. */
unsigned short type; /* This is the 2nd argument to socket(2). */
unsigned short protocol; /* This is the L4 protocol number. */
struct proto *prot;
const struct proto_ops *ops;
char no_check; /* checksum on rcv/xmit/none? */
unsigned char flags; /* See INET_PROTOSW_* below. */
};
#define INET_PROTOSW_REUSE 0x01 /* Are ports automatically reusable? */
#define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */
#define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */
可以看出inet_protosw的list就是inetsw指向的list_head指標
最後是分別調用 arp_init, ip_init, tcp_v4_init, tcp_init, udp_init 等,這裡略過了
下面來談IP協議,這裡我們略過IP option部分,因為實際應用的網路幾乎不會有IP option出現,先看IP頭部
struct iphdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 ihl:4,
version:4;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u8 version:4,
ihl:4;
#else
#error "Please fix <asm/byteorder.h>"
#endif
__u8 tos;
__be16 tot_len;
__be16 id;
__be16 frag_off;
__u8 ttl;
__u8 protocol;
__sum16 check;
__be32 saddr;
__be32 daddr;
/*The options start here. */
};
ihl單位是4位元組,一般而言ihl長度是20位元組因此是這個值是5
tot_len單位是位元組
id一般用於IP的分段/組合,同一IP包的所有分段其ID值是相同的
protocol表示4層協議值
check是IP首部的校檢和
sk_buff 結構中,skb->csum儲存了L4的校正和,skb->ip_summed表示校正和的狀態
CHECKSUM_NONE,表示L4校正和無效,需要重新計算
CHECKSUM_HW,表示網卡已經正確計算了L4校正和,但程式需要再次驗證L4校正和
CHECKSUM_UNNECESSARY,表示L4校正和無需驗證
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
.gro_receive = inet_gro_receive,
.gro_complete = inet_gro_complete,
};
L2層通過ip_packet_type找到ip_rcv函數,從而把報文傳到L3,下面分析下ip_rcv 函數:
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto out;
}
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
iph = ip_hdr(skb);
如果skb是通過混雜模式擷取的且不是發往原生,直接丟棄;如果skb是share的,調用skb_share_check複製一份出來處理;pskb_may_pull這個函數比較複雜,其目的是,確保在skb->data開始的線性記憶體裡面至少有 iphdr 的內容(這裡要提下sk_buff這個結構的複雜性就在於:真正的報文內容很多情況下是不存在skb所在的線性記憶體中的,通常情況下,sk_buff後面會跟著一塊線性記憶體空間,用skb_shared_info來表示,如果IP包沒有分區的話,這裡會儲存scatter-gather的報文內容,這些內容是分散在各個不同的記憶體頁中的,用一個
skb_frag_t 數組frags表示,nrfrags裡儲存了數組中元素的個數;如果IP包存在分區的話,可以看到有個sk_buff的數組frag_list,裡面就是分區的skb咯),如果skb->data後續的記憶體不夠,pskb_may_pull會擴充這個skb結構,然後把frags或者frag_list裡的IP頭內容拷出來填到skb線性記憶體裡
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto inhdr_error;
len = ntohs(iph->tot_len);
if (skb->len < len) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
這段代碼基本都是做一些check,略過了
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto drop;
}
pskb_trim_rcsum用於去掉L2用來padding的部分,並重新計算checksum,瞭解下就行了
return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
最後走一遍netfilter,如果不被DROP或啥的,進入ip_rcv_finish
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb_dst(skb) == NULL) {
int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INNOROUTES);
goto drop;
}
}
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST,
skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST,
skb->len);
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
ip_rcv_finish首先調用ip_route_input擷取目的地路由,關於路由的部分放到以後說,這裡通過本地路由表,會得知這個包究竟是應該本地接收還是給轉寄出去,ip_route_input會把路由資訊存到 (struct dst_entry *)skb->_skb_dst 中,而這個dst_entry->input 的函數指標究竟指向ip_local_deliver還是ip_forward是在ip_route_input_slow裡決定的(ip_route_input_slow由ip_route_input調用)
ip_route_input_slow中,先調用ip_mkroute_input,查看是否有轉寄路由表項,如果沒有則返錯表示是本地接收。ip_mkroute_input會調用__mkroute_input,裡面會調用dst_alloc建立一個rtable,並設定rth->u.dst.input = ip_forward,程式碼片段如下:
rth = dst_alloc(&ipv4_dst_ops);
if (!rth) {
err = -ENOBUFS;
goto cleanup;
}
atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
rth->u.dst.flags |= DST_NOPOLICY;
if (IN_DEV_CONF_GET(out_dev, NOXFRM))
rth->u.dst.flags |= DST_NOXFRM;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
rth->fl.mark = skb->mark;
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
rth->rt_gateway = daddr;
rth->rt_iif =
rth->fl.iif = in_dev->dev->ifindex;
rth->u.dst.dev = (out_dev)->dev;
dev_hold(rth->u.dst.dev);
rth->idev = in_dev_get(rth->u.dst.dev);
rth->fl.oif = 0;
rth->rt_spec_dst= spec_dst;
rth->u.dst.input = ip_forward;
rth->u.dst.output = ip_output;
rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
rt_set_nexthop(rth, res, itag);
rth->rt_flags = flags;
如果是broadcast input, 或者local_input,會走進如下程式碼片段:
local_input:
rth = dst_alloc(&ipv4_dst_ops);
if (!rth)
goto e_nobufs;
rth->u.dst.output= ip_rt_bug;
rth->rt_genid = rt_genid(net);
atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
rth->u.dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
rth->fl.mark = skb->mark;
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
#ifdef CONFIG_NET_CLS_ROUTE
rth->u.dst.tclassid = itag;
#endif
rth->rt_iif =
rth->fl.iif = dev->ifindex;
rth->u.dst.dev = net->loopback_dev;
dev_hold(rth->u.dst.dev);
rth->idev = in_dev_get(rth->u.dst.dev);
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
rth->u.dst.input= ip_local_deliver;
rth->rt_flags = flags|RTCF_LOCAL;
if (res.type == RTN_UNREACHABLE) {
rth->u.dst.input= ip_error;
rth->u.dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
rth->rt_type = res.type;
hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
err = rt_intern_hash(hash, rth, NULL, skb);
goto done;