Linux核心–網路通訊協定棧深入分析(四)–通訊端核心初始化和建立過程

來源:互聯網
上載者:User

本文分析基於Linux Kernel 3.2.1

原創作品,轉載請標明http://blog.csdn.net/yming0221/article/details/7984238

更多請查看專欄http://blog.csdn.net/column/details/linux-kernel-net.html

作者:閆明

1、系統初始化過程中會調用sock_init函數進行通訊端的初始化,主要是進行緩衝的初始化

static int __init sock_init(void){int err; //初始化.sock緩衝sk_init(); //初始化sk_buff緩衝skb_init(); //初始化協議模組緩衝init_inodecache();        //註冊檔案系統類型err = register_filesystem(&sock_fs_type);if (err)goto out_fs;sock_mnt = kern_mount(&sock_fs_type);if (IS_ERR(sock_mnt)) {err = PTR_ERR(sock_mnt);goto out_mount;}.........................out:return err;out_mount:unregister_filesystem(&sock_fs_type);out_fs:goto out;}

2、INET協議族的初始化函數

static int __init inet_init(void){struct sk_buff *dummy_skb;struct inet_protosw *q;struct list_head *r;int rc = -EINVAL;BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);if (!sysctl_local_reserved_ports)goto out;//下面註冊傳輸層協議操作集rc = proto_register(&tcp_prot, 1);if (rc)goto out_free_reserved_ports;rc = proto_register(&udp_prot, 1);if (rc)goto out_unregister_tcp_proto;rc = proto_register(&raw_prot, 1);if (rc)goto out_unregister_udp_proto;rc = proto_register(&ping_prot, 1);if (rc)goto out_unregister_raw_proto;//註冊INET協議族的handler(void)sock_register(&inet_family_ops);........................./* *Add all the base protocols. *///將INET協議族協議資料包接收函數添加到系統中if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");#ifdef CONFIG_IP_MULTICASTif (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");#endif/* Register the socket-side information for inet_create. */for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)INIT_LIST_HEAD(r);//將inetsw_array中的元素按通訊端類型註冊到inetsw鏈表數組中for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)inet_register_protosw(q);/* *Set the ARP module up */arp_init();/* *Set the IP module up */ip_init();tcp_v4_init();/* Setup TCP slab cache for open requests. */tcp_init();/* Setup UDP memory threshold */udp_init();/* Add UDP-Lite (RFC 3828) */udplite4_register();ping_init();/* *Set the ICMP layer up */if (icmp_init() < 0)panic("Failed to create the ICMP control socket.\n");.........................if (init_ipv4_mibs())printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");ipv4_proc_init();ipfrag_init();dev_add_pack(&ip_packet_type);rc = 0;out:return rc;out_unregister_raw_proto:proto_unregister(&raw_prot);out_unregister_udp_proto:proto_unregister(&udp_prot);out_unregister_tcp_proto:proto_unregister(&tcp_prot);out_free_reserved_ports:kfree(sysctl_local_reserved_ports);goto out;}

上面函數中的inetsw_array的定義中有四個元素:

static struct inet_protosw inetsw_array[] ={{.type =       SOCK_STREAM,.protocol =   IPPROTO_TCP,.prot =       &tcp_prot,.ops =        &inet_stream_ops,.no_check =   0,.flags =      INET_PROTOSW_PERMANENT |      INET_PROTOSW_ICSK,},{.type =       SOCK_DGRAM,.protocol =   IPPROTO_UDP,.prot =       &udp_prot,.ops =        &inet_dgram_ops,.no_check =   UDP_CSUM_DEFAULT,.flags =      INET_PROTOSW_PERMANENT,       },       {.type =       SOCK_DGRAM,.protocol =   IPPROTO_ICMP,.prot =       &ping_prot,.ops =        &inet_dgram_ops,.no_check =   UDP_CSUM_DEFAULT,.flags =      INET_PROTOSW_REUSE,       },       {       .type =       SOCK_RAW,       .protocol =   IPPROTO_IP,/* wild card */       .prot =       &raw_prot,       .ops =        &inet_sockraw_ops,       .no_check =   UDP_CSUM_DEFAULT,       .flags =      INET_PROTOSW_REUSE,       }};

上面的函數會將這個數組中的元素按照type為索引註冊到inetsw指標數組中。

函數2中調用的sock_register函數就是想協議族數組net_families中添加inet協議族的net_proto_family的資料定義,主要是協議族的建立方法inet_create下面是它的實現

int sock_register(const struct net_proto_family *ops){int err;if (ops->family >= NPROTO) {printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,       NPROTO);return -ENOBUFS;}spin_lock(&net_family_lock);if (rcu_dereference_protected(net_families[ops->family],      lockdep_is_held(&net_family_lock)))err = -EEXIST;else {RCU_INIT_POINTER(net_families[ops->family], ops);//這裡就相當於將ops賦予net_families[ops->families]err = 0;}spin_unlock(&net_family_lock);printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);return err;}

3、通訊端的建立

通訊端分BSD socket的傳輸層的socket(struct sock結構,與具體的傳輸層協議有關)。

3.1、BSD socket的建立

應用程式使用函數socket會產生系統調用,調用sys_socket函數來建立BSD socket:

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol){int retval;struct socket *sock;int flags;/* Check the SOCK_* constants for consistency.  */BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);flags = type & ~SOCK_TYPE_MASK;if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))return -EINVAL;type &= SOCK_TYPE_MASK;if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;retval = sock_create(family, type, protocol, &sock);//調用sock_create建立通訊端,參數分別是協議族號、通訊端類型,使用的傳輸層協議、執行要建立的通訊端的指標的地址。if (retval < 0)goto out;retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));if (retval < 0)goto out_release;out:/* It may be already another descriptor 8) Not kernel problem. */return retval;out_release:sock_release(sock);return retval;}

函數sock_create會調用__sock_create函數進行通訊端的建立:

int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern){int err;struct socket *sock;const struct net_proto_family *pf;/* *      合法性檢查 */if (family < 0 || family >= NPROTO)return -EAFNOSUPPORT;if (type < 0 || type >= SOCK_MAX)return -EINVAL;/* Compatibility.   This uglymoron is moved from INET layer to here to avoid   deadlock in module load. */if (family == PF_INET && type == SOCK_PACKET) {static int warned;if (!warned) {warned = 1;printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",       current->comm);}family = PF_PACKET;}err = security_socket_create(family, type, protocol, kern);if (err)return err;sock = sock_alloc();//分配inode結構並獲得對應的socket結構if (!sock) {if (net_ratelimit())printk(KERN_WARNING "socket: no more sockets\n");return -ENFILE;/* Not exactly a match, but its the   closest posix thing */}sock->type = type;rcu_read_lock();pf = rcu_dereference(net_families[family]);err = -EAFNOSUPPORT;if (!pf)goto out_release;/* * We will call the ->create function, that possibly is in a loadable * module, so we have to bump that loadable module refcnt first. */if (!try_module_get(pf->owner))//模組檢測goto out_release;/* Now protected by module ref count */rcu_read_unlock();//這裡調用inet_create函數對INET協議族進行建立err = pf->create(net, sock, protocol, kern);if (err < 0)goto out_module_put;/* * Now to bump the refcnt of the [loadable] module that owns this * socket at sock_release time we decrement its refcnt. */if (!try_module_get(sock->ops->owner))goto out_module_busy;/* * Now that we're done with the ->create function, the [loadable] * module can have its refcnt decremented */module_put(pf->owner);err = security_socket_post_create(sock, family, type, protocol, kern);if (err)goto out_sock_release;*res = sock;return 0;out_module_busy:err = -EAFNOSUPPORT;out_module_put:sock->ops = NULL;module_put(pf->owner);out_sock_release:sock_release(sock);return err;out_release:rcu_read_unlock();goto out_sock_release;}

其中的參數protocol的取值如下:

/* Standard well-defined IP protocols.  */enum {  IPPROTO_IP = 0,/* Dummy protocol for TCP*/  IPPROTO_ICMP = 1,/* Internet Control Message Protocol*/  IPPROTO_IGMP = 2,/* Internet Group Management Protocol*/  IPPROTO_IPIP = 4,/* IPIP tunnels (older KA9Q tunnels use 94) */  IPPROTO_TCP = 6,/* Transmission Control Protocol*/  IPPROTO_EGP = 8,/* Exterior Gateway Protocol*/  IPPROTO_PUP = 12,/* PUP protocol*/  IPPROTO_UDP = 17,/* User Datagram Protocol*/  IPPROTO_IDP = 22,/* XNS IDP protocol*/  IPPROTO_DCCP = 33,/* Datagram Congestion Control Protocol */  IPPROTO_RSVP = 46,/* RSVP protocol*/  IPPROTO_GRE = 47,/* Cisco GRE tunnels (rfc 1701,1702)*/  IPPROTO_IPV6 = 41,/* IPv6-in-IPv4 tunnelling*/  IPPROTO_ESP = 50,            /* Encapsulation Security Payload protocol */  IPPROTO_AH = 51,             /* Authentication Header protocol       */  IPPROTO_BEETPH = 94,       /* IP option pseudo header for BEET */  IPPROTO_PIM    = 103,/* Protocol Independent Multicast*/  IPPROTO_COMP   = 108,                /* Compression Header protocol */  IPPROTO_SCTP   = 132,/* Stream Control Transport Protocol*/  IPPROTO_UDPLITE = 136,/* UDP-Lite (RFC 3828)*/  IPPROTO_RAW = 255,/* Raw IP packets*/  IPPROTO_MAX};

3.2、INET層socket(inet_socket)和傳輸層socket(struct sock)建立

函數inet_create完成了上述功能,並初始化了sock的屬性值,將socket的sk屬性指向sock結構

static int inet_create(struct net *net, struct socket *sock, int protocol,       int kern){struct sock *sk;struct inet_protosw *answer;struct inet_sock *inet;struct proto *answer_prot;unsigned char answer_flags;char answer_no_check;int try_loading_module = 0;int err;if (unlikely(!inet_ehash_secret))if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)build_ehash_secret();sock->state = SS_UNCONNECTED;/* Look for the requested type/protocol pair. */lookup_protocol:err = -ESOCKTNOSUPPORT;rcu_read_lock();//根據傳輸層協議的類型建立sock結構//遍曆inetsw鏈表list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {err = 0;/* Check the non-wild match. */if (protocol == answer->protocol) {if (protocol != IPPROTO_IP)break;//找到了適配的inetsw[]元素} else {/* Check for the two wild cases. */if (IPPROTO_IP == protocol) {protocol = answer->protocol;break;}if (IPPROTO_IP == answer->protocol)break;}err = -EPROTONOSUPPORT;}//到這裡answer指向了合適的inetsw結構,若是TCP協議,answer指向內容如下/**.type =       SOCK_STREAM,*.protocol =   IPPROTO_TCP,*.prot =       &tcp_prot,*.ops =        &inet_stream_ops,*.no_check =   0,*.flags =      INET_PROTOSW_PERMANENT |*      INET_PROTOSW_ICSK,*/if (unlikely(err)) {if (try_loading_module < 2) {rcu_read_unlock();/* * Be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */if (++try_loading_module == 1)request_module("net-pf-%d-proto-%d-type-%d",       PF_INET, protocol, sock->type);/* * Fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */elserequest_module("net-pf-%d-proto-%d",       PF_INET, protocol);goto lookup_protocol;} elsegoto out_rcu_unlock;}err = -EPERM;if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))goto out_rcu_unlock;err = -EAFNOSUPPORT;if (!inet_netns_ok(net, protocol))goto out_rcu_unlock;sock->ops = answer->ops;answer_prot = answer->prot;answer_no_check = answer->no_check;answer_flags = answer->flags;rcu_read_unlock();WARN_ON(answer_prot->slab == NULL);err = -ENOBUFS;//分配sock結構體記憶體,這裡在inet_init函數初始化好的高速緩衝區中分配記憶體,然後做一些初始化工作。後面有進一步分析。sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);if (sk == NULL)goto out;err = 0;sk->sk_no_check = answer_no_check;if (INET_PROTOSW_REUSE & answer_flags)sk->sk_reuse = 1;inet = inet_sk(sk);//後面有進一步分析,為何可以強制轉換?!!inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;inet->nodefrag = 0;if (SOCK_RAW == sock->type) {inet->inet_num = protocol;if (IPPROTO_RAW == protocol)inet->hdrincl = 1;}if (ipv4_config.no_pmtu_disc)inet->pmtudisc = IP_PMTUDISC_DONT;elseinet->pmtudisc = IP_PMTUDISC_WANT;inet->inet_id = 0;//對sk進行初始化設定並將sock中的sk指標指向sk結構sock_init_data(sock, sk);//進一步設定sk的其他屬性資訊sk->sk_destruct   = inet_sock_destruct;sk->sk_protocol   = protocol;sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;inet->uc_ttl= -1;inet->mc_loop= 1;inet->mc_ttl= 1;inet->mc_all= 1;inet->mc_index= 0;inet->mc_list= NULL;sk_refcnt_debug_inc(sk);if (inet->inet_num) {/* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */inet->inet_sport = htons(inet->inet_num);/* Add to protocol hash chains. */sk->sk_prot->hash(sk);//調用inet_hash函數}if (sk->sk_prot->init) {err = sk->sk_prot->init(sk);//調用tcp_v4_init_sock函數進行進一步的初始化,由於在函數sk_alloc中一些屬性被設定成0了,所以在此調用進行初始化if (err)sk_common_release(sk);}out:return err;out_rcu_unlock:rcu_read_unlock();goto out;}

關於通訊端struct sock與struct inet_sock、struct tcp_sock、struct inet_connection_sock等結構之間的關係有待進一步瞭解。

上篇中已經寫過,核心中通訊端struct socket、struct sock、struct inet_sock、struct tcp_sock、struct raw_sock、struct udp_sock、struct inet_connection_sock、struct inet_timewait_sock和struct tcp_timewait_sock的關係是:

*struct socket這個是BSD層的socket,應用程式會用過系統調用首先建立該類型通訊端,它和具體協議無關。

*struct inet_sock是INET協議族使用的socket結構,可以看成位於INET層,是struct sock的一個擴充。它的第一個屬性就是struct sock結構。

*struct sock是與具體傳輸層協議相關的通訊端,所有核心的操作都基於這個通訊端。

*struct tcp_sock是TCP協議的通訊端表示,它是對struct inet_connection_sock的擴充,其第一個屬性就是struct inet_connection_sock inet_conn。

*struct raw_sock是原始類型的通訊端表示,ICMP協議就使用這種通訊端,其是對struct sock的擴充。

*struct udp_sock是UDP協議通訊端表示,其是對struct inet_sock通訊端的擴充。

*struct inet_connetction_sock是所有連線導向協議的通訊端,是對struct inet_sock通訊端擴充。

後面兩個是用於控制逾時的通訊端。

就拿struct inet_sock和struct sock為例來說明,為什麼核心中可以直接將sock結構體首地址強制轉換成inet_sock的首地址?並且inet_sock的大小要大於sock,直接進行如下強制轉換

inet = inet_sk(sk);

static inline struct inet_sock *inet_sk(const struct sock *sk){return (struct inet_sock *)sk;}

不會發生記憶體非法訪問嗎?!那就是在分配的時候並不只是分配的struct sock結構體大小的儲存空間!

可以細看sock結構體分配的代碼:

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,      struct proto *prot){struct sock *sk;sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);if (sk) {sk->sk_family = family;sk->sk_prot = sk->sk_prot_creator = prot;sock_lock_init(sk);sock_net_set(sk, get_net(net));atomic_set(&sk->sk_wmem_alloc, 1);sock_update_classid(sk);}return sk;}

緊接著調用sk_prot_alloc函數分配:

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,int family){struct sock *sk;struct kmem_cache *slab;slab = prot->slab;if (slab != NULL) {sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);..............................} elsesk = kmalloc(prot->obj_size, priority);.....................return sk;......................}

上面的代碼中首先判斷快取中是否可用,如果不可用,直接在記憶體配置空間,不過大小都是prot->obj_size。

如果是TCP協議中的tcp_prot中指明該屬性的大小為.obj_size= sizeof(struct tcp_sock)。

所以,程式中給struct sock指標分配的不是該結構體的實際大小,而是大於其實際大小,以便其擴充通訊端的屬性佔用。
以圖例說明tcp_sock是如何從sock強制轉換來的:

下篇將分析通訊端的綁定、串連等一系列操作的實現。

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.