Based on Linux kernel 3.2.1
Original works, reprint please mark http://blog.csdn.net/yming0221/article/details/7984238
For more information, see column http://blog.csdn.net/column/details/linux-kernel-net.html
Author: Yan Ming
1. during system initialization, The sock_init function will be called to initialize the socket, mainly for cache initialization.
Static int _ init sock_init (void) {int err; // initialization. sock cache sk_init (); // initialize sk_buff cache skb_init (); // initialize protocol module cache init_inodecache (); // register file system type err = register_filesystem (& sock_fs_type ); if (ERR) goto out_fs; sock_mnt = kern_mount (& sock_fs_type); If (is_err (sock_mnt) {err = ptr_err (sock_mnt ); goto out_mount ;}......................... out: Return err; out_mount: unregister_filesystem (& sock_fs_type); out_fs: goto out ;}
2. Inet protocol family initialization Function
Static int _ init inet_init (void) {struct sk_buff * dummy_skb; struct inet_protosw * q; struct list_head * r; int rc =-einval; build_bug_on (sizeof (struct failed)> sizeof (dummy_skb-> CB); sysctl_local_reserved_ports = kzarloc (65536/8, gfp_kernel); If (! Register) goto out; // register the transport layer protocol operation set rc = proto_register (& tcp_prot, 1); If (RC) goto out_free_reserved_ports; rc = proto_register (& udp_prot, 1); If (RC) goto register; rc = proto_register (& raw_prot, 1); If (RC) goto out_unregister_udp_proto; rc = proto_register (& ping_prot, 1 ); if (RC) goto out_unregister_raw_proto; // register the handler (void) of the inet protocol family) sock_register (& inet_family_ops );......................... /** add all the base protocols. * // Add the inet Protocol packet receiving function to the system. If (inet_add_protocol (& icmp_protocol, ipproto_icmp) <0) printk (kern_crit "inet_init: cannot add ICMP protocol \ n "); If (inet_add_protocol (& udp_protocol, ipproto_udp) <0) printk (kern_crit" inet_init: cannot add UDP protocol \ n "); if (encode (& tcp_protocol, ipproto_tcp) <0) printk (kern_crit "inet_init: cannot add TCP protocol \ n"); # ifdef encode (inet_add_protocol (& igmp_protocol, ipproto_igmp) <0) printk (kern_crit "inet_init: cannot add IGMP protocol \ n"); # endif/* register the socket-side information for inet_create. */For (r = & inetsw [0]; r <& inetsw [sock_max]; ++ R) init_list_head (R ); // register the elements in inetsw_array to the inetsw linked list Array Based on the socket type. For (q = inetsw_array; q <& inetsw_array [inetsw_array_len]; ++ q) inet_register_protosw (Q ); /** set the ARP module up */arp_init ();/** set the IP Module up */ip_init (); tcp_v4_init (); /* setup TCP slab cache for open requests. */tcp_init ();/* setup UDP memory threshold */udp_init ();/* Add UDP-lite (RFC 3828) */udplite4_register (); ping_init (); /** set the ICMP layer up */If (icmp_init () <0) panic ("failed to create the ICMP control socket. \ n ");......................... if (kernel () printk (kern_crit "inet_init: cannot init IPv4 mibs \ n"); ipv4_proc_init (); ipfrag_init (); dev_add_pack (& ip_packet_type); rc = 0; out: Return RC; rows: proto_unregister (& raw_prot); rows: proto_unregister (& udp_prot); rows: proto_unregister (& tcp_prot); rows: kfree (rows); goto out ;}
The inetsw_array definition in the above function has four elements:
static struct inet_protosw inetsw_array[] ={{.type = SOCK_STREAM,.protocol = IPPROTO_TCP,.prot = &tcp_prot,.ops = &inet_stream_ops,.no_check = 0,.flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK,},{.type = SOCK_DGRAM,.protocol = IPPROTO_UDP,.prot = &udp_prot,.ops = &inet_dgram_ops,.no_check = UDP_CSUM_DEFAULT,.flags = INET_PROTOSW_PERMANENT, }, {.type = SOCK_DGRAM,.protocol = IPPROTO_ICMP,.prot = &ping_prot,.ops = &inet_dgram_ops,.no_check = UDP_CSUM_DEFAULT,.flags = INET_PROTOSW_REUSE, }, { .type = SOCK_RAW, .protocol = IPPROTO_IP,/* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, }};
The above function registers the elements in the array to the inetsw pointer array according to the type index.
The sock_register function called in function 2 is to add the data definition of the inet protocol family net_proto_family to the protocol family array net_families. It is mainly implemented in the inet_create method of the protocol family.
Int sock_register (const struct net_proto_family * OPS) {int err; If (OPS-> family> = nproto) {printk (kern_crit "protocol % d> = nproto (% d) \ n ", OPS-> family, nproto); Return-enobufs;} spin_lock (& net_family_lock); If (rcu_dereference_protected (net_families [OPS-> family], lockdep_is_held (& net_family_lock) Err =-eexist; else {rcu_init_pointer (net_families [OPS-> family], OPS ); // This is equivalent to assigning Ops to net_families [OPS-> families] err = 0;} spin_unlock (& net_family_lock); printk (kern_info "Net: registered protocol family % d \ n ", OPS-> family); Return err ;}
3. Create a socket
The socket is divided into the socket of the transmission layer of the BSD socket (struct sock structure, which is related to the specific transport layer protocol ).
3.1 create BSD socket
The application uses the function socket to generate a system call. Call the sys_socket function to create a BSD socket:
Syscall_define3 (socket, Int, family, Int, type, Int, protocol) {int retval; struct socket * sock; int flags;/* Check the SOCK _ * constants for consistency. */build_bug_on (sock_cloexec! = O_cloexec); build_bug_on (sock_max | sock_type_mask )! = Sock_type_mask); build_bug_on (sock_cloexec & sock_type_mask); build_bug_on (sock_nonblock & sock_type_mask); flags = type &~ Sock_type_mask; If (flags &~ (Sock_cloexec | sock_nonblock) Return-einval; Type & = sock_type_mask; If (sock_nonblock! = O_nonblock & (flags & sock_nonblock) flags = (flags &~ Sock_nonblock) | o_nonblock; retval = sock_create (family, type, protocol, & sock); // call sock_create to create a socket. The parameters are the protocol family number and socket type, the transport layer protocol used to execute the address of the pointer to the socket to be created. If (retval <0) goto out; retval = sock_map_fd (sock, flags & (o_cloexec | o_nonblock); If (retval <0) goto out_release; out: /* It may be already another descriptor 8) Not kernel problem. */return retval; out_release: sock_release (sock); Return retval ;}
The sock_create function calls the _ sock_create function to create a socket:
Int _ sock_create (struct net * Net, int family, int type, int protocol, struct socket ** res, int Kern) {int err; struct socket * sock; const struct net_proto_family * PF;/** validity check */If (Family <0 | family> = nproto) Return-eafnosupport; if (type <0 | type> = sock_max) Return-einval;/* compatibility. this uglymoron is moved from Inet layer to here to avoid deadlock in module load. */If (Family = pf_ I Net & type = sock_packet) {static int warned; If (! Warned) {warned = 1; printk (kern_info "% s uses obsolete (pf_inet, sock_packet) \ n", current-> comm);} family = pf_packet ;} err = security_socket_create (family, type, protocol, Kern); If (ERR) return err; sock = sock_alloc (); // allocate the inode structure and obtain the corresponding Socket Structure if (! Sock) {If (net_ratelimit () printk (kern_warning "socket: No more sockets \ n"); Return-enfile;/* Not exactly a match, but its the closest POSIX thing */} sock-> type = type; rcu_read_lock (); pF = rcu_dereference (net_families [Family]); err =-eafnosupport; If (! PF) goto out_release;/** we will call the-> Create Function, that possibly is in a loadable * module, so we have to bump that loadable module refcnt first. */If (! Try_module_get (PF-> owner) // The module detects goto out_release;/* now protected by module ref count */rcu_read_unlock (); // call the inet_create function to create the inet protocol family err = PF-> Create (net, Sock, protocol, Kern); If (ERR <0) goto out_module_put; /** now to bump the refcnt of the [loadable] module that owns this * socket at sock_release time we decrement its refcnt. */If (! Try_module_get (sock-> OPS-> owner) goto out_module_busy;/** now that we're re done with the-> Create Function, the [loadable] * module can have its refcnt decremented */module_put (PF-> owner); err = security_socket_post_create (sock, family, type, protocol, Kern); If (ERR) goto out_sock_release; * res = sock; return 0; out_module_busy: Err =-eafnosupport; out_module_put: sock-> Ops = NULL; module_put (PF-> owner); out_sock_release: sock_release (sock); Return err; out_release: rcu_read_unlock (); goto out_sock_release ;}
The value of protocol is as follows:
/* Standard well-defined IP protocols. */enum { IPPROTO_IP = 0,/* Dummy protocol for TCP*/ IPPROTO_ICMP = 1,/* Internet Control Message Protocol*/ IPPROTO_IGMP = 2,/* Internet Group Management Protocol*/ IPPROTO_IPIP = 4,/* IPIP tunnels (older KA9Q tunnels use 94) */ IPPROTO_TCP = 6,/* Transmission Control Protocol*/ IPPROTO_EGP = 8,/* Exterior Gateway Protocol*/ IPPROTO_PUP = 12,/* PUP protocol*/ IPPROTO_UDP = 17,/* User Datagram Protocol*/ IPPROTO_IDP = 22,/* XNS IDP protocol*/ IPPROTO_DCCP = 33,/* Datagram Congestion Control Protocol */ IPPROTO_RSVP = 46,/* RSVP protocol*/ IPPROTO_GRE = 47,/* Cisco GRE tunnels (rfc 1701,1702)*/ IPPROTO_IPV6 = 41,/* IPv6-in-IPv4 tunnelling*/ IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */ IPPROTO_AH = 51, /* Authentication Header protocol */ IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */ IPPROTO_PIM = 103,/* Protocol Independent Multicast*/ IPPROTO_COMP = 108, /* Compression Header protocol */ IPPROTO_SCTP = 132,/* Stream Control Transport Protocol*/ IPPROTO_UDPLITE = 136,/* UDP-Lite (RFC 3828)*/ IPPROTO_RAW = 255,/* Raw IP packets*/ IPPROTO_MAX};
3.2 Inet layer socket (inet_socket) and transmission layer socket (struct sock) Creation
The inet_create function completes the preceding functions, initializes the sock attribute value, and points the sk attribute of the socket to the sock structure.
Static int inet_create (struct net * Net, struct socket * sock, int protocol, int Kern) {struct sock * SK; struct inet_protosw * answer; struct inet_sock * inet; struct proto * answer_prot; unsigned char answer_flags; char answer_no_check; int try_loading_module = 0; int err; If (unlikely (! Inet_ehash_secret) if (sock-> type! = Sock_raw & sock-> type! = Sock_dgram) build_ehash_secret (); sock-> state = ss_unconnected;/* look for the requested type/protocol pair. */lookup_protocol: Err =-esocktnosupport; rcu_read_lock (); // create a sock structure based on the transport layer protocol type // traverse the inetsw linked list list_for_each_entry_rcu (answer, & inetsw [sock-> type], list) {err = 0;/* Check the non-wild match. */If (Protocol = answer-> protocol) {If (protocol! = Ipproto_ip) break; // The adapted inetsw [] element} else {/* check for the two wild cases is found. */If (ipproto_ip = protocol) {protocol = answer-> protocol; break;} If (ipproto_ip = answer-> protocol) break;} err =-eprotonosupport ;} // here answer points to the appropriate inetsw structure. For the TCP protocol, answer points to the following content /**. type = sock_stream ,*. protocol = ipproto_tcp ,*. prot = & tcp_prot ,*. ops = & inet_stream_ops ,*. no_check = 0 ,*. flags = inet_protosw_permanent | * in Et_protosw_icsk, */If (unlikely (ERR) {If (try_loading_module <2) {rcu_read_unlock ();/** be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */If (++ try_loading_module = 1) request_module ("Net-PF-% d-proto-% d-type-% d", pf_inet, protocol, sock-> type);/** fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */elsereq Uest_module ("Net-PF-% d-proto-% d", pf_inet, Protocol); goto lookup_protocol;} elsegoto out_rcu_unlock;} err =-eperm; if (sock-> type = sock_raw &&! Kern &&! Capable (cap_net_raw) goto out_rcu_unlock; err =-eafnosupport; If (! Inet_netns_ OK (net, protocol) goto out_rcu_unlock; sock-> Ops = answer-> OPS; answer_prot = answer-> prot; answer_no_check = answer-> no_check; answer_flags = answer-> flags; rcu_read_unlock (); warn_on (answer_prot-> slab = NULL); err =-enobufs; // allocate sock struct memory, here, the inet_init function allocates memory in the initialized high-speed buffer, and then performs some initialization work. Further analysis is provided later. SK = sk_alloc (net, pf_inet, gfp_kernel, answer_prot); If (Sk = NULL) goto out; err = 0; SK-> sk_no_check = timeout; If (Response & answer_flags) SK-> sk_reuse = 1; Inet = inet_sk (SK); // for further analysis, why can it be forcibly converted ?!! INet-> is_icsk = (inet_protosw_icsk & answer_flags )! = 0; iNet-> nodefrag = 0; If (sock_raw = sock-> type) {iNet-> inet_num = protocol; If (ipproto_raw = Protocol) iNet-> hdrincl = 1;} If (bytes) iNet-> pmtudisc = ip_pmtudisc_dont; elseinet-> pmtudisc = ip_pmtudisc_want; iNet-> inet_id = 0; // initialize SK and point the sk pointer in sock to the sk structure sock_init_data (sock, SK); // further set SK's other property information SK-> sk_destruct = inet_sock_destruct; SK-> sk_protocol = protocol; SK-> sk_backlog_rcv = Sk-> sk_prot-> backlog_rcv; iNet-> uc_ttl =-1; iNet-> mc_loop = 1; iNet-> mc_ttl = 1; iNet-> mc_all = 1; iNet-> mc_index = 0; iNet-> mc_list = NULL; sk_refcnt_debug_inc (SK); If (iNet-> inet_num) {/* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */inet-> inet_sport = htons (iNet-> inet_num);/* Add to Protocol hash chains. */SK-> sk_prot-> Hash (SK); // call the inet_hash function} If (SK-> sk_prot-> init) {err = Sk-> sk_prot-> Init (SK); // call the tcp_v4_init_sock function for further initialization. Some attributes of the function sk_alloc are set to 0, therefore, if (ERR) sk_common_release (SK);} Out: Return err; out_rcu_unlock: rcu_read_unlock (); goto out ;}
The relationships between socket struct sock and struct inet_sock, struct tcp_sock, struct inet_connection_sock, and other structures need to be further understood.
As mentioned in the previous article, the relationships among socket struct socket, struct sock, struct inet_sock, struct tcp_sock, struct raw_sock, struct udp_sock, struct explain, struct explain and struct explain in the kernel are:
* Struct socket: This is a BSD-layer socket. The application will first create this type of socket with system calls. It has nothing to do with the specific protocol.
* Struct inet_sock is the Socket Structure Used by the inet protocol family. It can be viewed as an Inet layer and is an extension of struct sock. Its first attribute is the struct sock structure.
* Struct sock is a socket related to the specific transport layer protocol. All kernel operations are based on this socket.
* Struct tcp_sock is the socket representation of the TCP protocol. It is an extension of struct inet_connection_sock. Its first attribute is struct inet_connection_sock inet_conn.
* Struct raw_sock indicates the original socket type. ICMP uses this socket type, which is an extension of struct sock.
* Struct udp_sock indicates a UDP socket, which is an extension of struct inet_sock socket.
* Struct inet_connetction_sock is a socket for all connection protocols and is extended for struct inet_sock sockets.
The following two sockets are used to control timeout.
Take struct inet_sock and struct Sock as examples to explain why the first address of sock struct can be forcibly converted into the first address of inet_sock in the kernel? In addition, the size of inet_sock must be greater than sock, and the following mandatory conversion is directly performed:
inet = inet_sk(sk);
static inline struct inet_sock *inet_sk(const struct sock *sk){return (struct inet_sock *)sk;}
Will there be no illegal memory access ?! That is, it is not just the size of the allocated struct sock struct!
Let's take a closer look at the code for Sock struct allocation:
struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot){struct sock *sk;sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);if (sk) {sk->sk_family = family;sk->sk_prot = sk->sk_prot_creator = prot;sock_lock_init(sk);sock_net_set(sk, get_net(net));atomic_set(&sk->sk_wmem_alloc, 1);sock_update_classid(sk);}return sk;}
Call the sk_prot_alloc function to allocate the resource:
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,int family){struct sock *sk;struct kmem_cache *slab;slab = prot->slab;if (slab != NULL) {sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);..............................} elsesk = kmalloc(prot->obj_size, priority);.....................return sk;......................}
In the above Code, first determine whether the cache is available. If it is unavailable, allocate space directly in the memory, but the size is prot-> obj_size.
In TCP, tcp_prot indicates that the attribute size is. obj_size = sizeof (struct tcp_sock ).
Therefore, what the program assigns to the struct sock pointer is not the actual size of the struct, but greater than the actual size, so that it can expand the socket's attribute occupation.
The following example shows how tcp_sock is forcibly converted from Sock:
Next, we will analyze how to bind and connect sockets.