Kernel Listen backlog and simple three-time handshake analysis

Source: Internet
Author: User

1. Background:

In front of the listen backlog to the full connection and the impact of semi-connection, this article will be from the kernel source (kernel 2.6.32) Simple understanding of the service side of the three-time handshake and the backlog in the middle of the role.

2. Three-time handshake:

2.1 Service-Side monitoring:

After System_call, through the FD number to obtain the corresponding socket, and the backlog is limited to the maximum value, and then into the Inet_listen function for processing.

int Inet_listen (struct socket *sock, int backlog) {struct sock *sk = sock->sk;unsigned char old_state;int err;lock_sock ( SK); err =-einval;if (sock->state! = ss_unconnected | | Sock->type! = sock_stream) Goto out;old_state = Sk->sk_stat E;if (! ( (1 << old_state) & (Tcpf_close |  Tcpf_listen)) Goto out;/* Really, if the socket is already in LISTEN state * We can only allow the backlog to be adjusted. */if (old_state! = Tcp_listen) {err = Inet_csk_listen_start (SK, backlog); if (err) goto out;} Sk->sk_max_ack_backlog = Backlog;err = 0;out:release_sock (SK); return err;}
This function makes a simple state check and then enters Inet_csk_listen_start to initialize the listening socket, and finally note that the Sk->sk_max_ack_backlog is modified to the backlog size, and this variable will act on the Sk_ The Acceptq_is_full function is used to determine the half-connection queue, so the backlog is useful for semi-joins.

static inline int Sk_acceptq_is_full (struct sock *sk) {return sk->sk_ack_backlog > sk->sk_max_ack_backlog;} 
int INET_CSK _listen_start (struct sock *sk, const int nr_table_entries) {struct Inet_sock *inet = Inet_sk (SK); struct Inet_connection_ Sock *ICSK = INET_CSK (SK); int rc = Reqsk_queue_alloc (&icsk->icsk_accept_queue, nr_table_entries); if (rc! = 0)  Return rc;/* initialization backlog of 0, back to the upper level after the assignment, see the previous article. */sk->sk_max_ack_backlog = 0;sk->sk_ack_backlog = 0;inet_csk_delack_init (SK);/* There is race window Here:we Annou nCE ourselves listening, * but this transition was still not validated by Get_port (). * It is OK, the because this socket enters to hash table only * After validation are complete.  *//* Check if the port is occupied. */sk->sk_state = Tcp_listen;if (!sk->sk_prot->get_port (SK, Inet->num)) {Inet->sport = htons (inet-> num); Sk_dst_reset (SK); Sk->sk_prot->hash (SK); return 0;} Sk->sk_state = Tcp_close;__reqsk_queue_destroy (&icsk->icsk_accept_queue); return-eaddrinuse;} 
        initialization primarily initializes the backlog, which is re-assigned to the backlog at the top, and checks whether the monitored ports are occupied. Also pay attention to the function Reqsk_queue_alloc function, really initializes a listen_sock. Listen_sock data results See the following code, which mainly opens up a hash bucket to record a semi-connected state.
int Reqsk_queue_alloc (struct request_sock_queue *queue, unsigned int nr_table_entries) {size_t lopt_size = sizeof (struc    T listen_sock);      /** struct Listen_sock-listen State * * @max_qlen_log-log_2 of maximal queued syns/requests *//*  struct Listen_sock {/* hash size.      */U8 Max_qlen_log; /* 3 bytes hole, try to use */* to record the number of half connections and the number of half connections not timed out.      */int Qlen;     int Qlen_young; /* will be used for half-connection timeout retransmission.      */int Clock_hand; /* To calculate the hash value.      */u32 hash_rnd;     U32 nr_table_entries; /* Flexible array.      */struct Request_sock *syn_table[0];     };    Listen_sock is a flexible array that is used to hold a semi-concatenated hash table, and the hash table size is the power of the backlog to fetch 2.    */struct Listen_sock *lopt; /* Flexible array size calculation, the backlog in [8, Sysctl_max_syn_backlog] up to the power of 2.     */nr_table_entries = min_t (u32, nr_table_entries, Sysctl_max_syn_backlog);    Nr_table_entries = max_t (u32, nr_table_entries, 8);    Nr_table_entries = Roundup_pow_of_two (nr_table_entries + 1); Lopt_size + = nr_table_entries * sizeof (struct request_sock *);    if (Lopt_size > page_size) lopt = __vmalloc (lopt_size, Gfp_kernel | __gfp_highmem | __gfp_zero, Page_kernel);    else lopt = Kzalloc (lopt_size, Gfp_kernel);    if (lopt = = NULL) Return-enomem; /* Hash at least 8.  */for (lopt->max_qlen_log = 3; (1 << lopt->max_qlen_log) < nr_table_entries;    lopt->max_qlen_log++);    Get_random_bytes (&lopt->hash_rnd, sizeof (LOPT-&GT;HASH_RND));    Rwlock_init (&queue->syn_wait_lock);    Queue->rskq_accept_head = NULL;    Lopt->nr_table_entries = nr_table_entries;    WRITE_LOCK_BH (&queue->syn_wait_lock);    queue->listen_opt = lopt;    WRITE_UNLOCK_BH (&queue->syn_wait_lock); return 0;}

2.2 SYN of the first handshake arrives:

When a message arrives, the IPv4 TCP processing entry is TCP_V4_DO_RCV, and the following code removes some of the checksum processing and the established connection processing, leaving only the main process of monitoring. The first handshake received a SYN packet, so after Tcp_v4_hnd_req go to tcp_rcv_state_process to handle the package that is currently received.

int tcp_v4_ DO_RCV (struct sock *sk, struct sk_buff *skb) {struct sock *rsk;if (sk->sk_state = = tcp_listen) {struct sock *nsk = Tcp_v  4_hnd_req (SK, SKB);/* error, discard package.  */if (!nsk) Goto discard;/* received an ACK. */if (Nsk! = SK) {if (tcp_child_process (SK, NSK, SKB)) {RSK = Nsk;goto reset;} return 0;}}  /* processing messages. */tcp_check_timer (SK), if (Tcp_rcv_state_process (SK, SKB, TCP_HDR (SKB), Skb->len)) {RSK = Sk;goto reset;} Tcp_check_timer (SK), return 0;reset:tcp_v4_send_reset (RSK, SKB);d iscard:kfree_skb (SKB);/* is careful here. If This function gets more complicated and * GCC suffers from register pressure on the x86, SK (in%EBX) * might is Destro Yed here. This current version compiles correctly, * and you have been warned. */return 0;} 
One of the more important functions is tcp_v4_hnd_req and tcp_rcv_state_process, first look at Tcp_v4_hnd_req. This function returns the parameter SK indicating that the current is receiving the SYN State, and the packet is processed directly; when the return value is not SK and non-null means that the last ACK is currently in place, a new sock is created to handle the connection, and if it is empty, the packet is discarded and the current processing error.

static  struct sock *tcp_v4_hnd_req (struct sock *sk, struct sk_buff *skb) {struct TCPHDR *th = TCP_HDR (SKB); const struct IPHDR *iph  = Ip_hdr (SKB), struct sock *nsk;struct request_sock **prev;/* semi-connection lookup, the found will be checked if there is no connection in the half, find the completed connection, are not then into the SYN receive. */struct Request_sock *req = Inet_csk_search_req (SK, &prev, Th->source, iph->saddr, iph->daddr); if (req ) return Tcp_check_req (SK, SKB, req, prev); Nsk = inet_lookup_established (Sock_net (SK), &tcp_hashinfo, iph->saddr , Th->source, Iph->daddr, Th->dest, Inet_iif (SKB)), if (NSK) {if (nsk->sk_state! = tcp_time_wait) {Bh_lock_ Sock (NSK); return NSK;} Inet_twsk_put (INET_TWSK (NSK)); return NULL;} #ifdef config_syn_cookiesif (!th->rst &&!th->syn && th->ack) SK = Cookie_v4_check (SK, SKB, & (IPCB (SKB)->opt); #endifreturn SK;} 
About the function Tcp_check_req will check the SYN_RECV, currently does not introduce, if the semi-connection is not established, then the two tables are found to fail, directly return to SK.

Next, the first state machine in the function tcp_rcv_state_process of the package is processed into the listening state and receives the processing function tcp_v4_conn_request of the SYN packet.

Simplify part of the code and keep the main flow as follows:

1. Check if the half-connection queue is full, the full size is the backlog value up to 2 power, the full will discard the SYN packet.

2. Check that the completion queue is empty and that the new request number is greater than 1, note that here the new request refers to a request that has just been sent to SYN, has not yet been re-transmitted or confirmed, and the new request will reset Qlen_young after Tcp_check_req receives the ACK and enters the connected state. Or at the time of the first retransmission, the personal understanding is to spread the pressure on the server to the client, so that retransmission occurs on the client side, reducing the pressure on the server.

3. A semi-connection can be created, a request_sock is initialized, a SYN + ACK is sent, and the req is added to the syn_table of the Listener sock, and the link is found in the semi-connected table the next time tcp_v4_hnd_req.

4. Adding a hash table activates the KeepAlive event corresponding to the socket, which is analyzed later.

int tcp_v4_conn_request (struct sock *sk, struct Sk_buff *skb) {... */TW buckets is converted to open requests without * Li Mitations, they conserve resources and peer is * evidently real one. */if (Inet_csk_reqsk_queue_is_full (SK) &&!isn) {#ifdef config_syn_cookiesif (sysctl_tcp_syncookies) {Want_ cookie = 1;} Else#endifgoto drop;} /* Accept backlog is full. If we have already queued enough * of warm entries in SYN Queue, drop request. It is better than * clogging syn queue with OPENREQS with exponentially increasing * timeout. *//* when the receive queue is full and the number of new retries is greater than 1, the service side temporarily discards the message, * the personal understanding is to reduce the pressure on the service side, the retransmission to the client (retransmission SYN), * instead of the server (syn + ACK). */if (Sk_acceptq_is_full (SK) && Inet_csk_reqsk_queue_young (SK) > 1) Goto drop;req = Inet_reqsk_alloc (& Tcp_request_sock_ops); if (!req) goto drop; #ifdef Config_tcp_md5sigtcp_rsk (req)->af_specific = &tcp_request_ Sock_ipv4_ops; #endiftcp_clear_options (&tmp_opt); tmp_opt.mss_clamp = 536;TMP_OPT.USER_MSS = Tcp_sk (SK)->rx_ Opt.user_mss;tcp_parSe_options (SKB, &tmp_opt, 0); if (Want_cookie &&!tmp_opt.saw_tstamp) tcp_clear_options (&tmp_opt); TMP _OPT.TSTAMP_OK = Tmp_opt.saw_tstamp;tcp_openreq_init (req, &tmp_opt, SKB); ireq = Inet_rsk (req); ireq->loc_addr = daddr;ireq->rmt_addr = Saddr;ireq->no_srccheck = Inet_sk (SK)->transparent;ireq->opt = Tcp_v4_save_ Options (SK, SKB), if (Security_inet_conn_request (SK, SKB, req) goto drop_and_free;if (!want_cookie) Tcp_ecn_create_  Request (req, TCP_HDR (SKB)) ... */* Send SYN + ACK.  */if (__tcp_v4_send_synack (SK, req, DST) | | want_cookie) goto drop_and_free;/* Add req to a semi-connected syn_table while starting a timer for time-out detection. */inet_csk_reqsk_queue_hash_add (SK, req, tcp_timeout_init), return 0;drop_and_release:dst_release (DST);d Rop_and_ Free:reqsk_free (req);d Rop:return 0;}

The first handshake has a major detail, which is mentioned above, when the queue is full, the SYN is discarded appropriately.

2.3 Third Handshake:

The third handshake when the server received the client feedback Ack, this time is still in the TCP_V4_DO_RCV processing this package, but the difference is in the tcp_v4_hnd_req, will find in the syn_table req, this time will enter Tcp_check_ Req Check this half connection. A detailed check can be found in the reference document [2], when the check is complete, a child sock is created to handle the connection, and the main processing is done in the Tcp_v4_syn_recv_sock function. When the sub-sock creation fails, such as the Accept queue is full, then the system's Sysctl_tcp_abort_on_overflow flag will determine whether or not to send the RST to the peer, or simply discard the package, waiting for subsequent retransmissions.

If the sub-sock is established successfully, it is removed from the hash bucket and the corresponding half-connection count is reduced, and the corresponding timer is removed.

Child = INET_CSK (SK)->icsk_af_ops->syn_recv_sock (SK, SKB, req, NULL); if (child = = NULL) goto listen_overflow;inet_ Csk_reqsk_queue_unlink (SK, req, prev); Inet_csk_reqsk_queue_removed (SK, req); Inet_csk_reqsk_queue_add (SK, Req, child ); return Child;listen_overflow:if (!sysctl_tcp_abort_on_overflow) {Inet_rsk (req)->acked = 1;return NULL;} EMBRYONIC_RESET:NET_INC_STATS_BH (Sock_net (SK), linux_mib_embryonicrsts); FLG & Tcp_flag_rst) Req->rsk_ops->send_reset (SK, SKB); Inet_csk_reqsk_queue_drop (SK, req, prev); return NULL ;}
2.4 Server-side timeout retransmission:

The total entrance of the timer is Tcp_keepalive_timer, the entrance of the corresponding listening state is inet_csk_reqsk_queue_prune.

void Inet_csk_reqsk_queue_prune (struct sock *parent,const unsigned long interval,const unsigned long timeout,const unsigned long Max_rto) {struct Inet_connection_sock *icsk = INET_CSK (parent); struct Request_sock_queue *queue = & Icsk->icsk_accept_queue;struct listen_sock *lopt = queue->listen_opt;int max_retries = Icsk->icsk_syn_ Retries? : Sysctl_tcp_synack_retries;int Thresh = max_retries;unsigned Long now = jiffies;struct Request_sock **reqp, *req;int I, b Udget;if (lopt = = NULL | | lopt->qlen = 0) return;/* Normally all the openreqs is young and become mature * (i.e. conve RTed to established sockets) for first timeout. * If Synack is not acknowledged for 1 second, it means * one of the following things:synack is lost, ACK was lost, * RT Nobody planned to ACK (i.e. Synflood). * When the server is a bit loaded, the queue is populated with the old * open requests, reducing effective size of the queue. * When server was well loaded, queue size reduces to zero * after severalMinutes of work. It isn't synflood, * it is normal operation. The solution is pruning * too old entries overriding normal timeout, when * situation becomes dangerous. * * Essentially, we reserve half of the hostel for young * embrions; and abort old ones without pity, if-old * ones is about-to-clog our table. *//* when the number of semi-connections increases, but young increases the speed is relatively flat, this time the thresh will gradually become smaller, * semi-connection is more likely to expire. */if (lopt->qlen>> (lopt->max_qlen_log-1)) {int young = (lopt->qlen_young<<1), while (Thresh > 2 {if (Lopt->qlen < Young) Break;thresh--;young <<= 1;}} if (queue->rskq_defer_accept) max_retries = Queue->rskq_defer_accept;budget = 2 * (Lopt->nr_table_entries/(  Timeout/interval)); i = lopt->clock_hand;/* traversing the bucket.  */do {reqp=&lopt->syn_table[i];while ((req = *reqp)! = NULL) {/* has reached the time-out.  */if (Time_after_eq (now, req->expires)) {int expire = 0, resend = 0;/* time-out and expired calculations. */syn_ack_recalc (req, Thresh, Max_retries, Queue->rskq_defer_accept, &expire, &resend); Req->rsk_ops->syn_ack_timeout (parent, req); if (!expire && (!resend | |     !inet_rtx_syn_ack (Parent, req) | |  Inet_rsk (req)->acked)) {/* time-out retransmission.  */unsigned long timeo;/* for the first time, the request is no longer young.  */if (req->num_timeout++ = = 0) lopt->qlen_young--;/* time-out index increased. */timeo = min (timeout << req->num_timeout, max_rto); req->expires = now + TIMEO;REQP = &req->dl_next; Continue;}  /* Drop this request *//* expires. */inet_csk_reqsk_queue_unlink (parent, req, REQP); reqsk_queue_removed (queue, req); Reqsk_free (req); continue;} REQP = &req->dl_next;} i = (i + 1) & (LOPT-&GT;NR_TABLE_ENTRIES-1);}  while (--budget > 0); lopt->clock_hand = i;/* Refreshes the timed event. */if (Lopt->qlen) inet_csk_reset_keepalive_timer (parent, interval);

3. Reference documents:

[1]. function call Relationship: http://dedecms.com/knowledge/servers/linux-bsd/2012/1217/17745_3.html

[2]. http://blog.csdn.net/zhangskd/article/details/17923917

[3]. Timer: http://blog.csdn.net/zhangskd/article/details/35281345

Kernel Listen backlog and simple three-time handshake analysis

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.