TCP 發送流程

最後更新：2018-12-05 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

socket編程中tcp發送的調用有好幾個：send, sendmsg, sendpage。不過這些調用都可以歸結到核心功能

tcp_sendmsg。

int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags; int mss_now, size_goal; int err, copied; long timeo; /* 由於該函數有系統調用調用，sock加鎖 */ lock_sock(sk); TCP_CHECK_TIMER(sk);flags = msg->msg_flags; /* 發送逾時計算 */ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);/* Wait for a connection to finish. */ if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) goto out_err;/* This should be in poll */ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); /* 計算當前MSS，並設定xmit_size_goal，該值一般等於MSS，不過在GSO 存在的話就不是了 */ mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); size_goal = tp->xmit_size_goal;/* Ok commence sending. */ iovlen = msg->msg_iovlen; iov = msg->msg_iov; copied = 0;err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error;

下面就進入發送資料流程，主要的工作是把iov中的資料搬移到滿足大小限制的SKB資料包中。

while (--iovlen >= 0) { int seglen = iov->iov_len; unsigned char __user *from = iov->iov_base;iov++;while (seglen > 0) { int copy;skb = tcp_write_queue_tail(sk);if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {new_segment: /* Allocate new segment. If the interface is SG, * allocate skb fitting to single page. */ if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; /* select_size函數用來決定分配多少記憶體，一般情況下是 分配一個MSS的大小；對於SG下，則分配一個頭部的長度*/ skb = sk_stream_alloc_skb(sk, select_size(sk), sk->sk_allocation); if (!skb) goto wait_for_memory;/* * Check whether we can use HW checksum. */ if (sk->sk_route_caps & NETIF_F_ALL_CSUM) skb->ip_summed = CHECKSUM_PARTIAL;skb_entail(sk, skb); /* 最大的資料量限制，一般為MSS */ copy = size_goal; }/* Try to append data to the end of skb. */ if (copy > seglen) copy = seglen;/* Where to copy to? */ if (skb_tailroom(skb) > 0) { /* We have some space in skb head. Superb! */ if (copy > skb_tailroom(skb)) copy = skb_tailroom(skb); if ((err = skb_add_data(skb, from, copy)) != 0) goto do_fault; } else { int merge = 0; int i = skb_shinfo(skb)->nr_frags; struct page *page = TCP_PAGE(sk); int off = TCP_OFF(sk);if (skb_can_coalesce(skb, i, page, off) && off != PAGE_SIZE) { /* We can extend the last page * fragment. */ merge = 1; } else if (i == MAX_SKB_FRAGS || (!i && !(sk->sk_route_caps & NETIF_F_SG))) { /* Need to add new fragment and cannot * do this because interface is non-SG, * or because all the page slots are * busy. */ tcp_mark_push(tp, skb); goto new_segment; } else if (page) { if (off == PAGE_SIZE) { put_page(page); TCP_PAGE(sk) = page = NULL; off = 0; } } else off = 0;if (copy > PAGE_SIZE - off) copy = PAGE_SIZE - off;if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory;if (!page) { /* Allocate new cache page. */ if (!(page = sk_stream_alloc_page(sk))) goto wait_for_memory; }/* Time to copy data. We are close to * the end! */ err = skb_copy_to_page(sk, from, skb, page, off, copy); if (err) { /* If this page was new, give it to the * socket so it does not get leaked. */ if (!TCP_PAGE(sk)) { TCP_PAGE(sk) = page; TCP_OFF(sk) = 0; } goto do_error; }/* Update the skb. */ if (merge) { skb_shinfo(skb)->frags[i - 1].size += copy; } else { skb_fill_page_desc(skb, i, page, off, copy); if (TCP_PAGE(sk)) { get_page(page); } else if (off + copy < PAGE_SIZE) { get_page(page); TCP_PAGE(sk) = page; } }TCP_OFF(sk) = off + copy; }

對於資料的添加，如果隊列尾的skb有剩餘空間的話，就添加適量的資料到該原有的skb中，否則就構造新的skb。

對於不支援SG機制的網卡來說，資料是添加到skb的線性緩衝區；而支援SG機制下，則copy到skb的page數組中，

不過超過數組的大小後就要重新分配skb了。

sk_stream_alloc_skb函數是tcp的緩衝分配函數，實現了緩衝控制機制。

struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) { struct sk_buff *skb;/* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4);skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); if (skb) { if (sk_wmem_schedule(sk, skb->truesize)) { /* * Make sure that we have exactly size bytes * available to the caller, no more, no less. */ skb_reserve(skb, skb_tailroom(skb) - size); return skb; } __kfree_skb(skb); } else { sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; }

接下來是資料的發送處理

if (!copied) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;tp->write_seq += copy; /* 更新skb中資料的最後一個序號*/ TCP_SKB_CB(skb)->end_seq += copy; skb_shinfo(skb)->gso_segs = 0;from += copy; copied += copy; /* 資料全部copy完，跳轉後調用tcp_push發送 */ if ((seglen -= copy) == 0 && iovlen == 0) goto out;if (skb->len < size_goal || (flags & MSG_OOB)) continue; /* 待發送的資料大小超過最大可見視窗的一半，調用__tcp_push_pending_frames */ if (forced_push(tp)) { tcp_mark_push(tp, skb); __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); } else if (skb == tcp_send_head(sk)) /* 發送隊列中僅有一個skb時，調用tcp_push_one發送該skb */ tcp_push_one(sk, mss_now); continue;

tcp_push函數也是通過調用__tcp_push_pending_frames來發送資料包。

__tcp_push_pending_frames和tcp_push_one的區別是前者發送多個資料包，後者只發送一個。

這2個函數最終都調用tcp_write_xmit。

下面分析該函數

static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; int cwnd_quota; int result;sent_pkts = 0; /*如果發送的不僅是一個資料包，就要進行MTU probe */ if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); if (!result) { return 0; } else if (result > 0) { sent_pkts = 1; } }while ((skb = tcp_send_head(sk))) { unsigned int limit; /* tso 處理，不支援gso時tso_segs為1，否則是資料包長度除以MSS的大小*/ tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); /* 擁塞視窗檢測 */ cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) break; /* 發送視窗檢測 */ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break;if (tso_segs == 1) { /* Nagle演算法檢測 */ if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) break; } else { if (!push_one && tcp_tso_should_defer(sk, skb)) break; }limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota); /* 如果當前skb的長度大於MSS，則需要分區 */ if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now))) break;TCP_SKB_CB(skb)->when = tcp_time_stamp;if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break;/* Advance the send_head. This one is sent out. * This call will increment packets_out(處於網路中的資料包個數). */ tcp_event_new_data_sent(sk, skb);tcp_minshall_update(tp, mss_now, skb); sent_pkts++;if (push_one) break; }if (likely(sent_pkts)) { tcp_cwnd_validate(sk); return 0; } return !tp->packets_out && tcp_send_head(sk); }

如果沒有資料包被發送並且發送隊列不為空白就返回1，否則返回0.

tcp_cwnd_test函數

/* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) { u32 in_flight, cwnd;/* Don't be strict about the congestion window for the final FIN. */ if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && tcp_skb_pcount(skb) == 1) return 1; /* 目前還處於網路流中的資料包個數 */ in_flight = tcp_packets_in_flight(tp); cwnd = tp->snd_cwnd; if (in_flight < cwnd) return (cwnd - in_flight); /* 當前可發送的資料包個數是擁塞視窗的剩餘的值（減去in_flight資料包個數）*/ return 0; }

如果有資料包發送後，調用函數tcp_cwnd_validate來調整擁塞視窗

/* Congestion window validation. (RFC2861) */ static void tcp_cwnd_validate(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk);if (tp->packets_out >= tp->snd_cwnd) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_time_stamp; } else { /* Network starves. */ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out;if (sysctl_tcp_slow_start_after_idle && (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) tcp_cwnd_application_limited(sk); } }

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

TCP 發送流程

聯繫我們

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support