TCP 發送流程

來源:互聯網
上載者:User

socket編程中tcp發送的調用有好幾個:send, sendmsg, sendpage。不過這些調用都可以歸結到核心功能

tcp_sendmsg。

int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,<br />size_t size)<br />{<br />struct sock *sk = sock->sk;<br />struct iovec *iov;<br />struct tcp_sock *tp = tcp_sk(sk);<br />struct sk_buff *skb;<br />int iovlen, flags;<br />int mss_now, size_goal;<br />int err, copied;<br />long timeo;<br /> /* 由於該函數有系統調用調用,sock加鎖 */<br />lock_sock(sk);<br />TCP_CHECK_TIMER(sk);</p><p>flags = msg->msg_flags;<br /> /* 發送逾時計算 */<br />timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);</p><p>/* Wait for a connection to finish. */<br />if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))<br />if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)<br />goto out_err;</p><p>/* This should be in poll */<br />clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);<br /> /* 計算當前MSS,並設定xmit_size_goal,該值一般等於MSS,不過在GSO<br /> 存在的話就不是了 */<br />mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));<br />size_goal = tp->xmit_size_goal;</p><p>/* Ok commence sending. */<br />iovlen = msg->msg_iovlen;<br />iov = msg->msg_iov;<br />copied = 0;</p><p>err = -EPIPE;<br />if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))<br />goto do_error;  

 

下面就進入發送資料流程,主要的工作是把iov中的資料搬移到滿足大小限制的SKB資料包中。

while (--iovlen >= 0) {<br />int seglen = iov->iov_len;<br />unsigned char __user *from = iov->iov_base;</p><p>iov++;</p><p>while (seglen > 0) {<br />int copy;</p><p>skb = tcp_write_queue_tail(sk);</p><p>if (!tcp_send_head(sk) ||<br /> (copy = size_goal - skb->len) <= 0) {</p><p>new_segment:<br />/* Allocate new segment. If the interface is SG,<br /> * allocate skb fitting to single page.<br /> */<br />if (!sk_stream_memory_free(sk))<br />goto wait_for_sndbuf;<br /> /* select_size函數用來決定分配多少記憶體,一般情況下是<br /> 分配一個MSS的大小;對於SG下,則分配一個頭部的長度*/<br />skb = sk_stream_alloc_skb(sk, select_size(sk),<br />sk->sk_allocation);<br />if (!skb)<br />goto wait_for_memory;</p><p>/*<br /> * Check whether we can use HW checksum.<br /> */<br />if (sk->sk_route_caps & NETIF_F_ALL_CSUM)<br />skb->ip_summed = CHECKSUM_PARTIAL;</p><p>skb_entail(sk, skb);<br /> /* 最大的資料量限制,一般為MSS */<br />copy = size_goal;<br />}</p><p>/* Try to append data to the end of skb. */<br />if (copy > seglen)<br />copy = seglen;</p><p>/* Where to copy to? */<br />if (skb_tailroom(skb) > 0) {<br />/* We have some space in skb head. Superb! */<br />if (copy > skb_tailroom(skb))<br />copy = skb_tailroom(skb);<br />if ((err = skb_add_data(skb, from, copy)) != 0)<br />goto do_fault;<br />} else {<br />int merge = 0;<br />int i = skb_shinfo(skb)->nr_frags;<br />struct page *page = TCP_PAGE(sk);<br />int off = TCP_OFF(sk);</p><p>if (skb_can_coalesce(skb, i, page, off) &&<br /> off != PAGE_SIZE) {<br />/* We can extend the last page<br /> * fragment. */<br />merge = 1;<br />} else if (i == MAX_SKB_FRAGS ||<br /> (!i &&<br /> !(sk->sk_route_caps & NETIF_F_SG))) {<br />/* Need to add new fragment and cannot<br /> * do this because interface is non-SG,<br /> * or because all the page slots are<br /> * busy. */<br />tcp_mark_push(tp, skb);<br />goto new_segment;<br />} else if (page) {<br />if (off == PAGE_SIZE) {<br />put_page(page);<br />TCP_PAGE(sk) = page = NULL;<br />off = 0;<br />}<br />} else<br />off = 0;</p><p>if (copy > PAGE_SIZE - off)<br />copy = PAGE_SIZE - off;</p><p>if (!sk_wmem_schedule(sk, copy))<br />goto wait_for_memory;</p><p>if (!page) {<br />/* Allocate new cache page. */<br />if (!(page = sk_stream_alloc_page(sk)))<br />goto wait_for_memory;<br />}</p><p>/* Time to copy data. We are close to<br /> * the end! */<br />err = skb_copy_to_page(sk, from, skb, page,<br /> off, copy);<br />if (err) {<br />/* If this page was new, give it to the<br /> * socket so it does not get leaked.<br /> */<br />if (!TCP_PAGE(sk)) {<br />TCP_PAGE(sk) = page;<br />TCP_OFF(sk) = 0;<br />}<br />goto do_error;<br />}</p><p>/* Update the skb. */<br />if (merge) {<br />skb_shinfo(skb)->frags[i - 1].size +=<br />copy;<br />} else {<br />skb_fill_page_desc(skb, i, page, off, copy);<br />if (TCP_PAGE(sk)) {<br />get_page(page);<br />} else if (off + copy < PAGE_SIZE) {<br />get_page(page);<br />TCP_PAGE(sk) = page;<br />}<br />}</p><p>TCP_OFF(sk) = off + copy;<br />}

 

對於資料的添加,如果隊列尾的skb有剩餘空間的話,就添加適量的資料到該原有的skb中,否則就構造新的skb。

對於不支援SG機制的網卡來說,資料是添加到skb的線性緩衝區;而支援SG機制下,則copy到skb的page數組中,

不過超過數組的大小後就要重新分配skb了。

 

sk_stream_alloc_skb函數是tcp的緩衝分配函數,實現了緩衝控制機制。

struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)<br />{<br />struct sk_buff *skb;</p><p>/* The TCP header must be at least 32-bit aligned. */<br />size = ALIGN(size, 4);</p><p>skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);<br />if (skb) {<br />if (sk_wmem_schedule(sk, skb->truesize)) {<br />/*<br /> * Make sure that we have exactly size bytes<br /> * available to the caller, no more, no less.<br /> */<br />skb_reserve(skb, skb_tailroom(skb) - size);<br />return skb;<br />}<br />__kfree_skb(skb);<br />} else {<br />sk->sk_prot->enter_memory_pressure(sk);<br />sk_stream_moderate_sndbuf(sk);<br />}<br />return NULL;<br />}<br />

 

 

接下來是資料的發送處理

if (!copied)<br />TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;</p><p>tp->write_seq += copy;<br /> /* 更新skb中資料的最後一個序號*/<br />TCP_SKB_CB(skb)->end_seq += copy;<br />skb_shinfo(skb)->gso_segs = 0;</p><p>from += copy;<br />copied += copy;<br /> /* 資料全部copy完,跳轉後調用tcp_push發送 */<br />if ((seglen -= copy) == 0 && iovlen == 0)<br />goto out;</p><p>if (skb->len < size_goal || (flags & MSG_OOB))<br />continue;<br /> /* 待發送的資料大小超過最大可見視窗的一半,調用__tcp_push_pending_frames */<br />if (forced_push(tp)) {<br />tcp_mark_push(tp, skb);<br />__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);<br />} else if (skb == tcp_send_head(sk))<br /> /* 發送隊列中僅有一個skb時,調用tcp_push_one發送該skb */<br />tcp_push_one(sk, mss_now);<br />continue;

 

tcp_push函數也是通過調用__tcp_push_pending_frames來發送資料包。

__tcp_push_pending_frames和tcp_push_one的區別是前者發送多個資料包,後者只發送一個。

這2個函數最終都調用tcp_write_xmit。

下面分析該函數

static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,<br /> int push_one, gfp_t gfp)<br />{<br />struct tcp_sock *tp = tcp_sk(sk);<br />struct sk_buff *skb;<br />unsigned int tso_segs, sent_pkts;<br />int cwnd_quota;<br />int result;</p><p>sent_pkts = 0;<br /> /*如果發送的不僅是一個資料包,就要進行MTU probe */<br />if (!push_one) {<br />/* Do MTU probing. */<br />result = tcp_mtu_probe(sk);<br />if (!result) {<br />return 0;<br />} else if (result > 0) {<br />sent_pkts = 1;<br />}<br />}</p><p>while ((skb = tcp_send_head(sk))) {<br />unsigned int limit;<br /> /* tso 處理,不支援gso時tso_segs為1,否則是資料包長度除以MSS的大小*/<br />tso_segs = tcp_init_tso_segs(sk, skb, mss_now);<br />BUG_ON(!tso_segs);<br /> /* 擁塞視窗檢測 */<br />cwnd_quota = tcp_cwnd_test(tp, skb);<br />if (!cwnd_quota)<br />break;<br /> /* 發送視窗檢測 */<br />if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))<br />break;</p><p>if (tso_segs == 1) {<br /> /* Nagle演算法檢測 */<br />if (unlikely(!tcp_nagle_test(tp, skb, mss_now,<br /> (tcp_skb_is_last(sk, skb) ?<br /> nonagle : TCP_NAGLE_PUSH))))<br />break;<br />} else {<br />if (!push_one && tcp_tso_should_defer(sk, skb))<br />break;<br />}</p><p>limit = mss_now;<br />if (tso_segs > 1 && !tcp_urg_mode(tp))<br />limit = tcp_mss_split_point(sk, skb, mss_now,<br /> cwnd_quota);<br /> /* 如果當前skb的長度大於MSS,則需要分區 */<br />if (skb->len > limit &&<br /> unlikely(tso_fragment(sk, skb, limit, mss_now)))<br />break;</p><p>TCP_SKB_CB(skb)->when = tcp_time_stamp;</p><p>if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))<br />break;</p><p>/* Advance the send_head. This one is sent out.<br /> * This call will increment packets_out(處於網路中的資料包個數).<br /> */<br />tcp_event_new_data_sent(sk, skb);</p><p>tcp_minshall_update(tp, mss_now, skb);<br />sent_pkts++;</p><p>if (push_one)<br />break;<br />}</p><p>if (likely(sent_pkts)) {<br />tcp_cwnd_validate(sk);<br />return 0;<br />}<br />return !tp->packets_out && tcp_send_head(sk);<br />}<br />

 

如果沒有資料包被發送並且發送隊列不為空白就返回1,否則返回0.

 

tcp_cwnd_test函數

/* Can at least one segment of SKB be sent right now, according to the<br /> * congestion window rules? If so, return how many segments are allowed.<br /> */<br />static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,<br /> struct sk_buff *skb)<br />{<br />u32 in_flight, cwnd;</p><p>/* Don't be strict about the congestion window for the final FIN. */<br />if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&<br /> tcp_skb_pcount(skb) == 1)<br />return 1;<br /> /* 目前還處於網路流中的資料包個數 */<br />in_flight = tcp_packets_in_flight(tp);<br />cwnd = tp->snd_cwnd;<br />if (in_flight < cwnd)<br />return (cwnd - in_flight);<br /> /* 當前可發送的資料包個數是擁塞視窗的剩餘的值(減去in_flight資料包個數)*/<br />return 0;<br />}

 

如果有資料包發送後,調用函數tcp_cwnd_validate來調整擁塞視窗

/* Congestion window validation. (RFC2861) */<br />static void tcp_cwnd_validate(struct sock *sk)<br />{<br />struct tcp_sock *tp = tcp_sk(sk);</p><p>if (tp->packets_out >= tp->snd_cwnd) {<br />/* Network is feed fully. */<br />tp->snd_cwnd_used = 0;<br />tp->snd_cwnd_stamp = tcp_time_stamp;<br />} else {<br />/* Network starves. */<br />if (tp->packets_out > tp->snd_cwnd_used)<br />tp->snd_cwnd_used = tp->packets_out;</p><p>if (sysctl_tcp_slow_start_after_idle &&<br /> (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)<br />tcp_cwnd_application_limited(sk);<br />}<br />}

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.