標籤:linu oop dwz ... turn ast eth ant 記憶體布局
Linux select 機制深入分析 作為IO複用的實現方式。select是提高了抽象和batch處理的層級,不是傳統方式那樣堵塞在真正IO讀寫的系統調用上。而是堵塞在select系統調用上,等待我們關注的描寫敘述符就緒。當然如今更好的方式是epoll,比方Java中的NIO底層就是用的epoll。這篇文章僅僅是為了搞懂select機制的原理。不看原始碼就不能說懂這些IO複用手法。也在面試過程中體會到了,不去實踐就會發現知道的永遠是皮毛。
面試問題:select的最大描寫敘述符限制能夠改動嗎?(有待深入)
使用者層API文法:
/* According to POSIX.1-2001 */ #include <sys/select.h> /* According to earlier standards */ #include <sys/time.h> #include <sys/types.h> #include <unistd.h> int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout); void FD_CLR(int fd, fd_set *set); int FD_ISSET(int fd, fd_set *set); void FD_SET(int fd, fd_set *set); void FD_ZERO(fd_set *set); #include <sys/select.h> int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask);
注:這裡的API發生了變化(參見UNPv1 P127),timeout值是同意更新的,這在核心中有體現。
select系統調用的核心原始碼主要流程是:sys_select() -> core_sys_select() -> do_select() -> poll_select_copy_remaining。可代碼能夠一目瞭然。
/** SYSCALL_DEFINE5宏的作用就是將其轉成系統調用的常見形式,* asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,fd_set __user *exp, struct timeval __user *tvp);*/SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct timeval __user *, tvp){ struct timespec end_time, *to = NULL; struct timeval tv; int ret; if (tvp) {//假設設定了逾時閾值 if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; // 從timeval(秒 微秒)轉換為(秒 納秒) 繼而建立逾時 if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } // 核心工作 ret = core_sys_select(n, inp, outp, exp, to); //core_sys_select處理的fd_set 接下來更新timeout的值 ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); return ret;}/** We can actually return ERESTARTSYS instead of EINTR, but I‘d* like to be certain this leads to no problems. So I return* EINTR just for safety.** Update: ERESTARTSYS breaks at least the xview clock binary, so* I‘m trying ERESTARTNOHAND which restart only when you want to.*/int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec *end_time){ // poll.h :fd_set_bits封裝了6個long *,代表三個描寫敘述表集的值-結果 fd_set_bits fds; void *bits; int ret, max_fds; unsigned int size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster * 先是預分配256B的空間 大多數情況下可以滿足須要 特殊情況在以下會分配空間 */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); // 獲得開啟檔案描寫敘述符表(指標析取) fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds;//參數修正 /* * 如今要監視的描寫敘述符個數個size*8個對於每個都須要6個位來標示 * 它是否可以讀寫異常而且把結果寫在res_in res_out res_exp中 * 所以構成了以下的記憶體布局(見圖1) */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; bits = kmalloc(6 * size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; // 從使用者空間得到這些fd sets if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; // 初始化這些結果參數為0 zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); // 到這裡 一切準備工作都就緒了..... ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } // do_select正確返回後 通過copy_to_user將fds中的描寫敘述符就緒結果參數 // 反饋到使用者空間 if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT;out: if (bits != stack_fds) kfree(bits);out_nofds: return ret;}// select 的核心工作int do_select(int n, fd_set_bits *fds, struct timespec *end_time){ ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; unsigned long slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; // 得到Select要監測的最大的描寫敘述符值 rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; // 定時器值(秒 納秒)為0的話標示不等待 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); // 以下會用到這個變數統計就緒的描寫敘述符個數 所以先清0 retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, mask, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; // 要一次輪詢這些這些位元影像 定位到某個有我們關心的fd的區間 // 否則以32bits步長前進 if (all_bits == 0) { i += BITS_PER_LONG; continue; } // 當前這個區間有我們關心的fd 所以深入細節追蹤(圖2) for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { struct fd f; if (i >= n) break; if (!(bit & all_bits)) continue; // 假設發現了當前區間的某一個bit為1 則說明相應的fd須要我們處理 // 此時此刻的i正是檔案描寫敘述符值 f = fdget(i); if (f.file) { const struct file_operations *f_op; f_op = f.file->f_op; mask = DEFAULT_POLLMASK; //詳細到檔案操作結果中的poll函數指標 對於 if (f_op->poll) { wait_key_set(wait, in, out, bit, busy_flag); mask = (*f_op->poll)(f.file, wait);// TODO } // 上面的fdget添加了file引用計數 所以這裡恢複 fdput(f); /* 推斷關注的描寫敘述符是否就緒 就緒的話就更新到結果參數中 * 而且添加就緒個數 */ if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait->_qproc = NULL; } /* got something, stop busy polling * 停止忙迴圈 */ if (retval) { can_busy_loop = false; busy_flag = 0; /* * only remember a returned * POLL_BUSY_LOOP if we asked for it */ } else if (busy_flag & mask) can_busy_loop = true; } } // 這一輪的區間遍曆完之後 更新結果參數 if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; /* 進行一次調度 同意其它進程執行 * 後面有等待隊列喚醒 */ cond_resched(); } // 一輪輪詢之後 wait->_qproc = NULL; // 假設有描寫敘述符就緒 或者設定了逾時 或者有待處理訊號 則退出這個死迴圈 if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_end) { busy_end = busy_loop_end_time(); continue; } if (!busy_loop_timeout(busy_end)) continue; } busy_flag = 0; /* 假設設定逾時 而且這是首次迴圈(to==NULL) */ if (end_time && !to) { // 從timespec轉化為ktime類型(64位的有符號值) expire = timespec_to_ktime(*end_time); to = &expire; } /*設定該進程狀態TASK_INTERRUPTIBLE 睡眠直到逾時 * 返回到這裡後進程 TASK_RUNNING */ if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } // 釋放該poll wait queue poll_freewait(&table); return retval;}
附圖1:
附圖2:
參考:(1)Linux kernel 3.18 source code (2)Linux man page(3)UNPv1耗時:3h
Linux select 機制深入分析