In-depth analysis of the Linux select mechanism and select analysis

Source: Internet
Author: User
Tags epoll

In-depth analysis of the Linux select mechanism and select analysis
In Linux, the select mechanism is deeply analyzed as an implementation method of IO reuse. select improves the abstraction and batch processing level, and does not block the system calls in real IO read/write as the traditional method, it is blocked on the select system call and waits for the descriptor to be followed. Of course, the better way is epoll. For example, the bottom layer of NIO in Java is epoll. This article is only intended to understand the principles of the select mechanism. I cannot understand these IO reuse methods without looking at the source code. I also learned during the interview that without practice, I will find that what I know is forever.Interview Questions: Can the maximum descriptor limit of select be modified? (To be further explored)
User-layer API syntax:

 /* According to POSIX.1-2001 */       #include <sys/select.h>       /* According to earlier standards */       #include <sys/time.h>       #include <sys/types.h>       #include <unistd.h>       int select(int nfds, fd_set *readfds, fd_set *writefds,                  fd_set *exceptfds, struct timeval *timeout);       void FD_CLR(int fd, fd_set *set);       int  FD_ISSET(int fd, fd_set *set);       void FD_SET(int fd, fd_set *set);       void FD_ZERO(fd_set *set);       #include <sys/select.h>      int pselect(int nfds, fd_set *readfds, fd_set *writefds,                   fd_set *exceptfds, const struct timespec *timeout,                   const sigset_t *sigmask);

Note: The API changes here (see UNPv1 P127). The timeout value is allowed to be updated, which is reflected in the kernel.
The main process of the kernel source code called by the select system is sys_select ()-> core_sys_select ()-> do_select ()-> poll_select_copy_remaining. Code is clear at a glance.
/** The Role Of The SYSCALL_DEFINE5 macro is to convert it into a common form of system calling. * asmlinkage long sys_select (int n, fd_set _ user * indium, fd_set _ user * outp, fd_set _ user * exp, struct timeval _ user * tvp); */SYSCALL_DEFINE5 (select, int, n, fd_set _ user *, indium, fd_set _ user *, outp, fd_set _ user *, exp, struct timeval _ user *, tvp) {struct timespec end_time, * to = NULL; struct timeval TV; int ret; if (tvp) {// if the timeout threshold is set if (copy_from_user (& TV, tvp, sizeof (TV) return-EFAULT; to = & end_time; // convert from timeval (in seconds) to (in seconds) then create timeout if (poll_select_set_timeout (to, TV. TV _sec + (TV. TV _usec/USEC_PER_SEC), (TV. TV _usec % USEC_PER_SEC) * NSEC_PER_USEC) return-EINVAL;} // core job ret = core_sys_select (n, indium, outp, exp, ); // The fd_set processed by core_sys_select is then updated with the timeout value ret = poll_select_copy_remaining (& end_time, tvp, 1, ret); return ret;}/** We Can actually return ERESTARTSYS instead of EINTR, but I 'd * like to be certain this leads to no problems. so I return * EINTR just for safety. ** Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want. */int core_sys_select (int n, fd_set _ user * indium, fd_set _ user * outp, fd_set _ user * exp, struct timespec * end_time) {// poll. h: fd _ Set_bits is packed with 6 long *, representing the values of the three descriptive table sets-result fd_set_bits fds; void * bits; int ret, max_fds; unsigned int size; struct fdtable * fdt; /* Allocate small arguments on the stack to save memory and be faster * First, 256B space is pre-allocated. In most cases, space can be allocated under special circumstances. */long stack_fds [SELECT_STACK_ALLOC/ sizeof (long)]; ret =-EINVAL; if (n <0) goto out_nofds;/* max_fds can increase, so grab it once to avoid race */rcu_read_lock (); // Obtain the open file descriptor table (pointer extraction) fdt = files_fdtable (current-> files); max_fds = fdt-> max_fds; rcu_read_unlock (); if (n> max_fds) n = max_fds; // parameter correction/** Number of descriptors to be monitored x size x 8 each requires six digits to indicate * whether it can read or write exceptions and write the results in res_in res_out res_exp * The following memory layout is formed (see figure 1) */size = FDS_BYTES (n); bits = stack_fds; if (size> sizeof (stack_fds)/6) {/* Not enough space in on-stack array; must use kmalloc */ret =-ENOMEM; bits = km Alloc (6 * size, GFP_KERNEL); if (! Bits) goto out_nofds;} fds. in = bits; fds. out = bits + size; fds. ex = bits + 2 * size; fds. res_in = bits + 3 * size; fds. res_out = bits + 4 * size; fds. res_ex = bits + 5 * size; // obtain these fd sets from the user space if (ret = get_fd_set (n, indium, fds. in) | (ret = get_fd_set (n, outp, fds. out) | (ret = get_fd_set (n, exp, fds. (ex) goto out; // The initialization result parameters are 0 zero_fd_set (n, fds. res_in); zero_fd_set (n, fds. res_out); zero_fd_se T (n, fds. res_ex); // all preparations are ready here ..... ret = do_select (n, & fds, end_time); if (ret <0) goto out; if (! Ret) {ret =-ERESTARTNOHAND; if (signal_pending (current) goto out; ret = 0 ;} // After do_select is returned correctly, use copy_to_user to send the descriptor ready result parameter in fds to the user space if (set_fd_set (n, indium, fds. res_in) | set_fd_set (n, outp, fds. res_out) | set_fd_set (n, exp, fds. res_ex) ret =-EFAULT; out: if (bits! = Stack_fds) kfree (bits); out_nofds: return ret;} // The core work of select: int do_select (int n, fd_set_bits * fds, struct timespec * end_time) {ktime_t expire, * to = NULL; struct poll_wqueues table; poll_table * wait; int retval, I, timed_out = 0; unsigned long slack = 0; unsigned int busy_flag = net_busy_loop_on ()? POLL_BUSY_LOOP: 0; unsigned long busy_end = 0; // obtain the maximum descriptor value rcu_read_lock (); retval = max_select_fd (n, fds); rcu_read_unlock (); if (retval <0) return retval; n = retval; poll_initwait (& table); wait = & table.pt; // The timer value (in seconds) if the value is 0, it indicates not waiting for if (end_time &&! End_time-> TV _sec &&! End_time-> TV _nsec) {wait-> _ qproc = NULL; timed_out = 1;} if (end_time &&! Timed_out) slack = select_estimate_accuracy (end_time); // The number of descriptors that are ready for statistics for this variable is used below. Therefore, the value 0 retval = 0; for (;) {unsigned long * rinp, * routp, * rexp, * indium, * outp, * exp; bool can_busy_loop = false; indium = fds-> in; outp = fds-> out; exp = fds-> ex; rinp = fds-> res_in; routp = fds-> res_out; rexp = fds-> res_ex; for (I = 0; I <n; ++ rinp, ++ routp, ++ rexp) {unsigned long in, out, ex, all_bits, bit = 1, mask, j; u Nsigned long res_in = 0, res_out = 0, res_ex = 0; in = * indium ++; out = * outp ++; ex = * exp ++; all_bits = in | out | ex; // You Need to poll these bitmaps at a time to locate a certain fd range that we care about. // otherwise, move forward with the 32bits step if (all_bits = 0) {I + = BITS_PER_LONG; continue;} // The fd we are concerned about in this interval. Therefore, we need to track the details in depth (Figure 2) for (j = 0; j <BITS_PER_LONG; ++ j, ++ I, bit <= 1) {struct fd f; if (I> = n) break; if (! (Bit & all_bits) continue; // if a bit in the current interval is 1, the corresponding fd needs to be processed. // at this moment, I is the file descriptor value f = fdget (I ); if (f. file) {const struct file_operations * f_op; f_op = f. file-> f_op; mask = DEFAULT_POLLMASK; // poll function pointer specific to the file operation result for if (f_op-> poll) {wait_key_set (wait, in, out, bit, busy_flag); mask = (* f_op-> poll) (f. file, wait); // TODO} // The fdget above increases the file reference count, so fdput (f) is restored here ); /* If the attention descriptor is ready, update it to the result parameter * and add Number of ready instances */if (mask & POLLIN_SET) & (in & bit) {res_in | = bit; retval ++; wait-> _ qproc = NULL ;} if (mask & POLLOUT_SET) & (out & bit) {res_out | = bit; retval ++; wait-> _ qproc = NULL ;} if (mask & POLLEX_SET) & (ex & bit) {res_ex | = bit; retval ++; wait-> _ qproc = NULL;}/* got something, stop busy polling * stop a busy cycle */if (retval) {can_busy_loop = false; busy_flag = 0;/** only remember a r Eturned * POLL_BUSY_LOOP if we asked for it */} else if (busy_flag & mask) can_busy_loop = true ;}// after this round of interval traversal, update the result parameter if (res_in) * rinp = res_in; if (res_out) * routp = res_out; if (res_ex) * rexp = res_ex; /* perform one scheduling to allow other processes to run * wait for the queue to wake up */cond_resched ();} // wait-> _ qproc = NULL after one round of polling; // if a descriptor is ready or timeout is set, or the signal to be processed is to exit this endless loop if (retval | timed_out | signal_pending (current) break; if (table. Error) {retval = table. error; break;}/* only if found POLL_BUSY_LOOP sockets & not out of time */if (can_busy_loop &&! Need_resched () {if (! Busy_end) {busy_end = busy_loop_end_time (); continue;} if (! Busy_loop_timeout (busy_end) continue;} busy_flag = 0;/* if timeout is set and this is the first loop (to = NULL) */if (end_time &&! To) {// convert from timespec to ktime type (64-Bit Signed value) expire = timespec_to_ktime (* end_time); to = & expire ;} /* set the status of the process to TASK_INTERRUPTIBLE and sleep until the time-out * returns to task _ running */if (! Poll_schedule_timeout (& table, TASK_INTERRUPTIBLE, to, slack) timed_out = 1 ;}// release the poll wait queue poll_freewait (& table); return retval ;}

Figure 1:

Figure 2:




Reference: (1) Linux kernel 3.18 source code (2) Linux man page (3) UNPv1 time consumed: 3 h


Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.