1. 相關資料結構
#define EPOLLIN 0x00000001#define EPOLLPRI 0x00000002#define EPOLLOUT 0x00000004#define EPOLLERR 0x00000008#define EPOLLHUP 0x00000010#define EPOLLRDNORM 0x00000040#define EPOLLRDBAND 0x00000080#define EPOLLWRNORM 0x00000100#define EPOLLWRBAND 0x00000200#define EPOLLMSG 0x00000400#define EPOLLET 0x80000000#define EPOLL_CTL_ADD 1#define EPOLL_CTL_DEL 2#define EPOLL_CTL_MOD 3typedef union epoll_data { void *ptr; int fd; unsigned int u32; unsigned long long u64;} epoll_data_t;struct epoll_event { unsigned int events; //如EPOLLIN、EPOLLOUT epoll_data_t data;};int epoll_create(int size);int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event); //op為:EPOLL_CTL_ADD、EPOLL_CTL_DELint epoll_wait(int epfd, struct epoll_event *events, int max, int timeout);常用的事件類型:
EPOLLIN :表示對應的檔案描述符可以讀;
EPOLLOUT:表示對應的檔案描述符可以寫;
EPOLLPRI:表示對應的檔案描述符有緊急的資料可讀
EPOLLERR:表示對應的檔案描述符發生錯誤;
EPOLLHUP:表示對應的檔案描述符被掛斷;
EPOLLET:表示對應的檔案描述符有事件發生;
2. epoll_create
在Bionic中的實現見:epoll_create.S,它直接進行系統調用,系統調用表見kernel:src/include/linux/syscalls.h,根據規則在Kernel中Search字串“epoll_create”,就能找到對應的實現函數:SYSCALL_DEFINE1(epoll_create, int, size) <在Kernel的實現在檔案eventpoll.c中>
SYSCALL_DEFINE1(epoll_create, int, size){if (size <= 0)return -EINVAL;return sys_epoll_create1(0);}
看仔細了, 只要傳入的參數大於0即可,它並沒有別的用處。此函數功能為:
1)從當前進程的files中尋找一個閒置fd(檔案控制代碼)
2)建立一個struct file執行個體(其fops為eventpoll_fops,priv為剛為其建立的struct eventpoll對象)
3)當前進程的files->fdt->fd[fd]為新建立的struct file執行個體
4)返回給使用者態的當然是一個fd(檔案控制代碼)
3. epoll_ctl
使用者態:int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
Kernel態:SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event),它主要實現eventpoll檔案的控制介面,用於插入、刪除、修改檔案集中的檔案描述符。其代碼處理流程為:<epoll_event用於描述感興趣的事件和源fd>
1)擷取eventpoll檔案控制代碼epfd對應的檔案執行個體(struct file)
2)擷取目標檔案控制代碼fd對應的檔案執行個體(struct file)
3)確保目標檔案控制代碼fd對應的檔案執行個體(struct file)支援poll操作(即:(tfile->f_op && tfile->f_op->poll))
4)把eventpoll檔案的私人資料轉換為eventpoll對象,eventpoll紅/黑樹狀結構的key為:epoll_filefd
struct epoll_filefd {
struct file *file;
int fd;
};
5)在紅/黑樹狀結構中尋找要操作的目標fd和fie執行個體,從而擷取一個struct epitem,紅/黑樹狀結構中節點的資料結構
6)根據op,進行對應的INSERT、REMOVE或MODIFY操作,下面說說INSERT(當操作為EPOLL_CTL_ADD時)
7)調用int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd)
7.1)建立struct epitem對象(每一個檔案描述符增加到eventpoll介面必須有一個epitem對象,且此對象被插入eventpoll的紅/黑樹狀結構中)
7.2)初始化epi中的三個鏈表,儲存eventpoll,目標fd,目標檔案執行個體,epoll_event..
7.3)把回呼函數註冊給目標檔案的f_op->poll,相關代碼如下:
struct ep_pqueue epq;
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
revents = tfile->f_op->poll(tfile, &epq.pt); //詳情可參考pipe_poll處理方式,它最終還是調用ep_ptable_queue_proc函數來處理
ep_ptable_queue_proc: is used to add our wait queue to the target file wakeup lists
7.4)把此對象插入紅/黑樹狀結構中
/* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt){struct epitem *epi = ep_item_from_epqueue(pt);struct eppoll_entry *pwq;if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);pwq->whead = whead;pwq->base = epi;add_wait_queue(whead, &pwq->wait);list_add_tail(&pwq->llink, &epi->pwqlist);epi->nwait++;} else {/* We have to signal that an error occurred */epi->nwait = -1;}}
/* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the target file descriptors when they * have events to report. */static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key){ ...}
這ep_poll_callback如何被執行的呢?
下面以pipe為例,假設上面的是檢測從pipe中讀取資料,哪麼寫資料時將調用此函數。
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);static inline void init_waitqueue_func_entry(wait_queue_t *q,wait_queue_func_t func){q->flags = 0;q->private = NULL;q->func = func;}
ep_poll_callback被儲存在q->func中。
下面看此q->func的調用流程:
static ssize_tpipe_write(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t ppos){ ...if (do_wakeup) {wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);do_wakeup = 0;} ...}#define wake_up_interruptible_sync_poll(x, m)\__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))/** * __wake_up_sync_key - wake up threads blocked on a waitqueue. */void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,int nr_exclusive, void *key){ ...__wake_up_common(q, mode, nr_exclusive, wake_flags, key); ...}/* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve * number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,int nr_exclusive, int wake_flags, void *key){wait_queue_t *curr, *next;list_for_each_entry_safe(curr, next, &q->task_list, task_list) {unsigned flags = curr->flags;if (curr->func(curr, mode, wake_flags, key) &&(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)break;}}
總結一下:目標檔案以pipefd為例。為新的目標檔案產生一個epitem包含 pipe fd和需要監聽的事件,並將epitem與函數ep_ptable_queue_proc的地址捆綁成一個ep_pqueue結構,然後用結構中的函數地址欄位作為參數執行pipe fd對應的poll函數(pipe_poll),在pipe_poll執行時函數ep_ptable_queue_proc被執行,同時函數體中可以根據傳入的函數地址計算位移來得到epitem指標,函數ep_ptable_queue_proc將epoll回呼函數ep_poll_callback函數與epitem指標捆綁成另一個結構eppoll_entry,然後把eppoll_entry中的函數地址產生一個wait_queue_t,插入到目標pipe
fd的wait queue中,當pipe由於狀態改變而觸發啟用wait_queue時<在pipe_write中調用wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);它將wake up threads blocked on the waitqueue(pipe->wait)>,包含在隊列中的ep_poll_callback函數就會被調用,同時根據其函數地址參數,用位移量來得到epitem,回呼函數在調用時會再執行pipe_poll函數,來明確是不是指定的關注事件發生,若成立則將
epitem插入到eventpoll中的rdlist,並啟用在epoll fd上wait的進程,並將事件回傳至使用者態.這樣就能實現對目標fd的事件監聽.
4. epoll_wait
從epfd中讀取epoll_event並儲存到events數組中。
使用者態:int epoll_wait(int epfd, struct epoll_event *events, int max, int timeout);
Kernel態:SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout)
1)擷取epfd對應的檔案執行個體
2)把eventpoll檔案的私人資料轉換為eventpoll對象
3)調用int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)擷取epoll_event。此函數擷取已經準備好的事件,然後把他們儲存到調用都提供的events buffer中。
3.1)ep_poll中調用hrtimer來實現其逾時功能
3.2)調用int ep_events_available(struct eventpoll *ep)來check是否有事件
3.3)調用int ep_send_events(struct eventpoll *ep,struct epoll_event __user *events, int maxevents)來真正地擷取事件,並copy到使用者空間的events buffer中
3.3.1)調用ep_scan_ready_list(ep, ep_send_events_proc, &esed);
3.3.2)在回呼函數中,擷取epoll_event,epoll_event兩個域的資料來源如下:
uevent是使用者提供的epoll_event,
revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & epi->event.events;
if (revents) {
if (__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
return eventcnt ? eventcnt : -EFAULT;
}
...
}