標籤:
write對應的系統調用是sys_write,代碼如下:
asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count){ssize_t ret;struct file * file;ret = -EBADF;file = fget(fd);if (file) {if (file->f_mode & FMODE_WRITE) {struct inode *inode = file->f_dentry->d_inode;ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file,file->f_pos, count);if (!ret) {ssize_t (*write)(struct file *, const char *, size_t, loff_t *);ret = -EINVAL;if (file->f_op && (write = file->f_op->write) != NULL)ret = write(file, buf, count, &file->f_pos);}}if (ret > 0)inode_dir_notify(file->f_dentry->d_parent->d_inode,DN_MODIFY);fput(file);}return ret;} fd假設就是Linux核心原始碼情景分析-檔案的開啟,一文中剛剛開啟檔案/usr/local/hello.c的檔案號。fget(fd),根據開啟檔案號fd找到該已開啟檔案的file結構。代碼如下:
struct file * fget(unsigned int fd){struct file * file;struct files_struct *files = current->files;read_lock(&files->file_lock);file = fcheck(fd);if (file)get_file(file);read_unlock(&files->file_lock);return file;}
static inline struct file * fcheck(unsigned int fd){struct file * file = NULL;struct files_struct *files = current->files;if (fd < files->max_fds)file = files->fd[fd];return file;}
還記得在開啟檔案的時候,file->f_op被設定為f->f_op = fops_get(inode->i_fop);對於ext2檔案系統,所以f->f_op要麼為ext2_file_operations,另一個是ext2_dir_oprations,視操作的目標為檔案或目錄而選擇其一。對於檔案來說,file->f_op->write為generic_file_write。
ssize_tgeneric_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)//file是要寫入檔案節點的file結構,buf為要寫入內容的指標,count是數量,ppos是要寫入檔案的位置{struct inode*inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping;unsigned longlimit = current->rlim[RLIMIT_FSIZE].rlim_cur;loff_tpos;struct page*page, *cached_page;unsigned longwritten;longstatus;interr;cached_page = NULL;down(&inode->i_sem);pos = *ppos;err = -EINVAL;if (pos < 0)goto out;err = file->f_error;if (err) {file->f_error = 0;goto out;}written = 0;if (file->f_flags & O_APPEND)pos = inode->i_size;/* * Check whether we‘ve reached the file size limit. */err = -EFBIG;if (limit != RLIM_INFINITY) {if (pos >= limit) {send_sig(SIGXFSZ, current, 0);goto out;}if (count > limit - pos) {send_sig(SIGXFSZ, current, 0);count = limit - pos;}}status = 0;if (count) {remove_suid(inode);inode->i_ctime = inode->i_mtime = CURRENT_TIME;mark_inode_dirty_sync(inode);}while (count) {unsigned long bytes, index, offset;char *kaddr;int deactivate = 1;/* * Try to find the page in the cache. If it isn‘t there, * allocate a free page. */offset = (pos & (PAGE_CACHE_SIZE -1)); //根據當前位置pos計算出本次迴圈中要寫多的緩衝頁面index、在該頁面中的起點offset以及寫入長度bytesindex = pos >> PAGE_CACHE_SHIFT;bytes = PAGE_CACHE_SIZE - offset;if (bytes > count) {bytes = count;deactivate = 0;}/* * Bring in the user page that we will copy from _first_. * Otherwise there‘s a nasty deadlock on copying from the * same page as we‘re writing to, without it being marked * up-to-date. */{ volatile unsigned char dummy;__get_user(dummy, buf);__get_user(dummy, buf+bytes-1);}status = -ENOMEM;/* we‘ll assign it later anyway */page = __grab_cache_page(mapping, index, &cached_page);//在page_hash_table中找到該緩衝頁面,如找不到,就分配、建立一個緩衝頁面if (!page)break;/* We have exclusive IO access to the page.. */if (!PageLocked(page)) {PAGE_BUG(page);}status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);//預寫先讀,先把檔案在裝置上對應位置的資料讀到page中if (status)goto unlock;kaddr = page_address(page);status = copy_from_user(kaddr+offset, buf, bytes);//把資料從使用者空間拷貝到page指向的頁面中,已經放入了緩衝區flush_dcache_page(page);if (status)goto fail_write;status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);//真正的寫,把緩衝區的資料寫到裝置上if (!status)status = bytes;if (status >= 0) {written += status;count -= status;pos += status;buf += status;}unlock:/* Mark it unlocked again and drop the page.. */UnlockPage(page);if (deactivate)deactivate_page(page);page_cache_release(page);if (status < 0)break;}*ppos = pos;if (cached_page)page_cache_free(cached_page);/* For now, when the user asks for O_SYNC, we‘ll actually * provide O_DSYNC. */if ((status >= 0) && (file->f_flags & O_SYNC))status = generic_osync_inode(inode, 1); /* 1 means datasync */err = written ? written : status;out:up(&inode->i_sem);return err;fail_write:status = -EFAULT;ClearPageUptodate(page);kunmap(page);goto unlock;} inode結構中有個指標i_mapping,指向一個address_space資料結構,其定義如下:
struct address_space {struct list_headclean_pages;/* list of clean pages */struct list_headdirty_pages;/* list of dirty pages */struct list_headlocked_pages;/* list of locked pages */unsigned longnrpages;/* number of total pages */struct address_space_operations *a_ops;/* methods */struct inode*host;/* owner: inode, block_device */struct vm_area_struct*i_mmap;/* list of private mappings */struct vm_area_struct*i_mmap_shared; /* list of shared mappings */spinlock_ti_shared_lock; /* and spinlock protecting it */}; 其中a_ops,它指向一個address_space_operations資料結構,就ext2檔案系統來說,這個資料結構為ext2_aops,代碼如下:
struct address_space_operations {int (*writepage)(struct page *);int (*readpage)(struct file *, struct page *);int (*sync_page)(struct page *);int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);int (*commit_write)(struct file *, struct page *, unsigned, unsigned);/* Unfortunately this kludge is needed for FIBMAP. Don‘t use it */int (*bmap)(struct address_space *, long);};
__grab_cache_page,在page_hash_table中找到該緩衝頁面,如找不到,就分配、建立一個緩衝頁面,代碼如下:
static inline struct page * __grab_cache_page(struct address_space *mapping,unsigned long index, struct page **cached_page){struct page *page, **hash = page_hash(mapping, index);repeat:page = __find_lock_page(mapping, index, hash);//在page_hash_table中尋找該緩衝頁面 if (!page) {//如果找不到if (!*cached_page) {//cached_page為NULL*cached_page = page_cache_alloc();//分配一個頁面if (!*cached_page)return NULL;}page = *cached_page;if (add_to_page_cache_unique(page, mapping, index, hash))//加入到page_hash_table中goto repeat;*cached_page = NULL;}return page;}
#define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index))
add_to_page_cache_unique,加入到page_hash_table中,代碼如下:
static int add_to_page_cache_unique(struct page * page,struct address_space *mapping, unsigned long offset,struct page **hash){int err;struct page *alias;spin_lock(&pagecache_lock);alias = __find_page_nolock(mapping, offset, *hash);err = 1;if (!alias) {__add_to_page_cache(page,mapping,offset,hash);err = 0;}spin_unlock(&pagecache_lock);return err;}
static inline void __add_to_page_cache(struct page * page,struct address_space *mapping, unsigned long offset,struct page **hash){unsigned long flags;if (PageLocked(page))BUG();flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));page->flags = flags | (1 << PG_locked);page_cache_get(page);page->index = offset;//就是最初傳遞進來的頁面緩衝indexadd_page_to_inode_queue(mapping, page);add_page_to_hash_queue(page, hash);//加入到page_hash_table表中lru_cache_add(page);}
擷取了緩衝頁面後,這個頁面有可能是個新分配的空白頁面。新分配的空白頁面與業已存在的緩衝頁面除了在內容上有根本性的區別外,在結構上也有重要的區別。那就是前面所講的,緩衝頁面一方面與一個page結構相聯絡,而新分配的頁面尚無buffer_head結構與之掛鈎。所以,對於新分配的空白頁面一來要為其配備相應的buffer_head資料結構,二來將目標頁面的內容先從裝置中讀入(因為寫操作未必是整個頁面的寫入)。不僅如此,就是業已存在的老頁面也有個緩衝頁面中的內容是否"up to date",即是否一致的問題。這裡所謂"一致",是指緩衝頁面火車緩衝區內容與裝置上的邏輯內容一致。
mapping->a_ops->prepare_write開始執行,指向了ext2_prepare_write,代碼如下:
static int ext2_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to){return block_prepare_write(page,from,to,ext2_get_block);}
int block_prepare_write(struct page *page, unsigned from, unsigned to,get_block_t *get_block){struct inode *inode = page->mapping->host;int err = __block_prepare_write(inode, page, from, to, get_block);if (err) {ClearPageUptodate(page);kunmap(page);}return err;}
static int __block_prepare_write(struct inode *inode, struct page *page,unsigned from, unsigned to, get_block_t *get_block){unsigned block_start, block_end;unsigned long block;int err = 0;unsigned blocksize, bbits;struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;char *kaddr = kmap(page);blocksize = inode->i_sb->s_blocksize;if (!page->buffers)//說明是新分配的頁面,沒有buffer_head結構create_empty_buffers(page, inode->i_dev, blocksize);//為該頁面配備好相應的buffer_head結構,並建立起這個隊列head = page->buffers;bbits = inode->i_sb->s_blocksize_bits;block = page->index << (PAGE_CACHE_SHIFT - bbits);//這裡用到了page->indexfor(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) {if (!bh)BUG();block_end = block_start+blocksize;if (block_end <= from)continue;if (block_start >= to)break;if (!buffer_mapped(bh)) {err = get_block(inode, block, bh, 1);//bh中存相關的資訊,為ll_rw_block準備if (err)goto out;if (buffer_new(bh)) {unmap_underlying_metadata(bh);if (Page_Uptodate(page)) {set_bit(BH_Uptodate, &bh->b_state);continue;}if (block_end > to)memset(kaddr+to, 0, block_end-to);if (block_start < from)memset(kaddr+block_start, 0, from-block_start);if (block_end > to || block_start < from)flush_dcache_page(page);continue;}}if (Page_Uptodate(page)) {set_bit(BH_Uptodate, &bh->b_state);continue; }if (!buffer_uptodate(bh) && (block_start < from || block_end > to)) {//如果是新分配的頁面,一定不一致,如果原有的頁面,有可能不一致ll_rw_block(READ, 1, &bh);//如果不一致,就從裝置上讀入資料到page中,bh中已經存好了用於從裝置中讀入資料的相關資訊*wait_bh++=bh;}}/* * If we issued read requests - let them complete. */while(wait_bh > wait) {wait_on_buffer(*--wait_bh);err = -EIO;if (!buffer_uptodate(*wait_bh))goto out;}return 0;out:return err;} create_empty_buffers,為該頁面配備好相應的buffer_head結構,並建立起這個隊列
static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize){struct buffer_head *bh, *head, *tail;head = create_buffers(page, blocksize, 1);if (page->buffers)BUG();bh = head;do {bh->b_dev = dev;//重要點bh->b_blocknr = 0;bh->b_end_io = NULL;tail = bh;bh = bh->b_this_page;} while (bh);tail->b_this_page = head;page->buffers = head;//重要點page_cache_get(page);}
static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async){struct buffer_head *bh, *head;long offset;try_again:head = NULL;offset = PAGE_SIZE;while ((offset -= size) >= 0) {bh = get_unused_buffer_head(async);if (!bh)goto no_grow;bh->b_dev = B_FREE; /* Flag as unused */bh->b_this_page = head;head = bh;bh->b_state = 0;bh->b_next_free = NULL;bh->b_pprev = NULL;atomic_set(&bh->b_count, 0);bh->b_size = size;//重要點,block_sizeset_bh_page(bh, page, offset);bh->b_list = BUF_CLEAN;bh->b_end_io = NULL;}return head; ......}
void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset){bh->b_page = page;//重要點if (offset >= PAGE_SIZE)BUG();if (PageHighMem(page))/* * This catches illegal uses and preserves the offset: */bh->b_data = (char *)(0 + offset);elsebh->b_data = page_address(page) + offset;//重要點,頁面的實際位置} 返回到generic_file_write,繼續執行mapping->a_ops->commit_write,真正的寫,把緩衝區的資料寫到裝置上,對應的指標是generic_commit_write,代碼如下:
int generic_commit_write(struct file *file, struct page *page,unsigned from, unsigned to){struct inode *inode = page->mapping->host;loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;__block_commit_write(inode,page,from,to);kunmap(page);if (pos > inode->i_size) {inode->i_size = pos;mark_inode_dirty(inode);}return 0;}
static int __block_commit_write(struct inode *inode, struct page *page,unsigned from, unsigned to){unsigned block_start, block_end;int partial = 0, need_balance_dirty = 0;unsigned blocksize;struct buffer_head *bh, *head;blocksize = inode->i_sb->s_blocksize;for(bh = head = page->buffers, block_start = 0;//page->buffers得到buffer_head結構 bh != head || !block_start; block_start=block_end, bh = bh->b_this_page) {block_end = block_start + blocksize;if (block_end <= from || block_start >= to) {if (!buffer_uptodate(bh))partial = 1;} else {set_bit(BH_Uptodate, &bh->b_state);if (!atomic_set_buffer_dirty(bh)) {__mark_dirty(bh);buffer_insert_inode_queue(bh, inode);need_balance_dirty = 1;//只要有記錄塊緩衝區從"乾淨"狀態變成"髒"狀態,need_balance_dirty就置1}}}if (need_balance_dirty)balance_dirty(bh->b_dev);//如果置1,這個函數看看這樣的記錄塊是否已經積累到一定的數量,如果是,就喚醒bdflushin進行一次"沖刷"/* * is this a partial write that happened to make all buffers * uptodate then we can optimize away a bogus readpage() for * the next read(). Here we ‘discover‘ wether the page went * uptodate as a result of this (potentially partial) write. */if (!partial)SetPageUptodate(page);return 0;} 至此,檔案寫就分析完了,page和buffer_head同時管理頁面,page->buffers指向了buffer_head,bh->b_page指向了page。
Linux核心原始碼情景分析-檔案的寫