Linux kernel Source-code scenario analysis-File Write

Source: Internet
Author: User
Tags goto blank page

Write corresponds to the system call is Sys_write, the code is as follows:

Asmlinkage ssize_t sys_write (unsigned int fd, const char * buf, size_t count) {ssize_t ret;struct file * File;ret =-EBADF;  File = Fget (FD); if (file) {if (File->f_mode & fmode_write) {struct Inode *inode = File->f_dentry->d_inode;ret = Locks_verify_area (Flock_verify_write, Inode, File,file->f_pos, Count), if (!ret) {ssize_t (*write) (struct file *, CO NST char *, size_t, loff_t *); ret =-einval;if (file->f_op && (write = file->f_op->write)! = NULL) ret = W Rite (file, buf, Count, &file->f_pos);}} if (Ret > 0) inode_dir_notify (file->f_dentry->d_parent->d_inode,dn_modify); fput (file);} return ret;}
The FD hypothesis isLinux Kernel Source-code scenario analysis-file opening, the file number of the file/usr/local/hello.c just opened in the article. Fget (FD), based on the open file number FD, finds the file structure of the opened files. The code is as follows:

struct file * fget (unsigned int fd) {struct file * file;struct files_struct *files = Current->files;read_lock (&files ->file_lock); file = Fcheck (FD); if (file) get_file (file); Read_unlock (&files->file_lock); return file;}
static inline struct file * fcheck (unsigned int fd) {struct file * file = null;struct files_struct *files = Current->fil Es;if (FD < Files->max_fds) file = Files->fd[fd];return file;}


Remember that when you open a file, File->f_op is set to F->f_op = Fops_get (INODE-&GT;I_FOP), and ext2 is either f->f_op for Ext2_file_ file system Operations, the other is ext2_dir_oprations, depending on the target of the operation is the file or directory and choose one. For files, File->f_op->write is generic_file_write.

Ssize_tgeneric_file_write (struct file *file,const char *buf,size_t count,loff_t *ppos)//file is the file structure to be written to the files node. BUF is the pointer to write the content, count is the number, PPOs is the location to write the file {struct Inode*inode = file->f_dentry->d_inode; struct Address_space * mapping = inode->i_mapping;unsigned Longlimit = current->rlim[rlimit_fsize].rlim_cur;loff_tpos;struct page* page, *cached_page;unsigned longwritten;longstatus;interr;cached_page = Null;down (&inode->i_sem);p os = *ppos; Err =-einval;if (pos < 0) goto Out;err = file->f_error;if (err) {file->f_error = 0;goto out;} Written = 0;if (File->f_flags & o_append) pos = inode->i_size;/* * Check Whether we ' ve reached the file size Limi T. */err =-efbig;if (limit! = rlim_infinity) {if (pos >= limit) {Send_sig (Sigxfsz, current, 0); goto out;} if (Count > Limit-pos) {send_sig (Sigxfsz, current, 0); count = Limit-pos;}} Status = 0;if (count) {remove_suid (inode); inode->i_ctime = Inode->i_mtime = Current_time;mark_inode_dirty_sync ( Inode);} while (count) {unsigned long bytes, index, Offset;char *kaddr;int deactivate = 1;/* * Try to find the page in the cache. If it isn ' t there, * allocate a free page. */offset = (pos & (page_cache_size-1)); Calculates the number of buffered pages to write in the current loop based on POS, index, start offset in the page, and write length Bytesindex = pos >> page_cache_shift;bytes = page_cache_ Size-offset;if (Bytes > Count) {bytes = Count;deactivate = 0;} /* * Bring in the user page that we'll copy from _first_. * Otherwise there ' s a nasty deadlock on copying from the * same page as we ' re writing to, without it being marked * up-to- Date. */{volatile unsigned char dummy;__get_user (dummy, buf); __get_user (dummy, buf+bytes-1);} Status =-enomem;/* We ll assign it later anyway */page = __grab_cache_page (mapping, Index, &cached_page);//In Page_hash _table found in the buffer page, if not found, allocate, create a buffer page if (!page) break;/* We have exclusive IO access to the page. */IF (! Pagelocked (page)) {page_bug (page);} Status = Mapping->a_ops->prepare_write (file, page, offset, offset+bytes);//Pre-write first read, first the textThe data in the corresponding location on the device is read into the page if (status) goto UNLOCK;KADDR = page_address (page); status = Copy_from_user (Kaddr+offset, buf, bytes) ;//Copy the data from the user space to the page point, the Buffer Flush_dcache_page (page) has been placed; if (status) goto Fail_write;status = mapping->a_ops- >commit_write (file, page, offset, offset+bytes);//real write, write the buffer data to the device if (!status) status = bytes;if (status >= 0) { Written + = Status;count = Status;pos + status;buf + status;} unlock:/* Mark It unlocked again and drop the page. */unlockpage (page), if (deactivate) deactivate_page (page);p age_cache_release (page), if (status < 0) break;} *ppos = Pos;if (cached_page) page_cache_free (cached_page);/* For today, when the user asks for O_sync, we ' ll actually * provi De O_dsync. */if (Status >= 0) && (file->f_flags & o_sync)) status = Generic_osync_inode (Inode, 1); /* 1 means Datasync */err = written? Written:status;out:up (&inode->i_sem); return err;fail_write:status =-efault; Clearpageuptodate (page); Kunmap (page); goto unlock;}
There is a pointer i_mapping in the INODE structure, which points to a address_space data structure, which is defined as follows:

struct Address_space {struct list_headclean_pages;/* list of clean pages */struct list_headdirty_pages;/* List of Dirty PA GES */struct list_headlocked_pages;/* List of locked pages */unsigned longnrpages;/* number of total pages */struct addres S_space_operations *a_ops;/* methods */struct inode*host;/* owner:inode, Block_device */struct vm_area_struct*i_mmap;/ * List of private mappings */struct vm_area_struct*i_mmap_shared; /* List of shared mappings */spinlock_ti_shared_lock;  /* and spinlock protecting it */};
Where a_ops, it points to a address_space_operations data structure, in terms of ext2 file system, this data structure is ext2_aops, the code is as follows:

struct Address_space_operations {int (*writepage) (struct page *), int (*readpage) (struct file *, struct page *); Int (*sync_ page) (struct page *); int (*prepare_write) (struct file *, struct page *, unsigned, unsigned); int (*commit_write) (struct fil E *, struct page *, unsigned, unsigned);/* Unfortunately this kludge are needed for fibmap. Don ' t use it */int (*bmap) (struct address_space *, long);};


__grab_cache_page, find the buffer page in page_hash_table, if not found, allocate, create a buffer page, the code is as follows:

Static inline struct page * __grab_cache_page (struct address_space *mapping,unsigned long index, struct page **cached_page {struct page *page, **hash = page_hash (mapping, index); repeat:page = __find_lock_page (mapping, index, hash);//In Page_ Hash_table look for the buffer page if (!page) {//If not found if (!*cached_page) {//cached_page = Null*cached_page = Page_cache_alloc ();// Assign a page if (!*cached_page) return NULL;} page = *cached_page;if (add_to_page_cache_unique (page, mapping, index, hash))//Join to page_hash_table Goto repeat;*cached _page = NULL;} return page;}
#define Page_hash (Mapping,index) (PAGE_HASH_TABLE+_PAGE_HASHFN (Mapping,index))
Add_to_page_cache_unique, add to page_hash_table with the following code:

static int add _to_page_cache_unique (struct page * page,struct address_space *mapping, unsigned long offset,struct page **hash) {int err; struct page *alias;spin_lock (&pagecache_lock); alias = __find_page_nolock (mapping, offset, *hash); err = 1;if (!alias {__add_to_page_cache (page,mapping,offset,hash); err = 0;} Spin_unlock (&pagecache_lock); return err;} 
static inline void __add_to_page_cache (struct page * page,struct address_space *mapping, unsigned long offset,struct page **hash) {unsigned long flags;if (pagelocked (page)) BUG (); flags = Page->flags & ~ ((1 << pg_uptodate) | (1 << pg_error) | (1 << pg_dirty) | (1 << pg_referenced) | (1 << pg_arch_1)); Page->flags = Flags | (1 << pg_locked);p age_cache_get (page);p Age->index = offset;//is the page cache that was originally passed in Indexadd_page_to_inode_queue ( mapping, page); Add_page_to_hash_queue (page, hash);//Add to Page_hash_table Table Lru_cache_add (page);}


Once the buffer page has been obtained, this page may be a newly allocated blank page. New AssignmentBlank pages and existing buffer pages in addition to the content of the fundamental differences, there are also important structural differences. That's what we said earlier, the buffer page is linked to a page structure on the one hand, and the newly allocated page has no Buffer_head structure to link to it. Therefore, for the newly allocated blank page to be equipped with the corresponding Buffer_head data structure, and the content of the target page first read from the device (because the write operation is not necessarily the entire page of the write). Not only that, but the existing old page also has a buffer page content is "up to date", that is, whether the problem is consistent. Here the so-called "consistent" refers to the buffer page of the train buffer content and the logical content on the device consistent.

Mapping->a_ops->prepare_write begins execution, pointing to Ext2_prepare_write, the code is as follows:

static int ext2_prepare_write (struct file *file, struct page *page, unsigned from, unsigned to) {return block_prepare_write (Page,from,to,ext2_get_block);}
int Block_ Prepare_write (struct page *page, unsigned from, unsigned to,get_block_t *get_block) {struct Inode *inode = Page->mappin G->host;int err = __block_prepare_write (inode, page, from, to, Get_block), if (err) {clearpageuptodate (page); Kunmap ( page);} return err;} 
static int __block_prepare_write (struct inode *inode, struct page *page,unsigned from, unsigned to, get_block_t *get_block  ) {unsigned block_start, block_end;unsigned long block;int err = 0;unsigned blocksize, bbits;struct buffer_head *bh, *head, *WAIT[2], **wait_bh=wait;char *kaddr = kmap (page); blocksize = Inode->i_sb->s_blocksize;if (!page->buffers)// The description is a newly assigned page with no buffer_head structure create_empty_buffers (page, Inode->i_dev, blocksize);//For this page is equipped with a corresponding buffer_head structure, and set up this queue head = Page->buffers;bbits = Inode->i_sb->s_blocksize_bits;block = Page->index << (PAGE_    cache_shift-bbits);//use page->indexfor (BH = head, Block_start = 0; BH! = Head | |!block_start; block++, block_start=block_end, BH = bh->b_this_page) {if (!BH) BUG (); block_end = Block_start+blocksize;if (block_end <= from) continue;if (Block_start >= to) break;if (!buffer_mapped (BH)) {err = Get_block (inode, block, BH, 1);//bh in-store related Information for Ll_rw_block Prepare if (err) goto out;if (buffer_new (BH)) {unmap_underlying_metadata (BH), if (Page_uptodate (Page)) {set_bit (bh_uptodate, &bh->b_state); continue;} if (Block_end > To) memset (kaddr+to, 0, block_end-to), if (Block_start < from) memset (Kaddr+block_start, 0, From-block_start); if (Block_end > to | | block_start < from) flush_dcache_page (page); continue;}} if (Page_uptodate (Page)) {set_bit (bh_uptodate, &bh->b_state); continue;} if (!buffer_uptodate (BH) && (Block_start < from | | block_end > To)) {//If it is a newly assigned page, it must be inconsistent, if the original page, it may not be the same Ll_ Rw_block (read, 1, &AMP;BH);//If it is inconsistent, read the data from the device into the page, BH has stored the information used to read data from the device *wait_bh++=bh;}} /* * IF we issued read requests-let them complete. */while (Wait_bh > Wait) {wait_on_buffer (*--WAIT_BH); err =-eio;if (!buffer_uptodate (*WAIT_BH)) goto out;} return 0;out:return err;}
Create_empty_buffers, equip the page with the appropriate Buffer_head structure and build this queue
static void Create_empty_buffers (struct page *page, kdev_t Dev, unsigned long blocksize) {struct Buffer_head *bh, *head, *t Ail;head = create_buffers (page, blocksize, 1); if (page->buffers) BUG (); bh = head;do {Bh->b_dev = dev;//important point Bh->b_ BLOCKNR = 0;bh->b_end_io = Null;tail = BH;BH = Bh->b_this_page;} while (BH); tail->b_this_page = Head;page->buffers = head;//Important point Page_cache_get (page);}
static struct Buffer_head * create_buffers (struct page * page, unsigned long size, int async) {struct Buffer_head *bh, *head;long Offset;try_again:head = Null;offset = Page_size;while ((offset-= SIZE) >= 0) {bh = get_unused_buffer_head (async);  BH) Goto No_grow;bh->b_dev = B_free; /* Flag as unused */bh->b_this_page = Head;head = Bh;bh->b_state = 0;bh->b_next_free = Null;bh->b_pprev = NULL ; Atomic_set (&bh->b_count, 0); bh->b_size = size;//important point, block_sizeset_bh_page (BH, page, offset); bh->b_ List = Buf_clean;bh->b_end_io = NULL;}        return head; ......}
void Set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset) {bh->b_page = page;//important point if (offset &G  t;= page_size) BUG (); if (Pagehighmem (PAGE))/* * This catches illegal uses and preserves the offset: */bh->b_data = (char *) (0 + offset); elsebh->b_data = page_address (page) + offset;//important point, the actual location of the page}
Return to Generic_file_write, continue to execute mapping->a_ops->commit_write, write, write the buffer data to the device, the corresponding pointer is Generic_commit_write, The code is as follows:

int generic_ Commit_write (struct file *file, struct page *page,unsigned from, unsigned to) {struct Inode *inode = page->mapping->h ost;loff_t pos = ((loff_t) Page->index << page_cache_shift) + to;__block_commit_write (inode,page,from,to); Kunmap (page), if (pos > inode->i_size) {inode->i_size = Pos;mark_inode_dirty (inode);} return 0;} 
static int __block_commit_write (struct inode *inode, struct page *page,unsigned from, unsigned to) {unsigned block_start, b Lock_end;int partial = 0, Need_balance_dirty = 0;unsigned blocksize;struct buffer_head *bh, *head;blocksize = Inode->i_ Sb->s_blocksize;for (bh = head = page->buffers, Block_start = 0;//page->buffers get buffer_head structure bh! = Head | |!b    Lock_start; Block_start=block_end, BH = bh->b_this_page) {block_end = Block_start + blocksize;if (block_end <= from | | block_sta RT >= To) {if (!buffer_uptodate (BH)) partial = 1;} else {set_bit (bh_uptodate, &bh->b_state); if (!atomic_set_ Buffer_dirty (BH)) {__mark_dirty (BH), buffer_insert_inode_queue (BH, inode), Need_balance_dirty = 1;//as long as there is a record block buffer from "clean" The state becomes "dirty", Need_balance_dirty 1}}}if (need_balance_dirty) balance_dirty (Bh->b_dev);//If you set 1,  This function to see if such a record block has accumulated to a certain number, if it is, wake Bdflushin to do a "scour"/* * is the "a", "happened" to making all buffers * uptodate Then we can optimize away a bogus readpage () for * thE next read (). Here we ' discover ' wether the page went * uptodate as a result of this (potentially partial) write. */if (!partial) setpageuptodate (page); return 0;}
At this point, the file is finished analysis, page and Buffer_head manage the page at the same time, page->buffers points to the Buffer_head,bh->b_page point to page.

Linux kernel Source-code scenario analysis-File Write

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.