Linux file open and read-write process code parsing __linux

Source: Internet
Author: User
Tags dio flush

Open File Flow: System call

Fd=open ("/DEV/PCIE_SSD", O_RDWR);


Code Locator FS:OPEN.C File

SYSCALL_DEFINE3 (open, const char __user *, filename, int, flags, umode_t, mode)
{
if (Force_o_largefile ())
Flags |= O_largefile;


Return Do_sys_open (AT_FDCWD, filename, flags, mode);
}


Long Do_sys_open (int dfd, const char __user *filename, int flags, umode_t mode)
{
struct Open_flags op;
int FD = BUILD_OPEN_FLAGS (flags, mode, &OP);
struct filename *tmp;


if (FD)
return FD;
TMP = getname (filename);
Gets the filename, which creates a space to access the file name, and then copies the file name from user space.
if (Is_err (TMP))
return Ptr_err (TMP);


FD = Get_unused_fd_flags (flags);
/* Get an available FD, invoke ALLOC_FD to get an available FD from the fd_table and simply initialize it, note that for FD,
It is only valid for this process, that is, it is visible only in the process and represents a completely different file in other processes.
if (FD >= 0) {
struct File *f = Do_filp_open (DFD, TMP, &OP);//Create a file object for path resolution
if (Is_err (f)) {
PUT_UNUSED_FD (FD);
FD = Ptr_err (f);
} else {
Fsnotify_open (f);
/* The file has been opened and called to open the function according to the information specified by the inode.
The file is added to the file monitoring system, which is used to monitor the operation of files being opened, created, read and written, closed, modified, etc.
Fd_install (FD, F);
/* Set the file pointer to an FD array, and add F to the array at the FD index location.
If the subsequent process has continued operation on F, the corresponding file structure will be obtained by looking up the array, and doing the operation.
}
}
Putname (TMP);
return FD;

}

struct file *do_filp_open (int dfd, struct filename *pathname,
const struct Open_flags *op)
{
struct nameidat a nd;
INT flags = op->lookup_flags;
struct file *filp;

Set_nameidata (&nd, DFD, pathname);
Filp = Path_openat (&nd, OP, Flags | LOOKUP_RCU)///* resolves the road path */
if (unlikely (Filp = = Err_ptr (-echild))
Filp look_up = Path_openat (&nd, OP, Flags);
if (unlikely (Filp = = Err_ptr (-estale))
Filp = Path_openat (&nd, OP, Flags | Lookup_reval);
Restore_nameidata ();
return FILP;
}

int Vfs_open (const struct path *path, struct file *file,
const struct CRED *cred)
{
struct Dentry *dentry = path->dentry;/* resolves dentry*/according to pathname
struct Inode *inode = dentry->d_inode;/* found inode*/according to Dentry


File->f_path = *path;
if (Dentry->d_flags & Dcache_op_select_inode) {
Inode = Dentry->d_op->d_select_inode (Dentry, file->f_flags);
if (Is_err (inode))
Return Ptr_err (Inode);
}


return Do_dentry_open (file, Inode, NULL, cred);
}

static int Do_dentry_open (struct file *f, struct inode *inode,int (*open) (struct inode *, struct file *),

const struct CRED *cred)

{

........

F->f_op = Fops_get (INODE->I_FOP);/key, find file F_op according to the Inode I_FOP

if (F->f_mode & (Fmode_read | Fmode_write)) = = Fmode_read)
I_readcount_inc (Inode);
if ((F->f_mode & Fmode_read) &&
Likely (F->f_op->read | | f->f_op->read_iter))
F->f_mode |= Fmode_can_read;
if ((F->f_mode & Fmode_write) &&
Likely (F->f_op->write | | f->f_op->write_iter))
F->f_mode |= Fmode_can_write;

}

Path Lookup path_lookup See links http://blog.csdn.net/kickxxx/article/details/9529961

NAMEIDATA Data Structure

The lookup process involves a number of function calls, and Nameidata plays an important role in these calls: 1. Pass parameters to lookup function; 2. Save the lookup results.

[HTML]  View Plain  copy struct nameidata {            struct dentry   *dentry;            struct vfsmount *mnt;           struct qstr      last;           unsigned  int    flags;           int              last_type;            unsigned        depth;            char *saved_names[MAX_NESTED_LINKS + 1];              /* Intent data */            union {                    struct open_intent open;            } intent;  };  

VFS Read and write process: System call

Retval1=read (fd,buf,4096); Retval1=write (fd,buf,4096);

Code Locator FS:READ_WRITE.C File


Syscall_define3 (write, unsigned int, fd, const char __user *, buf, size_t, Count)
{
struct FD f = fdget_pos (FD);//Get File
ssize_t ret =-EBADF;


if (f.file) {
loff_t pos = File_pos_read (f.file); Reading file read and write locations
ret = Vfs_write (F.file, buf, Count, &pos);//vfs Read files
if (ret >= 0)
File_pos_write (F.file, POS); Write back file read/write location
Fdput_pos (f);
}


return ret;

}


ssize_t vfs_write (struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;


if (!) ( File->f_mode & Fmode_write)//To determine whether the file is writable
RETURN-EBADF;
if (!) ( File->f_mode & Fmode_can_write)//whether to define file write method
Return-einval;
if (Unlikely (!ACCESS_OK (Verify_read, buf, Count))
Return-efault;


ret = Rw_verify_area (write, file, POS, count);/write Checksum
if (ret >= 0) {
Count = ret;
File_start_write (file);
ret = __vfs_write (file, buf, Count, POS);//Invoke File Write action method
if (Ret > 0) {
Fsnotify_modify (file);//Change the location of files
Add_wchar (current, ret);
}
INC_SYSCW (current);
File_end_write (file);
}


return ret;
}


ssize_t __vfs_write (struct file *file, const char __user *p, size_t count,
loff_t *pos)
{
if (file->f_op->write)
return file->f_op->write (file, p, count, POS);//Invoke File Write action method
Note: From this we can see that the VFS does not cache the write file content process
else if (file->f_op->write_iter)
return new_sync_write (file, p, count, POS);//Common file model write method
Else
Return-einval;
}


Static ssize_t new_sync_write (struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
struct Iovec Iov = {. Iov_base = (void __user *) buf,. Iov_len = Len};
struct KIOCB KIOCB;
struct Iov_iter iter;
ssize_t ret;


INIT_SYNC_KIOCB (&KIOCB, FILP);//initialization of asynchronous I/O structures

/*struct KIOCB {
struct file *ki_filp; File pointers
loff_t ki_pos;//File Location
void (*ki_complete) (struct KIOCB *IOCB, long ret, long Ret2);/callback, representing asynchronous I/O completion
void *private;
int ki_flags;
;*/
Kiocb.ki_pos = *ppos;
Iov_iter_init (&iter, WRITE, &iov, 1, Len);


ret = Filp->f_op->write_iter (&KIOCB, &iter);
bug_on (ret = =-eiocbqueued);
if (Ret > 0)
*ppos = Kiocb.ki_pos;
return ret;
}


struct File_operations {
struct module *owner;//The first File_operations member is not an operation at all; It is a pointer to a module that owns this structure. This member is used to prevent the module from being unloaded while its operation is still in use. Almost all of the time, it was simply initialized to This_module, a macro defined in <Linux/module.h>.
loff_t (*llseek) (struct file *, loff_t, int); The//llseek method is used to change the current read/write position in the file, and the new position is the (positive) return value. The loff_t parameter is a "long offset" and is at least 64 bits wide even on a 32-bit platform. The error is indicated by a negative return value. If the function pointer is NULL, the seek call modifies the position counter in the file structure in a potentially unpredictable manner (described in the "File Structure" section).
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);//used to get data from the device. A null pointer at this position causes the read system call to fail with-einval ("Invalid argument"). A non-negative return value represents the number of bytes that were successfully read (the return value is a "signed size" type and is often an integer type that is local to the target platform).
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);//Send data to device. If NULL,-einval returns to the program calling the write system call. If non-negative, the return value represents the number of bytes successfully written.
ssize_t (*read_iter) (struct KIOCB *, struct iov_iter *);//For device files This member should be null; It is used to read directories and is only useful for file systems.
ssize_t (*write_iter) (struct KIOCB *, struct iov_iter *);
Synchronous read and write: After the application has started reading and writing, wait until the read and write function is completed to return, to continue running the following code.
Asynchronous reads and writes: After the application initiates read and write, registers the read and write to the queue, then immediately returns, the application continues to run behind the code, very fast, when the read and write action completes, the system sends a message to notify the application, then the application receives reads and writes the result. Depending on the scene selection, the driver can simultaneously implement both synchronous and asynchronous interfaces.
Int (*iterate) (struct file *, struct dir_context *);
unsigned int (*poll) (struct file *, struct poll_table_struct *); The//poll method is the back end of 3 system calls: Poll, Epoll, and select, both used as a query on one or more Whether the read or write of the file descriptor will block. The poll method should return a bitmask indicating whether read or write non-blocking is possible, and, possibly, provide kernel information to enable the calling process to sleep until I/O becomes possible. If a driven poll method is NULL, the device is assumed to be readable and writable without blocking.
Long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);//ioctl system calls provide a way to emit device-specific commands (for example, a track that formats a floppy disk, which is not read or write). In addition, several IOCTL commands are identified by the kernel without reference to the FoPs table. If the device does not provide a IOCTL method, the system call returns an error for any unspecified request (-enotty, "device no such ioctl").
Long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
Int (*mmap) (struct file *, struct vm_area_struct *)//mmap the address space used to request the mapping of device memory to the process. If this method is NULL, the mmap system call returns-ENODEV.
Int (*open) (struct inode *, struct file *);//Although this is often the first operation on a device file, the driver is not required to declare a corresponding method. If this item is NULL, the device is turned on successfully, but your driver will not be notified.
Int (*flush) (struct file *, fl_owner_t ID);//flush operation is invoked when the process closes the copy of its device file descriptor; It should perform (and wait) any unfinished operation of the device. This must not be confused with the fsync operation of the user query request. Currently, flush is used in very few drives; SCSI Tape drivers use it, for example, to ensure that all written data is written to tape before the device is shut down. If Flush is NULL, the kernel simply ignores requests for user applications.
Int (*release) (struct inode *, struct file *);//reference this operation when the file structure is released. As open, release can be null.
Int (*fsync) (struct file *, loff_t, loff_t, int datasync);//This method is the backend of the Fsync system call, which the user calls to refresh any hanging data. If this pointer is NULL, the system call returns-EINVAL.
Int (*aio_fsync) (struct KIOCB *, int datasync);//This is an asynchronous version of the Fsync method.
Int (*fasync) (int, struct file *, int);//This operation is used to notify the device of changes to its Fasync flag. Asynchronous notification is an advanced topic, described in Chapter 6th. This member can be null if the driver does not support asynchronous notifications.
Int (*lock) (struct file *, int, struct file_lock *);//lock method is used to add locks to files; Lock is essential to regular files, but device drivers almost never implement it.
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);//sendpage is the other half of Sendfile; It is used by the kernel to send data, one page at a time, to the corresponding file. Device drivers do not actually implement Sendpage.
unsigned long (*get_unmapped_area) (struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
Int (*check_flags) (int);//This method allows the module to check the flags passed to the FNCTL (F_SETFL ...) call.
Int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write) (struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read) (struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
Int (*setlease) (struct file *, long, struct file_lock * *, void * * *);
Long (*fallocate) (struct file *file, int mode, loff_t offset,
loff_t Len);
void (*show_fdinfo) (struct seq_file *m, struct file *f);
#ifndef Config_mmu
Unsigned (*mmap_capabilities) (struct file *);
#endif
};

ext4:file.c file.

const struct File_operations ext4_file_operations = {
. Llseek = Ext4_llseek,
. Read_iter = Generic_file_read_iter,
. Write_iter = Ext4_file_write_iter,
. Unlocked_ioctl = Ext4_ioctl,
#ifdef CONFIG_COMPAT
. Compat_ioctl = Ext4_compat_ioctl,
#endif
. mmap = Ext4_file_mmap,
. open = Ext4_file_open,
. Release = Ext4_release_file,
. Fsync = Ext4_sync_file,
. Splice_read = Generic_file_splice_read,
. Splice_write = Iter_file_splice_write,
. fallocate = Ext4_fallocate,
};


Static ssize_t ext4_file_write_iter (struct KIOCB *iocb, struct iov_iter *from)->ssize_t __generic_file_write_iter ( struct KIOCB *iocb, struct iov_iter *from)

Iocb->ki_flags & Iocb_direct To determine whether to enter direct IO

Direct IO is an option of the write function to determine that the data content is written directly to disk rather than cached, ensuring that critical data is written to disk, even if the system is abnormal, and that the Direct IO process is also sequential with the process of writing files. When the kernel goes to the __generic_file_aio_write function, the system enters the Directio processing branch according to File->f_flags & O_direct, and then looks at generic_file_direct_ first. Write function, mainly has filemap_write_and_wait_range,invalidate_inode_pages2_range and mapping->a_ops->direct_io functions.

Filemap_write_and_wait_range is mainly used to brush mapping under the dirty page, filemap_write_and_wait_range if there is a write volume is returned, subsequent two functions are not executed. The effect is to check whether the current memory is the cache page that corresponds to Direct_io, and if so, to mark its cache as invalid. The purpose is that because the data written by Direct_io is not cached, if the direct_io is cached before the data is written and is clean, the cache and disk data are inconsistent when the direct_io is completed, and if there is no protection when reading the cache, The data obtained is not the data on the disk. Returns a function that does not perform the following if the corresponding cache mark is not valid. Mapping->a_ops->direct_io is implemented through __BLOCKDEV_DIRECT_IO, the DIO structure is assembled in Direct_io_worker, and then through Dio_bio_submit, The essence is to be submitted to the IO layer via Submit_bio (DIO->RW, bio). The so-called direct_io and other read and write comparison is to cross the buffer layer, do not intermediate thread Pdflush and Kjournald periodically brush disk to IO layer. This time is not necessarily the data on the disk, Direct_io is to assume that the IO device driver does not have a large delay.


Generic_file_direct_write (IOCB, from, Iocb->ki_pos);->ssize_t generic_perform_write (struct file *file,struct Iov_iter *i, loff_t POS)

->a_ops>write_begin (file, mapping, pos, Bytes, flags,&page, &fsdata);

First, the allocation page cache is written to the data by calling function A_ops->write_begin and a set of buffer_head structures for the page is used to describe the block of data that makes up the page, in the function iov_iter_copy_from_user_ Atomic the user-space data into the page cache, and then calls the function A_ops->write_end to mark each buffer_head in the page as dirty.

The file->f_mapping is from the corresponding inode->f_mapping, Inode->f_mapping->a_ops is given by the corresponding file system type when the Inode is generated. File->f_mapping->a_ops->write_begin and File->f_mapping->a_ops->write_end in the Ext3 file system

Inode.c.

static const struct Address_space_operations Ext4_da_aops = {
. Readpage = Ext4_readpage,
. readpages = Ext4_readpages,
. Writepage = Ext4_writepage,
. writepages = Ext4_writepages,
. Write_begin = Ext4_da_write_begin,
. Write_end = Ext4_d

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.