Linux system Calls

Source: Internet
Author: User
Tags goto

User-space programs cannot execute kernel code directly. They cannot directly invoke functions in kernel space because the kernel resides on a protected address space. If a process can read and write directly to the kernel's address space, system security will be out of control. Therefore, the application should notify the system in some way, telling the kernel that it needs to perform a system call, and that it wants the system to switch to the kernel state so that the kernel can execute the system call on behalf of the application.

The mechanism for notifying the kernel is implemented by software interrupts. First, the user program sets the parameters for the system call. One of the parameters is the system call number. After the parameter setting is complete, the program executes the system call command. The soft interrupt on the x86 system is generated by Int. This command causes an exception: an event that causes the processor to switch to the kernel state and jump to a new address and start executing the exception handler there. The exception handler at this point is actually the system call handler. It is closely related to the hardware architecture.

Is the system call procedure:

Let's take open as an example to look at the invocation process of kernel.

The corresponding relationship between the system call number and the system invoker is defined in Include/uapi/asm-generic/unistd.h

#define __nr_open 1024
__syscall (__nr_open, Sys_open)

Include/linux/syscall.h Redefining system call-related macros

#define SYSCALL_DEFINE0 (sname) \
Syscall_metadata (_# #sname, 0);
Asmlinkage long sys_# #sname (void)

#define SYSCALL_DEFINE1 (name, ...) Syscall_definex (1, _# #name, __va_args__)
#define SYSCALL_DEFINE2 (name, ...) Syscall_definex (2, _# #name, __va_args__)
#define SYSCALL_DEFINE3 (name, ...) Syscall_definex (3, _# #name, __va_args__)
#define SYSCALL_DEFINE4 (name, ...) Syscall_definex (4, _# #name, __va_args__)
#define SYSCALL_DEFINE5 (name, ...) Syscall_definex (5, _# #name, __va_args__)
#define SYSCALL_DEFINE6 (name, ...) Syscall_definex (6, _# #name, __va_args__)

#define SYSCALL_DEFINEX (x, sname, ...) \
Syscall_metadata (sname, x, __va_args__) \
__syscall_definex (x, sname, __va_args__)

#define __SYSCALL_DEFINEX (x, Name, ...) \
Asmlinkage long sys# #name (__map (x,__sc_decl,__va_args__)) \
__attribute__ (alias (__stringify (sys# #name)));
Static inline long sysc# #name (__map (x,__sc_decl,__va_args__)); \
Asmlinkage long sys# #name (__map (x,__sc_long,__va_args__)); \
Asmlinkage long sys# #name (__map (x,__sc_long,__va_args__)) \
{\
LONG ret = sysc# #name (__map (x,__sc_cast,__va_args__)); \
__map (x,__sc_test,__va_args__); \
__protect (x, Ret,__map (x,__sc_args,__va_args__));
Return ret;\
}\
Static inline long sysc# #name (__map (x,__sc_decl,__va_args__))

In the FS/OPEN.C,

SYSCALL_DEFINE3 (open, const char __user *, filename, int, flags, umode_t, mode)//By the above macro, you can see here is the implementation of the Sys_open place.
{
if (Force_o_largefile ())
Flags |= O_largefile;

Return Do_sys_open (AT_FDCWD, filename, flags, mode);
}

Long Do_sys_open (int dfd, const char __user *filename, int flags, umode_t mode)
{
struct Open_flags op;
int FD = BUILD_OPEN_FLAGS (flags, mode, &OP);
struct filename *tmp;

if (FD)
return FD;

TMP = getname (filename);//Get file name
if (Is_err (TMP))
return Ptr_err (TMP);

FD = Get_unused_fd_flags (flags);//Get available FD
if (FD >= 0) {
struct File *f = Do_filp_open (DFD, TMP, &OP);//create struct file structure, open file
if (Is_err (f)) {
PUT_UNUSED_FD (FD);//Open file failed, release FD
FD = Ptr_err (f);
} else {
Fsnotify_open (f);//Add the file to the monitoring system and the monitoring file is turned off.
Fd_install (FD, f);//Add the struct file pointer to an array with the FD as IDX for subsequent operations on the file.
}
}
Putname (TMP);
return FD;
}

struct file *do_filp_open (int dfd, struct filename *pathname,
const struct OPEN_FLAGS *op)
{
struct nameidata nd;
int flags = op->lookup_flags;
struct file *filp;

Set_nameidata (&nd, DFD, pathname);
Filp = Path_openat (&nd, OP, Flags | LOOKUP_RCU);//open file.
if (unlikely (Filp = = Err_ptr (-echild)))
Filp = Path_openat (&nd, OP, Flags);
if (unlikely (Filp = = Err_ptr (-estale)))
Filp = Path_openat (&nd, OP, Flags | Lookup_reval);
Restore_nameidata ();
return FILP;
}

static struct file *path_openat (struct Nameidata *nd,
const struct OPEN_FLAGS *op, unsigned flags)
{
const char *s;
struct file *file;
int opened = 0;
int error;

File = Get_empty_filp ();//Assign struct file
if (is_err (file))
return file;

File->f_flags = op->open_flag;

if (Unlikely (File->f_flags & __o_tmpfile)) {
Error = Do_tmpfile (ND, Flags, op, File, &opened);
Goto Out2;
}

if (Unlikely (File->f_flags & O_path)) {
Error = Do_o_path (ND, flags, file);//If the file open flag is O_path, it may be directory. Vfs_open is called in the function
if (!error)
Opened |= file_opened;
Goto Out2;
}

s = path_init (ND, flags);
if (Is_err (s)) {
PUT_FILP (file);
return Err_cast (s);
}
while (!) ( Error = Link_path_walk (s, ND)) &&//parse file name, convert to Dentry
(Error = do_last (nd, File, op, &opened)) > 0) {The last step of//open, finding the inode by Dentry, and finally calling Vfs_open
Nd->flags &= ~ (lookup_open| lookup_create| LOOKUP_EXCL);
s = trailing_symlink (nd);
if (Is_err (s)) {
Error = Ptr_err (s);
Break
}
}
Terminate_walk (ND);
Out2:
if (! ( Opened & file_opened)) {
BUG_ON (!error);
PUT_FILP (file);
}
if (unlikely (error)) {
if (Error = =-eopenstale) {
if (Flags & LOOKUP_RCU)
Error =-echild;
Else
Error =-estale;
}
File = err_ptr (error);
}
return file;
}

int Vfs_open (const struct path *path, struct file *file,
const struct CRED *cred)
{
struct Dentry *dentry = d_real (Path->dentry, NULL, file->f_flags);

if (Is_err (dentry))
Return Ptr_err (Dentry);

File->f_path = *path;
return Do_dentry_open (file, D_backing_inode (dentry), NULL, cred);//Open File
}

static int Do_dentry_open (struct file *f,
struct Inode *inode,
Int (*open) (struct inode *, struct file *),
const struct CRED *cred)
{
static const struct File_operations empty_fops = {};
int error;

F->f_mode = Open_fmode (f->f_flags) | Fmode_lseek |
Fmode_pread | Fmode_pwrite;

Path_get (&f->f_path);
F->f_inode = Inode;
f->f_mapping = inode->i_mapping;

if (Unlikely (F->f_flags & O_path)) {//If the flag contains O_path. Then struct file_operations is empty
F->f_mode = Fmode_path;
F->f_op = &empty_fops;
return 0;
}

if (F->f_mode & Fmode_write &&!special_file (Inode->i_mode)) {
Error = get_write_access (inode);
if (unlikely (error))
Goto Cleanup_file;
Error = __mnt_want_write (F->F_PATH.MNT);
if (unlikely (error)) {
Put_write_access (Inode);
Goto Cleanup_file;
}
F->f_mode |= Fmode_writer;
}

/* POSIX.1-2008/SUSV4 Section XSI 2.9.7 */
if (S_isreg (inode->i_mode) | | S_isdir (Inode->i_mode))
F->f_mode |= Fmode_atomic_pos;

F->f_op = Fops_get (INODE->I_FOP);//Gets the struct file_operations structure of the inode corresponding
if (Unlikely (warn_on (!F->F_OP))) {
Error =-enodev;
Goto Cleanup_all;
}

Error = Security_file_open (f, cred);
if (Error)
Goto Cleanup_all;

Error = Break_lease (Inode, f->f_flags);
if (Error)
Goto Cleanup_all;

if (!open)
Open = The Open function for F->f_op->open;//inode, or the driver's open function if it is a device file.
if (open) {
Error = open (Inode, f);//Call the inode corresponding to the open function.
if (Error)
Goto Cleanup_all;
}
if (F->f_mode & (Fmode_read | Fmode_write)) = = Fmode_read)
I_readcount_inc (Inode);
if ((F->f_mode & Fmode_read) &&likely (F->f_op->read | | f->f_op->read_iter))
F->f_mode |= Fmode_can_read;
if ((F->f_mode & Fmode_write) &&likely (F->f_op->write | | f->f_op->write_iter))
F->f_mode |= Fmode_can_write;

F->f_flags &= ~ (o_creat | O_excl | O_noctty | O_TRUNC);

File_ra_state_init (&f->f_ra, f->f_mapping->host->i_mapping);

return 0;

Cleanup_all:
Fops_put (F->F_OP);
if (F->f_mode & Fmode_writer) {
Put_write_access (Inode);
__mnt_drop_write (F->F_PATH.MNT);
}
Cleanup_file:
Path_put (&f->f_path);
F->f_path.mnt = NULL;
F->f_path.dentry = NULL;
F->f_inode = NULL;
return error;
}

Linux system Calls

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.