sys_umount系統調用的實現注釋。2.4版核心

最後更新：2018-12-04 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

部落格已遷移至：http://kulv.sinaapp.com/，這裡不再使用

/* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. * * We now support a flag for forced unmount like the other 'big iron' * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD */ //檔案系統卸載的系統調用 asmlinkage long sys_umount(char * name, int flags) { struct nameidata nd; char *kname; int retval;lock_kernel(); kname = getname(name); retval = PTR_ERR(kname); if (IS_ERR(kname)) goto out; retval = 0; //kulv:得到卸載目錄的nameidata結構。其內有該目錄的dentry , inode if (path_init(kname, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &nd)) retval = path_walk(kname, &nd); putname(kname); if (retval) goto out; retval = -EINVAL; if (nd.dentry != nd.mnt->mnt_root)//mnt 是代表該安裝點的串連結構，此結構的mnt_root指向了裝置的根節點 goto dput_and_out;//如果該安裝目錄的dentry結構不等於裝置的根節點，則一定有問題。retval = -EPERM; if (!capable(CAP_SYS_ADMIN) && current->uid!=nd.mnt->mnt_owner) goto dput_and_out;//如果不是管理員，且不是擁有者。dput(nd.dentry);//減少使用計數 /* puts nd.mnt */ down(&mount_sem); retval = do_umount(nd.mnt, 0, flags);//真正工作的地方. up(&mount_sem); goto out; dput_and_out: path_release(&nd); out: unlock_kernel(); return retval; }/*kulv: mnt為該安裝點的串連結構。包含了裝置目錄，超級塊，以及安裝樹的支援結構。 umount_root = 0 ； flags:從sys_umount 傳下來 */ static int do_umount(struct vfsmount *mnt, int umount_root, int flags) { struct super_block * sb = mnt->mnt_sb;//得到超級塊/* * No sense to grab the lock for this test, but test itself looks * somewhat bogus. Suggestions for better replacement? * Ho-hum... In principle, we might treat that as umount + switch * to rootfs. GC would eventually take care of the old vfsmount. * The problem being: we have to implement rootfs and GC for that * Actually it makes sense, especially if rootfs would contain a * /reboot - static binary that would close all descriptors and * call reboot(9). Then init(8) could umount root and exec /reboot. */ if (mnt == current->fs->rootmnt && !umount_root) { int retval = 0;//如果卸載的目錄為當前進程根目錄， /* * Special case for "unmounting" root ... * we just try to remount it readonly. */ mntput(mnt); if (!(sb->s_flags & MS_RDONLY))//因為調用這的時候flag為0，所以下面不成立,使用者進程無法卸載自己的根目錄 retval = do_remount_sb(sb, MS_RDONLY, 0); return retval; }spin_lock(&dcache_lock);//如果一個裝置安裝了多次，即一個超級塊有多個串連結構，那麼 if (mnt->mnt_instances.next != mnt->mnt_instances.prev) {//如果安裝了多次，即其 if (atomic_read(&mnt->mnt_count) > 2) {//引用計數大於2，表面還有其他地方用到這個結構，那麼··· spin_unlock(&dcache_lock); mntput(mnt);//遞減一下計數就可以了 return -EBUSY; } if (sb->s_type->fs_flags & FS_SINGLE)//FS_SINGLE表示超級塊在同種檔案系統中是共用的 put_filesystem(sb->s_type); /* We hold two references, so mntput() is safe */ mntput(mnt); //因為這個超級塊有多次安裝，//即裝置有多次安裝，不能將裝置拆下，只是拆除多次安裝的一次 //下面的函數只是把當前串連件從各種隊列中刪除，然後遞減相關計數，這樣就無法訪問到了 remove_vfsmnt(mnt); return 0; } spin_unlock(&dcache_lock);//下面代表裝置的唯一一次安裝，那就麻煩點了 /* * Before checking whether the filesystem is still busy, * make sure the kernel doesn't hold any quota files open * on the device. If the umount fails, too bad -- there * are no quotas running any more. Just turn them on again. */ DQUOT_OFF(sb);//關於這個磁碟空間的配額。 acct_auto_close(sb->s_dev);/* * If we may have to abort operations to get out of this * mount, and they will themselves hold resources we must * allow the fs to do things. In the Unix tradition of * 'Gee thats tricky lets do it in userspace' the umount_begin * might fail to complete on the first run through as other tasks * must return, and the like. Thats for the mount program to worry * about for the moment. */if( (flags&MNT_FORCE) && sb->s_op->umount_begin) sb->s_op->umount_begin(sb);//讓相應的檔案系統來處理卸載準備工作 //struct file_operations 定義在:include/linux/fs.h裡面/* * Shrink dcache, then fsync. This guarantees that if the * filesystem is quiescent at this point, then (a) only the * root entry should be in use and (b) that root entry is * clean. */ shrink_dcache_sb(sb);//將緩衝在dentry_unused 隊列中的屬於本裝置的dentry結構刪除 fsync_dev(sb->s_dev);//將緩衝在記憶體中，還沒有同步到磁碟裝置中的資料重新整理到裝置上去。if (sb->s_root->d_inode->i_state) { mntput(mnt); return -EBUSY; }/* Something might grab it again - redo checks */spin_lock(&dcache_lock); if (atomic_read(&mnt->mnt_count) > 2) { spin_unlock(&dcache_lock); mntput(mnt); return -EBUSY; }/* OK, that's the point of no return */ mntput(mnt); remove_vfsmnt(mnt); //裝置上的緩衝，inode，髒頁面，鎖住的頁面，睡眠等待的進程·都處理完了，該把 //當前串連件從各種隊列中刪除，然後遞減相關計數，這樣就無法訪問到了. //你懂的，這些隊列什麼的形成了一個掛載點所需要的所有串連//將超級塊的後事處理一下，包括調用其super_operations函數通知相應的檔案系統 //然後刪除相關申請的緩衝什麼的。然後安全退出 kill_super(sb, umount_root); return 0; }/* * Called with spinlock held, releases it. */ static void remove_vfsmnt(struct vfsmount *mnt) {//本函數只是卸載相應裝置的多次安裝中的一次 /* First of all, remove it from all lists */ list_del(&mnt->mnt_instances);//把自己從超級塊的多次安裝中刪除，此成員是掛入超級塊的s_mounts上的。 list_del(&mnt->mnt_clash);//一個安裝點可以安裝多個檔案系統，mnt_clash掛入安裝點的dentry結構的d_vfsmount上 list_del(&mnt->mnt_list);//系統有個全域的vfsmntlist隊列，記錄系統中所有的串連件 list_del(&mnt->mnt_child);//從上一層安裝中刪除，相應的，自己也有一個mnt_mounts隊列表示安裝在我下面的裝置 spin_unlock(&dcache_lock); /* Now we can work safely */ if (mnt->mnt_parent != mnt)//我的上一層裝置為我自己，那就說明這是根節點 mntput(mnt->mnt_parent);dput(mnt->mnt_mountpoint);//指安裝點的dentry dput(mnt->mnt_root);//裝置的根目錄dentry結構 if (mnt->mnt_devname) kfree(mnt->mnt_devname); kfree(mnt); }/* * Shrink the dcache for the specified super block. * This allows us to unmount a device without disturbing * the dcache for the other devices. * * This implementation makes just two traversals of the * unused list. On the first pass we move the selected * dentries to the most recent end, and on the second * pass we free them. The second pass must restart after * each dput(), but since the target dentries are all at * the end, it's really just a single traversal. *//** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * * Shrink the dcache for the specified super block. This * is used to free the dcache before unmounting a file * system */ //刪除屬於給定超級塊的裝置的所有資料 void shrink_dcache_sb(struct super_block * sb) { struct list_head *tmp, *next; struct dentry *dentry;/* * Pass one ... move the dentries for the specified * superblock to the most recent end of the unused list. */ spin_lock(&dcache_lock); next = dentry_unused.next;//系統全域的邋錮entry_unused資料結構隊列，緩衝所有暫時不用要刪除的資料 while (next != &dentry_unused) { tmp = next; next = tmp->next; dentry = list_entry(tmp, struct dentry, d_lru); if (dentry->d_sb != sb)//越過不屬於我們的 continue; list_del(tmp);//從dentry_unused中刪除 list_add(tmp, &dentry_unused);//然後插入到節點之後，即第一個節點 }//經過這輪迭代，所有合格節點都已經相對倒敘的放在dentry_unused的前面部分了/* * Pass two ... free the dentries for this superblock. */ repeat: next = dentry_unused.next; while (next != &dentry_unused) {//再來一次 tmp = next; next = tmp->next; dentry = list_entry(tmp, struct dentry, d_lru); if (dentry->d_sb != sb) continue;//這用的著嗎，直接退出就可以呀，因為上一趟已經把所有合格都放到前面了 if (atomic_read(&dentry->d_count)) continue; dentry_stat.nr_unused--;//總數減少 list_del(tmp);//正式刪除了哈 INIT_LIST_HEAD(tmp);//這···用得著嗎? prune_one_dentry(dentry);//把dentry處理一下，其實就是把相關的指標刪除，比如hash等 goto repeat; } spin_unlock(&dcache_lock); }static inline void prune_one_dentry(struct dentry * dentry) { struct dentry * parent;list_del_init(&dentry->d_hash); list_del(&dentry->d_child); dentry_iput(dentry); parent = dentry->d_parent; d_free(dentry); if (parent != dentry)//莫非這裡如果不成立，那就是說這是根目錄了? dput(parent); spin_lock(&dcache_lock); } //KULV: //dev:裝置號 int fsync_dev(kdev_t dev) { //sync_buffers的第一趟，0表示對所有 sync_buffers(dev, 0);//第一趟，重新整理，不等待被鎖住的緩衝，過門不入lock_kernel(); sync_supers(dev);//將超級塊也處理一下，實際上就是在super_blocks全域隊列中找到裝置號相同的 //然後如果相應檔案系統有對應的sb->s_op->write_super(sb)，則調用，更新LRU sync_inodes(dev); DQUOT_SYNC(dev); unlock_kernel();return sync_buffers(dev, 1);//最後一趟，如果buf被鎖住了， //那會進入等待狀態，有可能會引起重新調度 }/* Call sync_buffers with wait!=0 to ensure that the call does not * return until all buffer writes have completed. Sync() may return * before the writes have finished; fsync() may not. *//* Godamity-damn. Some buffers (bitmaps for filesystems) * spontaneously dirty themselves without ever brelse being called. * We will ultimately want to put these in a separate list, but for * now we search all of the lists for dirty buffers. */ static int sync_buffers(kdev_t dev, int wait) {//第一趟wait為0，表示不等，第二趟為1，等 int i, retry, pass = 0, err = 0; struct buffer_head * bh, *next;/* One pass for no-wait, three for wait: * 0) write out all dirty, unlocked buffers; * 1) write out all dirty buffers, waiting if locked; * 2) wait for completion by waiting for all buffers to unlock. */ do { retry = 0;/* We search all lists as a failsafe mechanism, not because we expect * there to be dirty buffers on any of the other lists. */ repeat: spin_lock(&lru_list_lock); bh = lru_list[BUF_DIRTY];//取得髒的隊列頭,待會會被移到其他隊列中，比如被鎖定的 if (!bh) goto repeat2;//nr_buffers_type[BUF_DIRTY] 指相對每種buf的總數 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) { next = bh->b_next_free;if (!lru_list[BUF_DIRTY])//這···有可能嗎，上面不是檢查過bh了嗎··· break; if (dev && bh->b_dev != dev)//忽略所有不屬於此裝置的。前面的dev用不著了吧? continue; if (buffer_locked(bh)) {//看是否是鎖住的，如果是，那麼進去 /* Buffer is locked; skip it unless wait is * requested AND pass > 0. */ if (!wait || !pass) { retry = 1;//第一趟wait為0，不等，直接跳過，只是鎖住它 continue; } atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); wait_on_buffer (bh); atomic_dec(&bh->b_count); goto repeat; }//以下節點是沒有鎖住的 /* If an unlocked buffer is not uptodate, there has * been an IO error. Skip it. */ if (wait && buffer_req(bh) && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_uptodate(bh)) { err = -EIO; continue; }/* Don't write clean buffers. Don't write ANY buffers * on the third pass. */ if (!buffer_dirty(bh) || pass >= 2) continue;atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); ll_rw_block(WRITE, 1, &bh);//刷到磁碟上面 atomic_dec(&bh->b_count); retry = 1; goto repeat;//成功刷了一次，為何要這樣重來?難道為了怕鏈表變動? } repeat2: bh = lru_list[BUF_LOCKED]; if (!bh) { spin_unlock(&lru_list_lock); break; } for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) { next = bh->b_next_free;if (!lru_list[BUF_LOCKED]) break; if (dev && bh->b_dev != dev) continue; if (buffer_locked(bh)) { /* Buffer is locked; skip it unless wait is * requested AND pass > 0. */ if (!wait || !pass) { retry = 1; continue; } atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); wait_on_buffer (bh); //等待buf被解鎖，會引起進程調度 spin_lock(&lru_list_lock); atomic_dec(&bh->b_count); goto repeat2;//這裡重新來迴圈是因為等待了之後list變化了嗎? } } spin_unlock(&lru_list_lock);/* If we are waiting for the sync to succeed, and if any dirty * blocks were written, then repeat; on the second pass, only * wait for buffers being written (do not pass to write any * more buffers on the second pass). */ } while (wait && retry && ++pass<=2); return err; }/* * Note: check the dirty flag before waiting, so we don't * hold up the sync while mounting a device. (The newly * mounted device won't need syncing.) */ void sync_supers(kdev_t dev) { struct super_block * sb;//super_blocks是系統全域的隊列，記錄了所有超級塊 for (sb = sb_entry(super_blocks.next); sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) { if (!sb->s_dev) continue; if (dev && sb->s_dev != dev)//不是我的，就掠過 continue; if (!sb->s_dirt) continue; lock_super(sb); if (sb->s_dev && sb->s_dirt && (!dev || dev == sb->s_dev)) if (sb->s_op && sb->s_op->write_super) sb->s_op->write_super(sb); //如果相應的裝置提供了write_super的方法，那調用之。 //對於ext2檔案系統來說，這是ext2_write_super函數，其初始化如下 /*其實ext2檔案系統也沒幹啥事，就是更新了一下LRU，時間什麼的 static struct super_operations ext2_sops = { read_inode:ext2_read_inode, write_inode:ext2_write_inode, put_inode:ext2_put_inode, delete_inode:ext2_delete_inode, put_super:ext2_put_super, write_super:ext2_write_super, statfs:ext2_statfs, remount_fs:ext2_remount, };*/ unlock_super(sb); } }/** *sync_inodes *@dev: device to sync the inodes from. * *sync_inodes goes through the super block's dirty list, *writes them out, and puts them back on the normal list. */void sync_inodes(kdev_t dev) { struct super_block * sb = sb_entry(super_blocks.next);/* * Search the super_blocks array for the device(s) to sync. */ spin_lock(&inode_lock); for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) { if (!sb->s_dev) continue; if (dev && sb->s_dev != dev) continue; //找到了屬於我的超塊 sync_list(&sb->s_dirty);//s_dirty表示屬於此超級塊(裝置)的髒了的inodes，全部寫回去if (dev) break; } spin_unlock(&inode_lock); }static inline void sync_list(struct list_head *head) { struct list_head * tmp;while ((tmp = head->prev) != head)//無非就是得到每一個inode，然後重新整理之 sync_one(list_entry(tmp, struct inode, i_list), 0); /*sync_one會吧給定inode的所有髒頁面，鎖住的頁面都耍到磁碟上去 然後喚醒所有等待在上面的進程 */ }static inline void sync_one(struct inode *inode, int sync) { if (inode->i_state & I_LOCK) {//不明白，鎖住的難道不髒? __iget(inode);//如果被鎖住了，先遞增計數吧，免得待會被別人先登了 spin_unlock(&inode_lock); __wait_on_inode(inode);//這是沒辦法的，必須等 iput(inode);//遞減技術 spin_lock(&inode_lock); } else { unsigned dirty;list_del(&inode->i_list);//從隊列中刪除 list_add(&inode->i_list, atomic_read(&inode->i_count) ? &inode_in_use : &inode_unused);//是放入在用的呢，還是不用的?不過都是緩衝 /* Set I_LOCK, reset I_DIRTY */ dirty = inode->i_state & I_DIRTY;//這是不是髒的呢 inode->i_state |= I_LOCK;//我鎖住了哈，你們別來碰 inode->i_state &= ~I_DIRTY;//去掉髒位 spin_unlock(&inode_lock);filemap_fdatasync(inode->i_mapping);//你懂的，i_mapping 有大文章，緩衝的 //裡面其實是把mapping->dirty_pages，即這個inode的所有緩衝的髒頁面寫入到磁碟。//剛才filemap_fdatasync寫完了所有的髒頁面，那麼，如果真的有髒的，通知一下相應檔案系統吧 /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) write_inode(inode, sync); //其實就是調用相應檔案系統的inode->i_sb->s_op->write_inode(inode, sync); //把這個inode寫進去//這一次，把鎖住的頁面locked_pages也重新整理了，不過會等待的___wait_on_page(page); filemap_fdatawait(inode->i_mapping);spin_lock(&inode_lock); inode->i_state &= ~I_LOCK; wake_up(&inode->i_wait);//有誰再我這上面睡找了，都醒來 } }

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

sys_umount系統調用的實現注釋。2.4版核心

聯繫我們

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support