記憶體管理源碼剖析 Author:錢國正 為了深入記憶體管理的研究,我決定研究它的源碼,但是版本過多,所以選擇 0.12 核心源碼,因為其 簡短,但是無髒俱全。 記憶體管理代碼位於 mm 檔案夾內。含有 Makefile, memory.c, page.s swap.c 四個檔案。 Makefile 是檔案編譯時間用的,主要是管理源檔案。 memory.c 進行記憶體分頁管理。實現對主記憶體區記憶體頁面的動態分配和回收操作。 swap.c 程式主要實現虛擬記憶體交換功能。 page.s 包括異常中斷處理常式,主要分為兩種情況處理:一是由於缺頁引起的頁異常中斷,通過調 用 do_no_page(error_code,address)來處理;二是:由頁防寫保護引起的頁異常,此時調用頁防寫保護處理函數 do_wp_page(error_code,address)進行處理。其中出錯碼 error_code 由 CPU 自動產生並壓入堆棧,出現異 常時訪問的線性地址是從寄存器 CR2 中取得的。CR2 專門用來存放頁出錯時的線性地址。 get_free_page()和 free_page()這兩個函數專門用於管理主記憶體中實體記憶體的佔用和空閑情況,與每 個線程的線性地址無關。 void free_page(unsigned long addr) //memory.c 中 { if (addr < LOW_MEM) return; if (addr >= HIGH_MEMORY) panic("trying to free nonexistent page"); addr -= LOW_MEM; addr >>= 12; if (mem_map[addr]--) return; mem_map[addr]=0; panic("trying to free free page"); } unsigned long get_free_page(void) { register unsigned long __res asm("ax"); //swap.c 中 repeat: __asm__("std ; repne ; scasb/n/t" "jne 1f/n/t" "movb $1,1(%%edi)/n/t" "sall $12,%%ecx/n/t" "addl %2,%%ecx/n/t" "movl %%ecx,%%edx/n/t" "movl $1024,%%ecx/n/t" "leal 4092(%%edx),%%edi/n/t" "rep ; stosl/n/t" "movl %%edx,%%eax/n" "1:" :"=a" (__res) :"0" (0),"i" (LOW_MEM),"c" (PAGING_PAGES), "D" (mem_map+PAGING_PAGES-1) :"di","cx","dx"); if (__res >= HIGH_MEMORY) goto repeat; if (!__res && swap_out()) goto repeat; return __res; } free_page_tables()和 copy_page_tables()這兩個函數則以一個頁表對應的實體記憶體塊為單位,釋放 或複製指定線性地址和長度(頁表個數)對應的實體記憶體頁塊。不僅對管理線性地址的頁目錄和頁表中的對 應項內容進行修改,而且也對每個也表中所有頁表項對應的實體記憶體頁進行釋放或佔用操作。 /* * This function frees a continuos block of page tables, as needed * by 'exit()'. As does copy_page_tables(), this handles only 4Mb blocks. */ int free_page_tables(unsigned long from,unsigned long size) //memory.c 中 { unsigned long *pg_table; unsigned long * dir, nr; if (from & 0x3fffff) panic("free_page_tables called with wrong alignment"); if (!from) panic("Trying to free up swapper memory space"); size = (size + 0x3fffff) >> 22; dir = (unsigned long *) ((from>>20) & 0xffc); /* _pg_dir = 0 */ for ( ; size-->0 ; dir++) { if (!(1 & *dir)) continue; pg_table = (unsigned long *) (0xfffff000 & *dir); for (nr=0 ; nr<1024 ; nr++) { if (*pg_table) { if (1 & *pg_table) free_page(0xfffff000 & *pg_table); else swap_free(*pg_table >> 1); *pg_table = 0; } pg_table++; } free_page(0xfffff000 & *dir); *dir = 0; } invalidate(); return 0; } /* * Well, here is one of the most complicated functions in mm. It * copies a range of linerar addresses by copying only the pages. * Let's hope this is bug-free, 'cause this one I don't want to debug :-) * * Note! We don't copy just any chunks of memory - addresses have to * be divisible by 4Mb (one page-directory entry), as this makes the * function easier. It's used only by fork anyway. * * NOTE 2!! When from==0 we are copying kernel space for the first * fork(). Then we DONT want to copy a full page-directory entry, as * that would lead to some serious memory waste - we just copy the * first 160 pages - 640kB. Even that is more than we need, but it * doesn't take any more memory - we don't copy-on-write in the low * 1 Mb-range, so the pages can be shared with the kernel. Thus the * special case for nr=xxxx. */ int copy_page_tables(unsigned long from,unsigned long to,long size) { unsigned long * from_page_table; unsigned long * to_page_table; unsigned long this_page; unsigned long * from_dir, * to_dir; unsigned long new_page; unsigned long nr; if ((from&0x3fffff) || (to&0x3fffff)) panic("copy_page_tables called with wrong alignment"); from_dir = (unsigned long *) ((from>>20) & 0xffc); /* _pg_dir = 0 */ to_dir = (unsigned long *) ((to>>20) & 0xffc); size = ((unsigned) (size+0x3fffff)) >> 22; for( ; size-->0 ; from_dir++,to_dir++) { if (1 & *to_dir) panic("copy_page_tables: already exist"); if (!(1 & *from_dir)) continue; from_page_table = (unsigned long *) (0xfffff000 & *from_dir); if (!(to_page_table = (unsigned long *) get_free_page())) return -1; /* Out of memory, see freeing */ *to_dir = ((unsigned long) to_page_table) | 7; nr = (from==0)?0xA0:1024; for ( ; nr-- > 0 ; from_page_table++,to_page_table++) { this_page = *from_page_table; if (!this_page) continue; if (!(1 & this_page)) { if (!(new_page = get_free_page())) return -1; read_swap_page(this_page>>1, (char *) new_page); *to_page_table = this_page; *from_page_table = new_page | (PAGE_DIRTY | 7); continue; } this_page &= ~2; *to_page_table = this_page; if (this_page > LOW_MEM) { *from_page_table = this_page; this_page -= LOW_MEM; this_page >>= 12; mem_map[this_page]++; } } } invalidate(); return 0; } put_page()用於將一指定的實體記憶體頁面映射到指定的線性地址處。它首先判斷指定的記憶體頁面地址 的有效性,應在 1MB 和系統最高端記憶體位址外,否則發出警告。然後計算該指定線性地址在頁目錄中對應 的目錄項,此時若該目錄項有效(P=1),則取其對應頁表的地址,否則申請空閑頁給頁表使用,並設定該 頁表中對應頁表項的屬性。最後仍返回指定的實體記憶體頁面地址。 /* * This function puts a page in memory at the wanted address. * It returns the physical address of the page gotten, 0 if * out of memory (either when trying to access page-table or * page.) */ static unsigned long put_page(unsigned long page,unsigned long address) { unsigned long tmp, *page_table; /* NOTE !!! This uses the fact that _pg_dir=0 */ if (page < LOW_MEM || page >= HIGH_MEMORY) printk("Trying to put page %p at %p/n",page,address); if (mem_map[(page-LOW_MEM)>>12] != 1) printk("mem_map disagrees with %p at %p/n",page,address); page_table = (unsigned long *) ((address>>20) & 0xffc); if ((*page_table)&1) page_table = (unsigned long *) (0xfffff000 & *page_table); else { if (!(tmp=get_free_page())) return 0; *page_table = tmp | 7; page_table = (unsigned long *) tmp; } page_table[(address>>12) & 0x3ff] = page | 7; /* no need for invalidate */ return page; } do_wp_page()是頁異常中斷過程中調用的頁防寫保護處理函數。它首先判斷地址是否在進程的代碼區 域,若是則終止程式;然後執行寫時複製頁面的操作(copy on write) /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. * * If it's in code space we exit with a segment error. */ void do_wp_page(unsigned long error_code,unsigned long address) { if (address < TASK_SIZE) printk("/n/rBAD! KERNEL MEMORY WP-ERR!/n/r"); if (address - current->start_code > TASK_SIZE) { printk("Bad things happen: page error in do_wp_page/n/r"); do_exit(SIGSEGV); } #if 0 /* we cannot do this yet: the estdio library writes to code space */ /* stupid, stupid. I really want the libc.a from GNU */ if (CODE_SPACE(address)) do_exit(SIGSEGV); #endif un_wp_page((unsigned long *) (((address>>10) & 0xffc) + (0xfffff000 & *((unsigned long *) ((address>>20) &0xffc))))); } do_no_page()是頁異常中斷過程中調用的缺頁處理函數。 void do_no_page(unsigned long error_code,unsigned long address) { int nr[4]; unsigned long tmp; unsigned long page; int block,i; struct m_inode * inode; if (address < TASK_SIZE) printk("/n/rBAD!! KERNEL PAGE MISSING/n/r"); if (address - current->start_code > TASK_SIZE) { printk("Bad things happen: nonexistent page error in do_no_page/n/r"); do_exit(SIGSEGV); } page = *(unsigned long *) ((address >> 20) & 0xffc); if (page & 1) { page &= 0xfffff000; page += (address >> 10) & 0xffc; tmp = *(unsigned long *) page; if (tmp && !(1 & tmp)) { swap_in((unsigned long *) page); return; } } address &= 0xfffff000; tmp = address - current->start_code; if (tmp >= LIBRARY_OFFSET ) { inode = current->library; block = 1 + (tmp-LIBRARY_OFFSET) / BLOCK_SIZE; } else if (tmp < current->end_data) { inode = current->executable; block = 1 + tmp / BLOCK_SIZE; } else { inode = NULL; block = 0; } if (!inode) { get_empty_page(address); return; } if (share_page(inode,tmp)) return; if (!(page = get_free_page())) oom(); /* remember that 1 block is used for header */ for (i=0 ; i<4 ; block++,i++) nr[i] = bmap(inode,block); bread_page(page,inode->i_dev,nr); i = tmp + 4096 - current->end_data; if (i>4095) i = 0; tmp = page + 4096; while (i-- > 0) { tmp--; *(char *)tmp = 0; } if (put_page(page,address)) return; free_page(page); oom(); } get_empty_page(0 用也取得一頁空閑實體記憶體並映射到指定線性地址處。主要使用了 get_free_page()和 put_page()函數來實現該功能。 void get_empty_page(unsigned long address) { unsigned long tmp; if (!(tmp=get_free_page()) || !put_page(tmp,address)) { free_page(tmp); /* 0 is ok - ignored */ oom(); } } 以上只列出幾個重要函數,對於源碼的詳細分析可以看同濟大學趙博士的書,再仔細分析估計是一本 書了,呵呵。 本日誌是最不負責任的一篇了,沒有添加每行的注釋,非常抱歉,不過我這周真的是精疲力竭了,沒有太多時間研究核心,不能給大家好的分析結果,再次表示抱歉。