Linux記憶體管理–基本概念

來源:互聯網
上載者:User
文章目錄
  • 2.1 Page Directory (PGD and PMD)
  •  2.2 Page Table Entry
  • 2.3 如何通過3級頁表訪問實體記憶體
1. Linux實體記憶體三級架構

     對於記憶體管理,Linux採用了與具體體系架構不相關的設計模型,實現了良好的延展性。它主要由記憶體節點node、記憶體地區zone和物理頁框page三級架構組成。

    • 記憶體節點node

       記憶體節點node是電腦系統中對實體記憶體的一種描述方法,一個匯流排主裝置訪問位於同一個節點中的任意記憶體單元所花的代價相同,而訪問任意兩個不同節點中的記憶體單元所花的代價不同。在一致儲存結構(Uniform Memory Architecture,簡稱UMA)電腦系統中只有一個節點,而在非一致性儲存結構(NUMA)電腦系統中有多個節點。Linux核心中使用資料結構pg_data_t來表示記憶體節點node。如常用的ARM架構為UMA架構。

    •  記憶體地區zone

       記憶體地區位於同一個記憶體節點之內,由於各種原因它們的用途和使用方法並不一樣。如基於IA32體繫結構的個人電腦系統中,由於曆史原因使得ISA裝置只能使用最低16MB來進行DMA傳輸。又如,由於Linux核心採用

 

    •  物理頁框page

 

2. Linux虛擬記憶體三級頁表

      Linux虛擬記憶體三級管理由以下三級組成:

     • PGD: Page Global Directory (頁目錄)

     • PMD: Page Middle Directory (頁目錄)

     • PTE:  Page Table Entry  (頁表項)

    

     每一級有以下三個關鍵描述宏:

     • SHIFT

     • SIZE

     • MASK

        如頁的對應描述為:

/* PAGE_SHIFT determines the page size  asm/page.h */#define PAGE_SHIFT12#define PAGE_SIZE(_AC(1,UL) << PAGE_SHIFT)#define PAGE_MASK(~(PAGE_SIZE-1))

    資料結構定義如下:

/* asm/page.h */typedef unsigned long pteval_t;typedef pteval_t pte_t;typedef unsigned long pmd_t;typedef unsigned long pgd_t[2];typedef unsigned long pgprot_t;#define pte_val(x)      (x)#define pmd_val(x)      (x)#define pgd_val(x)((x)[0])#define pgprot_val(x)   (x)#define __pte(x)        (x)#define __pmd(x)        (x)#define __pgprot(x)     (x)

 

2.1 Page Directory (PGD and PMD)

     每個進程有它自己的PGD( Page Global Directory),它是一個物理頁,並包含一個pgd_t數組。其定義見<asm/page.h>。 進程的pgd_t資料見 task_struct -> mm_struct -> pgd_t * pgd;    

     ARM架構的PGD和PMD的定義如下<arch/arm/include/asm/pgtable.h>:

#define PTRS_PER_PTE  512 // PTE中可包含的指標<u32>數 (21-12=9bit)#define PTRS_PER_PMD  1#define PTRS_PER_PGD  2048 // PGD中可包含的指標<u32>數 (32-21=11bit)

#define PTE_HWTABLE_PTRS (PTRS_PER_PTE)#define PTE_HWTABLE_OFF  (PTE_HWTABLE_PTRS * sizeof(pte_t))#define PTE_HWTABLE_SIZE (PTRS_PER_PTE * sizeof(u32))

/* * PMD_SHIFT determines the size of the area a second-level page table can map * PGDIR_SHIFT determines what a third-level page table entry can map */#define PMD_SHIFT  21#define PGDIR_SHIFT  21

      虛擬位址SHIFT宏圖:

     虛擬位址MASK和SIZE宏圖:

 

 

 2.2 Page Table Entry

      PTEs, PMDs和PGDs分別由pte_t, pmd_t 和pgd_t來描述。為了儲存保護位,pgprot_t被定義,它擁有相關的flags並經常被儲存在page table entry低位(lower bits),其具體的儲存方式依賴於CPU架構。

     每個pte_t指向一個物理頁的地址,並且所有的地址都是頁對齊的。因此在32位地址中有PAGE_SHIFT(12)位是閒置,它可以為PTE的狀態位。

     PTE的保護和狀態位如所示:

2.3 如何通過3級頁表訪問實體記憶體

       為了通過PGD、PMD和PTE訪問實體記憶體,其相關宏在asm/pgtable.h中定義。

       • pgd_offset 

       根據當前虛擬位址和當前進程的mm_struct擷取pgd項的宏定義如下: 

/* to find an entry in a page-table-directory */#define pgd_index(addr)((addr) >> PGDIR_SHIFT)  //獲得在pgd表中的索引#define pgd_offset(mm, addr)((mm)->pgd + pgd_index(addr)) //獲得pmd表的起始地址/* to find an entry in a kernel page-table-directory */#define pgd_offset_k(addr)pgd_offset(&init_mm, addr)

         • pmd_offset
             根據通過pgd_offset擷取的pgd 項和虛擬位址,擷取相關的pmd項(即pte表的起始地址) 

/* Find an entry in the second-level page table.. */#define pmd_offset(dir, addr)((pmd_t *)(dir))   //即為pgd項的值

        • pte_offset

      根據通過pmd_offset擷取的pmd項和虛擬位址,擷取相關的pte項(即物理頁的起始地址)

#ifndef CONFIG_HIGHPTE#define __pte_map(pmd)pmd_page_vaddr(*(pmd))#define __pte_unmap(pte)do { } while (0)#else#define __pte_map(pmd)(pte_t *)kmap_atomic(pmd_page(*(pmd)))#define __pte_unmap(pte)kunmap_atomic(pte)#endif#define pte_index(addr)(((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))#define pte_offset_kernel(pmd,addr)(pmd_page_vaddr(*(pmd)) + pte_index(addr))#define pte_offset_map(pmd,addr)(__pte_map(pmd) + pte_index(addr))#define pte_unmap(pte)__pte_unmap(pte)#define pte_pfn(pte)(pte_val(pte) >> PAGE_SHIFT)#define pfn_pte(pfn,prot)__pte(__pfn_to_phys(pfn) | pgprot_val(prot))#define pte_page(pte)pfn_to_page(pte_pfn(pte))#define mk_pte(page,prot)pfn_pte(page_to_pfn(page), prot)#define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)#define pte_clear(mm,addr,ptep)set_pte_ext(ptep, __pte(0), 0)

        其如所示:

 
  2.4 根據虛擬位址擷取物理頁的範例程式碼

        根據虛擬位址擷取物理頁的範例程式碼詳見<mm/memory.c中的函數follow_page>。

 

/** * follow_page - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * * Returns the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */struct page *follow_page(struct vm_area_struct *vma, unsigned long address,unsigned int flags){pgd_t *pgd;pud_t *pud;pmd_t *pmd;pte_t *ptep, pte;spinlock_t *ptl;struct page *page;struct mm_struct *mm = vma->vm_mm;page = follow_huge_addr(mm, address, flags & FOLL_WRITE);if (!IS_ERR(page)) {BUG_ON(flags & FOLL_GET);goto out;}page = NULL;pgd = pgd_offset(mm, address);if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))goto no_page_table;pud = pud_offset(pgd, address);if (pud_none(*pud))goto no_page_table;if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {BUG_ON(flags & FOLL_GET);page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);goto out;}if (unlikely(pud_bad(*pud)))goto no_page_table;pmd = pmd_offset(pud, address);if (pmd_none(*pmd))goto no_page_table;if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {BUG_ON(flags & FOLL_GET);page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);goto out;}if (pmd_trans_huge(*pmd)) {if (flags & FOLL_SPLIT) {split_huge_page_pmd(mm, pmd);goto split_fallthrough;}spin_lock(&mm->page_table_lock);if (likely(pmd_trans_huge(*pmd))) {if (unlikely(pmd_trans_splitting(*pmd))) {spin_unlock(&mm->page_table_lock);wait_split_huge_page(vma->anon_vma, pmd);} else {page = follow_trans_huge_pmd(mm, address,     pmd, flags);spin_unlock(&mm->page_table_lock);goto out;}} elsespin_unlock(&mm->page_table_lock);/* fall through */}split_fallthrough:if (unlikely(pmd_bad(*pmd)))goto no_page_table;ptep = pte_offset_map_lock(mm, pmd, address, &ptl);pte = *ptep;if (!pte_present(pte))goto no_page;if ((flags & FOLL_WRITE) && !pte_write(pte))goto unlock;page = vm_normal_page(vma, address, pte);if (unlikely(!page)) {if ((flags & FOLL_DUMP) ||    !is_zero_pfn(pte_pfn(pte)))goto bad_page;page = pte_page(pte);}if (flags & FOLL_GET)get_page(page);if (flags & FOLL_TOUCH) {if ((flags & FOLL_WRITE) &&    !pte_dirty(pte) && !PageDirty(page))set_page_dirty(page);/* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */mark_page_accessed(page);}if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {/* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE * which might bounce very badly if there is contention. * * If the page is already locked, we don't need to * handle it now - vmscan will handle it later if and * when it attempts to reclaim the page. */if (page->mapping && trylock_page(page)) {lru_add_drain();  /* push cached pages to LRU *//* * Because we lock page here and migration is * blocked by the pte's page reference, we need * only check for file-cache page truncation. */if (page->mapping)mlock_vma_page(page);unlock_page(page);}}unlock:pte_unmap_unlock(ptep, ptl);out:return page;bad_page:pte_unmap_unlock(ptep, ptl);return ERR_PTR(-EFAULT);no_page:pte_unmap_unlock(ptep, ptl);if (!pte_none(pte))return page;no_page_table:/* * When core dumping an enormous anonymous area that nobody * has touched so far, we don't want to allocate unnecessary pages or * page tables.  Return error instead of NULL to skip handle_mm_fault, * then get_dump_page() will return NULL to leave a hole in the dump. * But we can only make this optimization where a hole would surely * be zero-filled if handle_mm_fault() actually did handle it. */if ((flags & FOLL_DUMP) &&    (!vma->vm_ops || !vma->vm_ops->fault))return ERR_PTR(-EFAULT);return page;}

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.