標籤:
linux kernel被bootloader載入到記憶體後,cpu首先執行head.s中的start_of_setup函數等函數,然後跳轉到main.c,main中首先執行detect_memory函數探測記憶體;
int detect_memory(void){int err = -1;if (detect_memory_e820() > 0)err = 0;if (!detect_memory_e801())err = 0;if (!detect_memory_88())err = 0;return err;}
linux核心通過detect_memory_xxx來擷取記憶體相關資訊;這幾個函數都是通過觸發int 0x15 中斷擷取;,同時調用前分別把AX寄存器設定為0xe820h、0xe801h、0x88h
對於e820();
struct e820entry {__u64 addr;/* start of memory segment */該記憶體段的起始地址__u64 size;/* size of memory segment */該記憶體段段的大小__u32 type;/* type of memory segment */該記憶體段的類型} __attribute__((packed));struct e820map {<span style="white-space:pre"></span>__u32 nr_map;<span style="white-space:pre"></span>struct e820entry map[E820_X_MAX];};
type:該記憶體段的類型,可分為Usable (normal) RAM,Reserved - unusable,ACPI reclaimable memory,ACPI NVS memory,Area containing bad memory,要擷取所有的記憶體段的資訊,detect_memory_e820()通過一個do_while迴圈來不斷觸發int 0x15中斷來擷取每個記憶體段的資訊,並且將這些資訊儲存在一個struct e820entry類型的數組中。
static int detect_memory_e820(void){int count = 0;struct biosregs ireg, oreg;struct e820entry *desc = boot_params.e820_map;static struct e820entry buf; /* static so it is zeroed */initregs(&ireg);ireg.ax = 0xe820;ireg.cx = sizeof buf;ireg.edx = SMAP;ireg.di = (size_t)&buf;/* * Note: at least one BIOS is known which assumes that the * buffer pointed to by one e820 call is the same one as * the previous call, and only changes modified fields. Therefore, * we use a temporary buffer and copy the results entry by entry. * * This routine deliberately does not try to account for * ACPI 3+ extended attributes. This is because there are * BIOSes in the field which report zero for the valid bit for * all ranges, and we don't currently make any use of the * other attribute bits. Revisit this if we see the extended * attribute bits deployed in a meaningful way in the future. */do {<span style="white-space:pre"></span> /*在執行這條內聯彙編語句時輸入的參數有: eax寄存器=0xe820 dx寄存器=’SMAP’ edi寄存器=desc ebx寄存器=next ecx寄存器=size 返回給c語言代碼的參數有: id=eax寄存器 rr=edx寄存器 ext=ebx寄存器 size=ecx寄存器 desc指向的記憶體位址在執行0x15中斷調用時被設定 */ <span style="white-space:pre"></span>intcall(0x15, &ireg, &oreg);/*觸發中斷0x15*/ireg.ebx = oreg.ebx; /* for next iteration... *//* BIOSes which terminate the chain with CF = 1 as opposed to %ebx = 0 don't always report the SMAP signature on the final, failing, probe. */if (oreg.eflags & X86_EFLAGS_CF)break;/* Some BIOSes stop returning SMAP in the middle of the search loop. We don't know exactly how the BIOS screwed up the map at that point, we might have a partial map, the full map, or complete garbage, so just return failure. */if (oreg.eax != SMAP) {count = 0;break;}*desc++ = buf;/*儲存擷取的記憶體段資訊*/ count++; /*擷取的記憶體段數目加1*/ } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));<span style="white-space:pre"></span>/*將記憶體塊數保持到變數中*/ return boot_params.e820_entries = count;}
static int detect_memory_e801(void){struct biosregs ireg, oreg;initregs(&ireg);ireg.ax = 0xe801;intcall(0x15, &ireg, &oreg);if (oreg.eflags & X86_EFLAGS_CF)return -1;/* Do we really need to do this? */if (oreg.cx || oreg.dx) {oreg.ax = oreg.cx;oreg.bx = oreg.dx;}if (oreg.ax > 15*1024) {return -1;/* Bogus! */} else if (oreg.ax == 15*1024) {boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax;} else {/* * This ignores memory above 16MB if we have a memory * hole there. If someone actually finds a machine * with a memory hole at 16MB and no support for * 0E820h they should probably generate a fake e820 * map. */boot_params.alt_mem_k = oreg.ax;}return 0;}static int detect_memory_88(void){struct biosregs ireg, oreg;initregs(&ireg);ireg.ah = 0x88;intcall(0x15, &ireg, &oreg);boot_params.screen_info.ext_mem_k = oreg.ax;return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */}
對於32位的系統,通過調用鏈arch/x86/boot/main.c:main()--->arch/x86/boot/pm.c:go_to_protected_mode()--->arch/x86/boot/pmjump.S:protected_mode_jump()--->arch/i386/boot/compressed/head_32.S:startup_32()--->arch/x86/kernel/head_32.S:startup_32()--->arch/x86/kernel/head32.c:i386_start_kernel()--->init/main.c:start_kernel(),到達眾所周知的Linux核心啟動函數start_kernel(),這裡會調用setup_arch()完成與體繫結構相關的一系列初始化工作,其中就包括各種記憶體的初始化工作,如記憶體配置圖的建立、管理區的初始化等等。對x86體繫結構,setup_arch()函數在arch/x86/kernel/setup.c中,如下:
void __init setup_arch(char **cmdline_p){/* ...... */x86_init.oem.arch_setup();setup_memory_map(); /* 建立記憶體配置圖 */e820_reserve_setup_data();/* ...... *//* * partially used pages are not usable - thus * we are rounding upwards: */max_pfn = e820_end_of_ram_pfn(); /* 找出最大可用記憶體頁面幀號 */<span style="white-space:pre"></span><pre name="code" class="cpp" style="font-size: 24px;"> /* ...... */#ifdef CONFIG_X86_32/* max_low_pfn在這裡更新 */find_low_pfn_range(); /* 找出低端記憶體的最大頁幀號 */#elsenum_physpages = max_pfn;/* ...... *//* max_pfn_mapped在這更新 *//* 初始化記憶體映射機制 */max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);max_pfn_mapped = max_low_pfn_mapped;/* ...... */initmem_init(0, max_pfn); /* 啟動記憶體 Clerk *//* ...... */x86_init.paging.pagetable_setup_start(swapper_pg_dir);paging_init(); /* 建立完整的頁表 */x86_init.paging.pagetable_setup_done(swapper_pg_dir);/* ...... */}
在 start_kernel---->setup_arch()--------------->setup_memory_map;
void __init setup_memory_map(void){char *who;who = x86_init.resources.memory_setup();memcpy(&e820_saved, &e820, sizeof(struct e820map));printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");e820_print_map(who);}在x86_init.c中定義了x86下的memory_setup函數:
/* * The platform setup functions are preset with the default functions * for standard PC hardware. */struct x86_init_ops x86_init __initdata = {.resources = {.probe_roms= probe_roms,.reserve_resources= reserve_standard_io_resources,.memory_setup= default_machine_specific_memory_setup,},.mpparse = {.mpc_record= x86_init_uint_noop,.setup_ioapic_ids= x86_init_noop,.mpc_apic_id= default_mpc_apic_id,.smp_read_mpc_oem= default_smp_read_mpc_oem,.mpc_oem_bus_info= default_mpc_oem_bus_info,.find_smp_config= default_find_smp_config,.get_smp_config= default_get_smp_config,},.irqs = {.pre_vector_init= init_ISA_irqs,.intr_init= native_init_IRQ,.trap_init= x86_init_noop,},.oem = {.arch_setup= x86_init_noop,.banner= default_banner,},.mapping = {.pagetable_reserve= native_pagetable_reserve,},.paging = {.pagetable_setup_start= native_pagetable_setup_start,.pagetable_setup_done= native_pagetable_setup_done,},.timers = {.setup_percpu_clockev= setup_boot_APIC_clock,.tsc_pre_init= x86_init_noop,.timer_init= hpet_time_init,.wallclock_init= x86_init_noop,},.iommu = {.iommu_init= iommu_init_noop,},.pci = {.init= x86_default_pci_init,.init_irq= x86_default_pci_init_irq,.fixup_irqs= x86_default_pci_fixup_irqs,},};
可知會回調:default_machine_specific_memory_setup();
char *__init default_machine_specific_memory_setup(void){char *who = "BIOS-e820";u32 new_nr;/* * Try to copy the BIOS-supplied E820-map. * * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */new_nr = boot_params.e820_entries;sanitize_e820_map(boot_params.e820_map, /*消除重疊的記憶體段*/ ARRAY_SIZE(boot_params.e820_map),&new_nr);boot_params.e820_entries = new_nr;if (append_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { /*將記憶體布局的資訊從boot_params.e820_map拷貝到struct e820map e820*/ u64 mem_size;/* compare results from other methods and take the greater */if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {mem_size = boot_params.screen_info.ext_mem_k;who = "BIOS-88";} else {mem_size = boot_params.alt_mem_k;who = "BIOS-e801";}e820.nr_map = 0;e820_add_region(0, LOWMEMSIZE(), E820_RAM);e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);}/* In case someone cares... */return who;}
1.消除記憶體段的重疊部分
2.將記憶體布局資訊從boot_params.e820_map拷貝到e820中
append_e820_map(boot_params.e820_map, boot_params.e820_entries)將會調用一下函數:
static int __init __append_e820_map(struct e820entry *biosmap, int nr_map){while (nr_map) { u64 start = biosmap->addr;u64 size = biosmap->size;u64 end = start + size;u32 type = biosmap->type;/* Overflow in 64 bits? Ignore the memory map. */if (start > end)return -1;e820_add_region(start, size, type); 迴圈nr_map次添加記憶體塊到e820中去; biosmap++;nr_map--;}return 0;}
void __init e820_add_region(u64 start, u64 size, int type){__e820_add_region(&e820, start, size, type);}
struct e820map e820;
實體記憶體就已經從BIOS中讀出來存放到全域變數e820中,
建立記憶體後
setup_arch------------->e820_end_of_ram_pfn;
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
max_pfn = e820_end_of_ram_pfn();
static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type){int i;unsigned long last_pfn = 0;unsigned long max_arch_pfn = MAX_ARCH_PFN;/*4G地址空間對應的頁面數*/ for (i = 0; i < e820.nr_map; i++) { /*迴圈遍曆記憶體布局數組*/struct e820entry *ei = &e820.map[i];unsigned long start_pfn;unsigned long end_pfn;if (ei->type != type)continue;start_pfn = ei->addr >> PAGE_SHIFT;end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;if (start_pfn >= limit_pfn)/*起始地址大於MAX_ARCH_PFN,無視之*/continue;if (end_pfn > limit_pfn) { /*結束位址大於MAX_ARCH_PFN則直接最大頁框編號設為MAX_ARCH_PFN*/last_pfn = limit_pfn;break;}if (end_pfn > last_pfn) /*該記憶體段的末地址大於之前找到的最大頁框編號,則重設最大頁框編號*/last_pfn = end_pfn;}if (last_pfn > max_arch_pfn)/*大於4G空間時*/ last_pfn = max_arch_pfn;printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn);return last_pfn; /*返回最後一個頁面幀號*/ }
unsigned long __init e820_end_of_ram_pfn(void){<span style="white-space:pre"></span>return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);}
#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
其中__VANALLOC_RESERVE為128M,說明了第4GB的記憶體劃分
可知:MAXMEM為一個略小於896M的值(896M-8K-4M-4M)即略小於低端記憶體的上限,高端記憶體的起始地址
setup_arch()-->find_low_pfn_range().該函數用來劃分低端記憶體和高端記憶體的界限,確定高端記憶體的起始地址
/* max_low_pfn get updated here */
find_low_pfn_range();
/* * Determine low and high memory ranges: */void __init find_low_pfn_range(void){/* it could update max_pfn */if (max_pfn <= MAXMEM_PFN)/*實際實體記憶體小於等於低端記憶體896M*/ lowmem_pfn_init();elsehighmem_pfn_init();}
/* * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: */ /*高端地址空間的頁面數可以在啟動中進行配置; 如果不配置,在這裡進行設定大小*/void __init highmem_pfn_init(void){/*MAXMEM_PFN為最大物理地址-(4M+4M+8K+128M);所以低端記憶體的大小其實比我們說的896M低一些*/max_low_pfn = MAXMEM_PFN;/*設定高端記憶體和低端記憶體的分界線*/ if (highmem_pages == -1)/*高端記憶體頁面數如果在開機沒有設定*/highmem_pages = max_pfn - MAXMEM_PFN;/*總頁面數減去低端頁面數*//*如果highmem_pages變數在啟動項設定了,那麼在這裡就要進行這樣的判斷,因為可能出現不一致的情況*/if (highmem_pages + MAXMEM_PFN < max_pfn)max_pfn = MAXMEM_PFN + highmem_pages;if (highmem_pages + MAXMEM_PFN > max_pfn) {printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,pages_to_mb(max_pfn - MAXMEM_PFN),pages_to_mb(highmem_pages));highmem_pages = 0;}#ifndef CONFIG_HIGHMEM/* Maximum memory usable is what is directly addressable */printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);if (max_pfn > MAX_NONPAE_PFN)printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");elseprintk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");max_pfn = MAXMEM_PFN;#else /* !CONFIG_HIGHMEM *//*存在高端地址情況*/#ifndef CONFIG_HIGHMEM64G/*在沒有配置64G的情況下,記憶體的大小不能超過4G*/if (max_pfn > MAX_NONPAE_PFN) {max_pfn = MAX_NONPAE_PFN;printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);}#endif /* !CONFIG_HIGHMEM64G */#endif /* !CONFIG_HIGHMEM */}當實際記憶體小於896M時
void __init lowmem_pfn_init(void){/* max_low_pfn is 0, we already have early_res support *//*將分界線初始化為實際實體記憶體的最大頁框號,由於系統的記憶體小於896M,所以全部記憶體為低端記憶體,如需要高端記憶體,則從中分一部分出來進行分配*/max_low_pfn = max_pfn;if (highmem_pages == -1)highmem_pages = 0;#ifdef CONFIG_HIGHMEM /*如果使用者定義了HIGHMEM,即需要分配高端記憶體*/if (highmem_pages >= max_pfn) { /*如果高端記憶體的頁起始地址>=最大頁框號,則無法分配*/printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,pages_to_mb(highmem_pages), pages_to_mb(max_pfn));highmem_pages = 0;}if (highmem_pages) {/*這個條件保證低端記憶體不能小於64M*/if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,pages_to_mb(highmem_pages));highmem_pages = 0;}max_low_pfn -= highmem_pages; /*設定好低、高端記憶體的分界線*/}#elseif (highmem_pages)printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");#endif}
當實際的實體記憶體大於896M,由highmem_pfn_init()進行分配void __init highmem_pfn_init(void){max_low_pfn = MAXMEM_PFN; /*設定高端記憶體和低端記憶體的分界線*/if (highmem_pages == -1) /*未設定高端記憶體的頁框數*/highmem_pages = max_pfn - MAXMEM_PFN; /*預設為最大頁框數減去MAXMEM_PFN*/if (highmem_pages + MAXMEM_PFN < max_pfn) /*高端記憶體頁框數加上MAXMEM_PFN小於最大頁框數*/max_pfn = MAXMEM_PFN + highmem_pages; /*將最大頁框數下調到前兩者的和*/if (highmem_pages + MAXMEM_PFN > max_pfn){ /*申請的高端記憶體超過範圍則不分配*/printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,pages_to_mb(max_pfn - MAXMEM_PFN),pages_to_mb(highmem_pages));highmem_pages = 0;}#ifndef CONFIG_HIGHMEM/* Maximum memory usable is what is directly addressable */printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);if (max_pfn > MAX_NONPAE_PFN)printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");elseprintk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");max_pfn = MAXMEM_PFN;#else /* !CONFIG_HIGHMEM */#ifndef CONFIG_HIGHMEM64Gif (max_pfn > MAX_NONPAE_PFN) {max_pfn = MAX_NONPAE_PFN;printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);}#endif /* !CONFIG_HIGHMEM64G */#endif /* !CONFIG_HIGHMEM */}
記憶體管理--檢測記憶體