Discussion:
[PATCHv3 03/13] x86/xen: Provide pre-built page tables only for XEN_PV and XEN_PVH
(too old to reply)
Kirill A. Shutemov
2017-08-07 14:20:01 UTC
Permalink
Looks like we only need pre-built page tables for XEN_PV and XEN_PVH
cases. Let's not provide them for other configurations.

Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
Reviewed-by: Juergen Gross <***@suse.com>
---
arch/x86/kernel/head_64.S | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 513cbb012ecc..2be7d1e7fcf1 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -37,11 +37,12 @@
*
*/

-#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))

+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
+#endif
L3_START_KERNEL = pud_index(__START_KERNEL_map)

.text
@@ -361,10 +362,7 @@ NEXT_PAGE(early_dynamic_pgts)

.data

-#ifndef CONFIG_XEN
-NEXT_PAGE(init_top_pgt)
- .fill 512,8,0
-#else
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
NEXT_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
@@ -381,6 +379,9 @@ NEXT_PAGE(level2_ident_pgt)
* Don't set NX because code runs from these pages.
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+#else
+NEXT_PAGE(init_top_pgt)
+ .fill 512,8,0
#endif

#ifdef CONFIG_X86_5LEVEL
--
2.13.2
Kirill A. Shutemov
2017-08-07 14:20:01 UTC
Permalink
This patch adds detection of 5-level paging at boot-time and adjusts
virtual memory layout and folds p4d page table layer if needed.

Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
---
arch/x86/boot/compressed/kaslr.c | 13 +++++--
arch/x86/entry/entry_64.S | 12 +++++++
arch/x86/include/asm/page_64_types.h | 13 +++----
arch/x86/include/asm/pgtable_64_types.h | 33 +++++++++++-------
arch/x86/include/asm/processor.h | 2 +-
arch/x86/include/asm/sparsemem.h | 9 ++---
arch/x86/kernel/head64.c | 62 +++++++++++++++++++++++++--------
arch/x86/kernel/head_64.S | 18 ++++++----
arch/x86/kernel/setup.c | 5 ++-
arch/x86/mm/dump_pagetables.c | 8 +++--
arch/x86/mm/kaslr.c | 13 ++++---
11 files changed, 127 insertions(+), 61 deletions(-)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index afde5eb91358..6eb2fd1732c3 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -44,8 +44,9 @@
#include <linux/decompress/mm.h>

#ifdef CONFIG_X86_5LEVEL
-unsigned int pgdir_shift = 48;
-unsigned int ptrs_per_p4d = 512;
+unsigned int p4d_folded = 1;
+unsigned int pgdir_shift = 39;
+unsigned int ptrs_per_p4d = 1;
#endif

extern unsigned long get_cmd_line_ptr(void);
@@ -642,6 +643,14 @@ void choose_random_location(unsigned long input,
return;
}

+#ifdef CONFIG_X86_5LEVEL
+ if (__read_cr4() & X86_CR4_LA57) {
+ p4d_folded = 0;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
+ }
+#endif
+
boot_params->hdr.loadflags |= KASLR_FLAG;

/* Prepare to add new identity pagetables on demand. */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index daf8936d0628..077e8b45784c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -273,8 +273,20 @@ return_from_SYSCALL_64:
* Change top bits to match most significant bit (47th or 56th bit
* depending on paging mode) in the address.
*/
+#ifdef CONFIG_X86_5LEVEL
+ testl $1, p4d_folded(%rip)
+ jnz 1f
+ shl $(64 - 57), %rcx
+ sar $(64 - 57), %rcx
+ jmp 2f
+1:
+ shl $(64 - 48), %rcx
+ sar $(64 - 48), %rcx
+2:
+#else
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+#endif

/* If this changed %rcx, it was not canonical */
cmpq %rcx, %r11
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 0126d6bc2eb1..26056ef366b8 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -36,24 +36,21 @@
* hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
* what Xen requires.
*/
-#ifdef CONFIG_X86_5LEVEL
-#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL)
-#else
-#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL)
-#endif
+#define __PAGE_OFFSET_BASE57 _AC(0xff10000000000000, UL)
+#define __PAGE_OFFSET_BASE48 _AC(0xffff880000000000, UL)

#if defined(CONFIG_RANDOMIZE_MEMORY) || defined(CONFIG_X86_5LEVEL)
#define __PAGE_OFFSET page_offset_base
#else
-#define __PAGE_OFFSET __PAGE_OFFSET_BASE
+#define __PAGE_OFFSET __PAGE_OFFSET_BASE48
#endif /* CONFIG_RANDOMIZE_MEMORY */

#define __START_KERNEL_map _AC(0xffffffff80000000, UL)

/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#ifdef CONFIG_X86_5LEVEL
-#define __PHYSICAL_MASK_SHIFT 52
-#define __VIRTUAL_MASK_SHIFT 56
+#define __PHYSICAL_MASK_SHIFT (p4d_folded ? 52 : 46)
+#define __VIRTUAL_MASK_SHIFT (p4d_folded ? 47 : 56)
#else
#define __PHYSICAL_MASK_SHIFT 46
#define __VIRTUAL_MASK_SHIFT 47
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index a5338b0936ad..a29cc23e96b8 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,7 +20,7 @@ typedef unsigned long pgprotval_t;
typedef struct { pteval_t pte; } pte_t;

#ifdef CONFIG_X86_5LEVEL
-#define p4d_folded 0
+extern unsigned int p4d_folded;
#else
#define p4d_folded 1
#endif
@@ -87,23 +87,30 @@ extern unsigned int ptrs_per_p4d;

/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
-#ifdef CONFIG_X86_5LEVEL
-#define VMALLOC_SIZE_TB _AC(16384, UL)
-#define __VMALLOC_BASE _AC(0xff92000000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
-#else
-#define VMALLOC_SIZE_TB _AC(32, UL)
-#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
-#endif
+
+#ifndef __ASSEMBLY__
+#define __VMALLOC_BASE48 0xffffc90000000000
+#define __VMALLOC_BASE57 0xff92000000000000
+
+#define VMALLOC_SIZE_TB48 32UL
+#define VMALLOC_SIZE_TB57 16384UL
+
+#define __VMEMMAP_BASE48 0xffffea0000000000
+#define __VMEMMAP_BASE57 0xffd4000000000000
+
#if defined(CONFIG_RANDOMIZE_MEMORY) || defined(CONFIG_X86_5LEVEL)
#define VMALLOC_START vmalloc_base
+#define VMALLOC_SIZE_TB (!p4d_folded ? VMALLOC_SIZE_TB57 : VMALLOC_SIZE_TB48)
#define VMEMMAP_START vmemmap_base
#else
-#define VMALLOC_START __VMALLOC_BASE
-#define VMEMMAP_START __VMEMMAP_BASE
+#define VMALLOC_START __VMALLOC_BASE48
+#define VMALLOC_SIZE_TB VMALLOC_SIZE_TB48
+#define VMEMMAP_START __VMEMMAP_BASE48
#endif /* CONFIG_RANDOMIZE_MEMORY */
-#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
+
+#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
+#endif
+
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */
#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c1352771b2f6..cb2958b88de5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -871,7 +871,7 @@ static inline void spin_lock_prefetch(const void *x)
IA32_PAGE_OFFSET : TASK_SIZE_MAX)

#define STACK_TOP TASK_SIZE_LOW
-#define STACK_TOP_MAX TASK_SIZE_MAX
+#define STACK_TOP_MAX (!p4d_folded ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW)

#define INIT_THREAD { \
.sp0 = TOP_OF_INIT_STACK, \
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 1f5bee2c202f..ba67afd870b7 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -26,13 +26,8 @@
# endif
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
-# ifdef CONFIG_X86_5LEVEL
-# define MAX_PHYSADDR_BITS 52
-# define MAX_PHYSMEM_BITS 52
-# else
-# define MAX_PHYSADDR_BITS 44
-# define MAX_PHYSMEM_BITS 46
-# endif
+# define MAX_PHYSADDR_BITS (p4d_folded ? 44 : 52)
+# define MAX_PHYSMEM_BITS (p4d_folded ? 46 : 52)
#endif

#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f81e61ee5e9d..4c9ffb58a3b5 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,20 +39,18 @@ static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);

#ifdef CONFIG_X86_5LEVEL
-unsigned int pgdir_shift = 48;
+unsigned int pgdir_shift = 39;
EXPORT_SYMBOL(pgdir_shift);
-unsigned int ptrs_per_p4d = 512;
+unsigned int ptrs_per_p4d = 1;
EXPORT_SYMBOL(ptrs_per_p4d);
#endif

-#if defined(CONFIG_RANDOMIZE_MEMORY) || defined(CONFIG_X86_5LEVEL)
-unsigned long page_offset_base = __PAGE_OFFSET_BASE;
+unsigned long page_offset_base = __PAGE_OFFSET_BASE48;
EXPORT_SYMBOL(page_offset_base);
-unsigned long vmalloc_base = __VMALLOC_BASE;
+unsigned long vmalloc_base = __VMALLOC_BASE48;
EXPORT_SYMBOL(vmalloc_base);
-unsigned long vmemmap_base = __VMEMMAP_BASE;
+unsigned long vmemmap_base = __VMEMMAP_BASE48;
EXPORT_SYMBOL(vmemmap_base);
-#endif

#define __head __section(.head.text)

@@ -61,6 +59,36 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
return ptr - (void *)_text + (void *)physaddr;
}

+static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr)
+{
+ return fixup_pointer(ptr, physaddr);
+}
+
+#ifdef CONFIG_X86_5LEVEL
+static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
+{
+ return fixup_pointer(ptr, physaddr);
+}
+
+static void __head check_la57_support(unsigned long physaddr)
+{
+ if (native_cpuid_eax(0) < 7)
+ return;
+
+ if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+ return;
+
+ *fixup_int(&p4d_folded, physaddr) = 0;
+ *fixup_int(&pgdir_shift, physaddr) = 48;
+ *fixup_int(&ptrs_per_p4d, physaddr) = 512;
+ *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE57;
+ *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE57;
+ *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE57;
+}
+#else
+static void __head check_la57_support(unsigned long physaddr) {}
+#endif
+
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
@@ -72,6 +100,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
pmdval_t *pmd, pmd_entry;
int i;

+ check_la57_support(physaddr);
+
/* Is the address too large? */
if (physaddr >> MAX_PHYSMEM_BITS)
for (;;);
@@ -95,9 +125,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
/* Fixup the physical addresses in the page table */

pgd = fixup_pointer(&early_top_pgt, physaddr);
- pgd[pgd_index(__START_KERNEL_map)] += load_delta;
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p = pgd + pgd_index(__START_KERNEL_map);
+ if (p4d_folded)
+ *p = (unsigned long)level3_kernel_pgt;
+ else
+ *p = (unsigned long)level4_kernel_pgt;
+ *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
+
+ if (!p4d_folded) {
p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
p4d[511] += load_delta;
}
@@ -120,7 +155,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();

- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ if (!p4d_folded) {
p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);

i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
@@ -166,8 +201,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
* Fixup phys_base - remove the memory encryption mask to obtain
* the true physical address.
*/
- p = fixup_pointer(&phys_base, physaddr);
- *p += load_delta - sme_get_me_mask();
+ *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();

/* Encrypt the kernel (if SME is active) */
sme_encrypt_kernel();
@@ -218,7 +252,7 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
* critical -- __PAGE_OFFSET would point us back into the dynamic
* range and we might end up looping forever...
*/
- if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (p4d_folded)
p4d_p = pgd_p;
else if (pgd)
p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 2be7d1e7fcf1..ebdcb08a91cb 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -40,7 +40,7 @@
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))

#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
-PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
+PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE48)
PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
#endif
L3_START_KERNEL = pud_index(__START_KERNEL_map)
@@ -121,7 +121,10 @@ ENTRY(secondary_startup_64)
/* Enable PAE mode, PGE and LA57 */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
+ testl $1, p4d_folded(%rip)
+ jnz 1f
orl $X86_CR4_LA57, %ecx
+1:
#endif
movq %rcx, %cr4

@@ -350,12 +353,7 @@ GLOBAL(name)

__INITDATA
NEXT_PAGE(early_top_pgt)
- .fill 511,8,0
-#ifdef CONFIG_X86_5LEVEL
- .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
-#else
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
-#endif
+ .fill 512,8,0

NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -434,6 +432,12 @@ ENTRY(phys_base)
.quad 0x0000000000000000
EXPORT_SYMBOL(phys_base)

+#ifdef CONFIG_X86_5LEVEL
+ENTRY(p4d_folded)
+ .word 1
+EXPORT_SYMBOL(p4d_folded)
+#endif
+
#include "../../x86/xen/xen-head.S"

__PAGE_ALIGNED_BSS
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 022ebddb3734..10e6dd1cb948 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -202,9 +202,7 @@ struct ist_info ist_info;
#endif

#else
-struct cpuinfo_x86 boot_cpu_data __read_mostly = {
- .x86_phys_bits = MAX_PHYSMEM_BITS,
-};
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
EXPORT_SYMBOL(boot_cpu_data);
#endif

@@ -892,6 +890,7 @@ void __init setup_arch(char **cmdline_p)
__flush_tlb_all();
#else
printk(KERN_INFO "Command line: %s\n", boot_command_line);
+ boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
#endif

/*
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index bad178deffba..acf264ab02f3 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -82,8 +82,8 @@ static struct addr_marker address_markers[] = {
{ 0/* VMALLOC_START */, "vmalloc() Area" },
{ 0/* VMEMMAP_START */, "Vmemmap" },
#ifdef CONFIG_KASAN
- { KASAN_SHADOW_START, "KASAN shadow" },
- { KASAN_SHADOW_END, "KASAN shadow end" },
+ { 0/* KASAN_SHADOW_START */, "KASAN shadow" },
+ { 0/* KASAN_SHADOW_END */, "KASAN shadow end" },
#endif
# ifdef CONFIG_X86_ESPFIX64
{ ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
@@ -515,6 +515,10 @@ static int __init pt_dump_init(void)
address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
+#ifdef CONFIG_KASAN
+ address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
+ address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
+#endif
#endif
#ifdef CONFIG_X86_32
address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index e6420b18f6e0..2f6ba5c72905 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -43,7 +43,6 @@
* before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
* ensure that this order is correct and won't be changed.
*/
-static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;

#if defined(CONFIG_X86_ESPFIX64)
static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
@@ -62,8 +61,8 @@ static __initdata struct kaslr_memory_region {
unsigned long *base;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
- { &vmalloc_base, VMALLOC_SIZE_TB },
+ { &page_offset_base, 0 },
+ { &vmalloc_base, 0 },
{ &vmemmap_base, 1 },
};

@@ -86,11 +85,14 @@ static inline bool kaslr_memory_enabled(void)
void __init kernel_randomize_memory(void)
{
size_t i;
- unsigned long vaddr = vaddr_start;
+ unsigned long vaddr_start, vaddr;
unsigned long rand, memory_tb;
struct rnd_state rand_state;
unsigned long remain_entropy;

+ vaddr_start = p4d_folded ? __PAGE_OFFSET_BASE48 : __PAGE_OFFSET_BASE57;
+ vaddr = vaddr_start;
+
/*
* All these BUILD_BUG_ON checks ensures the memory layout is
* consistent with the vaddr_start/vaddr_end variables.
@@ -106,6 +108,9 @@ void __init kernel_randomize_memory(void)
if (!kaslr_memory_enabled())
return;

+ kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT);
+ kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
+
/*
* Update Physical memory mapping to available and
* add padding if needed (especially for memory hotplug support).
--
2.13.2
Kirill A. Shutemov
2017-08-07 14:20:01 UTC
Permalink
All pieces of the puzzle are in place and we can now allow to boot with
CONFIG_X86_5LEVEL=y on a machine without la57 support.

Kernel will detect that la57 is missing and fold p4d at runtime.

Update documentation and Kconfig option description to reflect the
change.

Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
---
Documentation/x86/x86_64/5level-paging.txt | 9 +++------
arch/x86/Kconfig | 4 ++--
arch/x86/include/asm/required-features.h | 8 +-------
3 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/Documentation/x86/x86_64/5level-paging.txt b/Documentation/x86/x86_64/5level-paging.txt
index 087251a0d99c..2432a5ef86d9 100644
--- a/Documentation/x86/x86_64/5level-paging.txt
+++ b/Documentation/x86/x86_64/5level-paging.txt
@@ -20,12 +20,9 @@ Documentation/x86/x86_64/mm.txt

CONFIG_X86_5LEVEL=y enables the feature.

-So far, a kernel compiled with the option enabled will be able to boot
-only on machines that supports the feature -- see for 'la57' flag in
-/proc/cpuinfo.
-
-The plan is to implement boot-time switching between 4- and 5-level paging
-in the future.
+Kernel with CONFIG_X86_5LEVEL=y still able to boot on 4-level hardware.
+In this case additional page table level -- p4d -- will be folded at
+runtime.

== User-space and large virtual address space ==

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2c9c4899d9ff..4a1a52f064dc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1408,8 +1408,8 @@ config X86_5LEVEL

It will be supported by future Intel CPUs.

- Note: a kernel with this option enabled can only be booted
- on machines that support the feature.
+ A kernel with the option enabled can be booted on machines that
+ support 4- or 5-level paging.

See Documentation/x86/x86_64/5level-paging.txt for more
information.
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index d91ba04dd007..fac9a5c0abe9 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -53,12 +53,6 @@
# define NEED_MOVBE 0
#endif

-#ifdef CONFIG_X86_5LEVEL
-# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31))
-#else
-# define NEED_LA57 0
-#endif
-
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT
/* Paravirtualized systems may not have PSE or PGE available */
@@ -104,7 +98,7 @@
#define REQUIRED_MASK13 0
#define REQUIRED_MASK14 0
#define REQUIRED_MASK15 0
-#define REQUIRED_MASK16 (NEED_LA57)
+#define REQUIRED_MASK16 0
#define REQUIRED_MASK17 0
#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
--
2.13.2
Kirill A. Shutemov
2017-08-07 14:20:01 UTC
Permalink
By this point we have functioning boot-time switching between 4- and
5-level paging mode. But naive approach comes with cost.

Numbers below are for kernel build, allmodconfig, 5 times.

CONFIG_X86_5LEVEL=n:

Performance counter stats for 'sh -c make -j100 -B -k >/dev/null' (5 runs):

17308719.892691 task-clock:u (msec) # 26.772 CPUs utilized ( +- 0.11% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
331,993,164 page-faults:u # 0.019 M/sec ( +- 0.01% )
43,614,978,867,455 cycles:u # 2.520 GHz ( +- 0.01% )
39,371,534,575,126 stalled-cycles-frontend:u # 90.27% frontend cycles idle ( +- 0.09% )
28,363,350,152,428 instructions:u # 0.65 insn per cycle
# 1.39 stalled cycles per insn ( +- 0.00% )
6,316,784,066,413 branches:u # 364.948 M/sec ( +- 0.00% )
250,808,144,781 branch-misses:u # 3.97% of all branches ( +- 0.01% )

646.531974142 seconds time elapsed ( +- 1.15% )

CONFIG_X86_5LEVEL=y:

Performance counter stats for 'sh -c make -j100 -B -k >/dev/null' (5 runs):

17411536.780625 task-clock:u (msec) # 26.426 CPUs utilized ( +- 0.10% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
331,868,663 page-faults:u # 0.019 M/sec ( +- 0.01% )
43,865,909,056,301 cycles:u # 2.519 GHz ( +- 0.01% )
39,740,130,365,581 stalled-cycles-frontend:u # 90.59% frontend cycles idle ( +- 0.05% )
28,363,358,997,959 instructions:u # 0.65 insn per cycle
# 1.40 stalled cycles per insn ( +- 0.00% )
6,316,784,937,460 branches:u # 362.793 M/sec ( +- 0.00% )
251,531,919,485 branch-misses:u # 3.98% of all branches ( +- 0.00% )

658.886307752 seconds time elapsed ( +- 0.92% )
The patch tries to fix the performance regression by using

!cpu_feature_enabled(X86_FEATURE_LA57) instead of p4d_folded in most
code paths. These will statically patch the target code for additional
performance.

Notable exception, where cpu_feature_enabled() wasn't used is
return_from_SYSCALL_64(). It's written in assembly and there's no easy
way to use the same approach. We can come back to this later, if
required.

Also, I had to re-write number of static inline helpers as macros.
It was needed to break header dependency loop between cpufeature.h and
pgtable_types.h.

CONFIG_X86_5LEVEL=y + the patch:

Performance counter stats for 'sh -c make -j100 -B -k >/dev/null' (5 runs):

17381990.268506 task-clock:u (msec) # 26.907 CPUs utilized ( +- 0.19% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
331,862,625 page-faults:u # 0.019 M/sec ( +- 0.01% )
43,697,726,320,051 cycles:u # 2.514 GHz ( +- 0.03% )
39,480,408,690,401 stalled-cycles-frontend:u # 90.35% frontend cycles idle ( +- 0.05% )
28,363,394,221,388 instructions:u # 0.65 insn per cycle
# 1.39 stalled cycles per insn ( +- 0.00% )
6,316,794,985,573 branches:u # 363.410 M/sec ( +- 0.00% )
251,013,232,547 branch-misses:u # 3.97% of all branches ( +- 0.01% )

645.991174661 seconds time elapsed ( +- 1.19% )

Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
---
arch/x86/boot/compressed/misc.h | 5 +++
arch/x86/entry/entry_64.S | 2 +-
arch/x86/include/asm/paravirt.h | 23 ++++++-----
arch/x86/include/asm/pgtable_64_types.h | 5 ++-
arch/x86/include/asm/pgtable_types.h | 67 ++++++++-------------------------
arch/x86/kernel/head64.c | 5 +++
arch/x86/kernel/head_64.S | 6 +--
arch/x86/mm/kasan_init_64.c | 6 +++
8 files changed, 53 insertions(+), 66 deletions(-)

diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 766a5211f827..28ac72acaa31 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -11,6 +11,11 @@
#undef CONFIG_PARAVIRT_SPINLOCKS
#undef CONFIG_KASAN

+#ifdef CONFIG_X86_5LEVEL
+/* cpu_feature_enabled() cannot be used that early */
+#define p4d_folded __p4d_folded
+#endif
+
#include <linux/linkage.h>
#include <linux/screen_info.h>
#include <linux/elf.h>
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 077e8b45784c..6f92e61d35ac 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -274,7 +274,7 @@ return_from_SYSCALL_64:
* depending on paging mode) in the address.
*/
#ifdef CONFIG_X86_5LEVEL
- testl $1, p4d_folded(%rip)
+ testl $1, __p4d_folded(%rip)
jnz 1f
shl $(64 - 57), %rcx
sar $(64 - 57), %rcx
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 69c3cb792f34..77d69be89d4b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -604,19 +604,22 @@ static inline p4dval_t p4d_val(p4d_t p4d)
return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d);
}

-static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- if (p4d_folded)
- set_p4d((p4d_t *)(pgdp), (p4d_t) { pgd.pgd });
- else
- PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd));
+ PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd));
}

-static inline void pgd_clear(pgd_t *pgdp)
-{
- if (!p4d_folded)
- set_pgd(pgdp, __pgd(0));
-}
+#define set_pgd(pgdp, pgdval) do { \
+ if (p4d_folded) \
+ set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \
+ else \
+ __set_pgd(pgdp, pgdval); \
+} while (0)
+
+#define pgd_clear(pgdp) do { \
+ if (!p4d_folded) \
+ set_pgd(pgdp, __pgd(0)); \
+} while (0)

#endif /* CONFIG_PGTABLE_LEVELS == 5 */

diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index a29cc23e96b8..9b50cc0f82e1 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,7 +20,10 @@ typedef unsigned long pgprotval_t;
typedef struct { pteval_t pte; } pte_t;

#ifdef CONFIG_X86_5LEVEL
-extern unsigned int p4d_folded;
+extern unsigned int __p4d_folded;
+#ifndef p4d_folded
+#define p4d_folded (!cpu_feature_enabled(X86_FEATURE_LA57))
+#endif
#else
#define p4d_folded 1
#endif
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 399261ce904c..ece3ad5215ad 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -290,10 +290,7 @@ static inline pgdval_t native_pgd_val(pgd_t pgd)
return pgd.pgd;
}

-static inline pgdval_t pgd_flags(pgd_t pgd)
-{
- return native_pgd_val(pgd) & PTE_FLAGS_MASK;
-}
+#define pgd_flags(pgd) (native_pgd_val(pgd) & PTE_FLAGS_MASK)

#if CONFIG_PGTABLE_LEVELS > 4
typedef struct { p4dval_t p4d; } p4d_t;
@@ -363,57 +360,28 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
}
#endif

-static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
-{
- /* No 512 GiB huge pages yet */
- return PTE_PFN_MASK;
-}
+/* No 512 GiB huge pages yet */
+#define p4d_pfn_mask(p4d) PTE_PFN_MASK

-static inline p4dval_t p4d_flags_mask(p4d_t p4d)
-{
- return ~p4d_pfn_mask(p4d);
-}
+#define p4d_flags_mask(p4d) (~p4d_pfn_mask(p4d))

-static inline p4dval_t p4d_flags(p4d_t p4d)
-{
- return native_p4d_val(p4d) & p4d_flags_mask(p4d);
-}
+#define p4d_flags(p4d) (native_p4d_val(p4d) & p4d_flags_mask(p4d))

-static inline pudval_t pud_pfn_mask(pud_t pud)
-{
- if (native_pud_val(pud) & _PAGE_PSE)
- return PHYSICAL_PUD_PAGE_MASK;
- else
- return PTE_PFN_MASK;
-}
+#define pud_pfn_mask(pud) \
+ (native_pud_val(pud) & _PAGE_PSE ? \
+ PHYSICAL_PUD_PAGE_MASK : PTE_PFN_MASK)

-static inline pudval_t pud_flags_mask(pud_t pud)
-{
- return ~pud_pfn_mask(pud);
-}
+#define pud_flags_mask(pud) (~pud_pfn_mask(pud))

-static inline pudval_t pud_flags(pud_t pud)
-{
- return native_pud_val(pud) & pud_flags_mask(pud);
-}
+#define pud_flags(pud) (native_pud_val(pud) & pud_flags_mask(pud))

-static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
-{
- if (native_pmd_val(pmd) & _PAGE_PSE)
- return PHYSICAL_PMD_PAGE_MASK;
- else
- return PTE_PFN_MASK;
-}
+#define pmd_pfn_mask(pmd) \
+ (native_pmd_val(pmd) & _PAGE_PSE ? \
+ PHYSICAL_PMD_PAGE_MASK : PTE_PFN_MASK)

-static inline pmdval_t pmd_flags_mask(pmd_t pmd)
-{
- return ~pmd_pfn_mask(pmd);
-}
+#define pmd_flags_mask(pmd) (~pmd_pfn_mask(pmd))

-static inline pmdval_t pmd_flags(pmd_t pmd)
-{
- return native_pmd_val(pmd) & pmd_flags_mask(pmd);
-}
+#define pmd_flags(pmd) (native_pmd_val(pmd) & pmd_flags_mask(pmd))

static inline pte_t native_make_pte(pteval_t val)
{
@@ -425,10 +393,7 @@ static inline pteval_t native_pte_val(pte_t pte)
return pte.pte;
}

-static inline pteval_t pte_flags(pte_t pte)
-{
- return native_pte_val(pte) & PTE_FLAGS_MASK;
-}
+#define pte_flags(pte) (native_pte_val(pte) & PTE_FLAGS_MASK)

#define pgprot_val(x) ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) } )
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 4c9ffb58a3b5..10736723c4e0 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -31,6 +31,11 @@
#include <asm/microcode.h>
#include <asm/kasan.h>

+#ifdef CONFIG_X86_5LEVEL
+#undef p4d_folded
+#define p4d_folded __p4d_folded
+#endif
+
/*
* Manage page tables very early on.
*/
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 9de244aa72fd..805d915f730a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -121,7 +121,7 @@ ENTRY(secondary_startup_64)
/* Enable PAE mode, PGE and LA57 */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
- testl $1, p4d_folded(%rip)
+ testl $1, __p4d_folded(%rip)
jnz 1f
orl $X86_CR4_LA57, %ecx
1:
@@ -433,9 +433,9 @@ ENTRY(phys_base)
EXPORT_SYMBOL(phys_base)

#ifdef CONFIG_X86_5LEVEL
-ENTRY(p4d_folded)
+ENTRY(__p4d_folded)
.word 1
-EXPORT_SYMBOL(p4d_folded)
+EXPORT_SYMBOL(__p4d_folded)
#endif

#include "../../x86/xen/xen-head.S"
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 17256253a887..6dfdfddba09a 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,5 +1,11 @@
#define DISABLE_BRANCH_PROFILING
#define pr_fmt(fmt) "kasan: " fmt
+
+#ifdef CONFIG_X86_5LEVEL
+/* Too early to use cpu_feature_enabled() */
+#define p4d_folded __p4d_folded
+#endif
+
#include <linux/bootmem.h>
#include <linux/kasan.h>
#include <linux/kdebug.h>
--
2.13.2
Andi Kleen
2017-08-07 16:00:02 UTC
Permalink
Post by Kirill A. Shutemov
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 077e8b45784c..6f92e61d35ac 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
* depending on paging mode) in the address.
*/
#ifdef CONFIG_X86_5LEVEL
- testl $1, p4d_folded(%rip)
+ testl $1, __p4d_folded(%rip)
jnz 1f
You can use

ALTERNATIVE "", "jmp 1f", X86_FEATURE_LA57

to do the patching.

-Andi

Kirill A. Shutemov
2017-08-07 14:20:02 UTC
Permalink
This patch converts the of CONFIG_X86_5LEVEL check to runtime checks for
p4d folding.

Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
---
arch/x86/mm/fault.c | 2 +-
arch/x86/mm/ident_map.c | 2 +-
arch/x86/mm/init_64.c | 30 ++++++++++++++++++------------
arch/x86/mm/kasan_init_64.c | 8 ++++----
arch/x86/mm/kaslr.c | 6 +++---
arch/x86/platform/efi/efi_64.c | 2 +-
arch/x86/power/hibernate_64.c | 6 +++---
7 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2a1fa10c6a98..d3d8f10f0c10 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -459,7 +459,7 @@ static noinline int vmalloc_fault(unsigned long address)
if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
arch_flush_lazy_mmu_mode();
- } else if (CONFIG_PGTABLE_LEVELS > 4) {
+ } else if (!p4d_folded) {
/*
* With folded p4d, pgd_none() is always false, so the pgd may
* point to an empty page table entry and pgd_page_vaddr()
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 31cea988fa36..1ab88f1bec15 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -119,7 +119,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ if (!p4d_folded) {
set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
} else {
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 649b8df485ad..6b97f6c1bf77 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -88,12 +88,7 @@ static int __init nonx32_setup(char *str)
}
__setup("noexec32=", nonx32_setup);

-/*
- * When memory was added make sure all the processes MM have
- * suitable PGD entries in the local PGD level page.
- */
-#ifdef CONFIG_X86_5LEVEL
-void sync_global_pgds(unsigned long start, unsigned long end)
+static void sync_global_pgds_57(unsigned long start, unsigned long end)
{
unsigned long addr;

@@ -129,8 +124,8 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(&pgd_lock);
}
}
-#else
-void sync_global_pgds(unsigned long start, unsigned long end)
+
+static void sync_global_pgds_48(unsigned long start, unsigned long end)
{
unsigned long addr;

@@ -173,7 +168,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(&pgd_lock);
}
}
-#endif
+
+/*
+ * When memory was added make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+ if (!p4d_folded)
+ sync_global_pgds_57(start, end);
+ else
+ sync_global_pgds_48(start, end);
+}

/*
* NOTE: This function is marked __ref because it calls __init function
@@ -632,7 +638,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
unsigned long vaddr = (unsigned long)__va(paddr);
int i = p4d_index(vaddr);

- if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (p4d_folded)
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);

for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
@@ -712,7 +718,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
page_size_mask);

spin_lock(&init_mm.page_table_lock);
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (!p4d_folded)
pgd_populate(&init_mm, pgd, p4d);
else
p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
@@ -1078,7 +1084,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
* 5-level case we should free them. This code will have to change
* to adapt for boot-time switching between 4 and 5 level page tables.
*/
- if (CONFIG_PGTABLE_LEVELS == 5)
+ if (!p4d_folded)
free_pud_table(pud_base, p4d);
}

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index e1e2cca88567..17256253a887 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -40,7 +40,7 @@ static void __init clear_pgds(unsigned long start,
* With folded p4d, pgd_clear() is nop, use p4d_clear()
* instead.
*/
- if (CONFIG_PGTABLE_LEVELS < 5)
+ if (p4d_folded)
p4d_clear(p4d_offset(pgd, start));
else
pgd_clear(pgd);
@@ -55,7 +55,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
{
unsigned long p4d;

- if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (p4d_folded)
return (p4d_t *)pgd;

p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
@@ -135,7 +135,7 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);

- for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++)
+ for (i = 0; !p4d_folded && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);

kasan_map_early_shadow(early_top_pgt);
@@ -152,7 +152,7 @@ void __init kasan_init(void)

memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));

- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ if (!p4d_folded) {
void *ptr;

ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 2f6ba5c72905..b70f86a2ce6a 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -139,7 +139,7 @@ void __init kernel_randomize_memory(void)
*/
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(&rand_state, &rand, sizeof(rand));
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (!p4d_folded)
entropy = (rand % (entropy + 1)) & P4D_MASK;
else
entropy = (rand % (entropy + 1)) & PUD_MASK;
@@ -151,7 +151,7 @@ void __init kernel_randomize_memory(void)
* randomization alignment.
*/
vaddr += get_padding(&kaslr_regions[i]);
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (!p4d_folded)
vaddr = round_up(vaddr + 1, P4D_SIZE);
else
vaddr = round_up(vaddr + 1, PUD_SIZE);
@@ -227,7 +227,7 @@ void __meminit init_trampoline(void)
return;
}

- if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (!p4d_folded)
init_trampoline_p4d();
else
init_trampoline_pud();
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 970a0f5f787d..e8c3fc62ef03 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -219,7 +219,7 @@ int __init efi_alloc_page_tables(void)

pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
if (!pud) {
- if (CONFIG_PGTABLE_LEVELS > 4)
+ if (!p4d_folded)
free_page((unsigned long) pgd_page_vaddr(*pgd));
free_page((unsigned long)efi_pgd);
return -ENOMEM;
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index f2598d81cd55..9b9bc2ef4321 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -50,7 +50,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
{
pmd_t *pmd;
pud_t *pud;
- p4d_t *p4d;
+ p4d_t *p4d = NULL;

/*
* The new mapping only has to cover the page containing the image
@@ -66,7 +66,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
* tables used by the image kernel.
*/

- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ if (!p4d_folded) {
p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
if (!p4d)
return -ENOMEM;
@@ -84,7 +84,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
__pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC));
set_pud(pud + pud_index(restore_jump_address),
__pud(__pa(pmd) | _KERNPG_TABLE));
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ if (p4d) {
set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE));
set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE));
} else {
--
2.13.2
Kirill A. Shutemov
2017-08-07 14:20:02 UTC
Permalink
We need to be able to adjust virtual memory layout at runtime to be able
to switch between 4- and 5-level paging at boot-time.

KASLR already has movable __VMALLOC_BASE, __VMEMMAP_BASE and __PAGE_OFFSET.
Let's re-use it.

Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
---
arch/x86/include/asm/kaslr.h | 4 ----
arch/x86/include/asm/page_64.h | 4 ++++
arch/x86/include/asm/page_64_types.h | 2 +-
arch/x86/include/asm/pgtable_64_types.h | 2 +-
arch/x86/kernel/head64.c | 9 +++++++++
arch/x86/mm/kaslr.c | 8 --------
6 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h
index 1052a797d71d..683c9d736314 100644
--- a/arch/x86/include/asm/kaslr.h
+++ b/arch/x86/include/asm/kaslr.h
@@ -4,10 +4,6 @@
unsigned long kaslr_get_random_long(const char *purpose);

#ifdef CONFIG_RANDOMIZE_MEMORY
-extern unsigned long page_offset_base;
-extern unsigned long vmalloc_base;
-extern unsigned long vmemmap_base;
-
void kernel_randomize_memory(void);
#else
static inline void kernel_randomize_memory(void) { }
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index b4a0d43248cf..a12fb4dcdd15 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -10,6 +10,10 @@
extern unsigned long max_pfn;
extern unsigned long phys_base;

+extern unsigned long page_offset_base;
+extern unsigned long vmalloc_base;
+extern unsigned long vmemmap_base;
+
static inline unsigned long __phys_addr_nodebug(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 3f5f08b010d0..0126d6bc2eb1 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -42,7 +42,7 @@
#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL)
#endif

-#ifdef CONFIG_RANDOMIZE_MEMORY
+#if defined(CONFIG_RANDOMIZE_MEMORY) || defined(CONFIG_X86_5LEVEL)
#define __PAGE_OFFSET page_offset_base
#else
#define __PAGE_OFFSET __PAGE_OFFSET_BASE
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 06470da156ba..a9f77ead7088 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -85,7 +85,7 @@ typedef struct { pteval_t pte; } pte_t;
#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
#endif
-#ifdef CONFIG_RANDOMIZE_MEMORY
+#if defined(CONFIG_RANDOMIZE_MEMORY) || defined(CONFIG_X86_5LEVEL)
#define VMALLOC_START vmalloc_base
#define VMEMMAP_START vmemmap_base
#else
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 925b2928f377..f0d43da3e14c 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -38,6 +38,15 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);

+#if defined(CONFIG_RANDOMIZE_MEMORY) || defined(CONFIG_X86_5LEVEL)
+unsigned long page_offset_base = __PAGE_OFFSET_BASE;
+EXPORT_SYMBOL(page_offset_base);
+unsigned long vmalloc_base = __VMALLOC_BASE;
+EXPORT_SYMBOL(vmalloc_base);
+unsigned long vmemmap_base = __VMEMMAP_BASE;
+EXPORT_SYMBOL(vmemmap_base);
+#endif
+
#define __head __section(.head.text)

static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index af599167fe3c..e6420b18f6e0 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -53,14 +53,6 @@ static const unsigned long vaddr_end = EFI_VA_END;
static const unsigned long vaddr_end = __START_KERNEL_map;
#endif

-/* Default values */
-unsigned long page_offset_base = __PAGE_OFFSET_BASE;
-EXPORT_SYMBOL(page_offset_base);
-unsigned long vmalloc_base = __VMALLOC_BASE;
-EXPORT_SYMBOL(vmalloc_base);
-unsigned long vmemmap_base = __VMEMMAP_BASE;
-EXPORT_SYMBOL(vmemmap_base);
-
/*
* Memory regions randomized by KASLR (except modules that use a separate logic
* earlier during boot). The list is ordered based on virtual addresses. This
--
2.13.2
Kirill A. Shutemov
2017-08-07 14:20:02 UTC
Permalink
We are going to support boot-time switching between 4- and 5-level
paging. For KASAN it means we cannot have different KASAN_SHADOW_OFFSET
for different paging modes: the constant is passed to gcc to generate
code and cannot be changed at runtime.

This patch changes KASAN code to use 0xdffffc0000000000 as shadow offset
for both 4- and 5-level paging.

For 5-level paging it means that shadow memory region is not aligned to
PGD boundary anymore and we have to handle unaligned parts of the region
properly.

In addition, we have to exclude paravirt code from KASAN instrumentation
as we now use set_pgd() before KASAN is fully ready.

Signed-off-by: Andrey Ryabinin <***@virtuozzo.com>
[***@linux.intel.com: clenaup, changelog message]
Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
---
arch/x86/Kconfig | 1 -
arch/x86/kernel/Makefile | 3 +-
arch/x86/mm/kasan_init_64.c | 86 ++++++++++++++++++++++++++++++++++-----------
3 files changed, 67 insertions(+), 23 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ce3ed304288d..2c9c4899d9ff 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -299,7 +299,6 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
config KASAN_SHADOW_OFFSET
hex
depends on KASAN
- default 0xdff8000000000000 if X86_5LEVEL
default 0xdffffc0000000000

config HAVE_INTEL_TXT
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 287eac7d207f..4509140232c3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,7 +24,8 @@ endif
KASAN_SANITIZE_head$(BITS).o := n
KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
-KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_paravirt.o := n

OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index bc84b73684b7..f6b4db2647b5 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -15,6 +15,8 @@

extern struct range pfn_mapped[E820_MAX_ENTRIES];

+static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+
static int __init map_range(struct range *range)
{
unsigned long start;
@@ -30,8 +32,9 @@ static void __init clear_pgds(unsigned long start,
unsigned long end)
{
pgd_t *pgd;
+ unsigned long pgd_end = end & PGDIR_MASK;

- for (; start < end; start += PGDIR_SIZE) {
+ for (; start < pgd_end; start += PGDIR_SIZE) {
pgd = pgd_offset_k(start);
/*
* With folded p4d, pgd_clear() is nop, use p4d_clear()
@@ -42,29 +45,60 @@ static void __init clear_pgds(unsigned long start,
else
pgd_clear(pgd);
}
+
+ pgd = pgd_offset_k(start);
+ for (; start < end; start += P4D_SIZE)
+ p4d_clear(p4d_offset(pgd, start));
+}
+
+static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
+{
+ unsigned long p4d;
+
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return (p4d_t *)pgd;
+
+ p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
+ p4d += __START_KERNEL_map - phys_base;
+ return (p4d_t *)p4d + p4d_index(addr);
+}
+
+static void __init kasan_early_p4d_populate(pgd_t *pgd,
+ unsigned long addr,
+ unsigned long end)
+{
+ pgd_t pgd_entry;
+ p4d_t *p4d, p4d_entry;
+ unsigned long next;
+
+ if (pgd_none(*pgd)) {
+ pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d));
+ set_pgd(pgd, pgd_entry);
+ }
+
+ p4d = early_p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_none(*p4d))
+ continue;
+
+ p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud));
+ set_p4d(p4d, p4d_entry);
+ } while (p4d++, addr = next, addr != end && p4d_none(*p4d));
}

static void __init kasan_map_early_shadow(pgd_t *pgd)
{
- int i;
- unsigned long start = KASAN_SHADOW_START;
+ unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK;
unsigned long end = KASAN_SHADOW_END;
+ unsigned long next;

- for (i = pgd_index(start); start < end; i++) {
- switch (CONFIG_PGTABLE_LEVELS) {
- case 4:
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
- _KERNPG_TABLE);
- break;
- case 5:
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
- _KERNPG_TABLE);
- break;
- default:
- BUILD_BUG();
- }
- start += PGDIR_SIZE;
- }
+ pgd += pgd_index(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ kasan_early_p4d_populate(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
}

#ifdef CONFIG_KASAN_INLINE
@@ -101,7 +135,7 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);

- for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
+ for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);

kasan_map_early_shadow(early_top_pgt);
@@ -117,12 +151,22 @@ void __init kasan_init(void)
#endif

memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ void *ptr;
+
+ ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
+ memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table));
+ set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)],
+ __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE));
+ }
+
load_cr3(early_top_pgt);
__flush_tlb_all();

- clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+ clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END);

- kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
+ kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK),
kasan_mem_to_shadow((void *)PAGE_OFFSET));

for (i = 0; i < E820_MAX_ENTRIES; i++) {
--
2.13.2
Kirill A. Shutemov
2017-08-07 14:20:02 UTC
Permalink
For boot-time switching between 4- and 5-level paging we need to be able
to fold p4d page table level at runtime. It requires variable
PGDIR_SHIFT and PTRS_PER_P4D.

Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
---
arch/x86/boot/compressed/kaslr.c | 5 +++++
arch/x86/include/asm/pgtable_32.h | 2 ++
arch/x86/include/asm/pgtable_32_types.h | 2 ++
arch/x86/include/asm/pgtable_64_types.h | 15 +++++++++++++--
arch/x86/kernel/head64.c | 9 ++++++++-
arch/x86/mm/dump_pagetables.c | 12 +++++-------
arch/x86/mm/init_64.c | 2 +-
arch/x86/mm/kasan_init_64.c | 2 +-
arch/x86/platform/efi/efi_64.c | 4 ++--
include/asm-generic/5level-fixup.h | 1 +
include/asm-generic/pgtable-nop4d.h | 1 +
include/linux/kasan.h | 2 +-
mm/kasan/kasan_init.c | 2 +-
13 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 99c7194f7ea6..afde5eb91358 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -43,6 +43,11 @@
#define STATIC
#include <linux/decompress/mm.h>

+#ifdef CONFIG_X86_5LEVEL
+unsigned int pgdir_shift = 48;
+unsigned int ptrs_per_p4d = 512;
+#endif
+
extern unsigned long get_cmd_line_ptr(void);

/* Simplified build-specific string for starting entropy. */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index bfab55675c16..9c3c811347b0 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -32,6 +32,8 @@ static inline void pgtable_cache_init(void) { }
static inline void check_pgt_cache(void) { }
void paging_init(void);

+static inline int pgd_large(pgd_t pgd) { return 0; }
+
/*
* Define this if things work differently on an i386 and an i486:
* it will (on an i486) warn about kernel memory accesses that are
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 9fb2f2bc8245..8928eac4ef08 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -14,6 +14,8 @@
# include <asm/pgtable-2level_types.h>
#endif

+#define p4d_folded 1
+
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))

diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index a9f77ead7088..a5338b0936ad 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -19,6 +19,15 @@ typedef unsigned long pgprotval_t;

typedef struct { pteval_t pte; } pte_t;

+#ifdef CONFIG_X86_5LEVEL
+#define p4d_folded 0
+#else
+#define p4d_folded 1
+#endif
+
+extern unsigned int pgdir_shift;
+extern unsigned int ptrs_per_p4d;
+
#endif /* !__ASSEMBLY__ */

#define SHARED_KERNEL_PMD 0
@@ -28,14 +37,15 @@ typedef struct { pteval_t pte; } pte_t;
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
-#define PGDIR_SHIFT 48
+#define PGDIR_SHIFT pgdir_shift
#define PTRS_PER_PGD 512

/*
* 4th level page in 5-level paging case
*/
#define P4D_SHIFT 39
-#define PTRS_PER_P4D 512
+#define __PTRS_PER_P4D 512
+#define PTRS_PER_P4D ptrs_per_p4d
#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
#define P4D_MASK (~(P4D_SIZE - 1))

@@ -46,6 +56,7 @@ typedef struct { pteval_t pte; } pte_t;
*/
#define PGDIR_SHIFT 39
#define PTRS_PER_PGD 512
+#define __PTRS_PER_P4D 1

#endif /* CONFIG_X86_5LEVEL */

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f0d43da3e14c..f81e61ee5e9d 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -38,6 +38,13 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);

+#ifdef CONFIG_X86_5LEVEL
+unsigned int pgdir_shift = 48;
+EXPORT_SYMBOL(pgdir_shift);
+unsigned int ptrs_per_p4d = 512;
+EXPORT_SYMBOL(ptrs_per_p4d);
+#endif
+
#if defined(CONFIG_RANDOMIZE_MEMORY) || defined(CONFIG_X86_5LEVEL)
unsigned long page_offset_base = __PAGE_OFFSET_BASE;
EXPORT_SYMBOL(page_offset_base);
@@ -329,7 +336,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+ MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 5e3ac6fe6c9e..bad178deffba 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -399,14 +399,15 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
#define p4d_none(a) pud_none(__pud(p4d_val(a)))
#endif

-#if PTRS_PER_P4D > 1
-
static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
{
int i;
p4d_t *start, *p4d_start;
pgprotval_t prot;

+ if (PTRS_PER_P4D == 1)
+ return walk_pud_level(m, st, __p4d(pgd_val(addr)), P);
+
p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);

for (i = 0; i < PTRS_PER_P4D; i++) {
@@ -426,11 +427,8 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
}
}

-#else
-#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
-#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
-#define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
-#endif
+#define pgd_large(a) (p4d_folded ? p4d_large(__p4d(pgd_val(a))) : pgd_large(a))
+#define pgd_none(a) (p4d_folded ? p4d_none(__p4d(pgd_val(a))) : pgd_none(a))

static inline bool is_hypervisor_range(int idx)
{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 136422d7d539..649b8df485ad 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -143,7 +143,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
* With folded p4d, pgd_none() is always false, we need to
* handle synchonization on p4d level.
*/
- BUILD_BUG_ON(pgd_none(*pgd_ref));
+ MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
p4d_ref = p4d_offset(pgd_ref, addr);

if (p4d_none(*p4d_ref))
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index f6b4db2647b5..e1e2cca88567 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -15,7 +15,7 @@

extern struct range pfn_mapped[E820_MAX_ENTRIES];

-static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+static p4d_t tmp_p4d_table[__PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);

static int __init map_range(struct range *range)
{
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 12e83888e5b9..970a0f5f787d 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -249,8 +249,8 @@ void efi_sync_low_kernel_mappings(void)
* only span a single PGD entry and that the entry also maps
* other important kernel regions.
*/
- BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
- BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
+ MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
+ MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
(EFI_VA_END & PGDIR_MASK));

pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET);
diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
index b5ca82dc4175..e9fcfc6b2518 100644
--- a/include/asm-generic/5level-fixup.h
+++ b/include/asm-generic/5level-fixup.h
@@ -7,6 +7,7 @@
#define P4D_SHIFT PGDIR_SHIFT
#define P4D_SIZE PGDIR_SIZE
#define P4D_MASK PGDIR_MASK
+#define __PTRS_PER_P4D 1
#define PTRS_PER_P4D 1

#define p4d_t pgd_t
diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h
index de364ecb8df6..99cb2fa61cef 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -8,6 +8,7 @@
typedef struct { pgd_t pgd; } p4d_t;

#define P4D_SHIFT PGDIR_SHIFT
+#define __PTRS_PER_P4D 1
#define PTRS_PER_P4D 1
#define P4D_SIZE (1UL << P4D_SHIFT)
#define P4D_MASK (~(P4D_SIZE-1))
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index a5c7046f26b4..d27787ab2b84 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -19,7 +19,7 @@ extern unsigned char kasan_zero_page[PAGE_SIZE];
extern pte_t kasan_zero_pte[PTRS_PER_PTE];
extern pmd_t kasan_zero_pmd[PTRS_PER_PMD];
extern pud_t kasan_zero_pud[PTRS_PER_PUD];
-extern p4d_t kasan_zero_p4d[PTRS_PER_P4D];
+extern p4d_t kasan_zero_p4d[__PTRS_PER_P4D];

void kasan_populate_zero_shadow(const void *shadow_start,
const void *shadow_end);
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index 554e4c0f23a2..419e0d33f9be 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -31,7 +31,7 @@
unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;

#if CONFIG_PGTABLE_LEVELS > 4
-p4d_t kasan_zero_p4d[PTRS_PER_P4D] __page_aligned_bss;
+p4d_t kasan_zero_p4d[__PTRS_PER_P4D] __page_aligned_bss;
#endif
#if CONFIG_PGTABLE_LEVELS > 3
pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
--
2.13.2
Andi Kleen
2017-08-07 15:50:02 UTC
Permalink
Post by Kirill A. Shutemov
+#ifdef CONFIG_X86_5LEVEL
+unsigned int pgdir_shift = 48;
+unsigned int ptrs_per_p4d = 512;
should be both __read_mostly
Post by Kirill A. Shutemov
+#ifdef CONFIG_X86_5LEVEL
+unsigned int pgdir_shift = 48;
+EXPORT_SYMBOL(pgdir_shift);
+unsigned int ptrs_per_p4d = 512;
+EXPORT_SYMBOL(ptrs_per_p4d);
+#endif
Same.

-Andi
Kirill A. Shutemov
2017-08-07 14:20:02 UTC
Permalink
With boot-time switching between paging modes, XEN_PV and XEN_PVH can be
boot into 4-level paging mode.

Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
Cc: Juergen Gross <***@suse.com>
---
arch/x86/kernel/head_64.S | 12 ++++++------
arch/x86/xen/Kconfig | 5 -----
arch/x86/xen/mmu_pv.c | 21 +++++++++++++++++++++
3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ebdcb08a91cb..9de244aa72fd 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -37,12 +37,12 @@
*
*/

+#define l4_index(x) (((x) >> 39) & 511)
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))

-#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
-PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE48)
-PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
-#endif
+L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE48)
+L4_START_KERNEL = l4_index(__START_KERNEL_map)
+
L3_START_KERNEL = pud_index(__START_KERNEL_map)

.text
@@ -363,9 +363,9 @@ NEXT_PAGE(early_dynamic_pgts)
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
NEXT_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
- .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
+ .org init_top_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
- .org init_top_pgt + PGD_START_KERNEL*8, 0
+ .org init_top_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 1ecd419811a2..027987638e98 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -17,9 +17,6 @@ config XEN_PV
bool "Xen PV guest support"
default y
depends on XEN
- # XEN_PV is not ready to work with 5-level paging.
- # Changes to hypervisor are also required.
- depends on !X86_5LEVEL
select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
help
@@ -78,6 +75,4 @@ config XEN_DEBUG_FS
config XEN_PVH
bool "Support for running as a PVH guest"
depends on XEN && XEN_PVHVM && ACPI
- # Pre-built page tables are not ready to handle 5-level paging.
- depends on !X86_5LEVEL
def_bool n
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index bc5fddd64217..55b529c36f16 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -558,6 +558,22 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val)

xen_mc_issue(PARAVIRT_LAZY_MMU);
}
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+__visible p4dval_t xen_p4d_val(p4d_t p4d)
+{
+ return pte_mfn_to_pfn(p4d.p4d);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_p4d_val);
+
+__visible p4d_t xen_make_p4d(p4dval_t p4d)
+{
+ p4d = pte_pfn_to_mfn(p4d);
+
+ return native_make_p4d(p4d);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d);
+#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
#endif /* CONFIG_X86_64 */

static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
@@ -2430,6 +2446,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {

.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+ .p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
+ .make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
+#endif
#endif /* CONFIG_X86_64 */

.activate_mm = xen_activate_mm,
--
2.13.2
Kirill A. Shutemov
2017-08-07 14:20:03 UTC
Permalink
This patch changes page table helpers to fold p4d at runtime.
The logic is the same as in <asm-generic/pgtable-nop4d.h>.

Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
---
arch/x86/include/asm/paravirt.h | 10 ++++++----
arch/x86/include/asm/pgalloc.h | 5 ++++-
arch/x86/include/asm/pgtable.h | 10 +++++++++-
3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9ccac1926587..69c3cb792f34 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -606,14 +606,16 @@ static inline p4dval_t p4d_val(p4d_t p4d)

static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- pgdval_t val = native_pgd_val(pgd);
-
- PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val);
+ if (p4d_folded)
+ set_p4d((p4d_t *)(pgdp), (p4d_t) { pgd.pgd });
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd));
}

static inline void pgd_clear(pgd_t *pgdp)
{
- set_pgd(pgdp, __pgd(0));
+ if (!p4d_folded)
+ set_pgd(pgdp, __pgd(0));
}

#endif /* CONFIG_PGTABLE_LEVELS == 5 */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index b2d0cd8288aa..5c42262169d0 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -155,6 +155,8 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
#if CONFIG_PGTABLE_LEVELS > 4
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
+ if (p4d_folded)
+ return;
paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}
@@ -179,7 +181,8 @@ extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long address)
{
- ___p4d_free_tlb(tlb, p4d);
+ if (!p4d_folded)
+ ___p4d_free_tlb(tlb, p4d);
}

#endif /* CONFIG_PGTABLE_LEVELS > 4 */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index bbeae4a2bd01..5114495e4bfd 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -65,7 +65,7 @@ extern pmdval_t early_pmd_flags;

#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
-#define pgd_clear(pgd) native_pgd_clear(pgd)
+#define pgd_clear(pgd) (!p4d_folded ? native_pgd_clear(pgd) : 0)
#endif

#ifndef set_p4d
@@ -861,6 +861,8 @@ static inline unsigned long p4d_index(unsigned long address)
#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
+ if (p4d_folded)
+ return 1;
return pgd_flags(pgd) & _PAGE_PRESENT;
}

@@ -878,16 +880,22 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
/* to find an entry in a page-table-directory. */
static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
+ if (p4d_folded)
+ return (p4d_t *)pgd;
return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}

static inline int pgd_bad(pgd_t pgd)
{
+ if (p4d_folded)
+ return 0;
return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
}

static inline int pgd_none(pgd_t pgd)
{
+ if (p4d_folded)
+ return 0;
/*
* There is no need to do a workaround for the KNL stray
* A/D bit erratum here. PGDs only point to page tables
--
2.13.2
Kirill A. Shutemov
2017-08-07 14:20:03 UTC
Permalink
Size of mem_section array depends on size of physical address space.

In preparation for boot-time switching between paging modes on x86-64
we need to make allocation of mem_section dynamic.

The patch allocates the array on the first call to
sparse_memory_present_with_active_regions().

Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
---
include/linux/mmzone.h | 2 +-
mm/page_alloc.c | 10 ++++++++++
mm/sparse.c | 3 +--
3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc14b8b3f6ce..c8eb668eab79 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1137,7 +1137,7 @@ struct mem_section {
#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)

#ifdef CONFIG_SPARSEMEM_EXTREME
-extern struct mem_section *mem_section[NR_SECTION_ROOTS];
+extern struct mem_section **mem_section;
#else
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d30e914afb6..639fd2dce0c4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5681,6 +5681,16 @@ void __init sparse_memory_present_with_active_regions(int nid)
unsigned long start_pfn, end_pfn;
int i, this_nid;

+#ifdef CONFIG_SPARSEMEM_EXTREME
+ if (!mem_section) {
+ unsigned long size, align;
+
+ size = sizeof(struct mem_section) * NR_SECTION_ROOTS;
+ align = 1 << (INTERNODE_CACHE_SHIFT);
+ mem_section = memblock_virt_alloc(size, align);
+ }
+#endif
+
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
memory_present(this_nid, start_pfn, end_pfn);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 7b4be3fd5cac..3a226a8c1968 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -22,8 +22,7 @@
* 1) mem_section - memory sections, mem_map's for valid memory
*/
#ifdef CONFIG_SPARSEMEM_EXTREME
-struct mem_section *mem_section[NR_SECTION_ROOTS]
- ____cacheline_internodealigned_in_smp;
+struct mem_section **mem_section;
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
____cacheline_internodealigned_in_smp;
--
2.13.2
Kirill A. Shutemov
2017-08-07 14:20:03 UTC
Permalink
This patch prepare decompression code to boot-time switching between 4-
and 5-level paging.

Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
---
arch/x86/boot/compressed/head_64.S | 24 ++++++++++++++++++++++++
1 file changed, 24 insertions(+)

diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fbf4c32d0b62..2e362aea3319 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -347,6 +347,28 @@ preferred_addr:
leaq boot_stack_end(%rbx), %rsp

#ifdef CONFIG_X86_5LEVEL
+ /* Preserve rbx across cpuid */
+ movq %rbx, %r8
+
+ /* Check if leaf 7 is supported */
+ movl $0, %eax
+ cpuid
+ cmpl $7, %eax
+ jb lvl5
+
+ /*
+ * Check if la57 is supported.
+ * The feature is enumerated with CPUID.(EAX=07H, ECX=0):ECX[bit 16]
+ */
+ movl $7, %eax
+ movl $0, %ecx
+ cpuid
+ andl $(1 << 16), %ecx
+ jz lvl5
+
+ /* Restore rbx */
+ movq %r8, %rbx
+
/* Check if 5-level paging has already enabled */
movq %cr4, %rax
testl $X86_CR4_LA57, %eax
@@ -386,6 +408,8 @@ preferred_addr:
pushq %rax
lretq
lvl5:
+ /* Restore rbx */
+ movq %r8, %rbx
#endif

/* Zero EFLAGS */
--
2.13.2
Kirill A. Shutemov
2017-08-07 14:20:04 UTC
Permalink
It was decided 5-level paging is not going to be supported in XEN_PV.

Let's drop the dead code from XEN_PV code.

Signed-off-by: Kirill A. Shutemov <***@linux.intel.com>
Cc: Juergen Gross <***@suse.com>
---
arch/x86/xen/mmu_pv.c | 159 +++++++++++++++++++-------------------------------
1 file changed, 60 insertions(+), 99 deletions(-)

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index e437714750f8..bc5fddd64217 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -469,7 +469,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);

-#if CONFIG_PGTABLE_LEVELS == 4
+#ifdef CONFIG_X86_64
__visible pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
@@ -558,7 +558,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val)

xen_mc_issue(PARAVIRT_LAZY_MMU);
}
-#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+#endif /* CONFIG_X86_64 */

static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
@@ -600,21 +600,17 @@ static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
bool last, unsigned long limit)
{
- int i, nr, flush = 0;
+ int flush = 0;
+ pud_t *pud;

- nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
- for (i = 0; i < nr; i++) {
- pud_t *pud;

- if (p4d_none(p4d[i]))
- continue;
+ if (p4d_none(*p4d))
+ return flush;

- pud = pud_offset(&p4d[i], 0);
- if (PTRS_PER_PUD > 1)
- flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
- flush |= xen_pud_walk(mm, pud, func,
- last && i == nr - 1, limit);
- }
+ pud = pud_offset(p4d, 0);
+ if (PTRS_PER_PUD > 1)
+ flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
+ flush |= xen_pud_walk(mm, pud, func, last, limit);
return flush;
}

@@ -664,8 +660,6 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
continue;

p4d = p4d_offset(&pgd[i], 0);
- if (PTRS_PER_P4D > 1)
- flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
}

@@ -1196,22 +1190,14 @@ static void __init xen_cleanmfnmap(unsigned long vaddr)
{
pgd_t *pgd;
p4d_t *p4d;
- unsigned int i;
bool unpin;

unpin = (vaddr == 2 * PGDIR_SIZE);
vaddr &= PMD_MASK;
pgd = pgd_offset_k(vaddr);
p4d = p4d_offset(pgd, 0);
- for (i = 0; i < PTRS_PER_P4D; i++) {
- if (p4d_none(p4d[i]))
- continue;
- xen_cleanmfnmap_p4d(p4d + i, unpin);
- }
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- set_pgd(pgd, __pgd(0));
- xen_cleanmfnmap_free_pgtbl(p4d, unpin);
- }
+ if (!p4d_none(*p4d))
+ xen_cleanmfnmap_p4d(p4d, unpin);
}

static void __init xen_pagetable_p2m_free(void)
@@ -1717,7 +1703,7 @@ static void xen_release_pmd(unsigned long pfn)
xen_release_ptpage(pfn, PT_PMD);
}

-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2054,13 +2040,12 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
*/
void __init xen_relocate_p2m(void)
{
- phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
+ phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
- int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
+ int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
pte_t *pt;
pmd_t *pmd;
pud_t *pud;
- p4d_t *p4d = NULL;
pgd_t *pgd;
unsigned long *new_p2m;
int save_pud;
@@ -2070,11 +2055,7 @@ void __init xen_relocate_p2m(void)
n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
- if (PTRS_PER_P4D > 1)
- n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
- else
- n_p4d = 0;
- n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
+ n_frames = n_pte + n_pt + n_pmd + n_pud;

new_area = xen_find_free_area(PFN_PHYS(n_frames));
if (!new_area) {
@@ -2090,76 +2071,56 @@ void __init xen_relocate_p2m(void)
* To avoid any possible virtual address collision, just use
* 2 * PUD_SIZE for the new area.
*/
- p4d_phys = new_area;
- pud_phys = p4d_phys + PFN_PHYS(n_p4d);
+ pud_phys = new_area;
pmd_phys = pud_phys + PFN_PHYS(n_pud);
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;

pgd = __va(read_cr3_pa());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
- idx_p4d = 0;
save_pud = n_pud;
- do {
- if (n_p4d > 0) {
- p4d = early_memremap(p4d_phys, PAGE_SIZE);
- clear_page(p4d);
- n_pud = min(save_pud, PTRS_PER_P4D);
- }
- for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
- pud = early_memremap(pud_phys, PAGE_SIZE);
- clear_page(pud);
- for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
- idx_pmd++) {
- pmd = early_memremap(pmd_phys, PAGE_SIZE);
- clear_page(pmd);
- for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
- idx_pt++) {
- pt = early_memremap(pt_phys, PAGE_SIZE);
- clear_page(pt);
- for (idx_pte = 0;
- idx_pte < min(n_pte, PTRS_PER_PTE);
- idx_pte++) {
- set_pte(pt + idx_pte,
- pfn_pte(p2m_pfn, PAGE_KERNEL));
- p2m_pfn++;
- }
- n_pte -= PTRS_PER_PTE;
- early_memunmap(pt, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pt_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
- PFN_DOWN(pt_phys));
- set_pmd(pmd + idx_pt,
- __pmd(_PAGE_TABLE | pt_phys));
- pt_phys += PAGE_SIZE;
+ for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+ pud = early_memremap(pud_phys, PAGE_SIZE);
+ clear_page(pud);
+ for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+ idx_pmd++) {
+ pmd = early_memremap(pmd_phys, PAGE_SIZE);
+ clear_page(pmd);
+ for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+ idx_pt++) {
+ pt = early_memremap(pt_phys, PAGE_SIZE);
+ clear_page(pt);
+ for (idx_pte = 0;
+ idx_pte < min(n_pte, PTRS_PER_PTE);
+ idx_pte++) {
+ set_pte(pt + idx_pte,
+ pfn_pte(p2m_pfn, PAGE_KERNEL));
+ p2m_pfn++;
}
- n_pt -= PTRS_PER_PMD;
- early_memunmap(pmd, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pmd_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
- PFN_DOWN(pmd_phys));
- set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
- pmd_phys += PAGE_SIZE;
+ n_pte -= PTRS_PER_PTE;
+ early_memunmap(pt, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pt_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+ PFN_DOWN(pt_phys));
+ set_pmd(pmd + idx_pt,
+ __pmd(_PAGE_TABLE | pt_phys));
+ pt_phys += PAGE_SIZE;
}
- n_pmd -= PTRS_PER_PUD;
- early_memunmap(pud, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pud_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
- if (n_p4d > 0)
- set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys));
- else
- set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
- pud_phys += PAGE_SIZE;
- }
- if (n_p4d > 0) {
- save_pud -= PTRS_PER_P4D;
- early_memunmap(p4d, PAGE_SIZE);
- make_lowmem_page_readonly(__va(p4d_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
- set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys));
- p4d_phys += PAGE_SIZE;
+ n_pt -= PTRS_PER_PMD;
+ early_memunmap(pmd, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pmd_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+ PFN_DOWN(pmd_phys));
+ set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+ pmd_phys += PAGE_SIZE;
}
- } while (++idx_p4d < n_p4d);
+ n_pmd -= PTRS_PER_PUD;
+ early_memunmap(pud, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pud_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+ set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+ pud_phys += PAGE_SIZE;
+ }

/* Now copy the old p2m info to the new area. */
memcpy(new_p2m, xen_p2m_addr, size);
@@ -2386,7 +2347,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
pv_mmu_ops.set_p4d = xen_set_p4d;
#endif

@@ -2396,7 +2357,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
pv_mmu_ops.release_pte = xen_release_pte;
pv_mmu_ops.release_pmd = xen_release_pmd;
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
pv_mmu_ops.alloc_pud = xen_alloc_pud;
pv_mmu_ops.release_pud = xen_release_pud;
#endif
@@ -2462,14 +2423,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),

-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
.pud_val = PV_CALLEE_SAVE(xen_pud_val),
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
.set_p4d = xen_set_p4d_hyper,

.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,
-#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+#endif /* CONFIG_X86_64 */

.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
--
2.13.2
Loading...