Discussion:
mm: BUG in unmap_page_range
(too old to reply)
Sasha Levin
2014-08-02 22:10:01 UTC
Permalink
Hi all,

While fuzzing with trinity inside a KVM tools guest running the latest -next
kernel, I've stumbled on the following spew:

[ 2957.087977] BUG: unable to handle kernel paging request at ffffea0003480008
[ 2957.088008] IP: unmap_page_range (mm/memory.c:1132 mm/memory.c:1256 mm/memory.c:1277 mm/memory.c:1301)
[ 2957.088024] PGD 7fffc6067 PUD 7fffc5067 PMD 0
[ 2957.088041] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 2957.088087] Dumping ftrace buffer:
[ 2957.088266] (ftrace buffer empty)
[ 2957.088279] Modules linked in:
[ 2957.088293] CPU: 2 PID: 15417 Comm: trinity-c200 Not tainted 3.16.0-rc7-next-20140801-sasha-00047-gd6ce559 #990
[ 2957.088301] task: ffff8807a8c50000 ti: ffff880739fb4000 task.ti: ffff880739fb4000
[ 2957.088320] RIP: unmap_page_range (mm/memory.c:1132 mm/memory.c:1256 mm/memory.c:1277 mm/memory.c:1301)
[ 2957.088328] RSP: 0018:ffff880739fb7c58 EFLAGS: 00010246
[ 2957.088336] RAX: 0000000000000000 RBX: ffff880eb2bdbed8 RCX: dfff971b42800000
[ 2957.088343] RDX: 1ffff100e73f6fc4 RSI: 00007f00e85db000 RDI: ffffea0003480008
[ 2957.088350] RBP: ffff880739fb7d58 R08: 0000000000000001 R09: 0000000000b6e000
[ 2957.088357] R10: 0000000000000000 R11: 0000000000000001 R12: ffffea0003480000
[ 2957.088365] R13: 00000000d2000700 R14: 00007f00e85dc000 R15: 00007f00e85db000
[ 2957.088374] FS: 00007f00e85d8700(0000) GS:ffff88177fa00000(0000) knlGS:0000000000000000
[ 2957.088381] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2957.088387] CR2: ffffea0003480008 CR3: 00000007a802a000 CR4: 00000000000006a0
[ 2957.088406] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2957.088413] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
[ 2957.088416] Stack:
[ 2957.088432] ffff88171726d570 0000000000000010 0000000000000008 00000000d2000730
[ 2957.088450] 0000000019d00250 00007f00e85dc000 ffff880f9d311900 ffff880739fb7e20
[ 2957.088466] ffff8807a8c507a0 ffff8807a8c50000 ffff8807a75fe000 ffff8807ceaa7a10
[ 2957.088469] Call Trace:
[ 2957.088490] unmap_single_vma (mm/memory.c:1348)
[ 2957.088505] unmap_vmas (mm/memory.c:1375 (discriminator 3))
[ 2957.088520] unmap_region (mm/mmap.c:2386 (discriminator 4))
[ 2957.088542] ? vma_rb_erase (mm/mmap.c:454 include/linux/rbtree_augmented.h:219 include/linux/rbtree_augmented.h:227 mm/mmap.c:493)
[ 2957.088559] ? vmacache_update (mm/vmacache.c:61)
[ 2957.088572] do_munmap (mm/mmap.c:2581)
[ 2957.088583] vm_munmap (mm/mmap.c:2596)
[ 2957.088595] SyS_munmap (mm/mmap.c:2601)
[ 2957.088616] tracesys (arch/x86/kernel/entry_64.S:541)
[ 2957.088770] Code: ff ff e8 f9 5f 07 00 48 8b 45 90 80 48 18 01 4d 85 e4 0f 84 8b fe ff ff 45 84 ed 0f 85 fc 03 00 00 49 8d 7c 24 08 e8 b5 67 07 00 <41> f6 44 24 08 01 0f 84 29 02 00 00 83 6d c8 01 4c 89 e7 e8 bd
All code
========
0: ff (bad)
1: ff e8 ljmpq *<internal disassembler error>
3: f9 stc
4: 5f pop %rdi
5: 07 (bad)
6: 00 48 8b add %cl,-0x75(%rax)
9: 45 90 rex.RB xchg %eax,%r8d
b: 80 48 18 01 orb $0x1,0x18(%rax)
f: 4d 85 e4 test %r12,%r12
12: 0f 84 8b fe ff ff je 0xfffffffffffffea3
18: 45 84 ed test %r13b,%r13b
1b: 0f 85 fc 03 00 00 jne 0x41d
21: 49 8d 7c 24 08 lea 0x8(%r12),%rdi
26: e8 b5 67 07 00 callq 0x767e0
2b:* 41 f6 44 24 08 01 testb $0x1,0x8(%r12) <-- trapping instruction
31: 0f 84 29 02 00 00 je 0x260
37: 83 6d c8 01 subl $0x1,-0x38(%rbp)
3b: 4c 89 e7 mov %r12,%rdi
3e: e8 .byte 0xe8
3f: bd .byte 0xbd
...

Code starting with the faulting instruction
===========================================
0: 41 f6 44 24 08 01 testb $0x1,0x8(%r12)
6: 0f 84 29 02 00 00 je 0x235
c: 83 6d c8 01 subl $0x1,-0x38(%rbp)
10: 4c 89 e7 mov %r12,%rdi
13: e8 .byte 0xe8
14: bd .byte 0xbd
...
[ 2957.088784] RIP unmap_page_range (mm/memory.c:1132 mm/memory.c:1256 mm/memory.c:1277 mm/memory.c:1301)
[ 2957.088789] RSP <ffff880739fb7c58>
[ 2957.088794] CR2: ffffea0003480008


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Hugh Dickins
2014-08-04 11:50:02 UTC
Permalink
Post by Sasha Levin
Hi all,
While fuzzing with trinity inside a KVM tools guest running the latest -next
[ 2957.087977] BUG: unable to handle kernel paging request at ffffea0003480008
[ 2957.088008] IP: unmap_page_range (mm/memory.c:1132 mm/memory.c:1256 mm/memory.c:1277 mm/memory.c:1301)
[ 2957.088024] PGD 7fffc6067 PUD 7fffc5067 PMD 0
[ 2957.088041] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 2957.088266] (ftrace buffer empty)
[ 2957.088293] CPU: 2 PID: 15417 Comm: trinity-c200 Not tainted 3.16.0-rc7-next-20140801-sasha-00047-gd6ce559 #990
[ 2957.088301] task: ffff8807a8c50000 ti: ffff880739fb4000 task.ti: ffff880739fb4000
[ 2957.088320] RIP: unmap_page_range (mm/memory.c:1132 mm/memory.c:1256 mm/memory.c:1277 mm/memory.c:1301)
[ 2957.088328] RSP: 0018:ffff880739fb7c58 EFLAGS: 00010246
[ 2957.088336] RAX: 0000000000000000 RBX: ffff880eb2bdbed8 RCX: dfff971b42800000
[ 2957.088343] RDX: 1ffff100e73f6fc4 RSI: 00007f00e85db000 RDI: ffffea0003480008
[ 2957.088350] RBP: ffff880739fb7d58 R08: 0000000000000001 R09: 0000000000b6e000
[ 2957.088357] R10: 0000000000000000 R11: 0000000000000001 R12: ffffea0003480000
[ 2957.088365] R13: 00000000d2000700 R14: 00007f00e85dc000 R15: 00007f00e85db000
[ 2957.088374] FS: 00007f00e85d8700(0000) GS:ffff88177fa00000(0000) knlGS:0000000000000000
[ 2957.088381] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2957.088387] CR2: ffffea0003480008 CR3: 00000007a802a000 CR4: 00000000000006a0
[ 2957.088406] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2957.088413] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
[ 2957.088432] ffff88171726d570 0000000000000010 0000000000000008 00000000d2000730
[ 2957.088450] 0000000019d00250 00007f00e85dc000 ffff880f9d311900 ffff880739fb7e20
[ 2957.088466] ffff8807a8c507a0 ffff8807a8c50000 ffff8807a75fe000 ffff8807ceaa7a10
[ 2957.088490] unmap_single_vma (mm/memory.c:1348)
[ 2957.088505] unmap_vmas (mm/memory.c:1375 (discriminator 3))
[ 2957.088520] unmap_region (mm/mmap.c:2386 (discriminator 4))
[ 2957.088542] ? vma_rb_erase (mm/mmap.c:454 include/linux/rbtree_augmented.h:219 include/linux/rbtree_augmented.h:227 mm/mmap.c:493)
[ 2957.088559] ? vmacache_update (mm/vmacache.c:61)
[ 2957.088572] do_munmap (mm/mmap.c:2581)
[ 2957.088583] vm_munmap (mm/mmap.c:2596)
[ 2957.088595] SyS_munmap (mm/mmap.c:2601)
[ 2957.088616] tracesys (arch/x86/kernel/entry_64.S:541)
[ 2957.088770] Code: ff ff e8 f9 5f 07 00 48 8b 45 90 80 48 18 01 4d 85 e4 0f 84 8b fe ff ff 45 84 ed 0f 85 fc 03 00 00 49 8d 7c 24 08 e8 b5 67 07 00 <41> f6 44 24 08 01 0f 84 29 02 00 00 83 6d c8 01 4c 89 e7 e8 bd
All code
========
0: ff (bad)
1: ff e8 ljmpq *<internal disassembler error>
3: f9 stc
4: 5f pop %rdi
5: 07 (bad)
6: 00 48 8b add %cl,-0x75(%rax)
9: 45 90 rex.RB xchg %eax,%r8d
b: 80 48 18 01 orb $0x1,0x18(%rax)
f: 4d 85 e4 test %r12,%r12
12: 0f 84 8b fe ff ff je 0xfffffffffffffea3
18: 45 84 ed test %r13b,%r13b
1b: 0f 85 fc 03 00 00 jne 0x41d
21: 49 8d 7c 24 08 lea 0x8(%r12),%rdi
26: e8 b5 67 07 00 callq 0x767e0
2b:* 41 f6 44 24 08 01 testb $0x1,0x8(%r12) <-- trapping instruction
31: 0f 84 29 02 00 00 je 0x260
37: 83 6d c8 01 subl $0x1,-0x38(%rbp)
3b: 4c 89 e7 mov %r12,%rdi
3e: e8 .byte 0xe8
3f: bd .byte 0xbd
This differs in which functions got inlined (unmap_page_range showing up
in place of zap_pte_range), but this is the same "if (PageAnon(page))"
that Sasha reported in the "hang in shmem_fallocate" thread on June 26th.

I can see what it is now, and here is most of a patch (which I don't
expect to satisfy Trinity yet); at this point I think I had better
hand it over to Mel, to complete or to discard.

[INCOMPLETE PATCH] x86,mm: fix pte_special versus pte_numa

Sasha Levin has shown oopses on ffffea0003480048 and ffffea0003480008
at mm/memory.c:1132, running Trinity on different 3.16-rc-next kernels:
where zap_pte_range() checks page->mapping to see if PageAnon(page).

Those addresses fit struct pages for pfns d2001 and d2000, and in each
dump a register or a stack slot showed d2001730 or d2000730: pte flags
0x730 are PCD ACCESSED PROTNONE SPECIAL IOMAP; and Sasha's e820 map has
a hole between cfffffff and 100000000, which would need special access.

Commit c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on
the PMD and PTE levels") has broken vm_normal_page(): a PROTNONE SPECIAL
pte no longer passes the pte_special() test, so zap_pte_range() goes on
to try to access a non-existent struct page.

Fix this by refining pte_special() (SPECIAL with PRESENT or PROTNONE)
to complement pte_numa() (SPECIAL with neither PRESENT nor PROTNONE).

It's unclear why c46a7c817e66 added pte_numa() test to vm_normal_page(),
and moved its is_zero_pfn() test from slow to fast path: I suspect both
were papering over PROT_NONE issues seen with inadequate pte_special().
Revert vm_normal_page() to how it was before, relying on pte_special().

I find it confusing, that the only example of ARCH_USES_NUMA_PROT_NONE
no longer uses PROTNONE for NUMA, but SPECIAL instead: update the
asm-generic comment a little, but that config option remains unhelpful.

But more seriously, I think this patch is incomplete: aren't there
other places which need to be handling PROTNONE along with PRESENT?
For example, pte_mknuma() clears _PAGE_PRESENT and sets _PAGE_NUMA,
but on a PROT_NONE area, I think that will now make it pte_special()?
So it ought to clear _PAGE_PROTNONE too. Or maybe we can never
pte_mknuma() on a PROT_NONE area - there would be no point?

Around here I began to wonder if it was just a mistake to have deserted
the PROTNONE for NUMA model: I know Linus had a strong reaction against
it, and I've never delved into its drawbacks myself; but bringing yet
another (SPECIAL) flag into the game is not an obvious improvement.
Should we just revert c46a7c817e66, or would that be a mistake?

Let me hand this over to Mel now...

Partially-Fixes: c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels")
Reported-by: Sasha Levin <***@oracle.com>
Not-yet-Signed-off-by: Hugh Dickins <***@google.com>
Cc: ***@vger.kernel.org [3.16]
---

arch/x86/include/asm/pgtable.h | 9 +++++++--
include/asm-generic/pgtable.h | 6 +++---
mm/memory.c | 7 +++----
3 files changed, 13 insertions(+), 9 deletions(-)

--- v3.16/arch/x86/include/asm/pgtable.h 2014-08-03 15:25:02.000000000 -0700
+++ linux/arch/x86/include/asm/pgtable.h 2014-08-03 17:36:02.364552987 -0700
@@ -131,8 +131,13 @@ static inline int pte_exec(pte_t pte)

static inline int pte_special(pte_t pte)
{
- return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
- (_PAGE_PRESENT|_PAGE_SPECIAL);
+ /*
+ * See CONFIG_NUMA_BALANCING CONFIG_ARCH_USES_NUMA_PROT_NONE pte_numa()
+ * in include/asm-generic/pgtable.h: on x86 we have _PAGE_BIT_NUMA ==
+ * _PAGE_BIT_GLOBAL+1 == __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL.
+ */
+ return (pte_flags(pte) & _PAGE_SPECIAL) &&
+ (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE));
}

static inline unsigned long pte_pfn(pte_t pte)
--- v3.16/include/asm-generic/pgtable.h 2014-08-03 15:25:02.000000000 -0700
+++ linux/include/asm-generic/pgtable.h 2014-08-03 17:36:02.364552987 -0700
@@ -662,9 +662,9 @@ static inline int pmd_trans_unstable(pmd
#ifdef CONFIG_NUMA_BALANCING
#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
/*
- * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
- * same bit too). It's set only when _PAGE_PRESET is not set and it's
- * never set if _PAGE_PRESENT is set.
+ * _PAGE_NUMA works identically to _PAGE_PROTNONE.
+ * It is set only when neither _PAGE_PRESENT nor _PAGE_PROTNONE is set.
+ * This allows it to share a bit set only when present e.g. _PAGE_SPECIAL.
*
* pte/pmd_present() returns true if pte/pmd_numa returns true. Page
* fault triggers on those regions if pte/pmd_numa returns true
--- v3.16/mm/memory.c 2014-08-03 15:25:02.000000000 -0700
+++ linux/mm/memory.c 2014-08-03 17:36:02.368552987 -0700
@@ -751,7 +751,7 @@ struct page *vm_normal_page(struct vm_ar
unsigned long pfn = pte_pfn(pte);

if (HAVE_PTE_SPECIAL) {
- if (likely(!pte_special(pte) || pte_numa(pte)))
+ if (likely(!pte_special(pte)))
goto check_pfn;
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
@@ -777,15 +777,14 @@ struct page *vm_normal_page(struct vm_ar
}
}

+ if (is_zero_pfn(pfn))
+ return NULL;
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}

- if (is_zero_pfn(pfn))
- return NULL;
-
/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mel Gorman
2014-08-05 14:50:01 UTC
Permalink
Post by Hugh Dickins
Post by Sasha Levin
Hi all,
While fuzzing with trinity inside a KVM tools guest running the latest -next
[ 2957.087977] BUG: unable to handle kernel paging request at ffffea0003480008
[ 2957.088008] IP: unmap_page_range (mm/memory.c:1132 mm/memory.c:1256 mm/memory.c:1277 mm/memory.c:1301)
[ 2957.088024] PGD 7fffc6067 PUD 7fffc5067 PMD 0
[ 2957.088041] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 2957.088266] (ftrace buffer empty)
[ 2957.088293] CPU: 2 PID: 15417 Comm: trinity-c200 Not tainted 3.16.0-rc7-next-20140801-sasha-00047-gd6ce559 #990
[ 2957.088301] task: ffff8807a8c50000 ti: ffff880739fb4000 task.ti: ffff880739fb4000
[ 2957.088320] RIP: unmap_page_range (mm/memory.c:1132 mm/memory.c:1256 mm/memory.c:1277 mm/memory.c:1301)
[ 2957.088328] RSP: 0018:ffff880739fb7c58 EFLAGS: 00010246
[ 2957.088336] RAX: 0000000000000000 RBX: ffff880eb2bdbed8 RCX: dfff971b42800000
[ 2957.088343] RDX: 1ffff100e73f6fc4 RSI: 00007f00e85db000 RDI: ffffea0003480008
[ 2957.088350] RBP: ffff880739fb7d58 R08: 0000000000000001 R09: 0000000000b6e000
[ 2957.088357] R10: 0000000000000000 R11: 0000000000000001 R12: ffffea0003480000
[ 2957.088365] R13: 00000000d2000700 R14: 00007f00e85dc000 R15: 00007f00e85db000
[ 2957.088374] FS: 00007f00e85d8700(0000) GS:ffff88177fa00000(0000) knlGS:0000000000000000
[ 2957.088381] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2957.088387] CR2: ffffea0003480008 CR3: 00000007a802a000 CR4: 00000000000006a0
[ 2957.088406] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2957.088413] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
[ 2957.088432] ffff88171726d570 0000000000000010 0000000000000008 00000000d2000730
[ 2957.088450] 0000000019d00250 00007f00e85dc000 ffff880f9d311900 ffff880739fb7e20
[ 2957.088466] ffff8807a8c507a0 ffff8807a8c50000 ffff8807a75fe000 ffff8807ceaa7a10
[ 2957.088490] unmap_single_vma (mm/memory.c:1348)
[ 2957.088505] unmap_vmas (mm/memory.c:1375 (discriminator 3))
[ 2957.088520] unmap_region (mm/mmap.c:2386 (discriminator 4))
[ 2957.088542] ? vma_rb_erase (mm/mmap.c:454 include/linux/rbtree_augmented.h:219 include/linux/rbtree_augmented.h:227 mm/mmap.c:493)
[ 2957.088559] ? vmacache_update (mm/vmacache.c:61)
[ 2957.088572] do_munmap (mm/mmap.c:2581)
[ 2957.088583] vm_munmap (mm/mmap.c:2596)
[ 2957.088595] SyS_munmap (mm/mmap.c:2601)
[ 2957.088616] tracesys (arch/x86/kernel/entry_64.S:541)
[ 2957.088770] Code: ff ff e8 f9 5f 07 00 48 8b 45 90 80 48 18 01 4d 85 e4 0f 84 8b fe ff ff 45 84 ed 0f 85 fc 03 00 00 49 8d 7c 24 08 e8 b5 67 07 00 <41> f6 44 24 08 01 0f 84 29 02 00 00 83 6d c8 01 4c 89 e7 e8 bd
All code
========
0: ff (bad)
1: ff e8 ljmpq *<internal disassembler error>
3: f9 stc
4: 5f pop %rdi
5: 07 (bad)
6: 00 48 8b add %cl,-0x75(%rax)
9: 45 90 rex.RB xchg %eax,%r8d
b: 80 48 18 01 orb $0x1,0x18(%rax)
f: 4d 85 e4 test %r12,%r12
12: 0f 84 8b fe ff ff je 0xfffffffffffffea3
18: 45 84 ed test %r13b,%r13b
1b: 0f 85 fc 03 00 00 jne 0x41d
21: 49 8d 7c 24 08 lea 0x8(%r12),%rdi
26: e8 b5 67 07 00 callq 0x767e0
2b:* 41 f6 44 24 08 01 testb $0x1,0x8(%r12) <-- trapping instruction
31: 0f 84 29 02 00 00 je 0x260
37: 83 6d c8 01 subl $0x1,-0x38(%rbp)
3b: 4c 89 e7 mov %r12,%rdi
3e: e8 .byte 0xe8
3f: bd .byte 0xbd
This differs in which functions got inlined (unmap_page_range showing up
in place of zap_pte_range), but this is the same "if (PageAnon(page))"
that Sasha reported in the "hang in shmem_fallocate" thread on June 26th.
I can see what it is now, and here is most of a patch (which I don't
expect to satisfy Trinity yet); at this point I think I had better
hand it over to Mel, to complete or to discard.
[INCOMPLETE PATCH] x86,mm: fix pte_special versus pte_numa
Sasha Levin has shown oopses on ffffea0003480048 and ffffea0003480008
where zap_pte_range() checks page->mapping to see if PageAnon(page).
Those addresses fit struct pages for pfns d2001 and d2000, and in each
dump a register or a stack slot showed d2001730 or d2000730: pte flags
0x730 are PCD ACCESSED PROTNONE SPECIAL IOMAP; and Sasha's e820 map has
a hole between cfffffff and 100000000, which would need special access.
Commit c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on
the PMD and PTE levels") has broken vm_normal_page(): a PROTNONE SPECIAL
pte no longer passes the pte_special() test, so zap_pte_range() goes on
to try to access a non-existent struct page.
:(
Post by Hugh Dickins
Fix this by refining pte_special() (SPECIAL with PRESENT or PROTNONE)
to complement pte_numa() (SPECIAL with neither PRESENT nor PROTNONE).
It's unclear why c46a7c817e66 added pte_numa() test to vm_normal_page(),
and moved its is_zero_pfn() test from slow to fast path: I suspect both
were papering over PROT_NONE issues seen with inadequate pte_special().
Revert vm_normal_page() to how it was before, relying on pte_special().
Rather than answering directly I updated your changelog

Fix this by refining pte_special() (SPECIAL with PRESENT or PROTNONE)
to complement pte_numa() (SPECIAL with neither PRESENT nor PROTNONE).

A hint that this was a problem was that c46a7c817e66 added pte_numa()
test to vm_normal_page(), and moved its is_zero_pfn() test from slow to
fast path: This was papering over a pte_special() snag when the zero
page was encountered during zap. This patch reverts vm_normal_page()
to how it was before, relying on pte_special().
Post by Hugh Dickins
I find it confusing, that the only example of ARCH_USES_NUMA_PROT_NONE
no longer uses PROTNONE for NUMA, but SPECIAL instead: update the
asm-generic comment a little, but that config option remains unhelpful.
ARCH_USES_NUMA_PROT_NONE should have been sent to the farm at the same time
as that patch and by rights unified with the powerpc helpers. With the new
_PAGE_NUMA bit, there is no reason they should have different implementations
of pte_numa and related functions. Unfortunately unifying them is a little
problematic due to differences in fundamental types. It could be done with
#defines but I'm attaching a preliminary prototype to illustrate the issue.
Post by Hugh Dickins
But more seriously, I think this patch is incomplete: aren't there
other places which need to be handling PROTNONE along with PRESENT?
For example, pte_mknuma() clears _PAGE_PRESENT and sets _PAGE_NUMA,
but on a PROT_NONE area, I think that will now make it pte_special()?
So it ought to clear _PAGE_PROTNONE too. Or maybe we can never
pte_mknuma() on a PROT_NONE area - there would be no point?
We are depending on the fact that inaccessible VMAs are skipped by the
NUMA hinting scanner.
Post by Hugh Dickins
Around here I began to wonder if it was just a mistake to have deserted
the PROTNONE for NUMA model: I know Linus had a strong reaction against
it, and I've never delved into its drawbacks myself; but bringing yet
another (SPECIAL) flag into the game is not an obvious improvement.
Should we just revert c46a7c817e66, or would that be a mistake?
It's replacing one type of complexity with another. The downside is that
_PAGE_NUMA == _PAGE_PROTNONE puts subtle traps all over the core for
powerpc to fall foul of.

I'm attaching a preliminary pair of patches. The first which deals with
ARCH_USES_NUMA_PROT_NONE and the second which is yours with a revised
changelog. I'm adding Aneesh to the cc to look at the powerpc portion of
the first patch.
--
Mel Gorman
SUSE Labs
Hugh Dickins
2014-08-06 00:50:01 UTC
Permalink
Post by Mel Gorman
Post by Hugh Dickins
[INCOMPLETE PATCH] x86,mm: fix pte_special versus pte_numa
Sasha Levin has shown oopses on ffffea0003480048 and ffffea0003480008
where zap_pte_range() checks page->mapping to see if PageAnon(page).
Those addresses fit struct pages for pfns d2001 and d2000, and in each
dump a register or a stack slot showed d2001730 or d2000730: pte flags
0x730 are PCD ACCESSED PROTNONE SPECIAL IOMAP; and Sasha's e820 map has
a hole between cfffffff and 100000000, which would need special access.
Commit c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on
the PMD and PTE levels") has broken vm_normal_page(): a PROTNONE SPECIAL
pte no longer passes the pte_special() test, so zap_pte_range() goes on
to try to access a non-existent struct page.
:(
Post by Hugh Dickins
Fix this by refining pte_special() (SPECIAL with PRESENT or PROTNONE)
to complement pte_numa() (SPECIAL with neither PRESENT nor PROTNONE).
It's unclear why c46a7c817e66 added pte_numa() test to vm_normal_page(),
and moved its is_zero_pfn() test from slow to fast path: I suspect both
were papering over PROT_NONE issues seen with inadequate pte_special().
Revert vm_normal_page() to how it was before, relying on pte_special().
Rather than answering directly I updated your changelog
Fix this by refining pte_special() (SPECIAL with PRESENT or PROTNONE)
to complement pte_numa() (SPECIAL with neither PRESENT nor PROTNONE).
A hint that this was a problem was that c46a7c817e66 added pte_numa()
test to vm_normal_page(), and moved its is_zero_pfn() test from slow to
fast path: This was papering over a pte_special() snag when the zero
page was encountered during zap. This patch reverts vm_normal_page()
to how it was before, relying on pte_special().
Thanks, that's fine.
Post by Mel Gorman
Post by Hugh Dickins
I find it confusing, that the only example of ARCH_USES_NUMA_PROT_NONE
no longer uses PROTNONE for NUMA, but SPECIAL instead: update the
asm-generic comment a little, but that config option remains unhelpful.
ARCH_USES_NUMA_PROT_NONE should have been sent to the farm at the same time
as that patch and by rights unified with the powerpc helpers. With the new
_PAGE_NUMA bit, there is no reason they should have different implementations
of pte_numa and related functions. Unfortunately unifying them is a little
problematic due to differences in fundamental types. It could be done with
#defines but I'm attaching a preliminary prototype to illustrate the issue.
Post by Hugh Dickins
But more seriously, I think this patch is incomplete: aren't there
other places which need to be handling PROTNONE along with PRESENT?
For example, pte_mknuma() clears _PAGE_PRESENT and sets _PAGE_NUMA,
but on a PROT_NONE area, I think that will now make it pte_special()?
So it ought to clear _PAGE_PROTNONE too. Or maybe we can never
pte_mknuma() on a PROT_NONE area - there would be no point?
We are depending on the fact that inaccessible VMAs are skipped by the
NUMA hinting scanner.
Ah, okay. And the other way round (mprotecting to PROT_NONE an area
which already contains _PAGE_NUMA ptes) already looked safe to me.
Post by Mel Gorman
Post by Hugh Dickins
Around here I began to wonder if it was just a mistake to have deserted
the PROTNONE for NUMA model: I know Linus had a strong reaction against
it, and I've never delved into its drawbacks myself; but bringing yet
another (SPECIAL) flag into the game is not an obvious improvement.
Should we just revert c46a7c817e66, or would that be a mistake?
It's replacing one type of complexity with another. The downside is that
_PAGE_NUMA == _PAGE_PROTNONE puts subtle traps all over the core for
powerpc to fall foul of.
Okay.
Post by Mel Gorman
I'm attaching a preliminary pair of patches. The first which deals with
ARCH_USES_NUMA_PROT_NONE and the second which is yours with a revised
changelog. I'm adding Aneesh to the cc to look at the powerpc portion of
the first patch.
Thanks a lot, Mel.

I am surprised by the ordering, but perhaps you meant nothing by it.
Isn't the first one a welcome but optional cleanup, and the second one
a fix that we need in 3.16-stable? Or does the fix actually depend in
some unstated way upon the cleanup, in powerpc-land perhaps?

Aside from that, for the first patch: yes, I heartily approve of the
disappearance of CONFIG_ARCH_WANTS_PROT_NUMA_PROT_NONE and
CONFIG_ARCH_USES_NUMA_PROT_NONE. If you wish, add
Acked-by: Hugh Dickins <***@google.com>
but of course it's really Aneesh and powerpc who are the test of it.

One thing I did wonder, though: at first I was reassured by the
VM_BUG_ON(!pte_present(pte)) you add to pte_mknuma(); but then thought
it would be better as VM_BUG_ON(!(val & _PAGE_PRESENT)), being stronger
- asserting that indeed we do not put NUMA hints on PROT_NONE areas.
(But I have not tested, perhaps such a VM_BUG_ON would actually fire.)
Post by Mel Gorman
It still appears that this patch may be incomplete: aren't there other
places which need to be handling PROTNONE along with PRESENT? For example,
pte_mknuma() clears _PAGE_PRESENT and sets _PAGE_NUMA, but on a PROT_NONE
area, that would make it it pte_special(). This is side-stepped by the fact
s/it it/it/
Post by Mel Gorman
that NUMA hinting faults skiped PROT_NONE VMAs and there are no grounds
s/skiped/skip/
Post by Mel Gorman
where a NUMA hinting fault on a PROT_NONE VMA would be interesting.
Partially-Fixes: c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels")
s/Partially-//
s/Not-yet-//
Ditto I must leave to you!
Post by Mel Gorman
---
arch/x86/include/asm/pgtable.h | 9 +++++++--
mm/memory.c | 7 +++----
2 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0ec0560..230b811 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -131,8 +131,13 @@ static inline int pte_exec(pte_t pte)
static inline int pte_special(pte_t pte)
{
- return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
- (_PAGE_PRESENT|_PAGE_SPECIAL);
+ /*
+ * See CONFIG_NUMA_BALANCING CONFIG_ARCH_USES_NUMA_PROT_NONE pte_numa()
s/CONFIG_ARCH_USES_NUMA_PROT_NONE //
even if you do end up reordering this patch before the other.

Thanks!
Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sasha Levin
2014-08-06 01:10:01 UTC
Permalink
Thanks Hugh, Mel. I've added both patches to my local tree and will update tomorrow
with the weather.
Post by Hugh Dickins
One thing I did wonder, though: at first I was reassured by the
VM_BUG_ON(!pte_present(pte)) you add to pte_mknuma(); but then thought
it would be better as VM_BUG_ON(!(val & _PAGE_PRESENT)), being stronger
- asserting that indeed we do not put NUMA hints on PROT_NONE areas.
(But I have not tested, perhaps such a VM_BUG_ON would actually fire.)
I've added VM_BUG_ON(!(val & _PAGE_PRESENT)) in just as a curiosity, I'll
update how that one looks as well.


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sasha Levin
2014-08-12 03:30:01 UTC
Permalink
Post by Sasha Levin
Thanks Hugh, Mel. I've added both patches to my local tree and will update tomorrow
with the weather.
Post by Hugh Dickins
One thing I did wonder, though: at first I was reassured by the
VM_BUG_ON(!pte_present(pte)) you add to pte_mknuma(); but then thought
it would be better as VM_BUG_ON(!(val & _PAGE_PRESENT)), being stronger
- asserting that indeed we do not put NUMA hints on PROT_NONE areas.
(But I have not tested, perhaps such a VM_BUG_ON would actually fire.)
I've added VM_BUG_ON(!(val & _PAGE_PRESENT)) in just as a curiosity, I'll
update how that one looks as well.
Sorry for the rather long delay.

The patch looks fine, the issue didn't reproduce.

The added VM_BUG_ON didn't trigger either, so maybe we should consider adding
it in.


Thanks,
Sasha

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mel Gorman
2014-08-12 10:50:01 UTC
Permalink
Sasha Levin has shown oopses on ffffea0003480048 and ffffea0003480008
at mm/memory.c:1132, running Trinity on different 3.16-rc-next kernels:
where zap_pte_range() checks page->mapping to see if PageAnon(page).

Those addresses fit struct pages for pfns d2001 and d2000, and in each
dump a register or a stack slot showed d2001730 or d2000730: pte flags
0x730 are PCD ACCESSED PROTNONE SPECIAL IOMAP; and Sasha's e820 map has
a hole between cfffffff and 100000000, which would need special access.

Commit c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on
the PMD and PTE levels") has broken vm_normal_page(): a PROTNONE SPECIAL
pte no longer passes the pte_special() test, so zap_pte_range() goes on
to try to access a non-existent struct page.

Fix this by refining pte_special() (SPECIAL with PRESENT or PROTNONE)
to complement pte_numa() (SPECIAL with neither PRESENT nor PROTNONE).
A hint that this was a problem was that c46a7c817e66 added pte_numa()
test to vm_normal_page(), and moved its is_zero_pfn() test from slow to
fast path: This was papering over a pte_special() snag when the zero page
was encountered during zap. This patch reverts vm_normal_page() to how it
was before, relying on pte_special().

It still appears that this patch may be incomplete: aren't there other
places which need to be handling PROTNONE along with PRESENT? For example,
pte_mknuma() clears _PAGE_PRESENT and sets _PAGE_NUMA, but on a PROT_NONE
area, that would make it pte_special(). This is side-stepped by the fact
that NUMA hinting faults skipped PROT_NONE VMAs and there are no grounds
where a NUMA hinting fault on a PROT_NONE VMA would be interesting.

Fixes: c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels")
Reported-and-tested-by: Sasha Levin <***@oracle.com>
Signed-off-by: Hugh Dickins <***@google.com>
Signed-off-by: Mel Gorman <***@suse.de>
Cc: ***@vger.kernel.org [3.16]
---
arch/x86/include/asm/pgtable.h | 9 +++++++--
mm/memory.c | 7 +++----
2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0ec0560..aa97a07 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -131,8 +131,13 @@ static inline int pte_exec(pte_t pte)

static inline int pte_special(pte_t pte)
{
- return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
- (_PAGE_PRESENT|_PAGE_SPECIAL);
+ /*
+ * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h.
+ * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 ==
+ * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL.
+ */
+ return (pte_flags(pte) & _PAGE_SPECIAL) &&
+ (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE));
}

static inline unsigned long pte_pfn(pte_t pte)
diff --git a/mm/memory.c b/mm/memory.c
index 8b44f76..0a21f3d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -751,7 +751,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pte_pfn(pte);

if (HAVE_PTE_SPECIAL) {
- if (likely(!pte_special(pte) || pte_numa(pte)))
+ if (likely(!pte_special(pte)))
goto check_pfn;
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
@@ -777,15 +777,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
}
}

+ if (is_zero_pfn(pfn))
+ return NULL;
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}

- if (is_zero_pfn(pfn))
- return NULL;
-
/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mel Gorman
2014-08-12 11:10:02 UTC
Permalink
ARCH_USES_NUMA_PROT_NONE was defined for architectures that implemented
_PAGE_NUMA using _PROT_NONE. This saved using an additional PTE bit and
relied on the fact that PROT_NONE vmas were skipped by the NUMA hinting
fault scanner. This was found to be conceptually confusing with a lot of
implicit assumptions and it was asked that an alternative be found.

Commit c46a7c81 "x86: define _PAGE_NUMA by reusing software bits on the
PMD and PTE levels" redefined _PAGE_NUMA on x86 to be one of the swap
PTE bits and shrunk the maximum possible swap size but it did not go far
enough. There are no architectures that reuse _PROT_NONE as _PROT_NUMA
but the relics still exist.

This patch removes ARCH_USES_NUMA_PROT_NONE and removes some unnecessary
duplication in powerpc vs the generic implementation by defining the types
the core NUMA helpers expected to exist from x86 with their ppc64 equivalent.
This necessitated that a PTE bit mask be created that identified the bits
that distinguish present from NUMA pte entries but it is expected this
will only differ between arches based on _PAGE_PROTNONE. The naming for
the generic helpers was taken from x86 originally but ppc64 has types that
are equivalent for the purposes of the helper so they are mapped instead
of duplicating code.

Signed-off-by: Mel Gorman <***@suse.de>
---
arch/powerpc/include/asm/pgtable.h | 57 ++++++++---------------------------
arch/powerpc/include/asm/pte-common.h | 5 +++
arch/x86/Kconfig | 1 -
arch/x86/include/asm/pgtable_types.h | 14 +++++++++
include/asm-generic/pgtable.h | 27 ++++++-----------
init/Kconfig | 11 -------
6 files changed, 40 insertions(+), 75 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index d98c1ec..f60d4ea 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -38,10 +38,9 @@ static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK)
static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }

#ifdef CONFIG_NUMA_BALANCING
-
static inline int pte_present(pte_t pte)
{
- return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
+ return pte_val(pte) & _PAGE_NUMA_MASK;
}

#define pte_present_nonuma pte_present_nonuma
@@ -50,37 +49,6 @@ static inline int pte_present_nonuma(pte_t pte)
return pte_val(pte) & (_PAGE_PRESENT);
}

-#define pte_numa pte_numa
-static inline int pte_numa(pte_t pte)
-{
- return (pte_val(pte) &
- (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
-}
-
-#define pte_mknonnuma pte_mknonnuma
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
- pte_val(pte) &= ~_PAGE_NUMA;
- pte_val(pte) |= _PAGE_PRESENT | _PAGE_ACCESSED;
- return pte;
-}
-
-#define pte_mknuma pte_mknuma
-static inline pte_t pte_mknuma(pte_t pte)
-{
- /*
- * We should not set _PAGE_NUMA on non present ptes. Also clear the
- * present bit so that hash_page will return 1 and we collect this
- * as numa fault.
- */
- if (pte_present(pte)) {
- pte_val(pte) |= _PAGE_NUMA;
- pte_val(pte) &= ~_PAGE_PRESENT;
- } else
- VM_BUG_ON(1);
- return pte;
-}
-
#define ptep_set_numa ptep_set_numa
static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
@@ -92,12 +60,6 @@ static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
return;
}

-#define pmd_numa pmd_numa
-static inline int pmd_numa(pmd_t pmd)
-{
- return pte_numa(pmd_pte(pmd));
-}
-
#define pmdp_set_numa pmdp_set_numa
static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
@@ -109,16 +71,21 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
return;
}

-#define pmd_mknonnuma pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+/*
+ * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist
+ * which was inherited from x86. For the purposes of powerpc pte_basic_t and
+ * pmd_t are equivalent
+ */
+#define pteval_t pte_basic_t
+#define pmdval_t pmd_t
+static inline pteval_t ptenuma_flags(pte_t pte)
{
- return pte_pmd(pte_mknonnuma(pmd_pte(pmd)));
+ return pte_val(pte) & _PAGE_NUMA_MASK;
}

-#define pmd_mknuma pmd_mknuma
-static inline pmd_t pmd_mknuma(pmd_t pmd)
+static inline pmdval_t pmdnuma_flags(pmd_t pmd)
{
- return pte_pmd(pte_mknuma(pmd_pte(pmd)));
+ return pmd_val(pmd) & _PAGE_NUMA_MASK;
}

# else
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index 8d1569c..e040c35 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -98,6 +98,11 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
_PAGE_USER | _PAGE_ACCESSED | \
_PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC)

+#ifdef CONFIG_NUMA_BALANCING
+/* Mask of bits that distinguish present and numa ptes */
+#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT)
+#endif
+
/*
* We define 2 sets of base prot bits, one for basic pages (ie,
* cacheable kernel and user pages) and one for non cacheable
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d24887b..0a3f32b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,7 +28,6 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_INT128 if X86_64
- select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f216963..0f9724c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -325,6 +325,20 @@ static inline pteval_t pte_flags(pte_t pte)
return native_pte_val(pte) & PTE_FLAGS_MASK;
}

+#ifdef CONFIG_NUMA_BALANCING
+/* Set of bits that distinguishes present, prot_none and numa ptes */
+#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
+static inline pteval_t ptenuma_flags(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_NUMA_MASK;
+}
+
+static inline pmdval_t pmdnuma_flags(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_NUMA_MASK;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
#define pgprot_val(x) ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) } )

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 53b2acc..281870f 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -660,11 +660,12 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
}

#ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
/*
- * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
- * same bit too). It's set only when _PAGE_PRESET is not set and it's
- * never set if _PAGE_PRESENT is set.
+ * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that
+ * is protected for PROT_NONE and a NUMA hinting fault entry. If the
+ * architecture defines __PAGE_PROTNONE then it should take that into account
+ * but those that do not can rely on the fact that the NUMA hinting scanner
+ * skips inaccessible VMAs.
*
* pte/pmd_present() returns true if pte/pmd_numa returns true. Page
* fault triggers on those regions if pte/pmd_numa returns true
@@ -673,16 +674,14 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
#ifndef pte_numa
static inline int pte_numa(pte_t pte)
{
- return (pte_flags(pte) &
- (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
+ return ptenuma_flags(pte) == _PAGE_NUMA;
}
#endif

#ifndef pmd_numa
static inline int pmd_numa(pmd_t pmd)
{
- return (pmd_flags(pmd) &
- (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
+ return pmdnuma_flags(pmd) == _PAGE_NUMA;
}
#endif

@@ -722,6 +721,8 @@ static inline pte_t pte_mknuma(pte_t pte)
{
pteval_t val = pte_val(pte);

+ VM_BUG_ON(!(val & _PAGE_PRESENT));
+
val &= ~_PAGE_PRESENT;
val |= _PAGE_NUMA;

@@ -765,16 +766,6 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
}
#endif
#else
-extern int pte_numa(pte_t pte);
-extern int pmd_numa(pmd_t pmd);
-extern pte_t pte_mknonnuma(pte_t pte);
-extern pmd_t pmd_mknonnuma(pmd_t pmd);
-extern pte_t pte_mknuma(pte_t pte);
-extern pmd_t pmd_mknuma(pmd_t pmd);
-extern void ptep_set_numa(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-extern void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp);
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
-#else
static inline int pmd_numa(pmd_t pmd)
{
return 0;
diff --git a/init/Kconfig b/init/Kconfig
index 9d76b99..60fa415 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -844,17 +844,6 @@ config ARCH_SUPPORTS_INT128
config ARCH_WANT_NUMA_VARIABLE_LOCALITY
bool

-#
-# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE
-config ARCH_WANTS_PROT_NUMA_PROT_NONE
- bool
-
-config ARCH_USES_NUMA_PROT_NONE
- bool
- default y
- depends on ARCH_WANTS_PROT_NUMA_PROT_NONE
- depends on NUMA_BALANCING
-
config NUMA_BALANCING_DEFAULT_ENABLED
bool "Automatically enable NUMA aware memory/task placement"
default y
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Aneesh Kumar K.V
2014-08-13 13:20:03 UTC
Permalink
Post by Mel Gorman
ARCH_USES_NUMA_PROT_NONE was defined for architectures that implemented
_PAGE_NUMA using _PROT_NONE. This saved using an additional PTE bit and
relied on the fact that PROT_NONE vmas were skipped by the NUMA hinting
fault scanner. This was found to be conceptually confusing with a lot of
implicit assumptions and it was asked that an alternative be found.
Commit c46a7c81 "x86: define _PAGE_NUMA by reusing software bits on the
PMD and PTE levels" redefined _PAGE_NUMA on x86 to be one of the swap
PTE bits and shrunk the maximum possible swap size but it did not go far
enough. There are no architectures that reuse _PROT_NONE as _PROT_NUMA
but the relics still exist.
This patch removes ARCH_USES_NUMA_PROT_NONE and removes some unnecessary
duplication in powerpc vs the generic implementation by defining the types
the core NUMA helpers expected to exist from x86 with their ppc64 equivalent.
This necessitated that a PTE bit mask be created that identified the bits
that distinguish present from NUMA pte entries but it is expected this
will only differ between arches based on _PAGE_PROTNONE. The naming for
the generic helpers was taken from x86 originally but ppc64 has types that
are equivalent for the purposes of the helper so they are mapped instead
of duplicating code.
---
arch/powerpc/include/asm/pgtable.h | 57 ++++++++---------------------------
arch/powerpc/include/asm/pte-common.h | 5 +++
arch/x86/Kconfig | 1 -
arch/x86/include/asm/pgtable_types.h | 14 +++++++++
include/asm-generic/pgtable.h | 27 ++++++-----------
init/Kconfig | 11 -------
6 files changed, 40 insertions(+), 75 deletions(-)
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index d98c1ec..f60d4ea 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -38,10 +38,9 @@ static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK)
static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
#ifdef CONFIG_NUMA_BALANCING
-
static inline int pte_present(pte_t pte)
{
- return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
+ return pte_val(pte) & _PAGE_NUMA_MASK;
}
#define pte_present_nonuma pte_present_nonuma
@@ -50,37 +49,6 @@ static inline int pte_present_nonuma(pte_t pte)
return pte_val(pte) & (_PAGE_PRESENT);
}
-#define pte_numa pte_numa
-static inline int pte_numa(pte_t pte)
-{
- return (pte_val(pte) &
- (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
-}
-
-#define pte_mknonnuma pte_mknonnuma
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
- pte_val(pte) &= ~_PAGE_NUMA;
- pte_val(pte) |= _PAGE_PRESENT | _PAGE_ACCESSED;
- return pte;
-}
-
-#define pte_mknuma pte_mknuma
-static inline pte_t pte_mknuma(pte_t pte)
-{
- /*
- * We should not set _PAGE_NUMA on non present ptes. Also clear the
- * present bit so that hash_page will return 1 and we collect this
- * as numa fault.
- */
- if (pte_present(pte)) {
- pte_val(pte) |= _PAGE_NUMA;
- pte_val(pte) &= ~_PAGE_PRESENT;
- } else
- VM_BUG_ON(1);
- return pte;
-}
-
#define ptep_set_numa ptep_set_numa
static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
@@ -92,12 +60,6 @@ static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
return;
}
-#define pmd_numa pmd_numa
-static inline int pmd_numa(pmd_t pmd)
-{
- return pte_numa(pmd_pte(pmd));
-}
-
#define pmdp_set_numa pmdp_set_numa
static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
@@ -109,16 +71,21 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
return;
}
-#define pmd_mknonnuma pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+/*
+ * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist
+ * which was inherited from x86. For the purposes of powerpc pte_basic_t and
+ * pmd_t are equivalent
+ */
+#define pteval_t pte_basic_t
+#define pmdval_t pmd_t
+static inline pteval_t ptenuma_flags(pte_t pte)
{
- return pte_pmd(pte_mknonnuma(pmd_pte(pmd)));
+ return pte_val(pte) & _PAGE_NUMA_MASK;
}
-#define pmd_mknuma pmd_mknuma
-static inline pmd_t pmd_mknuma(pmd_t pmd)
+static inline pmdval_t pmdnuma_flags(pmd_t pmd)
{
- return pte_pmd(pte_mknuma(pmd_pte(pmd)));
+ return pmd_val(pmd) & _PAGE_NUMA_MASK;
}
# else
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index 8d1569c..e040c35 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -98,6 +98,11 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
_PAGE_USER | _PAGE_ACCESSED | \
_PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC)
+#ifdef CONFIG_NUMA_BALANCING
+/* Mask of bits that distinguish present and numa ptes */
+#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT)
+#endif
+
/*
* We define 2 sets of base prot bits, one for basic pages (ie,
* cacheable kernel and user pages) and one for non cacheable
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d24887b..0a3f32b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,7 +28,6 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_INT128 if X86_64
- select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f216963..0f9724c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -325,6 +325,20 @@ static inline pteval_t pte_flags(pte_t pte)
return native_pte_val(pte) & PTE_FLAGS_MASK;
}
+#ifdef CONFIG_NUMA_BALANCING
+/* Set of bits that distinguishes present, prot_none and numa ptes */
+#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
+static inline pteval_t ptenuma_flags(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_NUMA_MASK;
+}
+
+static inline pmdval_t pmdnuma_flags(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_NUMA_MASK;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
#define pgprot_val(x) ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) } )
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 53b2acc..281870f 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -660,11 +660,12 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
}
#ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
/*
- * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
- * same bit too). It's set only when _PAGE_PRESET is not set and it's
- * never set if _PAGE_PRESENT is set.
+ * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that
+ * is protected for PROT_NONE and a NUMA hinting fault entry. If the
+ * architecture defines __PAGE_PROTNONE then it should take that into account
+ * but those that do not can rely on the fact that the NUMA hinting scanner
+ * skips inaccessible VMAs.
*
* pte/pmd_present() returns true if pte/pmd_numa returns true. Page
* fault triggers on those regions if pte/pmd_numa returns true
@@ -673,16 +674,14 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
#ifndef pte_numa
static inline int pte_numa(pte_t pte)
{
- return (pte_flags(pte) &
- (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
+ return ptenuma_flags(pte) == _PAGE_NUMA;
}
#endif
#ifndef pmd_numa
static inline int pmd_numa(pmd_t pmd)
{
- return (pmd_flags(pmd) &
- (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
+ return pmdnuma_flags(pmd) == _PAGE_NUMA;
}
#endif
@@ -722,6 +721,8 @@ static inline pte_t pte_mknuma(pte_t pte)
{
pteval_t val = pte_val(pte);
+ VM_BUG_ON(!(val & _PAGE_PRESENT));
+
val &= ~_PAGE_PRESENT;
val |= _PAGE_NUMA;
@@ -765,16 +766,6 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
}
#endif
#else
-extern int pte_numa(pte_t pte);
-extern int pmd_numa(pmd_t pmd);
-extern pte_t pte_mknonnuma(pte_t pte);
-extern pmd_t pmd_mknonnuma(pmd_t pmd);
-extern pte_t pte_mknuma(pte_t pte);
-extern pmd_t pmd_mknuma(pmd_t pmd);
-extern void ptep_set_numa(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-extern void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp);
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
-#else
static inline int pmd_numa(pmd_t pmd)
{
return 0;
diff --git a/init/Kconfig b/init/Kconfig
index 9d76b99..60fa415 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -844,17 +844,6 @@ config ARCH_SUPPORTS_INT128
config ARCH_WANT_NUMA_VARIABLE_LOCALITY
bool
-#
-# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE
-config ARCH_WANTS_PROT_NUMA_PROT_NONE
- bool
-
-config ARCH_USES_NUMA_PROT_NONE
- bool
- default y
- depends on ARCH_WANTS_PROT_NUMA_PROT_NONE
- depends on NUMA_BALANCING
-
config NUMA_BALANCING_DEFAULT_ENABLED
bool "Automatically enable NUMA aware memory/task placement"
default y
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
see: http://www.linux-mm.org/ .
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sasha Levin
2014-08-27 03:20:01 UTC
Permalink
Post by Sasha Levin
Post by Sasha Levin
Thanks Hugh, Mel. I've added both patches to my local tree and will update tomorrow
with the weather.
Post by Hugh Dickins
One thing I did wonder, though: at first I was reassured by the
VM_BUG_ON(!pte_present(pte)) you add to pte_mknuma(); but then thought
it would be better as VM_BUG_ON(!(val & _PAGE_PRESENT)), being stronger
- asserting that indeed we do not put NUMA hints on PROT_NONE areas.
(But I have not tested, perhaps such a VM_BUG_ON would actually fire.)
I've added VM_BUG_ON(!(val & _PAGE_PRESENT)) in just as a curiosity, I'll
update how that one looks as well.
Sorry for the rather long delay.
The patch looks fine, the issue didn't reproduce.
The added VM_BUG_ON didn't trigger either, so maybe we should consider adding
it in.
It took a while, but I've managed to hit that VM_BUG_ON:

[ 707.975456] kernel BUG at include/asm-generic/pgtable.h:724!
[ 707.977147] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 707.978974] Dumping ftrace buffer:
[ 707.980110] (ftrace buffer empty)
[ 707.981221] Modules linked in:
[ 707.982312] CPU: 18 PID: 9488 Comm: trinity-c538 Not tainted 3.17.0-rc2-next-20140826-sasha-00031-gc48c9ac-dirty #1079
[ 707.982801] task: ffff880165e28000 ti: ffff880165e30000 task.ti: ffff880165e30000
[ 707.982801] RIP: 0010:[<ffffffffb42e3dda>] [<ffffffffb42e3dda>] change_protection_range+0x94a/0x970
[ 707.982801] RSP: 0018:ffff880165e33d98 EFLAGS: 00010246
[ 707.982801] RAX: 000000009d340902 RBX: ffff880511204a08 RCX: 0000000000000100
[ 707.982801] RDX: 000000009d340902 RSI: 0000000041741000 RDI: 000000009d340902
[ 707.982801] RBP: ffff880165e33e88 R08: ffff880708a23c00 R09: 0000000000b52000
[ 707.982801] R10: 0000000000001e01 R11: 0000000000000008 R12: 0000000041751000
[ 707.982801] R13: 00000000000000f7 R14: 000000009d340902 R15: 0000000041741000
[ 707.982801] FS: 00007f358a9aa700(0000) GS:ffff88071c600000(0000) knlGS:0000000000000000
[ 707.982801] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 707.982801] CR2: 00007f3586b69490 CR3: 0000000165d88000 CR4: 00000000000006a0
[ 707.982801] Stack:
[ 707.982801] ffff8804db88d058 0000000000000000 ffff88070fb17cf0 0000000000000000
[ 707.982801] ffff880165d88000 0000000000000000 ffff8801686a5000 000000004163e000
[ 707.982801] ffff8801686a5000 0000000000000001 0000000000000025 0000000041750fff
[ 707.982801] Call Trace:
[ 707.982801] [<ffffffffb42e3e14>] change_protection+0x14/0x30
[ 707.982801] [<ffffffffb42fda3b>] change_prot_numa+0x1b/0x40
[ 707.982801] [<ffffffffb41ad766>] task_numa_work+0x1f6/0x330
[ 707.982801] [<ffffffffb41937c4>] task_work_run+0xc4/0xf0
[ 707.982801] [<ffffffffb40712e7>] do_notify_resume+0x97/0xb0
[ 707.982801] [<ffffffffb74fd6ea>] int_signal+0x12/0x17
[ 707.982801] Code: e8 2c 84 21 03 e9 72 ff ff ff 0f 1f 80 00 00 00 00 0f 0b 48 8b 7d a8 4c 89 f2 4c 89 fe e8 9f 7b 03 00 e9 47 f9 ff ff 0f 0b 0f 0b <0f> 0b 0f 0b 48 8b b5 70 ff ff ff 4c 89 ea 48 89 c7 e8 10 d5 01
[ 707.982801] RIP [<ffffffffb42e3dda>] change_protection_range+0x94a/0x970
[ 707.982801] RSP <ffff880165e33d98>


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sasha Levin
2014-08-27 18:30:01 UTC
Permalink
Sasha, how long does it typically take to trigger this? Are you
using any particular switches for trinity that would trigger the bug
faster?
It took couple of weeks (I've been running with it since the beginning
of August). I don't have any special trinity options, just the default
fuzzing. Do you think that focusing on any of the mm syscalls would
increase the odds of hitting it?

There's always the chance that this is a fluke due to corruption somewhere
else. I'll keep running it with the new debug patch and if it won't reproduce
any time soon we can probably safely assume that.


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sasha Levin
2014-08-30 01:30:01 UTC
Permalink
Post by Mel Gorman
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 281870f..ffea570 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -723,6 +723,9 @@ static inline pte_t pte_mknuma(pte_t pte)
VM_BUG_ON(!(val & _PAGE_PRESENT));
+ /* debugging only, specific to x86 */
+ VM_BUG_ON(val & _PAGE_PROTNONE);
+
val &= ~_PAGE_PRESENT;
val |= _PAGE_NUMA;
Triggered again, the first VM_BUG_ON got hit, the second one never did.


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sasha Levin
2014-09-04 09:10:02 UTC
Permalink
Post by Sasha Levin
Post by Mel Gorman
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 281870f..ffea570 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -723,6 +723,9 @@ static inline pte_t pte_mknuma(pte_t pte)
VM_BUG_ON(!(val & _PAGE_PRESENT));
+ /* debugging only, specific to x86 */
+ VM_BUG_ON(val & _PAGE_PROTNONE);
+
val &= ~_PAGE_PRESENT;
val |= _PAGE_NUMA;
Triggered again, the first VM_BUG_ON got hit, the second one never did.
Okay, this bug has reproduced quite a few times since then that I no longer
suspect it's random memory corruption. I'd be happy to try out more debug
patches if you have any leads.


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sasha Levin
2014-09-08 17:30:03 UTC
Permalink
Post by Sasha Levin
Post by Sasha Levin
Post by Mel Gorman
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 281870f..ffea570 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -723,6 +723,9 @@ static inline pte_t pte_mknuma(pte_t pte)
VM_BUG_ON(!(val & _PAGE_PRESENT));
+ /* debugging only, specific to x86 */
+ VM_BUG_ON(val & _PAGE_PROTNONE);
+
val &= ~_PAGE_PRESENT;
val |= _PAGE_NUMA;
Triggered again, the first VM_BUG_ON got hit, the second one never did.
Okay, this bug has reproduced quite a few times since then that I no longer
suspect it's random memory corruption. I'd be happy to try out more debug
patches if you have any leads.
The fact the second one doesn't trigger makes me think that this is not
related to how the helpers are called and is instead relating to timing.
I tried reproducing this but got nothing after 3 hours. How long does it
typically take to reproduce in a given run? You mentioned that it takes a
few weeks to hit but maybe the frequency has changed since. I tried todays
linux-next kernel but it didn't even boot so next-20140826 to match your
original report but got nothing. Can you also send me the config you used
in case that's a factor.
The frequency seems to have changed, I can trigger this 5-10 times a day now.

Config is attached.


Thanks,
Sasha
I had one hunch that this may somehow be related to a collision between
pagetable teardown during exit and the scanner but I could not find a
way that could actually happen. During teardown there should be only one
user of the mm and it can't race with itself.
A worse possibility is that somehow the lock is getting corrupted but
that's also a tough sell considering that the locks should be allocated
from a dedicated cache. I guess I could try breaking that to allocate
one page per lock so DEBUG_PAGEALLOC triggers but I'm not very
optimistic.
Mel Gorman
2014-09-09 21:40:02 UTC
Permalink
A worse possibility is that somehow the lock is getting corrupted but
that's also a tough sell considering that the locks should be allocated
from a dedicated cache. I guess I could try breaking that to allocate
one page per lock so DEBUG_PAGEALLOC triggers but I'm not very
optimistic.
https://lkml.org/lkml/2014/9/4/599
Could this be related?
Possibly although the likely explanation then would be that there is
just general corruption coming from somewhere. Even using your config
and applying a patch to make linux-next boot (already in Tejun's tree)
I was unable to reproduce the problem after running for several hours. I
had to run trinity on tmpfs as ext4 and xfs blew up almost immediately
so I have a few questions.

1. What filesystem are you using?

2. What compiler in case it's an experimental compiler? I ask because I
think I saw a patch from you adding support so that the kernel would
build with gcc 5

3. Does your hardware support TSX or anything similarly funky that would
potentially affect locking?

4. How many sockets are on your test machine in case reproducing it
depends in a machine large enough to open a timing race?

As I'm drawing a blank on what would trigger the bug I'm hoping I can
reproduce this locally and experiement a bit.

Thanks.
--
Mel Gorman
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mel Gorman
2014-09-10 12:50:02 UTC
Permalink
Post by Mel Gorman
A worse possibility is that somehow the lock is getting corrupted but
that's also a tough sell considering that the locks should be allocated
from a dedicated cache. I guess I could try breaking that to allocate
one page per lock so DEBUG_PAGEALLOC triggers but I'm not very
optimistic.
https://lkml.org/lkml/2014/9/4/599
Could this be related?
Possibly although the likely explanation then would be that there is
just general corruption coming from somewhere. Even using your config
and applying a patch to make linux-next boot (already in Tejun's tree)
I was unable to reproduce the problem after running for several hours. I
had to run trinity on tmpfs as ext4 and xfs blew up almost immediately
so I have a few questions.
I agree it could be a case of random corruption somewhere else, it's just
that the amount of times this exact issue reproduced
Yes, I doubt it's random corruption; but I've been no more successful
than Mel in working it out (I share responsibility for that VM_BUG_ON).
Sasha, you say you're getting plenty of these now, but I've only seen
the dump for one of them, on Aug26: please post a few more dumps, so
that we can look for commonality.
It's also worth knowing that this is a test running in KVM and fake NUMA. The
hint was that the filesystem used was virtio-9p. I haven't formulated a
theory on how KVM could cause any damage here but it's interesting.
And please attach a disassembly of change_protection_range() (noting
"Code" just shows a cluster of ud2s for the unlikely bugs at end of the
function, we cannot tell at all what should be in the registers by then.
I've been rather assuming that the 9d340902 seen in many of the
registers in that Aug26 dump is the pte val in question: that's
SOFT_DIRTY|PROTNONE|RW.
I think RW on PROTNONE is unusual but not impossible (migration entry
replacement racing with mprotect setting PROT_NONE, after it's updated
vm_page_prot, before it's reached the page table).
At the risk of sounding thick, I need to spell this out because I'm
having trouble seeing exactly what race you are thinking of.

Migration entry replacement is protected against parallel NUMA hinting
updates by the page table lock (either PMD or PTE level). It's taken by
remove_migration_pte on one side and lock_pte_protection on the other.

For the mprotect case racing again migration, migration entries are not
present so change_pte_range() should ignore it. On migration completion
the VMA flags determine the permissions of the new PTE. Parallel faults
wait on the migration entry and see the correct value afterwards.

When creating migration entries, try_to_unmap calls page_check_address
which takes the PTL before doing anything. On the mprotect side,
lock_pte_protection will block before seeing PROTNONE.

I think the race you are thinking of is a migration entry created for write,
parallel mprotect(PROTNONE) and migration completion. The migration entry
was created for write but remove_migration_pte does not double check the VMA
protections and mmap_sem is not taken for write across a full migration to
protect against changes to vm_page_prot. However, change_pte_range checks
for migration entries marked for write under the PTL and marks them read if
one is encountered. The consequence is that we potentially take a spurious
fault to mark the PTE write again after migration completes but I can't
see how that causes a problem as such.

I'm missing some part of your reasoning that leads to the RW|PROTNONE :(
But exciting though
that line of thought is, I cannot actually bring it to a pte_mknuma bug,
or any bug at all.
On x86, PROTNONE|RW translates as GLOBAL|RW which would be unexpected. It
wouldn't cause this bug but it's sufficiently suspicious to be worth
correcting. In case this is the race you're thinking of, the patch is below.
Unfortunately, I cannot see how it would affect this problem but worth
giving a whirl anyway.
Mel, no way can it be the cause of this bug - unless Sasha's later
traces actually show a different stack - but I don't see the call
to change_prot_numa() from queue_pages_range() sharing the same
avoidance of PROT_NONE that task_numa_work() has (though it does
have an outdated comment about PROT_NONE which should be removed).
So I think that site probably does need PROT_NONE checking added.
That site should have checked PROT_NONE but it can't be the same bug
that trinity is seeing. Minimally trinity is unaware of MPOL_MF_LAZY
according to git grep of the trinity source.

Worth adding this to the debugging mix? It should warn if it encounters
the problem but avoid adding the problematic RW bit.

---8<---
migrate: debug patch to try identify race between migration completion and mprotect

A migration entry is marked as write if pte_write was true at the
time the entry was created. The VMA protections are not double checked
when migration entries are being removed but mprotect itself will mark
write-migration-entries as read to avoid problems. It means we potentially
take a spurious fault to mark these ptes write again but otherwise it's
harmless. Still, one dump indicates that this situation can actually
happen so this debugging patch spits out a warning if the situation occurs
and hopefully the resulting warning will contain a clue as to how exactly
it happens

Not-signed-off
---
mm/migrate.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 09d489c..631725c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -146,8 +146,16 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (pte_swp_soft_dirty(*ptep))
pte = pte_mksoft_dirty(pte);
- if (is_write_migration_entry(entry))
- pte = pte_mkwrite(pte);
+ if (is_write_migration_entry(entry)) {
+ /*
+ * This WARN_ON_ONCE is temporary for the purposes of seeing if
+ * it's a case encountered by trinity in Sasha's testing
+ */
+ if (!(vma->vm_flags & (VM_WRITE)))
+ WARN_ON_ONCE(1);
+ else
+ pte = pte_mkwrite(pte);
+ }
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Hugh Dickins
2014-09-10 19:40:04 UTC
Permalink
Post by Mel Gorman
I've been rather assuming that the 9d340902 seen in many of the
registers in that Aug26 dump is the pte val in question: that's
SOFT_DIRTY|PROTNONE|RW.
The 900s in the latest dumps imply that that 902 was not important.
(If any of them are in fact the pte val.)
Post by Mel Gorman
I think RW on PROTNONE is unusual but not impossible (migration entry
replacement racing with mprotect setting PROT_NONE, after it's updated
vm_page_prot, before it's reached the page table).
At the risk of sounding thick, I need to spell this out because I'm
having trouble seeing exactly what race you are thinking of.
Migration entry replacement is protected against parallel NUMA hinting
updates by the page table lock (either PMD or PTE level). It's taken by
remove_migration_pte on one side and lock_pte_protection on the other.
For the mprotect case racing again migration, migration entries are not
present so change_pte_range() should ignore it. On migration completion
the VMA flags determine the permissions of the new PTE. Parallel faults
wait on the migration entry and see the correct value afterwards.
When creating migration entries, try_to_unmap calls page_check_address
which takes the PTL before doing anything. On the mprotect side,
lock_pte_protection will block before seeing PROTNONE.
I think the race you are thinking of is a migration entry created for write,
parallel mprotect(PROTNONE) and migration completion. The migration entry
was created for write but remove_migration_pte does not double check the VMA
protections and mmap_sem is not taken for write across a full migration to
protect against changes to vm_page_prot.
Yes, the "if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte);"
arguably should take the latest value of vma->vm_page_prot into account.
Post by Mel Gorman
However, change_pte_range checks
for migration entries marked for write under the PTL and marks them read if
one is encountered. The consequence is that we potentially take a spurious
fault to mark the PTE write again after migration completes but I can't
see how that causes a problem as such.
Yes, once mprotect's page table walk reaches that pte, it updates it
correctly along with all the others nearby (which were not migrated),
removing the temporary oddity.
Post by Mel Gorman
I'm missing some part of your reasoning that leads to the RW|PROTNONE :(
You don't appear to be missing it at all, you are seeing the possibility
of an RW|PROTNONE yourself, and how it gets "corrected" afterwards
("corrected" in quotes because without the present bit, it's not an error).
Post by Mel Gorman
But exciting though
that line of thought is, I cannot actually bring it to a pte_mknuma bug,
or any bug at all.
And I wasn't saying that it led to this bug, just that it was an oddity
worth thinking about, and worth mentioning to you, in case you could work
out a way it might lead to the bug, when I had failed to do so.

But we now (almost) know that 902 is irrelevant to this bug anyway.
Post by Mel Gorman
On x86, PROTNONE|RW translates as GLOBAL|RW which would be unexpected. It
GLOBAL once PRESENT is set, but PROTNONE so long as it is not.
Post by Mel Gorman
wouldn't cause this bug but it's sufficiently suspicious to be worth
correcting. In case this is the race you're thinking of, the patch is below.
Unfortunately, I cannot see how it would affect this problem but worth
giving a whirl anyway.
Mel, no way can it be the cause of this bug - unless Sasha's later
traces actually show a different stack - but I don't see the call
to change_prot_numa() from queue_pages_range() sharing the same
avoidance of PROT_NONE that task_numa_work() has (though it does
have an outdated comment about PROT_NONE which should be removed).
So I think that site probably does need PROT_NONE checking added.
That site should have checked PROT_NONE but it can't be the same bug
that trinity is seeing. Minimally trinity is unaware of MPOL_MF_LAZY
according to git grep of the trinity source.
Yes, queue_pages_range() is not implicated in any of Sasha's traces.
Something to fix, but not relevant to this bug.
Post by Mel Gorman
Worth adding this to the debugging mix? It should warn if it encounters
the problem but avoid adding the problematic RW bit.
---8<---
migrate: debug patch to try identify race between migration completion and mprotect
A migration entry is marked as write if pte_write was true at the
time the entry was created. The VMA protections are not double checked
when migration entries are being removed but mprotect itself will mark
write-migration-entries as read to avoid problems. It means we potentially
take a spurious fault to mark these ptes write again but otherwise it's
harmless. Still, one dump indicates that this situation can actually
happen so this debugging patch spits out a warning if the situation occurs
and hopefully the resulting warning will contain a clue as to how exactly
it happens
Not-signed-off
---
mm/migrate.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/mm/migrate.c b/mm/migrate.c
index 09d489c..631725c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -146,8 +146,16 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (pte_swp_soft_dirty(*ptep))
pte = pte_mksoft_dirty(pte);
- if (is_write_migration_entry(entry))
- pte = pte_mkwrite(pte);
+ if (is_write_migration_entry(entry)) {
+ /*
+ * This WARN_ON_ONCE is temporary for the purposes of seeing if
+ * it's a case encountered by trinity in Sasha's testing
+ */
+ if (!(vma->vm_flags & (VM_WRITE)))
+ WARN_ON_ONCE(1);
+ else
+ pte = pte_mkwrite(pte);
+ }
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
Right, and Sasha reports that that can fire, but he sees the bug
with this patch in and without that firing.

Consider 902 of no interest, it was just something worth mentioning
before we got more info.

Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sasha Levin
2014-09-11 02:50:01 UTC
Permalink
Post by Hugh Dickins
Post by Mel Gorman
migrate: debug patch to try identify race between migration completion and mprotect
Post by Mel Gorman
A migration entry is marked as write if pte_write was true at the
time the entry was created. The VMA protections are not double checked
when migration entries are being removed but mprotect itself will mark
write-migration-entries as read to avoid problems. It means we potentially
take a spurious fault to mark these ptes write again but otherwise it's
harmless. Still, one dump indicates that this situation can actually
happen so this debugging patch spits out a warning if the situation occurs
and hopefully the resulting warning will contain a clue as to how exactly
it happens
Not-signed-off
---
mm/migrate.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/mm/migrate.c b/mm/migrate.c
index 09d489c..631725c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -146,8 +146,16 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (pte_swp_soft_dirty(*ptep))
pte = pte_mksoft_dirty(pte);
- if (is_write_migration_entry(entry))
- pte = pte_mkwrite(pte);
+ if (is_write_migration_entry(entry)) {
+ /*
+ * This WARN_ON_ONCE is temporary for the purposes of seeing if
+ * it's a case encountered by trinity in Sasha's testing
+ */
+ if (!(vma->vm_flags & (VM_WRITE)))
+ WARN_ON_ONCE(1);
+ else
+ pte = pte_mkwrite(pte);
+ }
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
Right, and Sasha reports that that can fire, but he sees the bug
with this patch in and without that firing.
I've changed that WARN_ON_ONCE() to a VM_BUG_ON_VMA() to get some useful VMA information
out, and got the following:

[ 4018.870776] vma ffff8801a0f1e800 start 00007f3fd0ca7000 end 00007f3fd16a7000
[ 4018.870776] next ffff8804e1b89800 prev ffff88008cd9a000 mm ffff88054b17d000
[ 4018.870776] prot 120 anon_vma ffff880bc858a200 vm_ops (null)
[ 4018.870776] pgoff 41bc8 file (null) private_data (null)
[ 4018.879731] flags: 0x8100070(mayread|maywrite|mayexec|account)
[ 4018.881324] ------------[ cut here ]------------
[ 4018.882612] kernel BUG at mm/migrate.c:155!
[ 4018.883649] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 4018.889647] Dumping ftrace buffer:
[ 4018.890323] (ftrace buffer empty)
[ 4018.890323] Modules linked in:
[ 4018.890323] CPU: 4 PID: 9966 Comm: trinity-main Tainted: G W 3.17.0-rc4-next-20140910-sasha-00042-ga4bad9b-dirty #1140
[ 4018.890323] task: ffff880695b83000 ti: ffff880560c44000 task.ti: ffff880560c44000
[ 4018.890323] RIP: 0010:[<ffffffff9b2fd4c1>] [<ffffffff9b2fd4c1>] remove_migration_pte+0x3e1/0x3f0
[ 4018.890323] RSP: 0000:ffff880560c477c8 EFLAGS: 00010292
[ 4018.890323] RAX: 0000000000000001 RBX: 00007f3fd129b000 RCX: 0000000000000000
[ 4018.890323] RDX: 0000000000000001 RSI: ffffffff9e4ba395 RDI: 0000000000000001
[ 4018.890323] RBP: ffff880560c47800 R08: 0000000000000001 R09: 0000000000000001
[ 4018.890323] R10: 0000000000045401 R11: 0000000000000001 R12: ffff8801a0f1e800
[ 4018.890323] R13: ffff88054b17d000 R14: ffffea000478eb40 R15: ffff880122bcf070
[ 4018.890323] FS: 00007f3fd55bb700(0000) GS:ffff8803d6a00000(0000) knlGS:0000000000000000
[ 4018.890323] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 4018.890323] CR2: 0000000000fcbca8 CR3: 0000000561bab000 CR4: 00000000000006a0
[ 4018.890323] DR0: 00000000006f0000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4018.890323] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
[ 4018.890323] Stack:
[ 4018.890323] ffffea00046ed980 ffff88011079c4d8 ffffea000478eb40 ffff880560c47858
[ 4018.890323] ffff88019fde0330 00000000000421bc ffff8801a0f1e800 ffff880560c47848
[ 4018.890323] ffffffff9b2d1b0f ffff880bc858a200 ffff880560c47850 ffffea000478eb40
[ 4018.890323] Call Trace:
[ 4018.890323] [<ffffffff9b2d1b0f>] rmap_walk+0x22f/0x380
[ 4018.890323] [<ffffffff9b2fc841>] remove_migration_ptes+0x41/0x50
[ 4018.890323] [<ffffffff9b2fd0e0>] ? __migration_entry_wait.isra.24+0x160/0x160
[ 4018.890323] [<ffffffff9b2fd4d0>] ? remove_migration_pte+0x3f0/0x3f0
[ 4018.890323] [<ffffffff9b2fe73b>] move_to_new_page+0x16b/0x230
[ 4018.890323] [<ffffffff9b2d1e8c>] ? try_to_unmap+0x6c/0xf0
[ 4018.890323] [<ffffffff9b2d08a0>] ? try_to_unmap_nonlinear+0x5c0/0x5c0
[ 4018.890323] [<ffffffff9b2cf0a0>] ? invalid_migration_vma+0x30/0x30
[ 4018.890323] [<ffffffff9b2d02e0>] ? page_remove_rmap+0x320/0x320
[ 4018.890323] [<ffffffff9b2ff19c>] migrate_pages+0x85c/0x930
[ 4018.890323] [<ffffffff9b2b8e20>] ? isolate_freepages_block+0x410/0x410
[ 4018.890323] [<ffffffff9b2b7a60>] ? arch_local_save_flags+0x30/0x30
[ 4018.890323] [<ffffffff9b2b9803>] compact_zone+0x4d3/0x8a0
[ 4018.890323] [<ffffffff9b2b9c2f>] compact_zone_order+0x5f/0xa0
[ 4018.890323] [<ffffffff9b2b9f87>] try_to_compact_pages+0x127/0x2f0
[ 4018.890323] [<ffffffff9b298c98>] __alloc_pages_direct_compact+0x68/0x200
[ 4018.890323] [<ffffffff9b2995af>] __alloc_pages_nodemask+0x77f/0xd90
[ 4018.890323] [<ffffffff9b192fad>] ? sched_clock_local+0x1d/0x90
[ 4018.890323] [<ffffffff9b2e8a1c>] alloc_pages_vma+0x13c/0x270
[ 4018.890323] [<ffffffff9b305934>] ? do_huge_pmd_wp_page+0x494/0xc90
[ 4018.890323] [<ffffffff9b305934>] do_huge_pmd_wp_page+0x494/0xc90
[ 4018.890323] [<ffffffff9b308d40>] ? __mem_cgroup_count_vm_event+0xd0/0x240
[ 4018.890323] [<ffffffff9b2c4b7d>] handle_mm_fault+0x8bd/0xc50
[ 4018.890323] [<ffffffff9b1ba6e6>] ? __lock_is_held+0x56/0x80
[ 4018.890323] [<ffffffff9b0afbc7>] __do_page_fault+0x1b7/0x660
[ 4018.890323] [<ffffffff9b1b5c5e>] ? put_lock_stats.isra.13+0xe/0x30
[ 4018.890323] [<ffffffff9b193f41>] ? vtime_account_user+0x91/0xa0
[ 4018.890323] [<ffffffff9b28ac35>] ? context_tracking_user_exit+0xb5/0x1b0
[ 4018.890323] [<ffffffff9bb55d33>] ? __this_cpu_preempt_check+0x13/0x20
[ 4018.890323] [<ffffffff9b1b62e2>] ? trace_hardirqs_off_caller+0xe2/0x1b0
[ 4018.890323] [<ffffffff9b0b0141>] trace_do_page_fault+0x51/0x2b0
[ 4018.890323] [<ffffffff9b0a6e83>] do_async_page_fault+0x63/0xd0
[ 4018.890323] [<ffffffff9e4bccf8>] async_page_fault+0x28/0x30
[ 4018.890323] Code: 0f 0b 48 c7 c6 b0 f2 71 9f 4c 89 f7 e8 b9 79 f9 ff 0f 0b 48 83 c9 02 41 f6 44 24 50 02 0f 85 70 fe ff ff 4c 89 e7 e8 af 4a f9 ff <0f> 0b 0f 0b 66 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 55
[ 4018.890323] RIP [<ffffffff9b2fd4c1>] remove_migration_pte+0x3e1/0x3f0
[ 4018.890323] RSP <ffff880560c477c8>

And from a different log:

[ 2035.602565] vma ffff88054b666c00 start 00007f561ffad000 end 00007f56203ad000
[ 2035.602565] next ffff88054b665a00 prev ffff8801f7a31800 mm ffff8804f207a000
[ 2035.602565] prot 120 anon_vma (null) vm_ops ffffffffb5671e80
[ 2035.602565] pgoff 0 file ffff88054b430a80 private_data (null)
[ 2035.608469] flags: 0x80000f8(shared|mayread|maywrite|mayexec|mayshare)


And on a maybe related note, I've started seeing the following today. It may
be because we fixed mbind() in trinity but it could also be related to
this issue (free_pgtables() is in the call chain). If you don't think it has
anything to do with it let me know and I'll start a new thread:

[ 1195.996803] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 1196.001744] IP: __rb_erase_color (include/linux/rbtree_augmented.h:107 lib/rbtree.c:229 lib/rbtree.c:367)
[ 1196.001744] PGD 196787067 PUD 117522067 PMD 0
[ 1196.001744] Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 1196.001744] Dumping ftrace buffer:
[ 1196.001744] (ftrace buffer empty)
[ 1196.001744] Modules linked in:
[ 1196.001744] CPU: 5 PID: 5724 Comm: trinity-c890 Not tainted 3.17.0-rc4-next-20140910-sasha-00042-ga4bad9b-dirty #1140
[ 1196.001744] task: ffff88024207b000 ti: ffff8808b25e0000 task.ti: ffff8808b25e0000
[ 1196.001744] RIP: __rb_erase_color (include/linux/rbtree_augmented.h:107 lib/rbtree.c:229 lib/rbtree.c:367)
[ 1196.001744] RSP: 0018:ffff8808b25e3d18 EFLAGS: 00010286
[ 1196.001744] RAX: ffff8808890ed059 RBX: ffff88091f75f458 RCX: 0000000000000000
[ 1196.001744] RDX: 0000000000000000 RSI: ffff8800b83396c8 RDI: ffff8808890ed058
[ 1196.001744] RBP: ffff8808b25e3d40 R08: ffff8808890ed058 R09: 0000000000000000
[ 1196.001744] R10: 0000000000000000 R11: ffff88085697d658 R12: ffff8808890ed058
[ 1196.001744] R13: ffffffff912ba700 R14: ffff8800b83396c8 R15: 0000000000000000
[ 1196.001744] FS: 00007f00e4458700(0000) GS:ffff880492c00000(0000) knlGS:0000000000000000
[ 1196.001744] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 1196.001744] CR2: 0000000000000000 CR3: 0000000196786000 CR4: 00000000000006a0
[ 1196.001744] DR0: 00000000006f0000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1196.001744] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000070602
[ 1196.001744] Stack:
[ 1196.001744] ffff88085697d600 ffff8800b5d13480 ffff8800b83396e0 ffff8800b8339660
[ 1196.001744] ffff88085697d600 ffff8808b25e3d58 ffffffff912ba9e4 ffff88085697d600
[ 1196.001744] ffff8808b25e3d78 ffffffff912c8446 ffff88085697d600 ffff8800b5d13480
[ 1196.001744] Call Trace:
[ 1196.001744] vma_interval_tree_remove (mm/interval_tree.c:24)
[ 1196.001744] __remove_shared_vm_struct (mm/mmap.c:232)
[ 1196.001744] unlink_file_vma (mm/mmap.c:246)
[ 1196.001744] free_pgtables (mm/memory.c:547)
[ 1196.001744] exit_mmap (mm/mmap.c:2826)
[ 1196.001744] mmput (kernel/fork.c:654)
[ 1196.001744] do_exit (./arch/x86/include/asm/thread_info.h:168 kernel/exit.c:461 kernel/exit.c:746)
[ 1196.001744] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
[ 1196.001744] ? trace_hardirqs_on (kernel/locking/lockdep.c:2609)
[ 1196.001744] do_group_exit (./arch/x86/include/asm/current.h:14 kernel/exit.c:874)
[ 1196.001744] SyS_exit_group (kernel/exit.c:900)
[ 1196.001744] tracesys (arch/x86/kernel/entry_64.S:542)
[ 1196.001744] Code: e2 49 89 c4 49 8b 5c 24 08 48 39 d3 0f 84 e2 00 00 00 f6 03 01 75 ad 4c 8b 7b 10 4c 89 e0 48 83 c8 01 4d 89 7c 24 08 4c 89 63 10 <49> 89 07 49 8b 04 24 48 89 03 48 83 e0 fc 49 89 1c 24 0f 84 69
All code
========
0: e2 49 loop 0x4b
2: 89 c4 mov %eax,%esp
4: 49 8b 5c 24 08 mov 0x8(%r12),%rbx
9: 48 39 d3 cmp %rdx,%rbx
c: 0f 84 e2 00 00 00 je 0xf4
12: f6 03 01 testb $0x1,(%rbx)
15: 75 ad jne 0xffffffffffffffc4
17: 4c 8b 7b 10 mov 0x10(%rbx),%r15
1b: 4c 89 e0 mov %r12,%rax
1e: 48 83 c8 01 or $0x1,%rax
22: 4d 89 7c 24 08 mov %r15,0x8(%r12)
27: 4c 89 63 10 mov %r12,0x10(%rbx)
2b:* 49 89 07 mov %rax,(%r15) <-- trapping instruction
2e: 49 8b 04 24 mov (%r12),%rax
32: 48 89 03 mov %rax,(%rbx)
35: 48 83 e0 fc and $0xfffffffffffffffc,%rax
39: 49 89 1c 24 mov %rbx,(%r12)
3d: 0f .byte 0xf
3e: 84 69 00 test %ch,0x0(%rcx)

Code starting with the faulting instruction
===========================================
0: 49 89 07 mov %rax,(%r15)
3: 49 8b 04 24 mov (%r12),%rax
7: 48 89 03 mov %rax,(%rbx)
a: 48 83 e0 fc and $0xfffffffffffffffc,%rax
e: 49 89 1c 24 mov %rbx,(%r12)
12: 0f .byte 0xf
13: 84 69 00 test %ch,0x0(%rcx)
[ 1196.001744] RIP __rb_erase_color (include/linux/rbtree_augmented.h:107 lib/rbtree.c:229 lib/rbtree.c:367)
[ 1196.001744] RSP <ffff8808b25e3d18>
[ 1196.001744] CR2: 0000000000000000
[ 1196.001744] ---[ end trace 67e0103d243f3c04 ]---
[ 1196.050031] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
[ 1196.050031] IP: __rb_insert_augmented (lib/rbtree.c:94 lib/rbtree.c:411)
[ 1196.050031] PGD a3ea09067 PUD a69b38067 PMD 0
[ 1196.050031] Oops: 0000 [#2] PREEMPT SMP DEBUG_PAGEALLOC
[ 1196.050031] Dumping ftrace buffer:
[ 1196.050031] (ftrace buffer empty)
[ 1196.050031] Modules linked in:
[ 1196.050031] CPU: 3 PID: 5688 Comm: trinity-c802 Tainted: G D 3.17.0-rc4-next-20140910-sasha-00042-ga4bad9b-dirty #1140
[ 1196.050031] task: ffff880a508f8000 ti: ffff880a6950c000 task.ti: ffff880a6950c000
[ 1196.050031] RIP: __rb_insert_augmented (lib/rbtree.c:94 lib/rbtree.c:411)
[ 1196.050031] RSP: 0018:ffff880a6950fd68 EFLAGS: 00010246
[ 1196.050031] RAX: ffff88091f75a058 RBX: 0000000000000000 RCX: 0000000000000000
[ 1196.050031] RDX: ffffffff912ba700 RSI: ffff8800b4cb3718 RDI: ffff8802d786ca58
[ 1196.050031] RBP: ffff880a6950fd90 R08: ffff8802d786ca00 R09: ffff8800b4cb3718
[ 1196.050031] R10: 0000000000000001 R11: 0000000000000001 R12: ffff8801fd067400
[ 1196.050031] R13: ffff8802d786ca00 R14: ffff8800b4cb3718 R15: 00007f00e44589d0
[ 1196.050031] FS: 00007f00e4458700(0000) GS:ffff88031ac00000(0000) knlGS:0000000000000000
[ 1196.050031] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 1196.050031] CR2: 0000000000000008 CR3: 0000000a62597000 CR4: 00000000000006a0
[ 1196.050031] DR0: 00000000006f0000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1196.050031] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000070602
[ 1196.050031] Stack:
[ 1196.050031] ffffffff9115499e ffff88028bc7a000 ffff8801fd067400 ffff8802d786ca00
[ 1196.050031] ffff8800b4cb3730 ffff880a6950fda0 ffffffff912babfd ffff880a6950fe70
[ 1196.050031] ffffffff91154a77 ffff8800b4cb36b0 000000003ebe3540 0000000000000000
[ 1196.050031] Call Trace:
[ 1196.050031] ? copy_process (kernel/fork.c:409 kernel/fork.c:859 kernel/fork.c:913 kernel/fork.c:1381)
[ 1196.050031] vma_interval_tree_insert_after (mm/interval_tree.c:60)
[ 1196.050031] copy_process (kernel/fork.c:442 kernel/fork.c:859 kernel/fork.c:913 kernel/fork.c:1381)
[ 1196.050031] do_fork (kernel/fork.c:1644)
[ 1196.050031] ? context_tracking_user_exit (./arch/x86/include/asm/paravirt.h:809 (discriminator 2) kernel/context_tracking.c:184 (discriminator 2))
[ 1196.050031] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
[ 1196.050031] ? trace_hardirqs_on_caller (kernel/locking/lockdep.c:2559 kernel/locking/lockdep.c:2601)
[ 1196.050031] ? trace_hardirqs_on (kernel/locking/lockdep.c:2609)
[ 1196.050031] SyS_clone (kernel/fork.c:1733)
[ 1196.050031] stub_clone (arch/x86/kernel/entry_64.S:637)
[ 1196.050031] ? tracesys (arch/x86/kernel/entry_64.S:542)
[ 1196.050031] Code: ff ff 0f 1f 00 48 8b 07 48 85 c0 0f 84 a4 01 00 00 55 48 89 e5 41 56 49 89 f6 41 55 41 54 53 48 83 ec 08 48 8b 18 f6 c3 01 75 6b <48> 8b 4b 08 49 89 d8 48 39 c8 0f 84 a5 00 00 00 48 85 c9 74 05
All code
========
0: ff (bad)
1: ff 0f decl (%rdi)
3: 1f (bad)
4: 00 48 8b add %cl,-0x75(%rax)
7: 07 (bad)
8: 48 85 c0 test %rax,%rax
b: 0f 84 a4 01 00 00 je 0x1b5
11: 55 push %rbp
12: 48 89 e5 mov %rsp,%rbp
15: 41 56 push %r14
17: 49 89 f6 mov %rsi,%r14
1a: 41 55 push %r13
1c: 41 54 push %r12
1e: 53 push %rbx
1f: 48 83 ec 08 sub $0x8,%rsp
23: 48 8b 18 mov (%rax),%rbx
26: f6 c3 01 test $0x1,%bl
29: 75 6b jne 0x96
2b:* 48 8b 4b 08 mov 0x8(%rbx),%rcx <-- trapping instruction
2f: 49 89 d8 mov %rbx,%r8
32: 48 39 c8 cmp %rcx,%rax
35: 0f 84 a5 00 00 00 je 0xe0
3b: 48 85 c9 test %rcx,%rcx
3e: 74 05 je 0x45
...

Code starting with the faulting instruction
===========================================
0: 48 8b 4b 08 mov 0x8(%rbx),%rcx
4: 49 89 d8 mov %rbx,%r8
7: 48 39 c8 cmp %rcx,%rax
a: 0f 84 a5 00 00 00 je 0xb5
10: 48 85 c9 test %rcx,%rcx
13: 74 05 je 0x1a
...
[ 1196.050031] RIP __rb_insert_augmented (lib/rbtree.c:94 lib/rbtree.c:411)
[ 1196.050031] RSP <ffff880a6950fd68>
[ 1196.050031] CR2: 0000000000000008


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Hugh Dickins
2014-09-11 11:50:03 UTC
Permalink
Post by Sasha Levin
Post by Hugh Dickins
Right, and Sasha reports that that can fire, but he sees the bug
with this patch in and without that firing.
I've changed that WARN_ON_ONCE() to a VM_BUG_ON_VMA() to get some useful
Well, thanks, but Mel and I have both failed to perceive any actual
problem arising from that peculiarity. And Mel's warning, and the 900s
in yesterday's dumps, have shown that it is not correlated with the
pte_mknuma() bug we are chasing. So there isn't anything that I want to
look up in these vmas. Or did you notice something interesting in them?
Post by Sasha Levin
And on a maybe related note, I've started seeing the following today. It may
be because we fixed mbind() in trinity but it could also be related to
The fixed trinity may be counter-productive for now, since we think
there is an understandable pte_mknuma() bug coming from that direction,
but have not posted a patch for it yet.
Post by Sasha Levin
this issue (free_pgtables() is in the call chain). If you don't think it has
[ 1195.996803] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 1196.001744] IP: __rb_erase_color (include/linux/rbtree_augmented.h:107 lib/rbtree.c:229 lib/rbtree.c:367)
[ 1196.001744] vma_interval_tree_remove (mm/interval_tree.c:24)
[ 1196.001744] __remove_shared_vm_struct (mm/mmap.c:232)
[ 1196.001744] unlink_file_vma (mm/mmap.c:246)
[ 1196.001744] free_pgtables (mm/memory.c:547)
[ 1196.001744] exit_mmap (mm/mmap.c:2826)
[ 1196.001744] mmput (kernel/fork.c:654)
[ 1196.001744] do_exit (./arch/x86/include/asm/thread_info.h:168 kernel/exit.c:461 kernel/exit.c:746)
I didn't study in any detail, but this one seems much more like the
zeroing and vma corruption that you've been seeing in other dumps.

Though a single pte_mknuma() crash could presumably be caused by vma
corruption (but I think not mere zeroing), the recurrent way in which
you hit that pte_mknuma() bug in particular makes it unlikely to be
caused by random corruption.

You are generating new crashes faster than we can keep up with them.
Would this be a suitable point for you to switch over to testing
3.17-rc, to see if that is as unstable for you as -next is?

That VM_BUG_ON(!(val & _PAGE_PRESENT)) is not in the 3.17-rc tree,
but I think you can "safely" add it to 3.17-rc. Quotes around
"safely" meaning that we know that there's a bug to hit, at least
in -next, but I don't think it's going to be hit for stupid obvious
reasons.

And you're using a gcc 5 these days? That's another variable to
try removing from the mix, to see if it makes a difference.

Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sasha Levin
2014-09-11 14:30:02 UTC
Permalink
Post by Hugh Dickins
Post by Sasha Levin
Post by Hugh Dickins
Right, and Sasha reports that that can fire, but he sees the bug
with this patch in and without that firing.
I've changed that WARN_ON_ONCE() to a VM_BUG_ON_VMA() to get some useful
Well, thanks, but Mel and I have both failed to perceive any actual
problem arising from that peculiarity. And Mel's warning, and the 900s
in yesterday's dumps, have shown that it is not correlated with the
pte_mknuma() bug we are chasing. So there isn't anything that I want to
look up in these vmas. Or did you notice something interesting in them?
I thought this was a separate issue that would need taking care of as well.
Post by Hugh Dickins
Post by Sasha Levin
And on a maybe related note, I've started seeing the following today. It may
be because we fixed mbind() in trinity but it could also be related to
The fixed trinity may be counter-productive for now, since we think
there is an understandable pte_mknuma() bug coming from that direction,
but have not posted a patch for it yet.
I'm still seeing the bug with fixed trinity, it was a matter of adding more flags
to mbind.
Post by Hugh Dickins
Post by Sasha Levin
this issue (free_pgtables() is in the call chain). If you don't think it has
[ 1195.996803] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 1196.001744] IP: __rb_erase_color (include/linux/rbtree_augmented.h:107 lib/rbtree.c:229 lib/rbtree.c:367)
[ 1196.001744] vma_interval_tree_remove (mm/interval_tree.c:24)
[ 1196.001744] __remove_shared_vm_struct (mm/mmap.c:232)
[ 1196.001744] unlink_file_vma (mm/mmap.c:246)
[ 1196.001744] free_pgtables (mm/memory.c:547)
[ 1196.001744] exit_mmap (mm/mmap.c:2826)
[ 1196.001744] mmput (kernel/fork.c:654)
[ 1196.001744] do_exit (./arch/x86/include/asm/thread_info.h:168 kernel/exit.c:461 kernel/exit.c:746)
I didn't study in any detail, but this one seems much more like the
zeroing and vma corruption that you've been seeing in other dumps.
Though a single pte_mknuma() crash could presumably be caused by vma
corruption (but I think not mere zeroing), the recurrent way in which
you hit that pte_mknuma() bug in particular makes it unlikely to be
caused by random corruption.
You are generating new crashes faster than we can keep up with them.
Would this be a suitable point for you to switch over to testing
3.17-rc, to see if that is as unstable for you as -next is?
That VM_BUG_ON(!(val & _PAGE_PRESENT)) is not in the 3.17-rc tree,
but I think you can "safely" add it to 3.17-rc. Quotes around
"safely" meaning that we know that there's a bug to hit, at least
in -next, but I don't think it's going to be hit for stupid obvious
reasons.
I'll try it, usually I just hit a bunch of issues that were already fixed
in -next, which is why I try sticking to one tree.
Post by Hugh Dickins
And you're using a gcc 5 these days? That's another variable to
try removing from the mix, to see if it makes a difference.
I'm seeing the BUG getting hit with 4.7.2, so I don't think it's compiler
dependant. I'll try reproducing everything I reported yesterday with 4.7.2
just in case, but I don't think that this is the issue.


Thanks,
Sasha

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Dave Jones
2014-09-11 14:40:03 UTC
Permalink
Post by Sasha Levin
Post by Hugh Dickins
The fixed trinity may be counter-productive for now, since we think
there is an understandable pte_mknuma() bug coming from that direction,
but have not posted a patch for it yet.
I'm still seeing the bug with fixed trinity, it was a matter of adding more flags
to mbind.
What did I miss ? Anything not in the MPOL_MF_VALID mask should be -EINVAL

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mel Gorman
2014-09-11 16:30:02 UTC
Permalink
Post by Hugh Dickins
Post by Sasha Levin
Post by Hugh Dickins
Right, and Sasha reports that that can fire, but he sees the bug
with this patch in and without that firing.
I've changed that WARN_ON_ONCE() to a VM_BUG_ON_VMA() to get some useful
Well, thanks, but Mel and I have both failed to perceive any actual
problem arising from that peculiarity. And Mel's warning, and the 900s
in yesterday's dumps, have shown that it is not correlated with the
pte_mknuma() bug we are chasing. So there isn't anything that I want to
look up in these vmas. Or did you notice something interesting in them?
Post by Sasha Levin
And on a maybe related note, I've started seeing the following today. It may
be because we fixed mbind() in trinity but it could also be related to
The fixed trinity may be counter-productive for now, since we think
there is an understandable pte_mknuma() bug coming from that direction,
but have not posted a patch for it yet.
Post by Sasha Levin
this issue (free_pgtables() is in the call chain). If you don't think it has
[ 1195.996803] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 1196.001744] IP: __rb_erase_color (include/linux/rbtree_augmented.h:107 lib/rbtree.c:229 lib/rbtree.c:367)
[ 1196.001744] vma_interval_tree_remove (mm/interval_tree.c:24)
[ 1196.001744] __remove_shared_vm_struct (mm/mmap.c:232)
[ 1196.001744] unlink_file_vma (mm/mmap.c:246)
[ 1196.001744] free_pgtables (mm/memory.c:547)
[ 1196.001744] exit_mmap (mm/mmap.c:2826)
[ 1196.001744] mmput (kernel/fork.c:654)
[ 1196.001744] do_exit (./arch/x86/include/asm/thread_info.h:168 kernel/exit.c:461 kernel/exit.c:746)
I didn't study in any detail, but this one seems much more like the
zeroing and vma corruption that you've been seeing in other dumps.
I didn't look through the dumps closely today because I spent the time
putting together a KVM setup similar to Sasha's (many cpus, fake NUMA,
etc) so I could run trinity in it in another attempt to reproduce this.
I did not encounter the same VM_BUG_ON unfortunately. However, trinity
itself crashed after 2.5 hours complaining

[watchdog] pid 32188 has disappeared. Reaping.
[watchdog] pid 32024 has disappeared. Reaping.
[watchdog] pid 32300 has disappeared. Reaping.
[watchdog] Sanity check failed! Found pid 0 at pidslot 35!

This did not happen when running on bare metal. This error makes me wonder
if it is evidence that there is zeroing corruption occuring when running
inside KVM. Another possibility is that it's somehow related to fake NUMA
although it's hard to see how. It's still possible the bug is with the
page table handling and KVM affects timing enough to cause problems so
I'm not ruling that out.
Post by Hugh Dickins
Though a single pte_mknuma() crash could presumably be caused by vma
corruption (but I think not mere zeroing), the recurrent way in which
you hit that pte_mknuma() bug in particular makes it unlikely to be
caused by random corruption.
You are generating new crashes faster than we can keep up with them.
Would this be a suitable point for you to switch over to testing
3.17-rc, to see if that is as unstable for you as -next is?
That VM_BUG_ON(!(val & _PAGE_PRESENT)) is not in the 3.17-rc tree,
but I think you can "safely" add it to 3.17-rc. Quotes around
"safely" meaning that we know that there's a bug to hit, at least
in -next, but I don't think it's going to be hit for stupid obvious
reasons.
Agreed. If 3.17-rc4 looks stable with the VM_BUG_ON then it would be
really nice if you could bisect 3.17-rc4 to linux-next carrying the
VM_BUG_ON(!(val & _PAGE_PRESENT)) check at each bisection point. I'm not
100% sure if I'm seeing the same corruption as you or some other issue and
do not want to conflate numerous different problems into one. I know this
is a pain in the ass but if 3.17-rc4 looks stable then a bisection might
be faster overall than my constant head scratching :(
--
Mel Gorman
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sasha Levin
2014-09-17 21:40:01 UTC
Permalink
Post by Mel Gorman
Agreed. If 3.17-rc4 looks stable with the VM_BUG_ON then it would be
really nice if you could bisect 3.17-rc4 to linux-next carrying the
VM_BUG_ON(!(val & _PAGE_PRESENT)) check at each bisection point. I'm not
100% sure if I'm seeing the same corruption as you or some other issue and
do not want to conflate numerous different problems into one. I know this
is a pain in the ass but if 3.17-rc4 looks stable then a bisection might
be faster overall than my constant head scratching :(
The good news are that 3.17-rc4 seems to be stable. I'll start the bisection,
which I suspect would take several days. I'll update when I run into something.
I might need a bit of a help here. The bisection is going sideways because I
can't reliably reproduce the issue.

We don't know what's causing this issue, but we know what the symptoms are. Is
there a VM_BUG_ON we could add somewhere so that it would be more likely to
trigger?


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Sasha Levin
2014-09-10 13:20:02 UTC
Permalink
Sasha, you say you're getting plenty of these now, but I've only seen
the dump for one of them, on Aug26: please post a few more dumps, so
that we can look for commonality.
I wasn't saving older logs for this issue so I only have 2 traces from
tonight. If that's not enough please let me know and I'll try to add
a few more.

[ 1125.600123] kernel BUG at include/asm-generic/pgtable.h:724!
[ 1125.600123] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 1125.600123] Dumping ftrace buffer:
[ 1125.600123] (ftrace buffer empty)
[ 1125.600123] Modules linked in:
[ 1125.600123] CPU: 16 PID: 11903 Comm: trinity-c517 Not tainted 3.17.0-rc4-next-20140909-sasha-00032-gc16d47b #1135
[ 1125.600123] task: ffff880661730000 ti: ffff880582c20000 task.ti: ffff880582c20000
[ 1125.600123] RIP: 0010:[<ffffffffa32e500a>] [<ffffffffa32e500a>] change_pte_range+0x4ea/0x4f0
[ 1125.600123] RSP: 0018:ffff880582c23d68 EFLAGS: 00010246
[ 1125.600123] RAX: 0000000936d9a900 RBX: 00007ffdb17c8000 RCX: 0000000000000100
[ 1125.600123] RDX: 0000000936d9a900 RSI: 00007ffdb17c8000 RDI: 0000000936d9a900
[ 1125.600123] RBP: ffff880582c23dc8 R08: ffff8802a8f2d400 R09: 0000000000b56000
[ 1125.600123] R10: 0000000000020201 R11: 0000000000000008 R12: ffff88004dd6ee40
[ 1125.600123] R13: 8000000000000025 R14: 00007ffdb1800000 R15: ffffc00000000fff
[ 1125.600123] FS: 00007ffdb6382700(0000) GS:ffff880278200000(0000) knlGS:0000000000000000
[ 1125.600123] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1125.600123] CR2: 00007ffdb617e60c CR3: 000000050ff12000 CR4: 00000000000006a0
[ 1125.600123] DR0: 00000000006f0000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1125.600123] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
[ 1125.600123] Stack:
[ 1125.600123] 0000000000000001 0000000936d9a900 0000000000000046 ffff8804bd549f40
[ 1125.600123] 000000001f989000 ffff8802a8f2d400 ffff88051f989000 00007f9f40604cfdb1ac8000
[ 1125.600123] ffff88032fcc3c58 00007ffdb16df000 00007ffdb16df000 00007ffdb1800000
[ 1125.600123] Call Trace:
[ 1125.600123] [<ffffffffa32e52c4>] change_protection+0x2b4/0x4e0
[ 1125.600123] [<ffffffffa32fefdb>] change_prot_numa+0x1b/0x40
[ 1125.600123] [<ffffffffa31add86>] task_numa_work+0x1f6/0x330
[ 1125.600123] [<ffffffffa3193d84>] task_work_run+0xc4/0xf0
[ 1125.600123] [<ffffffffa3071477>] do_notify_resume+0x97/0xb0
[ 1125.600123] [<ffffffffa650daea>] int_signal+0x12/0x17
[ 1125.600123] Code: 66 90 48 8b 7d b8 e8 f6 75 22 03 48 8b 45 b0 e9 6f ff ff ff 0f 1f 44 00 00 0f 0b 66 0f 1f 44 00 00 0f 0b 66 0f 1f 44 00 00 0f 0b <0f> 0b 0f 0b 0f 0b 66 66 66 66 90 55 48 89 e5 41 57 49 89 d7 41
[ 1125.600123] RIP [<ffffffffa32e500a>] change_pte_range+0x4ea/0x4f0
[ 1125.600123] RSP <ffff880582c23d68>

[ 3131.084176] kernel BUG at include/asm-generic/pgtable.h:724!
[ 3131.087358] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 3131.090143] Dumping ftrace buffer:
[ 3131.090143] (ftrace buffer empty)
[ 3131.090143] Modules linked in:
[ 3131.090143] CPU: 8 PID: 20595 Comm: trinity-c34 Not tainted 3.17.0-rc4-next-20140909-sasha-00032-gc16d47b #1135
[ 3131.090143] task: ffff8801ded60000 ti: ffff8803204ec000 task.ti: ffff8803204ec000
[ 3131.090143] RIP: 0010:[<ffffffffa72e500a>] [<ffffffffa72e500a>] change_pte_range+0x4ea/0x4f0
[ 3131.090143] RSP: 0000:ffff8803204efd68 EFLAGS: 00010246
[ 3131.090143] RAX: 0000000971bba900 RBX: 00007ffda1d4d000 RCX: 0000000000000100
[ 3131.090143] RDX: 0000000971bba900 RSI: 00007ffda1d4d000 RDI: 0000000971bba900
[ 3131.120281] RBP: ffff8803204efdc8 R08: ffff88026bed8800 R09: 0000000000b48000
[ 3131.120281] R10: 0000000000076501 R11: 0000000000000008 R12: ffff8801ca071a68
[ 3131.120281] R13: 8000000000000025 R14: 00007ffda1dbf000 R15: ffffc00000000fff
[ 3131.120281] FS: 00007ffda5cd4700(0000) GS:ffff880277e00000(0000) knlGS:0000000000000000
[ 3131.120281] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3131.120281] CR2: 00000000025d6000 CR3: 00000004bcde2000 CR4: 00000000000006a0
[ 3131.120281] Stack:
[ 3131.120281] 0000000000000001 0000000971bba900 000000000000005c ffff8800661a7b60
[ 3131.120281] 00000000f4953000 ffff88026bed8800 ffff8801f4953000 00007ffda1dbf000
[ 3131.120281] ffff8802b3319870 00007ffda1c1b000 00007ffda1c1b000 00007ffda1dbf000
[ 3131.120281] Call Trace:
[ 3131.120281] [<ffffffffa72e52c4>] change_protection+0x2b4/0x4e0
[ 3131.120281] [<ffffffffa72fefdb>] change_prot_numa+0x1b/0x40
[ 3131.120281] [<ffffffffa71add86>] task_numa_work+0x1f6/0x330
[ 3131.120281] [<ffffffffa7193d84>] task_work_run+0xc4/0xf0
[ 3131.120281] [<ffffffffa7071477>] do_notify_resume+0x97/0xb0
[ 3131.120281] [<ffffffffaa50e6ae>] retint_signal+0x4d/0x9f
[ 3131.120281] Code: 66 90 48 8b 7d b8 e8 f6 75 22 03 48 8b 45 b0 e9 6f ff ff ff 0f 1f 44 00 00 0f 0b 66 0f 1f 44 00 00 0f 0b 66 0f 1f 44 00 00 0f 0b <0f> 0b 0f 0b 0f 0b 66 66 66 66 90 55 48 89 e5 41 57 49 89 d7 41
[ 3131.120281] RIP [<ffffffffa72e500a>] change_pte_range+0x4ea/0x4f0
[ 3131.120281] RSP <ffff8803204efd68>
And please attach a disassembly of change_protection_range() (noting
"Code" just shows a cluster of ud2s for the unlikely bugs at end of the
function, we cannot tell at all what should be in the registers by then.
change_protection_range() got inlined into change_protection(), it applies to
both traces above:

00000000000004f0 <change_protection>:
4f0: e8 00 00 00 00 callq 4f5 <change_protection+0x5>
4f1: R_X86_64_PC32 __fentry__-0x4
4f5: 55 push %rbp
4f6: 48 89 e5 mov %rsp,%rbp
4f9: 41 57 push %r15
4fb: 49 89 d7 mov %rdx,%r15
4fe: 41 56 push %r14
500: 41 55 push %r13
502: 41 54 push %r12
504: 53 push %rbx
505: 48 81 ec 98 00 00 00 sub $0x98,%rsp
50c: 48 89 7d c8 mov %rdi,-0x38(%rbp)
510: 48 89 75 c0 mov %rsi,-0x40(%rbp)
514: 48 89 4d b8 mov %rcx,-0x48(%rbp)
518: 44 89 45 98 mov %r8d,-0x68(%rbp)
51c: 44 89 4d 9c mov %r9d,-0x64(%rbp)
520: f6 47 52 40 testb $0x40,0x52(%rdi)
524: 0f 85 96 03 00 00 jne 8c0 <change_protection+0x3d0>
52a: 48 8b 45 c8 mov -0x38(%rbp),%rax
52e: 48 8b 40 40 mov 0x40(%rax),%rax
532: 48 89 45 80 mov %rax,-0x80(%rbp)
536: 48 39 55 c0 cmp %rdx,-0x40(%rbp)
53a: 0f 83 40 04 00 00 jae 980 <change_protection+0x490>
540: 4c 8b 5d c0 mov -0x40(%rbp),%r11
544: 48 8b 4d 80 mov -0x80(%rbp),%rcx
548: 4c 89 d8 mov %r11,%rax
54b: 48 c1 e8 24 shr $0x24,%rax
54f: c6 81 dc 08 00 00 01 movb $0x1,0x8dc(%rcx)
556: 25 f8 0f 00 00 and $0xff8,%eax
55b: 48 03 41 40 add 0x40(%rcx),%rax
55f: 48 8d 52 ff lea -0x1(%rdx),%rdx
563: 4c 89 7d d0 mov %r15,-0x30(%rbp)
567: 49 89 c7 mov %rax,%r15
56a: 48 89 55 b0 mov %rdx,-0x50(%rbp)
56e: 48 c7 45 a8 00 00 00 movq $0x0,-0x58(%rbp)
575: 00
576: 48 b8 00 00 00 00 80 movabs $0x8000000000,%rax
57d: 00 00 00
580: 49 8b 3f mov (%r15),%rdi
583: 49 bd 00 00 00 00 80 movabs $0xffffff8000000000,%r13
58a: ff ff ff
58d: 4c 01 d8 add %r11,%rax
590: 49 21 c5 and %rax,%r13
593: 49 8d 45 ff lea -0x1(%r13),%rax
597: 48 3b 45 b0 cmp -0x50(%rbp),%rax
59b: 4c 0f 43 6d d0 cmovae -0x30(%rbp),%r13
5a0: 48 85 ff test %rdi,%rdi
5a3: 0f 84 2f 02 00 00 je 7d8 <change_protection+0x2e8>
5a9: 48 b8 fb 0f 00 00 00 movabs $0xffffc00000000ffb,%rax
5b0: c0 ff ff
5b3: 48 21 f8 and %rdi,%rax
5b6: 48 83 f8 63 cmp $0x63,%rax
5ba: 0f 85 98 03 00 00 jne 958 <change_protection+0x468>
5c0: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 5c8 <change_protection+0xd8>
5c7: 00
5c3: R_X86_64_PC32 pv_mmu_ops+0xf3
5c8: 0f 84 d2 03 00 00 je 9a0 <change_protection+0x4b0>
5ce: ff 14 25 00 00 00 00 callq *0x0
5d1: R_X86_64_32S pv_mmu_ops+0xf8
5d5: 4c 89 df mov %r11,%rdi
5d8: 4d 89 ea mov %r13,%r10
5db: 4c 89 bd 60 ff ff ff mov %r15,-0xa0(%rbp)
5e2: 48 ba 00 f0 ff ff ff movabs $0x3ffffffff000,%rdx
5e9: 3f 00 00
5ec: 48 c1 ef 1b shr $0x1b,%rdi
5f0: 48 21 d0 and %rdx,%rax
5f3: 48 be 00 00 00 00 00 movabs $0xffff880000000000,%rsi
5fa: 88 ff ff
5fd: 48 c7 85 68 ff ff ff movq $0x0,-0x98(%rbp)
604: 00 00 00 00
608: 81 e7 f8 0f 00 00 and $0xff8,%edi
60e: 48 89 95 70 ff ff ff mov %rdx,-0x90(%rbp)
615: 48 01 f7 add %rsi,%rdi
618: 4c 8d 34 07 lea (%rdi,%rax,1),%r14
61c: 49 8d 45 ff lea -0x1(%r13),%rax
620: 4d 89 f5 mov %r14,%r13
623: 4d 89 de mov %r11,%r14
626: 48 89 45 a0 mov %rax,-0x60(%rbp)
62a: 49 8d 9e 00 00 00 40 lea 0x40000000(%r14),%rbx
631: 49 8b 7d 00 mov 0x0(%r13),%rdi
635: 48 81 e3 00 00 00 c0 and $0xffffffffc0000000,%rbx
63c: 48 8d 43 ff lea -0x1(%rbx),%rax
640: 48 3b 45 a0 cmp -0x60(%rbp),%rax
644: 49 0f 43 da cmovae %r10,%rbx
648: 48 85 ff test %rdi,%rdi
64b: 0f 84 ff 01 00 00 je 850 <change_protection+0x360>
651: 48 b8 98 0f 00 00 00 movabs $0xffffc00000000f98,%rax
658: c0 ff ff
65b: 48 85 c7 test %rax,%rdi
65e: 0f 85 04 03 00 00 jne 968 <change_protection+0x478>
664: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 66c <change_protection+0x17c>
66b: 00
667: R_X86_64_PC32 pv_mmu_ops+0x11b
66c: 0f 84 4e 03 00 00 je 9c0 <change_protection+0x4d0>
672: 48 8b 45 c8 mov -0x38(%rbp),%rax
676: 48 8b 40 40 mov 0x40(%rax),%rax
67a: 48 89 85 78 ff ff ff mov %rax,-0x88(%rbp)
681: ff 14 25 00 00 00 00 callq *0x0
684: R_X86_64_32S pv_mmu_ops+0x120
688: 48 23 85 70 ff ff ff and -0x90(%rbp),%rax
68f: 4d 89 f4 mov %r14,%r12
692: 45 31 db xor %r11d,%r11d
695: 4c 89 ad 48 ff ff ff mov %r13,-0xb8(%rbp)
69c: 49 c1 ec 12 shr $0x12,%r12
6a0: 48 c7 45 88 00 00 00 movq $0x0,-0x78(%rbp)
6a7: 00
6a8: 4d 89 dd mov %r11,%r13
6ab: 41 81 e4 f8 0f 00 00 and $0xff8,%r12d
6b2: 4c 89 95 50 ff ff ff mov %r10,-0xb0(%rbp)
6b9: 48 ba 00 00 00 00 00 movabs $0xffff880000000000,%rdx
6c0: 88 ff ff
6c3: 48 c7 85 58 ff ff ff movq $0x0,-0xa8(%rbp)
6ca: 00 00 00 00
6ce: 49 01 d4 add %rdx,%r12
6d1: 49 01 c4 add %rax,%r12
6d4: 48 8d 43 ff lea -0x1(%rbx),%rax
6d8: 48 89 45 90 mov %rax,-0x70(%rbp)
6dc: 4d 8d be 00 00 20 00 lea 0x200000(%r14),%r15
6e3: 49 8b 3c 24 mov (%r12),%rdi
6e7: 49 81 e7 00 00 e0 ff and $0xffffffffffe00000,%r15
6ee: 49 8d 47 ff lea -0x1(%r15),%rax
6f2: 48 3b 45 90 cmp -0x70(%rbp),%rax
6f6: 4c 0f 43 fb cmovae %rbx,%r15
6fa: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 702 <change_protection+0x212>
701: 00
6fd: R_X86_64_PC32 pv_mmu_ops+0x10b
702: 0f 84 60 01 00 00 je 868 <change_protection+0x378>
708: ff 14 25 00 00 00 00 callq *0x0
70b: R_X86_64_32S pv_mmu_ops+0x110
70f: a8 80 test $0x80,%al
711: 0f 84 59 01 00 00 je 870 <change_protection+0x380>
717: 4d 85 ed test %r13,%r13
71a: 75 18 jne 734 <change_protection+0x244>
71c: 48 8b 85 78 ff ff ff mov -0x88(%rbp),%rax
723: 4d 89 f5 mov %r14,%r13
726: 48 83 b8 c0 04 00 00 cmpq $0x0,0x4c0(%rax)
72d: 00
72e: 0f 85 54 02 00 00 jne 988 <change_protection+0x498>
734: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 73c <change_protection+0x24c>
73b: 00
737: R_X86_64_PC32 pv_mmu_ops+0x10b
73c: 49 8b 3c 24 mov (%r12),%rdi
740: 0f 84 22 01 00 00 je 868 <change_protection+0x378>
746: ff 14 25 00 00 00 00 callq *0x0
749: R_X86_64_32S pv_mmu_ops+0x110
74d: a8 80 test $0x80,%al
74f: 74 33 je 784 <change_protection+0x294>
751: 4c 89 f8 mov %r15,%rax
754: 4c 29 f0 sub %r14,%rax
757: 48 3d 00 00 20 00 cmp $0x200000,%rax
75d: 0f 84 7d 01 00 00 je 8e0 <change_protection+0x3f0>
763: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 76b <change_protection+0x27b>
76a: 00
766: R_X86_64_PC32 pv_mmu_ops+0x10b
76b: 49 8b 3c 24 mov (%r12),%rdi
76f: 0f 84 f3 00 00 00 je 868 <change_protection+0x378>
775: ff 14 25 00 00 00 00 callq *0x0
778: R_X86_64_32S pv_mmu_ops+0x110
77c: a8 80 test $0x80,%al
77e: 0f 85 24 02 00 00 jne 9a8 <change_protection+0x4b8>
784: 8b 45 9c mov -0x64(%rbp),%eax
787: 4c 89 f9 mov %r15,%rcx
78a: 4c 89 f2 mov %r14,%rdx
78d: 4c 89 e6 mov %r12,%rsi
790: 44 8b 4d 98 mov -0x68(%rbp),%r9d
794: 4c 8b 45 b8 mov -0x48(%rbp),%r8
798: 48 8b 7d c8 mov -0x38(%rbp),%rdi
79c: 89 04 24 mov %eax,(%rsp)
79f: e8 5c f8 ff ff callq 0 <change_pte_range>
7a4: 48 01 45 88 add %rax,-0x78(%rbp)
7a8: 49 83 c4 08 add $0x8,%r12
7ac: 4c 39 fb cmp %r15,%rbx
7af: 74 3f je 7f0 <change_protection+0x300>
7b1: 4d 89 fe mov %r15,%r14
7b4: e9 23 ff ff ff jmpq 6dc <change_protection+0x1ec>
7b9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
7c0: 48 8b b5 68 ff ff ff mov -0x98(%rbp),%rsi
7c7: 4d 89 d5 mov %r10,%r13
7ca: 4c 8b bd 60 ff ff ff mov -0xa0(%rbp),%r15
7d1: 48 01 75 a8 add %rsi,-0x58(%rbp)
7d5: 0f 1f 00 nopl (%rax)
7d8: 49 83 c7 08 add $0x8,%r15
7dc: 4c 39 6d d0 cmp %r13,-0x30(%rbp)
7e0: 0f 84 3a 01 00 00 je 920 <change_protection+0x430>
7e6: 4d 89 eb mov %r13,%r11
7e9: e9 88 fd ff ff jmpq 576 <change_protection+0x86>
7ee: 66 90 xchg %ax,%ax
7f0: 4d 89 eb mov %r13,%r11
7f3: 4c 8b 95 50 ff ff ff mov -0xb0(%rbp),%r10
7fa: 4c 8b ad 48 ff ff ff mov -0xb8(%rbp),%r13
801: 4d 85 db test %r11,%r11
804: 74 2a je 830 <change_protection+0x340>
806: 48 8b 85 78 ff ff ff mov -0x88(%rbp),%rax
80d: 48 83 b8 c0 04 00 00 cmpq $0x0,0x4c0(%rax)
814: 00
815: 74 19 je 830 <change_protection+0x340>
817: 48 89 da mov %rbx,%rdx
81a: 4c 89 de mov %r11,%rsi
81d: 48 89 c7 mov %rax,%rdi
820: 4c 89 55 90 mov %r10,-0x70(%rbp)
824: e8 00 00 00 00 callq 829 <change_protection+0x339>
825: R_X86_64_PC32 __mmu_notifier_invalidate_range_end-0x4
829: 4c 8b 55 90 mov -0x70(%rbp),%r10
82d: 0f 1f 00 nopl (%rax)
830: 48 8b 85 58 ff ff ff mov -0xa8(%rbp),%rax
837: 48 85 c0 test %rax,%rax
83a: 74 09 je 845 <change_protection+0x355>
83c: 65 48 01 04 25 00 00 add %rax,%gs:0x0
843: 00 00
841: R_X86_64_32S vm_event_states+0x170
845: 48 8b 75 88 mov -0x78(%rbp),%rsi
849: 48 01 b5 68 ff ff ff add %rsi,-0x98(%rbp)
850: 49 83 c5 08 add $0x8,%r13
854: 49 39 da cmp %rbx,%r10
857: 0f 84 63 ff ff ff je 7c0 <change_protection+0x2d0>
85d: 49 89 de mov %rbx,%r14
860: e9 c5 fd ff ff jmpq 62a <change_protection+0x13a>
865: 0f 1f 00 nopl (%rax)
868: 0f 0b ud2
86a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
870: 49 8b 04 24 mov (%r12),%rax
874: 48 85 c0 test %rax,%rax
877: 0f 84 2b ff ff ff je 7a8 <change_protection+0x2b8>
87d: 48 89 c2 mov %rax,%rdx
880: 81 e2 01 02 00 00 and $0x201,%edx
886: 48 81 fa 00 02 00 00 cmp $0x200,%rdx
88d: 0f 84 84 fe ff ff je 717 <change_protection+0x227>
893: 48 be fb 0f 00 00 00 movabs $0xffffc00000000ffb,%rsi
89a: c0 ff ff
89d: 48 21 f0 and %rsi,%rax
8a0: 48 83 f8 63 cmp $0x63,%rax
8a4: 0f 84 6d fe ff ff je 717 <change_protection+0x227>
8aa: 4c 89 e7 mov %r12,%rdi
8ad: e8 00 00 00 00 callq 8b2 <change_protection+0x3c2>
8ae: R_X86_64_PC32 pmd_clear_bad-0x4
8b2: e9 f1 fe ff ff jmpq 7a8 <change_protection+0x2b8>
8b7: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
8be: 00 00
8c0: e8 00 00 00 00 callq 8c5 <change_protection+0x3d5>
8c1: R_X86_64_PC32 hugetlb_change_protection-0x4
8c5: 48 89 45 a8 mov %rax,-0x58(%rbp)
8c9: 48 8b 45 a8 mov -0x58(%rbp),%rax
8cd: 48 81 c4 98 00 00 00 add $0x98,%rsp
8d4: 5b pop %rbx
8d5: 41 5c pop %r12
8d7: 41 5d pop %r13
8d9: 41 5e pop %r14
8db: 41 5f pop %r15
8dd: 5d pop %rbp
8de: c3 retq
8df: 90 nop
8e0: 44 8b 45 9c mov -0x64(%rbp),%r8d
8e4: 4c 89 f2 mov %r14,%rdx
8e7: 4c 89 e6 mov %r12,%rsi
8ea: 48 8b 4d b8 mov -0x48(%rbp),%rcx
8ee: 48 8b 7d c8 mov -0x38(%rbp),%rdi
8f2: e8 00 00 00 00 callq 8f7 <change_protection+0x407>
8f3: R_X86_64_PC32 change_huge_pmd-0x4
8f7: 85 c0 test %eax,%eax
8f9: 0f 84 85 fe ff ff je 784 <change_protection+0x294>
8ff: 3d 00 02 00 00 cmp $0x200,%eax
904: 0f 85 9e fe ff ff jne 7a8 <change_protection+0x2b8>
90a: 48 81 45 88 00 02 00 addq $0x200,-0x78(%rbp)
911: 00
912: 48 83 85 58 ff ff ff addq $0x1,-0xa8(%rbp)
919: 01
91a: e9 89 fe ff ff jmpq 7a8 <change_protection+0x2b8>
91f: 90 nop
920: 48 83 7d a8 00 cmpq $0x0,-0x58(%rbp)
925: 4c 8b 7d d0 mov -0x30(%rbp),%r15
929: 74 18 je 943 <change_protection+0x453>
92b: 48 8b 45 c8 mov -0x38(%rbp),%rax
92f: 4c 89 fa mov %r15,%rdx
932: 48 8b 75 c0 mov -0x40(%rbp),%rsi
936: 48 8b 48 50 mov 0x50(%rax),%rcx
93a: 48 8b 78 40 mov 0x40(%rax),%rdi
93e: e8 00 00 00 00 callq 943 <change_protection+0x453>
93f: R_X86_64_PC32 flush_tlb_mm_range-0x4
943: 48 8b 45 80 mov -0x80(%rbp),%rax
947: c6 80 dc 08 00 00 00 movb $0x0,0x8dc(%rax)
94e: e9 76 ff ff ff jmpq 8c9 <change_protection+0x3d9>
953: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
958: 4c 89 ff mov %r15,%rdi
95b: e8 00 00 00 00 callq 960 <change_protection+0x470>
95c: R_X86_64_PC32 pgd_clear_bad-0x4
960: e9 73 fe ff ff jmpq 7d8 <change_protection+0x2e8>
965: 0f 1f 00 nopl (%rax)
968: 4c 89 ef mov %r13,%rdi
96b: 4c 89 55 90 mov %r10,-0x70(%rbp)
96f: e8 00 00 00 00 callq 974 <change_protection+0x484>
970: R_X86_64_PC32 pud_clear_bad-0x4
974: 4c 8b 55 90 mov -0x70(%rbp),%r10
978: e9 d3 fe ff ff jmpq 850 <change_protection+0x360>
97d: 0f 1f 00 nopl (%rax)
980: 0f 0b ud2
982: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
988: 48 89 da mov %rbx,%rdx
98b: 4c 89 f6 mov %r14,%rsi
98e: 48 89 c7 mov %rax,%rdi
991: e8 00 00 00 00 callq 996 <change_protection+0x4a6>
992: R_X86_64_PC32 __mmu_notifier_invalidate_range_start-0x4
996: e9 99 fd ff ff jmpq 734 <change_protection+0x244>
99b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
9a0: 0f 0b ud2
9a2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
9a8: 48 8b 7d c8 mov -0x38(%rbp),%rdi
9ac: 4c 89 e2 mov %r12,%rdx
9af: 4c 89 f6 mov %r14,%rsi
9b2: e8 00 00 00 00 callq 9b7 <change_protection+0x4c7>
9b3: R_X86_64_PC32 __split_huge_page_pmd-0x4
9b7: e9 c8 fd ff ff jmpq 784 <change_protection+0x294>
9bc: 0f 1f 40 00 nopl 0x0(%rax)
9c0: 0f 0b ud2
9c2: 66 66 66 66 66 2e 0f data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1)
9c9: 1f 84 00 00 00 00 00
I've been rather assuming that the 9d340902 seen in many of the
registers in that Aug26 dump is the pte val in question: that's
SOFT_DIRTY|PROTNONE|RW.
I think RW on PROTNONE is unusual but not impossible (migration entry
replacement racing with mprotect setting PROT_NONE, after it's updated
vm_page_prot, before it's reached the page table). But exciting though
that line of thought is, I cannot actually bring it to a pte_mknuma bug,
or any bug at all.
Mel, no way can it be the cause of this bug - unless Sasha's later
traces actually show a different stack - but I don't see the call
to change_prot_numa() from queue_pages_range() sharing the same
avoidance of PROT_NONE that task_numa_work() has (though it does
have an outdated comment about PROT_NONE which should be removed).
So I think that site probably does need PROT_NONE checking added.
I've spotted a new trace in overnight fuzzing, it could be related to this issue:

[ 3494.324839] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 3494.332153] Dumping ftrace buffer:
[ 3494.332153] (ftrace buffer empty)
[ 3494.332153] Modules linked in:
[ 3494.332153] CPU: 8 PID: 2727 Comm: trinity-c929 Not tainted 3.17.0-rc4-next-20140909-sasha-00032-gc16d47b #1135
[ 3494.332153] task: ffff88047e52b000 ti: ffff8804d491c000 task.ti: ffff8804d491c000
[ 3494.332153] RIP: task_numa_work (include/linux/mempolicy.h:177 kernel/sched/fair.c:1956)
[ 3494.332153] RSP: 0000:ffff8804d491feb8 EFLAGS: 00010206
[ 3494.332153] RAX: 0000000000000000 RBX: ffff8804bf4e8000 RCX: 000000000000e8e8
[ 3494.343974] RDX: 000000000000000a RSI: 0000000000000000 RDI: ffff8804bd6d4da8
[ 3494.343974] RBP: ffff8804d491fef8 R08: ffff8804bf4e84c8 R09: 0000000000000000
[ 3494.343974] R10: 00007f53e443c000 R11: 0000000000000001 R12: 00007f53e443c000
[ 3494.343974] R13: 000000000000dc51 R14: 006f732e61727478 R15: ffff88047e52b000
[ 3494.343974] FS: 00007f53e463f700(0000) GS:ffff880277e00000(0000) knlGS:0000000000000000
[ 3494.343974] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 3494.369895] CR2: 0000000001670fa8 CR3: 0000000283562000 CR4: 00000000000006a0
[ 3494.369895] DR0: 00000000006f0000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3494.369895] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
[ 3494.380081] Stack:
[ 3494.380081] ffff8804bf4e80a8 0000000000000014 00007f53e4437000 0000000000000000
[ 3494.380081] ffffffff9b976e70 ffff88047e52bbd8 ffff88047e52b000 0000000000000000
[ 3494.380081] ffff8804d491ff28 ffffffff95193d84 0000000000000002 ffff8804d491ff58
[ 3494.380081] Call Trace:
[ 3494.380081] task_work_run (kernel/task_work.c:125 (discriminator 1))
[ 3494.380081] do_notify_resume (include/linux/tracehook.h:190 arch/x86/kernel/signal.c:758)
[ 3494.380081] retint_signal (arch/x86/kernel/entry_64.S:918)
[ 3494.380081] Code: e8 1e e5 01 00 48 89 df 4c 89 e6 e8 a3 2d 13 00 49 89 c6 48 85 c0 0f 84 07 02 00 00 48 c7 45 c8 00 00 00 00 0f 1f 80 00 00 00 00 <49> f7 46 50 00 44 00 00 0f 85 42 01 00 00 49 8b 86 a0 00 00 00
All code
========
0: e8 1e e5 01 00 callq 0x1e523
5: 48 89 df mov %rbx,%rdi
8: 4c 89 e6 mov %r12,%rsi
b: e8 a3 2d 13 00 callq 0x132db3
10: 49 89 c6 mov %rax,%r14
13: 48 85 c0 test %rax,%rax
16: 0f 84 07 02 00 00 je 0x223
1c: 48 c7 45 c8 00 00 00 movq $0x0,-0x38(%rbp)
23: 00
24: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
2b:* 49 f7 46 50 00 44 00 testq $0x4400,0x50(%r14) <-- trapping instruction
32: 00
33: 0f 85 42 01 00 00 jne 0x17b
39: 49 8b 86 a0 00 00 00 mov 0xa0(%r14),%rax
...

Code starting with the faulting instruction
===========================================
0: 49 f7 46 50 00 44 00 testq $0x4400,0x50(%r14)
7: 00
8: 0f 85 42 01 00 00 jne 0x150
e: 49 8b 86 a0 00 00 00 mov 0xa0(%r14),%rax
...
[ 3494.380081] RIP task_numa_work (include/linux/mempolicy.h:177 kernel/sched/fair.c:1956)
[ 3494.380081] RSP <ffff8804d491feb8>


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sasha Levin
2014-09-10 16:50:02 UTC
Permalink
<SNIP, haven't digested the rest>
[ 3494.324839] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 3494.332153] (ftrace buffer empty)
[ 3494.332153] CPU: 8 PID: 2727 Comm: trinity-c929 Not tainted 3.17.0-rc4-next-20140909-sasha-00032-gc16d47b #1135
[ 3494.332153] task: ffff88047e52b000 ti: ffff8804d491c000 task.ti: ffff8804d491c000
[ 3494.332153] RIP: task_numa_work (include/linux/mempolicy.h:177 kernel/sched/fair.c:1956)
[ 3494.332153] RSP: 0000:ffff8804d491feb8 EFLAGS: 00010206
[ 3494.332153] RAX: 0000000000000000 RBX: ffff8804bf4e8000 RCX: 000000000000e8e8
[ 3494.343974] RDX: 000000000000000a RSI: 0000000000000000 RDI: ffff8804bd6d4da8
[ 3494.343974] RBP: ffff8804d491fef8 R08: ffff8804bf4e84c8 R09: 0000000000000000
[ 3494.343974] R10: 00007f53e443c000 R11: 0000000000000001 R12: 00007f53e443c000
[ 3494.343974] R13: 000000000000dc51 R14: 006f732e61727478 R15: ffff88047e52b000
[ 3494.343974] FS: 00007f53e463f700(0000) GS:ffff880277e00000(0000) knlGS:0000000000000000
[ 3494.343974] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 3494.369895] CR2: 0000000001670fa8 CR3: 0000000283562000 CR4: 00000000000006a0
[ 3494.369895] DR0: 00000000006f0000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3494.369895] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
[ 3494.380081] ffff8804bf4e80a8 0000000000000014 00007f53e4437000 0000000000000000
[ 3494.380081] ffffffff9b976e70 ffff88047e52bbd8 ffff88047e52b000 0000000000000000
[ 3494.380081] ffff8804d491ff28 ffffffff95193d84 0000000000000002 ffff8804d491ff58
[ 3494.380081] task_work_run (kernel/task_work.c:125 (discriminator 1))
[ 3494.380081] do_notify_resume (include/linux/tracehook.h:190 arch/x86/kernel/signal.c:758)
[ 3494.380081] retint_signal (arch/x86/kernel/entry_64.S:918)
[ 3494.380081] Code: e8 1e e5 01 00 48 89 df 4c 89 e6 e8 a3 2d 13 00 49 89 c6 48 85 c0 0f 84 07 02 00 00 48 c7 45 c8 00 00 00 00 0f 1f 80 00 00 00 00 <49> f7 46 50 00 44 00 00 0f 85 42 01 00 00 49 8b 86 a0 00 00 00
Shot in dark, can you test this please? Pagetable teardown can schedule
and I'm wondering if we are trying to add hinting faults to an address
space that is in the process of going away. The TASK_DEAD check is bogus
so replacing it.
Mel, I ran today's -next with both of your patches, but the issue still remains:

[ 3114.540976] kernel BUG at include/asm-generic/pgtable.h:724!
[ 3114.541857] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 3114.543112] Dumping ftrace buffer:
[ 3114.544056] (ftrace buffer empty)
[ 3114.545000] Modules linked in:
[ 3114.545717] CPU: 18 PID: 30217 Comm: trinity-c617 Tainted: G W 3.17.0-rc4-next-20140910-sasha-00032-g6825fb5-dirty #1137
[ 3114.548058] task: ffff880415050000 ti: ffff88076f584000 task.ti: ffff88076f584000
[ 3114.549284] RIP: 0010:[<ffffffff952e527a>] [<ffffffff952e527a>] change_pte_range+0x4ea/0x4f0
[ 3114.550028] RSP: 0000:ffff88076f587d68 EFLAGS: 00010246
[ 3114.550028] RAX: 0000000314625900 RBX: 0000000041218000 RCX: 0000000000000100
[ 3114.550028] RDX: 0000000314625900 RSI: 0000000041218000 RDI: 0000000314625900
[ 3114.550028] RBP: ffff88076f587dc8 R08: ffff8802cf973600 R09: 0000000000b50000
[ 3114.550028] R10: 0000000000032c01 R11: 0000000000000008 R12: ffff8802a81070c0
[ 3114.550028] R13: 8000000000000025 R14: 0000000041343000 R15: ffffc00000000fff
[ 3114.550028] FS: 00007fabb91c8700(0000) GS:ffff88025ec00000(0000) knlGS:0000000000000000
[ 3114.550028] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 3114.550028] CR2: 00007fffdb7678e8 CR3: 0000000713935000 CR4: 00000000000006a0
[ 3114.550028] DR0: 00000000006f0000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3114.550028] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000050602
[ 3114.550028] Stack:
[ 3114.550028] 0000000000000001 0000000314625900 0000000000000018 ffff8802685f2260
[ 3114.550028] 0000000016840000 ffff8802cf973600 ffff880616840000 0000000041343000
[ 3114.550028] ffff880108805048 0000000041005000 0000000041200000 0000000041343000
[ 3114.550028] Call Trace:
[ 3114.550028] [<ffffffff952e5534>] change_protection+0x2b4/0x4e0
[ 3114.550028] [<ffffffff952ff24b>] change_prot_numa+0x1b/0x40
[ 3114.550028] [<ffffffff951adf16>] task_numa_work+0x1f6/0x330
[ 3114.550028] [<ffffffff95193de4>] task_work_run+0xc4/0xf0
[ 3114.550028] [<ffffffff95071477>] do_notify_resume+0x97/0xb0
[ 3114.550028] [<ffffffff9850f06a>] int_signal+0x12/0x17
[ 3114.550028] Code: 66 90 48 8b 7d b8 e8 e6 88 22 03 48 8b 45 b0 e9 6f ff ff ff 0f 1f 44 00 00 0f 0b 66 0f 1f 44 00 00 0f 0b 66 0f 1f 44 00 00 0f 0b <0f> 0b 0f 0b 0f 0b 66 66 66 66 90 55 48 89 e5 41 57 49 89 d7 41
[ 3114.550028] RIP [<ffffffff952e527a>] change_pte_range+0x4ea/0x4f0
[ 3114.550028] RSP <ffff88076f587d68>


Thanks,
Sasha

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sasha Levin
2014-09-10 20:40:03 UTC
Permalink
Thanks for supplying, but the change in inlining means that
change_protection_range() and change_protection() are no longer
relevant for these traces, we now need to see change_pte_range()
instead, to confirm that what I expect are ptes are indeed ptes.
If you can include line numbers (objdump -ld) in the disassembly, so
much the better, but should be decipherable without. (Or objdump -Sd
for source, but I often find that harder to unscramble, can't say why.)
Here it is. Note that the source includes both of Mel's debug patches.
For reference, here's one trace of the issue with those patches:

[ 3114.540976] kernel BUG at include/asm-generic/pgtable.h:724!
[ 3114.541857] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 3114.543112] Dumping ftrace buffer:
[ 3114.544056] (ftrace buffer empty)
[ 3114.545000] Modules linked in:
[ 3114.545717] CPU: 18 PID: 30217 Comm: trinity-c617 Tainted: G W 3.17.0-rc4-next-20140910-sasha-00032-g6825fb5-dirty #1137
[ 3114.548058] task: ffff880415050000 ti: ffff88076f584000 task.ti: ffff88076f584000
[ 3114.549284] RIP: 0010:[<ffffffff952e527a>] [<ffffffff952e527a>] change_pte_range+0x4ea/0x4f0
[ 3114.550028] RSP: 0000:ffff88076f587d68 EFLAGS: 00010246
[ 3114.550028] RAX: 0000000314625900 RBX: 0000000041218000 RCX: 0000000000000100
[ 3114.550028] RDX: 0000000314625900 RSI: 0000000041218000 RDI: 0000000314625900
[ 3114.550028] RBP: ffff88076f587dc8 R08: ffff8802cf973600 R09: 0000000000b50000
[ 3114.550028] R10: 0000000000032c01 R11: 0000000000000008 R12: ffff8802a81070c0
[ 3114.550028] R13: 8000000000000025 R14: 0000000041343000 R15: ffffc00000000fff
[ 3114.550028] FS: 00007fabb91c8700(0000) GS:ffff88025ec00000(0000) knlGS:0000000000000000
[ 3114.550028] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 3114.550028] CR2: 00007fffdb7678e8 CR3: 0000000713935000 CR4: 00000000000006a0
[ 3114.550028] DR0: 00000000006f0000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3114.550028] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000050602
[ 3114.550028] Stack:
[ 3114.550028] 0000000000000001 0000000314625900 0000000000000018 ffff8802685f2260
[ 3114.550028] 0000000016840000 ffff8802cf973600 ffff880616840000 0000000041343000
[ 3114.550028] ffff880108805048 0000000041005000 0000000041200000 0000000041343000
[ 3114.550028] Call Trace:
[ 3114.550028] [<ffffffff952e5534>] change_protection+0x2b4/0x4e0
[ 3114.550028] [<ffffffff952ff24b>] change_prot_numa+0x1b/0x40
[ 3114.550028] [<ffffffff951adf16>] task_numa_work+0x1f6/0x330
[ 3114.550028] [<ffffffff95193de4>] task_work_run+0xc4/0xf0
[ 3114.550028] [<ffffffff95071477>] do_notify_resume+0x97/0xb0
[ 3114.550028] [<ffffffff9850f06a>] int_signal+0x12/0x17
[ 3114.550028] Code: 66 90 48 8b 7d b8 e8 e6 88 22 03 48 8b 45 b0 e9 6f ff ff ff 0f 1f 44 00 00 0f 0b 66 0f 1f 44 00 00 0f 0b 66 0f 1f 44 00 00 0f 0b <0f> 0b 0f 0b 0f 0b 66 66 66 66 90 55 48 89 e5 41 57 49 89 d7 41
[ 3114.550028] RIP [<ffffffff952e527a>] change_pte_range+0x4ea/0x4f0
[ 3114.550028] RSP <ffff88076f587d68>

And the disassembly:

0000000000000000 <change_pte_range>:
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:70
0: e8 00 00 00 00 callq 5 <change_pte_range+0x5>
1: R_X86_64_PC32 __fentry__-0x4
5: 55 push %rbp
6: 48 89 e5 mov %rsp,%rbp
9: 41 57 push %r15
b: 41 56 push %r14
d: 49 89 ce mov %rcx,%r14
10: 41 55 push %r13
12: 4d 89 c5 mov %r8,%r13
15: 41 54 push %r12
17: 49 89 f4 mov %rsi,%r12
1a: 53 push %rbx
1b: 48 89 d3 mov %rdx,%rbx
1e: 48 83 ec 38 sub $0x38,%rsp
/home/sasha/linux-next/mm/mprotect.c:71
22: 48 8b 47 40 mov 0x40(%rdi),%rax
/home/sasha/linux-next/mm/mprotect.c:70
26: 48 89 7d c8 mov %rdi,-0x38(%rbp)
lock_pte_protection():
/home/sasha/linux-next/mm/mprotect.c:53
2a: 8b 4d 10 mov 0x10(%rbp),%ecx
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:70
2d: 44 89 4d c4 mov %r9d,-0x3c(%rbp)
/home/sasha/linux-next/mm/mprotect.c:71
31: 48 89 45 d0 mov %rax,-0x30(%rbp)
lock_pte_protection():
/home/sasha/linux-next/mm/mprotect.c:53
35: 85 c9 test %ecx,%ecx
37: 0f 84 6b 03 00 00 je 3a8 <change_pte_range+0x3a8>
pmd_to_page():
/home/sasha/linux-next/include/linux/mm.h:1538
3d: 48 89 f7 mov %rsi,%rdi
40: 48 81 e7 00 f0 ff ff and $0xfffffffffffff000,%rdi
47: e8 00 00 00 00 callq 4c <change_pte_range+0x4c>
48: R_X86_64_PC32 __phys_addr-0x4
4c: 48 ba 00 00 00 00 00 movabs $0xffffea0000000000,%rdx
53: ea ff ff
56: 48 c1 e8 0c shr $0xc,%rax
spin_lock():
/home/sasha/linux-next/include/linux/spinlock.h:309
5a: 48 89 55 b8 mov %rdx,-0x48(%rbp)
5e: 48 c1 e0 06 shl $0x6,%rax
62: 4c 8b 7c 10 30 mov 0x30(%rax,%rdx,1),%r15
67: 4c 89 ff mov %r15,%rdi
6a: e8 00 00 00 00 callq 6f <change_pte_range+0x6f>
6b: R_X86_64_PC32 _raw_spin_lock-0x4
6f: 49 8b 3c 24 mov (%r12),%rdi
pmd_val():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:571
73: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 7b <change_pte_range+0x7b>
7a: 00
76: R_X86_64_PC32 pv_mmu_ops+0x10b
7b: 48 8b 55 b8 mov -0x48(%rbp),%rdx
7f: 0f 84 ab 03 00 00 je 430 <change_pte_range+0x430>
85: ff 14 25 00 00 00 00 callq *0x0
88: R_X86_64_32S pv_mmu_ops+0x110
lock_pte_protection():
/home/sasha/linux-next/mm/mprotect.c:57
8c: a8 80 test $0x80,%al
8e: 0f 85 a4 03 00 00 jne 438 <change_pte_range+0x438>
94: 49 8b 3c 24 mov (%r12),%rdi
98: 48 85 ff test %rdi,%rdi
9b: 0f 84 97 03 00 00 je 438 <change_pte_range+0x438>
pmd_val():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:571
a1: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # a9 <change_pte_range+0xa9>
a8: 00
a4: R_X86_64_PC32 pv_mmu_ops+0x10b
a9: 0f 84 81 03 00 00 je 430 <change_pte_range+0x430>
af: ff 14 25 00 00 00 00 callq *0x0
b2: R_X86_64_32S pv_mmu_ops+0x110
b6: 48 b9 00 f0 ff ff ff movabs $0x3ffffffff000,%rcx
bd: 3f 00 00
c0: 48 21 c8 and %rcx,%rax
c3: 48 89 c7 mov %rax,%rdi
c6: 48 c1 ef 06 shr $0x6,%rdi
ca: 48 8b 44 3a 30 mov 0x30(%rdx,%rdi,1),%rax
cf: 49 8b 3c 24 mov (%r12),%rdi
d3: 48 89 45 b8 mov %rax,-0x48(%rbp)
pte_offset_kernel():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:551
d7: 48 89 d8 mov %rbx,%rax
da: 48 c1 e8 09 shr $0x9,%rax
de: 25 f8 0f 00 00 and $0xff8,%eax
pmd_val():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:571
e3: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # eb <change_pte_range+0xeb>
ea: 00
e6: R_X86_64_PC32 pv_mmu_ops+0x10b
pte_offset_kernel():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:551
eb: 48 89 c2 mov %rax,%rdx
pmd_val():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:571
ee: 0f 84 3c 03 00 00 je 430 <change_pte_range+0x430>
f4: ff 14 25 00 00 00 00 callq *0x0
f7: R_X86_64_32S pv_mmu_ops+0x110
spin_lock():
/home/sasha/linux-next/include/linux/spinlock.h:309
fb: 48 8b 7d b8 mov -0x48(%rbp),%rdi
pmd_page_vaddr():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:571
ff: 49 89 c4 mov %rax,%r12
pte_offset_kernel():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:551
102: 48 b8 00 00 00 00 00 movabs $0xffff880000000000,%rax
109: 88 ff ff
10c: 48 01 d0 add %rdx,%rax
10f: 4c 21 e1 and %r12,%rcx
112: 4c 8d 24 08 lea (%rax,%rcx,1),%r12
spin_lock():
/home/sasha/linux-next/include/linux/spinlock.h:309
116: e8 00 00 00 00 callq 11b <change_pte_range+0x11b>
117: R_X86_64_PC32 _raw_spin_lock-0x4
spin_unlock():
/home/sasha/linux-next/include/linux/spinlock.h:349
11b: 4c 89 ff mov %r15,%rdi
11e: e8 00 00 00 00 callq 123 <change_pte_range+0x123>
11f: R_X86_64_PC32 _raw_spin_unlock-0x4
arch_enter_lazy_mmu_mode():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:694
123: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 12b <change_pte_range+0x12b>
12a: 00
126: R_X86_64_PC32 pv_mmu_ops+0x133
12b: 0f 84 a7 03 00 00 je 4d8 <change_pte_range+0x4d8>
131: ff 14 25 00 00 00 00 callq *0x0
134: R_X86_64_32S pv_mmu_ops+0x138
massage_pgprot():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:351
138: 4c 89 e8 mov %r13,%rax
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:74
13b: 48 c7 45 b0 00 00 00 movq $0x0,-0x50(%rbp)
142: 00
pte_present():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:460
143: 49 bf ff 0f 00 00 00 movabs $0xffffc00000000fff,%r15
14a: c0 ff ff
massage_pgprot():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:351
14d: 83 e0 01 and $0x1,%eax
150: 48 89 45 a0 mov %rax,-0x60(%rbp)
154: e9 fa 00 00 00 jmpq 253 <change_pte_range+0x253>
159: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:87
160: 8b 55 10 mov 0x10(%rbp),%edx
163: 85 d2 test %edx,%edx
165: 0f 85 85 01 00 00 jne 2f0 <change_pte_range+0x2f0>
ptep_modify_prot_start():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:490
16b: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 173 <change_pte_range+0x173>
172: 00
16e: R_X86_64_PC32 pv_mmu_ops+0xd3
173: 0f 84 2f 03 00 00 je 4a8 <change_pte_range+0x4a8>
179: 48 8b 7d d0 mov -0x30(%rbp),%rdi
17d: 48 89 de mov %rbx,%rsi
180: 4c 89 e2 mov %r12,%rdx
183: ff 14 25 00 00 00 00 callq *0x0
186: R_X86_64_32S pv_mmu_ops+0xd8
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:89
18a: 48 89 c2 mov %rax,%rdx
ptep_modify_prot_start():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:490
18d: 48 89 c7 mov %rax,%rdi
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:89
190: 81 e2 01 03 00 00 and $0x301,%edx
196: 48 81 fa 00 02 00 00 cmp $0x200,%rdx
19d: 0f 84 bd 02 00 00 je 460 <change_pte_range+0x460>
pte_val():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:450
1a3: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 1ab <change_pte_range+0x1ab>
1aa: 00
1a6: R_X86_64_PC32 pv_mmu_ops+0xe3
1ab: 0f 84 a7 02 00 00 je 458 <change_pte_range+0x458>
1b1: ff 14 25 00 00 00 00 callq *0x0
1b4: R_X86_64_32S pv_mmu_ops+0xe8
pte_modify():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:377
1b8: 48 be 78 fa ff ff ff movabs $0x3ffffffffa78,%rsi
1bf: 3f 00 00
massage_pgprot():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:352
1c2: 4c 89 ef mov %r13,%rdi
1c5: 48 23 3d 00 00 00 00 and 0x0(%rip),%rdi # 1cc <change_pte_range+0x1cc>
1c8: R_X86_64_PC32 __supported_pte_mask-0x4
pte_modify():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:378
1cc: 48 ba 87 05 00 00 00 movabs $0xffffc00000000587,%rdx
1d3: c0 ff ff
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:377
1d6: 48 21 f0 and %rsi,%rax
massage_pgprot():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:352
1d9: 48 83 7d a0 00 cmpq $0x0,-0x60(%rbp)
1de: 49 0f 44 fd cmove %r13,%rdi
pte_modify():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:378
1e2: 48 89 f9 mov %rdi,%rcx
1e5: 48 21 d1 and %rdx,%rcx
1e8: 48 09 c1 or %rax,%rcx
__pte():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:435
1eb: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 1f3 <change_pte_range+0x1f3>
1f2: 00
1ee: R_X86_64_PC32 pv_mmu_ops+0xeb
pte_modify():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:378
1f3: 48 89 cf mov %rcx,%rdi
__pte():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:435
1f6: 0f 84 a4 02 00 00 je 4a0 <change_pte_range+0x4a0>
1fc: ff 14 25 00 00 00 00 callq *0x0
1ff: R_X86_64_32S pv_mmu_ops+0xf0
203: 48 89 c1 mov %rax,%rcx
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:96
206: 8b 45 c4 mov -0x3c(%rbp),%eax
209: 85 c0 test %eax,%eax
20b: 74 0e je 21b <change_pte_range+0x21b>
pte_set_flags():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:186 (discriminator 1)
20d: 48 89 c8 mov %rcx,%rax
210: 48 83 c8 02 or $0x2,%rax
214: f6 c1 40 test $0x40,%cl
217: 48 0f 45 c8 cmovne %rax,%rcx
ptep_modify_prot_commit():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:503
21b: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 223 <change_pte_range+0x223>
222: 00
21e: R_X86_64_PC32 pv_mmu_ops+0xdb
223: 0f 84 b7 02 00 00 je 4e0 <change_pte_range+0x4e0>
229: 48 8b 7d d0 mov -0x30(%rbp),%rdi
22d: 48 89 de mov %rbx,%rsi
230: 4c 89 e2 mov %r12,%rdx
233: ff 14 25 00 00 00 00 callq *0x0
236: R_X86_64_32S pv_mmu_ops+0xe0
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:128
23a: 48 83 45 b0 01 addq $0x1,-0x50(%rbp)
/home/sasha/linux-next/mm/mprotect.c:131
23f: 48 81 c3 00 10 00 00 add $0x1000,%rbx
246: 49 83 c4 08 add $0x8,%r12
24a: 4c 39 f3 cmp %r14,%rbx
24d: 0f 84 5d 02 00 00 je 4b0 <change_pte_range+0x4b0>
/home/sasha/linux-next/mm/mprotect.c:82
253: 49 8b 0c 24 mov (%r12),%rcx
pte_present():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:460
257: 48 89 c8 mov %rcx,%rax
25a: 4c 21 f8 and %r15,%rax
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:83
25d: a9 01 03 00 00 test $0x301,%eax
262: 0f 85 f8 fe ff ff jne 160 <change_pte_range+0x160>
/home/sasha/linux-next/mm/mprotect.c:113
268: a8 40 test $0x40,%al
26a: 75 d3 jne 23f <change_pte_range+0x23f>
pte_swp_soft_dirty():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:885
26c: a9 01 01 00 00 test $0x101,%eax
271: 0f 85 71 02 00 00 jne 4e8 <change_pte_range+0x4e8>
pte_clear_flags():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:193
277: 48 89 ca mov %rcx,%rdx
27a: 41 89 c0 mov %eax,%r8d
27d: 80 e2 7f and $0x7f,%dl
280: 41 81 e0 80 00 00 00 and $0x80,%r8d
287: 48 0f 45 ca cmovne %rdx,%rcx
pte_val():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:450
28b: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 293 <change_pte_range+0x293>
292: 00
28e: R_X86_64_PC32 pv_mmu_ops+0xe3
293: 0f 84 bf 01 00 00 je 458 <change_pte_range+0x458>
299: 48 89 cf mov %rcx,%rdi
29c: ff 14 25 00 00 00 00 callq *0x0
29f: R_X86_64_32S pv_mmu_ops+0xe8
swp_entry():
/home/sasha/linux-next/include/linux/swapops.h:30
2a3: 48 89 c1 mov %rax,%rcx
2a6: 48 c1 e8 0a shr $0xa,%rax
2aa: 48 d1 e9 shr %rcx
2ad: 83 e1 1f and $0x1f,%ecx
2b0: 48 c1 e1 39 shl $0x39,%rcx
2b4: 48 09 c8 or %rcx,%rax
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:116
2b7: 48 89 c2 mov %rax,%rdx
2ba: 48 c1 ea 39 shr $0x39,%rdx
2be: 48 83 fa 1f cmp $0x1f,%rdx
2c2: 0f 85 77 ff ff ff jne 23f <change_pte_range+0x23f>
swp_entry_to_pte():
/home/sasha/linux-next/include/linux/swapops.h:84
2c8: 48 c1 e0 0a shl $0xa,%rax
2cc: 48 89 c1 mov %rax,%rcx
2cf: 0c bc or $0xbc,%al
2d1: 48 83 c9 3c or $0x3c,%rcx
2d5: 45 85 c0 test %r8d,%r8d
2d8: 48 0f 45 c8 cmovne %rax,%rcx
set_pte_at():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:524
2dc: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 2e4 <change_pte_range+0x2e4>
2e3: 00
2df: R_X86_64_PC32 pv_mmu_ops+0x9b
2e4: 0f 85 a4 00 00 00 jne 38e <change_pte_range+0x38e>
2ea: 0f 0b ud2
2ec: 0f 1f 40 00 nopl 0x0(%rax)
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:103
2f0: 48 8b 7d c8 mov -0x38(%rbp),%rdi
2f4: 48 89 ca mov %rcx,%rdx
2f7: 48 89 de mov %rbx,%rsi
2fa: 48 89 4d a8 mov %rcx,-0x58(%rbp)
2fe: e8 00 00 00 00 callq 303 <change_pte_range+0x303>
2ff: R_X86_64_PC32 vm_normal_page-0x4
/home/sasha/linux-next/mm/mprotect.c:104
303: 48 85 c0 test %rax,%rax
306: 0f 84 33 ff ff ff je 23f <change_pte_range+0x23f>
/home/sasha/linux-next/mm/mprotect.c:104 (discriminator 1)
30c: 48 8b 40 08 mov 0x8(%rax),%rax
310: 83 e0 03 and $0x3,%eax
313: 48 83 f8 03 cmp $0x3,%rax
317: 0f 84 22 ff ff ff je 23f <change_pte_range+0x23f>
/home/sasha/linux-next/mm/mprotect.c:105
31d: 48 8b 4d a8 mov -0x58(%rbp),%rcx
321: 81 e1 01 03 00 00 and $0x301,%ecx
327: 48 81 f9 00 02 00 00 cmp $0x200,%rcx
32e: 0f 84 0b ff ff ff je 23f <change_pte_range+0x23f>
pte_val():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:450
334: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 33c <change_pte_range+0x33c>
33b: 00
337: R_X86_64_PC32 pv_mmu_ops+0xe3
ptep_set_numa():
/home/sasha/linux-next/include/asm-generic/pgtable.h:740
33c: 49 8b 3c 24 mov (%r12),%rdi
pte_val():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:450
340: 0f 84 12 01 00 00 je 458 <change_pte_range+0x458>
346: ff 14 25 00 00 00 00 callq *0x0
349: R_X86_64_32S pv_mmu_ops+0xe8
pte_mknuma():
/home/sasha/linux-next/include/asm-generic/pgtable.h:724
34d: a8 01 test $0x1,%al
34f: 0f 84 95 01 00 00 je 4ea <change_pte_range+0x4ea>
/home/sasha/linux-next/include/asm-generic/pgtable.h:727
355: f6 c4 01 test $0x1,%ah
358: 0f 85 8e 01 00 00 jne 4ec <change_pte_range+0x4ec>
/home/sasha/linux-next/include/asm-generic/pgtable.h:729
35e: 48 83 e0 fe and $0xfffffffffffffffe,%rax
/home/sasha/linux-next/include/asm-generic/pgtable.h:730
362: 80 cc 02 or $0x2,%ah
__pte():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:435
365: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 36d <change_pte_range+0x36d>
36c: 00
368: R_X86_64_PC32 pv_mmu_ops+0xeb
pte_mknuma():
/home/sasha/linux-next/include/asm-generic/pgtable.h:730
36d: 48 89 c7 mov %rax,%rdi
__pte():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:435
370: 0f 84 2a 01 00 00 je 4a0 <change_pte_range+0x4a0>
376: ff 14 25 00 00 00 00 callq *0x0
379: R_X86_64_32S pv_mmu_ops+0xf0
set_pte_at():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:524
37d: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 385 <change_pte_range+0x385>
384: 00
380: R_X86_64_PC32 pv_mmu_ops+0x9b
pte_mknuma():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:435
385: 48 89 c1 mov %rax,%rcx
set_pte_at():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:524
388: 0f 84 5c ff ff ff je 2ea <change_pte_range+0x2ea>
38e: 48 8b 7d d0 mov -0x30(%rbp),%rdi
392: 48 89 de mov %rbx,%rsi
395: 4c 89 e2 mov %r12,%rdx
398: ff 14 25 00 00 00 00 callq *0x0
39b: R_X86_64_32S pv_mmu_ops+0xa0
39f: e9 96 fe ff ff jmpq 23a <change_pte_range+0x23a>
3a4: 0f 1f 40 00 nopl 0x0(%rax)
pmd_val():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:571
3a8: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 3b0 <change_pte_range+0x3b0>
3af: 00
3ab: R_X86_64_PC32 pv_mmu_ops+0x10b
3b0: 48 8b 3e mov (%rsi),%rdi
3b3: 74 7b je 430 <change_pte_range+0x430>
3b5: ff 14 25 00 00 00 00 callq *0x0
3b8: R_X86_64_32S pv_mmu_ops+0x110
3bc: 48 ba 00 f0 ff ff ff movabs $0x3ffffffff000,%rdx
3c3: 3f 00 00
3c6: 48 21 d0 and %rdx,%rax
3c9: 48 89 c7 mov %rax,%rdi
3cc: 48 b8 00 00 00 00 00 movabs $0xffffea0000000000,%rax
3d3: ea ff ff
3d6: 48 c1 ef 06 shr $0x6,%rdi
3da: 48 8b 44 07 30 mov 0x30(%rdi,%rax,1),%rax
3df: 48 8b 3e mov (%rsi),%rdi
3e2: 48 89 45 b8 mov %rax,-0x48(%rbp)
pte_offset_kernel():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:551
3e6: 48 89 d8 mov %rbx,%rax
3e9: 48 c1 e8 09 shr $0x9,%rax
3ed: 25 f8 0f 00 00 and $0xff8,%eax
pmd_val():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:571
3f2: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 3fa <change_pte_range+0x3fa>
3f9: 00
3f5: R_X86_64_PC32 pv_mmu_ops+0x10b
pte_offset_kernel():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:551
3fa: 48 89 c1 mov %rax,%rcx
pmd_val():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:571
3fd: 74 31 je 430 <change_pte_range+0x430>
3ff: ff 14 25 00 00 00 00 callq *0x0
402: R_X86_64_32S pv_mmu_ops+0x110
spin_lock():
/home/sasha/linux-next/include/linux/spinlock.h:309
406: 48 8b 7d b8 mov -0x48(%rbp),%rdi
pmd_page_vaddr():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:571
40a: 49 89 c4 mov %rax,%r12
pte_offset_kernel():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:551
40d: 48 b8 00 00 00 00 00 movabs $0xffff880000000000,%rax
414: 88 ff ff
417: 48 01 c8 add %rcx,%rax
41a: 4c 21 e2 and %r12,%rdx
41d: 4c 8d 24 10 lea (%rax,%rdx,1),%r12
spin_lock():
/home/sasha/linux-next/include/linux/spinlock.h:309
421: e8 00 00 00 00 callq 426 <change_pte_range+0x426>
422: R_X86_64_PC32 _raw_spin_lock-0x4
426: e9 f8 fc ff ff jmpq 123 <change_pte_range+0x123>
42b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
pmd_val():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:571
430: 0f 0b ud2
432: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
spin_unlock():
/home/sasha/linux-next/include/linux/spinlock.h:349
438: 4c 89 ff mov %r15,%rdi
43b: e8 00 00 00 00 callq 440 <change_pte_range+0x440>
43c: R_X86_64_PC32 _raw_spin_unlock-0x4
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:78
440: 31 c0 xor %eax,%eax
/home/sasha/linux-next/mm/mprotect.c:136
442: 48 83 c4 38 add $0x38,%rsp
446: 5b pop %rbx
447: 41 5c pop %r12
449: 41 5d pop %r13
44b: 41 5e pop %r14
44d: 41 5f pop %r15
44f: 5d pop %rbp
450: c3 retq
451: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
pte_to_swp_entry():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:450
458: 0f 0b ud2
45a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
pte_val():
460: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 468 <change_pte_range+0x468>
467: 00
463: R_X86_64_PC32 pv_mmu_ops+0xe3
468: 74 ee je 458 <change_pte_range+0x458>
46a: 48 89 c7 mov %rax,%rdi
46d: ff 14 25 00 00 00 00 callq *0x0
470: R_X86_64_32S pv_mmu_ops+0xe8
pte_mknonnuma():
/home/sasha/linux-next/include/asm-generic/pgtable.h:701
474: 80 e4 fd and $0xfd,%ah
/home/sasha/linux-next/include/asm-generic/pgtable.h:702
477: 48 83 c8 21 or $0x21,%rax
__pte():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:435
47b: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 483 <change_pte_range+0x483>
482: 00
47e: R_X86_64_PC32 pv_mmu_ops+0xeb
pte_mknonnuma():
/home/sasha/linux-next/include/asm-generic/pgtable.h:702
483: 48 89 c7 mov %rax,%rdi
__pte():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:435
486: 74 18 je 4a0 <change_pte_range+0x4a0>
488: ff 14 25 00 00 00 00 callq *0x0
48b: R_X86_64_32S pv_mmu_ops+0xf0
48f: 48 89 c7 mov %rax,%rdi
492: e9 0c fd ff ff jmpq 1a3 <change_pte_range+0x1a3>
497: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
49e: 00 00
4a0: 0f 0b ud2
4a2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
ptep_modify_prot_start():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:490
4a8: 0f 0b ud2
4aa: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
arch_leave_lazy_mmu_mode():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:699
4b0: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 4b8 <change_pte_range+0x4b8>
4b7: 00
4b3: R_X86_64_PC32 pv_mmu_ops+0x13b
4b8: 74 34 je 4ee <change_pte_range+0x4ee>
4ba: ff 14 25 00 00 00 00 callq *0x0
4bd: R_X86_64_32S pv_mmu_ops+0x140
spin_unlock():
/home/sasha/linux-next/include/linux/spinlock.h:349
4c1: 48 8b 7d b8 mov -0x48(%rbp),%rdi
4c5: e8 00 00 00 00 callq 4ca <change_pte_range+0x4ca>
4c6: R_X86_64_PC32 _raw_spin_unlock-0x4
change_pte_range():
/home/sasha/linux-next/mm/mprotect.c:135
4ca: 48 8b 45 b0 mov -0x50(%rbp),%rax
4ce: e9 6f ff ff ff jmpq 442 <change_pte_range+0x442>
4d3: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
arch_enter_lazy_mmu_mode():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:694
4d8: 0f 0b ud2
4da: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
ptep_modify_prot_commit():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:503
4e0: 0f 0b ud2
4e2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
pte_to_swp_entry():
/home/sasha/linux-next/./arch/x86/include/asm/pgtable.h:885
4e8: 0f 0b ud2
ptep_set_numa():
/home/sasha/linux-next/include/asm-generic/pgtable.h:724
4ea: 0f 0b ud2
/home/sasha/linux-next/include/asm-generic/pgtable.h:727
4ec: 0f 0b ud2
arch_leave_lazy_mmu_mode():
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:699
4ee: 0f 0b ud2


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Hugh Dickins
2014-09-10 23:10:02 UTC
Permalink
Post by Sasha Levin
Thanks for supplying, but the change in inlining means that
change_protection_range() and change_protection() are no longer
relevant for these traces, we now need to see change_pte_range()
instead, to confirm that what I expect are ptes are indeed ptes.
If you can include line numbers (objdump -ld) in the disassembly, so
much the better, but should be decipherable without. (Or objdump -Sd
for source, but I often find that harder to unscramble, can't say why.)
Here it is. Note that the source includes both of Mel's debug patches.
[ 3114.540976] kernel BUG at include/asm-generic/pgtable.h:724!
[ 3114.541857] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 3114.544056] (ftrace buffer empty)
[ 3114.545717] CPU: 18 PID: 30217 Comm: trinity-c617 Tainted: G W 3.17.0-rc4-next-20140910-sasha-00032-g6825fb5-dirty #1137
[ 3114.548058] task: ffff880415050000 ti: ffff88076f584000 task.ti: ffff88076f584000
[ 3114.549284] RIP: 0010:[<ffffffff952e527a>] [<ffffffff952e527a>] change_pte_range+0x4ea/0x4f0
[ 3114.550028] RSP: 0000:ffff88076f587d68 EFLAGS: 00010246
[ 3114.550028] RAX: 0000000314625900 RBX: 0000000041218000 RCX: 0000000000000100
[ 3114.550028] RDX: 0000000314625900 RSI: 0000000041218000 RDI: 0000000314625900
[ 3114.550028] RBP: ffff88076f587dc8 R08: ffff8802cf973600 R09: 0000000000b50000
[ 3114.550028] R10: 0000000000032c01 R11: 0000000000000008 R12: ffff8802a81070c0
[ 3114.550028] R13: 8000000000000025 R14: 0000000041343000 R15: ffffc00000000fff
[ 3114.550028] FS: 00007fabb91c8700(0000) GS:ffff88025ec00000(0000) knlGS:0000000000000000
[ 3114.550028] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 3114.550028] CR2: 00007fffdb7678e8 CR3: 0000000713935000 CR4: 00000000000006a0
[ 3114.550028] DR0: 00000000006f0000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3114.550028] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000050602
[ 3114.550028] 0000000000000001 0000000314625900 0000000000000018 ffff8802685f2260
[ 3114.550028] 0000000016840000 ffff8802cf973600 ffff880616840000 0000000041343000
[ 3114.550028] ffff880108805048 0000000041005000 0000000041200000 0000000041343000
[ 3114.550028] [<ffffffff952e5534>] change_protection+0x2b4/0x4e0
[ 3114.550028] [<ffffffff952ff24b>] change_prot_numa+0x1b/0x40
[ 3114.550028] [<ffffffff951adf16>] task_numa_work+0x1f6/0x330
[ 3114.550028] [<ffffffff95193de4>] task_work_run+0xc4/0xf0
[ 3114.550028] [<ffffffff95071477>] do_notify_resume+0x97/0xb0
[ 3114.550028] [<ffffffff9850f06a>] int_signal+0x12/0x17
[ 3114.550028] Code: 66 90 48 8b 7d b8 e8 e6 88 22 03 48 8b 45 b0 e9 6f ff ff ff 0f 1f 44 00 00 0f 0b 66 0f 1f 44 00 00 0f 0b 66 0f 1f 44 00 00 0f 0b <0f> 0b 0f 0b 0f 0b 66 66 66 66 90 55 48 89 e5 41 57 49 89 d7 41
[ 3114.550028] RIP [<ffffffff952e527a>] change_pte_range+0x4ea/0x4f0
[ 3114.550028] RSP <ffff88076f587d68>
...
Post by Sasha Levin
/home/sasha/linux-next/mm/mprotect.c:105
31d: 48 8b 4d a8 mov -0x58(%rbp),%rcx
321: 81 e1 01 03 00 00 and $0x301,%ecx
327: 48 81 f9 00 02 00 00 cmp $0x200,%rcx
32e: 0f 84 0b ff ff ff je 23f <change_pte_range+0x23f>
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:450
334: 48 83 3d 00 00 00 00 cmpq $0x0,0x0(%rip) # 33c <change_pte_range+0x33c>
33b: 00
337: R_X86_64_PC32 pv_mmu_ops+0xe3
/home/sasha/linux-next/include/asm-generic/pgtable.h:740
33c: 49 8b 3c 24 mov (%r12),%rdi
/home/sasha/linux-next/./arch/x86/include/asm/paravirt.h:450
340: 0f 84 12 01 00 00 je 458 <change_pte_range+0x458>
346: ff 14 25 00 00 00 00 callq *0x0
349: R_X86_64_32S pv_mmu_ops+0xe8
/home/sasha/linux-next/include/asm-generic/pgtable.h:724
34d: a8 01 test $0x1,%al
34f: 0f 84 95 01 00 00 je 4ea <change_pte_range+0x4ea>
...
Post by Sasha Levin
/home/sasha/linux-next/include/asm-generic/pgtable.h:724
4ea: 0f 0b ud2
Thanks, yes, there is enough in there to be sure that the ...900 is
indeed the oldpte. I wasn't expecting that pv_mmu_ops function call,
but there's no evidence that it does anything worse than just return
in %rax what it's given in %rdi; and the second long on the stack is
the -0x58(%rbp) from which oldpte is retrieved for !pte_numa(oldpte)
at the beginning of the extract above.

Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mel Gorman
2014-08-06 10:40:02 UTC
Permalink
Post by Hugh Dickins
<SNIP>
I'm attaching a preliminary pair of patches. The first which deals with
ARCH_USES_NUMA_PROT_NONE and the second which is yours with a revised
changelog. I'm adding Aneesh to the cc to look at the powerpc portion of
the first patch.
Thanks a lot, Mel.
I am surprised by the ordering, but perhaps you meant nothing by it.
I didn't mean anything by it. It was based on the order I looked at the
patches in. Revisited c46a7c817, looked at ARCH_USES_NUMA_PROT_NONE issue
to see if it had any potential impact to your patch and then moved on to
your patch.
Post by Hugh Dickins
Isn't the first one a welcome but optional cleanup, and the second one
a fix that we need in 3.16-stable? Or does the fix actually depend in
some unstated way upon the cleanup, in powerpc-land perhaps?
It shouldn't as powerpc can use its old helpers. I've included Aneesh in
the cc just in case.
Post by Hugh Dickins
Aside from that, for the first patch: yes, I heartily approve of the
disappearance of CONFIG_ARCH_WANTS_PROT_NUMA_PROT_NONE and
CONFIG_ARCH_USES_NUMA_PROT_NONE. If you wish, add
but of course it's really Aneesh and powerpc who are the test of it.
Thanks. I have a second version finished for that which I'll send once
this bug is addressed.
Post by Hugh Dickins
One thing I did wonder, though: at first I was reassured by the
VM_BUG_ON(!pte_present(pte)) you add to pte_mknuma(); but then thought
it would be better as VM_BUG_ON(!(val & _PAGE_PRESENT)), being stronger
- asserting that indeed we do not put NUMA hints on PROT_NONE areas.
(But I have not tested, perhaps such a VM_BUG_ON would actually fire.)
It shouldn't so I'll use the stronger test.

Sasha, if it's not too late would you mind testing this patch in isolation
as a -stable candidate for 3.16 please? It worked for me including within
trinity but then again I was not seeing crashes with 3.16 either so I do
not consider my trinity testing to be a reliable indicator.

---8<---
x86,mm: fix pte_special versus pte_numa

Sasha Levin has shown oopses on ffffea0003480048 and ffffea0003480008
at mm/memory.c:1132, running Trinity on different 3.16-rc-next kernels:
where zap_pte_range() checks page->mapping to see if PageAnon(page).

Those addresses fit struct pages for pfns d2001 and d2000, and in each
dump a register or a stack slot showed d2001730 or d2000730: pte flags
0x730 are PCD ACCESSED PROTNONE SPECIAL IOMAP; and Sasha's e820 map has
a hole between cfffffff and 100000000, which would need special access.

Commit c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on
the PMD and PTE levels") has broken vm_normal_page(): a PROTNONE SPECIAL
pte no longer passes the pte_special() test, so zap_pte_range() goes on
to try to access a non-existent struct page.

Fix this by refining pte_special() (SPECIAL with PRESENT or PROTNONE)
to complement pte_numa() (SPECIAL with neither PRESENT nor PROTNONE).
A hint that this was a problem was that c46a7c817e66 added pte_numa()
test to vm_normal_page(), and moved its is_zero_pfn() test from slow to
fast path: This was papering over a pte_special() snag when the zero page
was encountered during zap. This patch reverts vm_normal_page() to how it
was before, relying on pte_special().

It still appears that this patch may be incomplete: aren't there other
places which need to be handling PROTNONE along with PRESENT? For example,
pte_mknuma() clears _PAGE_PRESENT and sets _PAGE_NUMA, but on a PROT_NONE
area, that would make it pte_special(). This is side-stepped by the fact
that NUMA hinting faults skipped PROT_NONE VMAs and there are no grounds
where a NUMA hinting fault on a PROT_NONE VMA would be interesting.

Fixes: c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels")
Reported-by: Sasha Levin <***@oracle.com>
Signed-off-by: Hugh Dickins <***@google.com>
Signed-off-by: Mel Gorman <***@suse.de>
Cc: ***@vger.kernel.org [3.16]
---
arch/x86/include/asm/pgtable.h | 9 +++++++--
mm/memory.c | 7 +++----
2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0ec0560..aa97a07 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -131,8 +131,13 @@ static inline int pte_exec(pte_t pte)

static inline int pte_special(pte_t pte)
{
- return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
- (_PAGE_PRESENT|_PAGE_SPECIAL);
+ /*
+ * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h.
+ * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 ==
+ * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL.
+ */
+ return (pte_flags(pte) & _PAGE_SPECIAL) &&
+ (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE));
}

static inline unsigned long pte_pfn(pte_t pte)
diff --git a/mm/memory.c b/mm/memory.c
index 8b44f76..0a21f3d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -751,7 +751,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pte_pfn(pte);

if (HAVE_PTE_SPECIAL) {
- if (likely(!pte_special(pte) || pte_numa(pte)))
+ if (likely(!pte_special(pte)))
goto check_pfn;
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
@@ -777,15 +777,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
}
}

+ if (is_zero_pfn(pfn))
+ return NULL;
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}

- if (is_zero_pfn(pfn))
- return NULL;
-
/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mel Gorman
2014-08-06 10:30:04 UTC
Permalink
-#define pmd_mknonnuma pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+/*
+ * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist
+ * which was inherited from x86. For the purposes of powerpc pte_basic_t is
+ * equivalent
+ */
+#define pteval_t pte_basic_t
+#define pmdval_t pmd_t
+static inline pteval_t pte_flags(pte_t pte)
{
- return pte_pmd(pte_mknonnuma(pmd_pte(pmd)));
+ return pte_val(pte) & PAGE_PROT_BITS;
PAGE_PROT_BITS don't get the _PAGE_NUMA and _PAGE_PRESENT. I will have
to check further to find out why the mask doesn't include
_PAGE_PRESENT.
Dumb of me, not sure how I managed that. For the purposes of what is required
it doesn't matter what PAGE_PROT_BITS does. It is clearer if there is a mask
that defines what bits are of interest to the generic helpers which is what
this version attempts to do. It's not tested on powerpc at all unfortunately.

---8<---
mm: Remove misleading ARCH_USES_NUMA_PROT_NONE

ARCH_USES_NUMA_PROT_NONE was defined for architectures that implemented
_PAGE_NUMA using _PROT_NONE. This saved using an additional PTE bit and
relied on the fact that PROT_NONE vmas were skipped by the NUMA hinting
fault scanner. This was found to be conceptually confusing with a lot of
implicit assumptions and it was asked that an alternative be found.

Commit c46a7c81 "x86: define _PAGE_NUMA by reusing software bits on the
PMD and PTE levels" redefined _PAGE_NUMA on x86 to be one of the swap
PTE bits and shrunk the maximum possible swap size but it did not go far
enough. There are no architectures that reuse _PROT_NONE as _PROT_NUMA
but the relics still exist.

This patch removes ARCH_USES_NUMA_PROT_NONE and removes some unnecessary
duplication in powerpc vs the generic implementation by defining the types
the core NUMA helpers expected to exist from x86 with their ppc64 equivalent.
This necessitated that a PTE bit mask be created that identified the bits
that distinguish present from NUMA pte entries but it is expected this
will only differ between arches based on _PAGE_PROTNONE. The naming for
the generic helpers was taken from x86 originally but ppc64 has types that
are equivalent for the purposes of the helper so they are mapped instead
of duplicating code.

Signed-off-by: Mel Gorman <***@suse.de>
---
arch/powerpc/include/asm/pgtable.h | 57 ++++++++---------------------------
arch/powerpc/include/asm/pte-common.h | 5 +++
arch/x86/Kconfig | 1 -
arch/x86/include/asm/pgtable_types.h | 7 +++++
include/asm-generic/pgtable.h | 27 ++++++-----------
init/Kconfig | 11 -------
6 files changed, 33 insertions(+), 75 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index d98c1ec..beeb09e 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -38,10 +38,9 @@ static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK)
static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }

#ifdef CONFIG_NUMA_BALANCING
-
static inline int pte_present(pte_t pte)
{
- return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
+ return pte_val(pte) & _PAGE_NUMA_MASK;
}

#define pte_present_nonuma pte_present_nonuma
@@ -50,37 +49,6 @@ static inline int pte_present_nonuma(pte_t pte)
return pte_val(pte) & (_PAGE_PRESENT);
}

-#define pte_numa pte_numa
-static inline int pte_numa(pte_t pte)
-{
- return (pte_val(pte) &
- (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
-}
-
-#define pte_mknonnuma pte_mknonnuma
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
- pte_val(pte) &= ~_PAGE_NUMA;
- pte_val(pte) |= _PAGE_PRESENT | _PAGE_ACCESSED;
- return pte;
-}
-
-#define pte_mknuma pte_mknuma
-static inline pte_t pte_mknuma(pte_t pte)
-{
- /*
- * We should not set _PAGE_NUMA on non present ptes. Also clear the
- * present bit so that hash_page will return 1 and we collect this
- * as numa fault.
- */
- if (pte_present(pte)) {
- pte_val(pte) |= _PAGE_NUMA;
- pte_val(pte) &= ~_PAGE_PRESENT;
- } else
- VM_BUG_ON(1);
- return pte;
-}
-
#define ptep_set_numa ptep_set_numa
static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
@@ -92,12 +60,6 @@ static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
return;
}

-#define pmd_numa pmd_numa
-static inline int pmd_numa(pmd_t pmd)
-{
- return pte_numa(pmd_pte(pmd));
-}
-
#define pmdp_set_numa pmdp_set_numa
static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
@@ -109,16 +71,21 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
return;
}

-#define pmd_mknonnuma pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+/*
+ * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist
+ * which was inherited from x86. For the purposes of powerpc pte_basic_t and
+ * pmd_t are equivalent
+ */
+#define pteval_t pte_basic_t
+#define pmdval_t pmd_t
+static inline pteval_t ptenuma_flags(pte_t pte)
{
- return pte_pmd(pte_mknonnuma(pmd_pte(pmd)));
+ return pte_val(pte) & _PAGE_NUMA_MASK;
}

-#define pmd_mknuma pmd_mknuma
-static inline pmd_t pmd_mknuma(pmd_t pmd)
+static inline pmdval_t pmdnuma_flags(pte_t pte)
{
- return pte_pmd(pte_mknuma(pmd_pte(pmd)));
+ return pmd_val(pte) & _PAGE_NUMA_MASK;
}

# else
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index 8d1569c..e040c35 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -98,6 +98,11 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
_PAGE_USER | _PAGE_ACCESSED | \
_PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC)

+#ifdef CONFIG_NUMA_BALANCING
+/* Mask of bits that distinguish present and numa ptes */
+#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT)
+#endif
+
/*
* We define 2 sets of base prot bits, one for basic pages (ie,
* cacheable kernel and user pages) and one for non cacheable
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d24887b..0a3f32b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,7 +28,6 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_INT128 if X86_64
- select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f216963..34ffe7e 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -129,6 +129,13 @@
_PAGE_SOFT_DIRTY | _PAGE_NUMA)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)

+#ifdef CONFIG_NUMA_BALANCING
+/* Set of bits that distinguishes present, prot_none and numa ptes */
+#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
+#define ptenuma_flags pte_flags
+#define pmdnuma_flags pmd_flags
+#endif /* CONFIG_NUMA_BALANCING */
+
#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
#define _PAGE_CACHE_WB (0)
#define _PAGE_CACHE_WC (_PAGE_PWT)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 53b2acc..196c124 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -660,11 +660,12 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
}

#ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
/*
- * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
- * same bit too). It's set only when _PAGE_PRESET is not set and it's
- * never set if _PAGE_PRESENT is set.
+ * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that
+ * is protected for PROT_NONE and a NUMA hinting fault entry. If the
+ * architecture defines __PAGE_PROTNONE then it should take that into account
+ * but those that do not can rely on the fact that the NUMA hinting scanner
+ * skips inaccessible VMAs.
*
* pte/pmd_present() returns true if pte/pmd_numa returns true. Page
* fault triggers on those regions if pte/pmd_numa returns true
@@ -673,16 +674,14 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
#ifndef pte_numa
static inline int pte_numa(pte_t pte)
{
- return (pte_flags(pte) &
- (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
+ return (ptenuma_flags(pte) & _PAGE_NUMA_MASK) == _PAGE_NUMA;
}
#endif

#ifndef pmd_numa
static inline int pmd_numa(pmd_t pmd)
{
- return (pmd_flags(pmd) &
- (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
+ return (pmdnuma_flags(pmd) & _PAGE_NUMA_MASK) == _PAGE_NUMA;
}
#endif

@@ -722,6 +721,8 @@ static inline pte_t pte_mknuma(pte_t pte)
{
pteval_t val = pte_val(pte);

+ VM_BUG_ON(!(val & _PAGE_PRESENT));
+
val &= ~_PAGE_PRESENT;
val |= _PAGE_NUMA;

@@ -765,16 +766,6 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
}
#endif
#else
-extern int pte_numa(pte_t pte);
-extern int pmd_numa(pmd_t pmd);
-extern pte_t pte_mknonnuma(pte_t pte);
-extern pmd_t pmd_mknonnuma(pmd_t pmd);
-extern pte_t pte_mknuma(pte_t pte);
-extern pmd_t pmd_mknuma(pmd_t pmd);
-extern void ptep_set_numa(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-extern void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp);
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
-#else
static inline int pmd_numa(pmd_t pmd)
{
return 0;
diff --git a/init/Kconfig b/init/Kconfig
index 9d76b99..60fa415 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -844,17 +844,6 @@ config ARCH_SUPPORTS_INT128
config ARCH_WANT_NUMA_VARIABLE_LOCALITY
bool

-#
-# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE
-config ARCH_WANTS_PROT_NUMA_PROT_NONE
- bool
-
-config ARCH_USES_NUMA_PROT_NONE
- bool
- default y
- depends on ARCH_WANTS_PROT_NUMA_PROT_NONE
- depends on NUMA_BALANCING
-
config NUMA_BALANCING_DEFAULT_ENABLED
bool "Automatically enable NUMA aware memory/task placement"
default y
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Loading...