Patch Detail
get:
Show a patch.
patch:
Update a patch.
put:
Update a patch.
GET /api/patches/811800/?format=api
{ "id": 811800, "url": "http://patchwork.ozlabs.org/api/patches/811800/?format=api", "web_url": "http://patchwork.ozlabs.org/project/linuxppc-dev/patch/1504894024-2750-15-git-send-email-ldufour@linux.vnet.ibm.com/", "project": { "id": 2, "url": "http://patchwork.ozlabs.org/api/projects/2/?format=api", "name": "Linux PPC development", "link_name": "linuxppc-dev", "list_id": "linuxppc-dev.lists.ozlabs.org", "list_email": "linuxppc-dev@lists.ozlabs.org", "web_url": "https://github.com/linuxppc/wiki/wiki", "scm_url": "https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git", "webscm_url": "https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/", "list_archive_url": "https://lore.kernel.org/linuxppc-dev/", "list_archive_url_format": "https://lore.kernel.org/linuxppc-dev/{}/", "commit_url_format": "https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/commit/?id={}" }, "msgid": "<1504894024-2750-15-git-send-email-ldufour@linux.vnet.ibm.com>", "list_archive_url": "https://lore.kernel.org/linuxppc-dev/1504894024-2750-15-git-send-email-ldufour@linux.vnet.ibm.com/", "date": "2017-09-08T18:06:58", "name": "[v3,14/20] mm: Provide speculative fault infrastructure", "commit_ref": null, "pull_url": null, "state": "not-applicable", "archived": false, "hash": "cffa98f4952904f4a7669afd6e33856c70b977a3", "submitter": { "id": 40248, "url": "http://patchwork.ozlabs.org/api/people/40248/?format=api", "name": "Laurent Dufour", "email": "ldufour@linux.vnet.ibm.com" }, "delegate": null, "mbox": "http://patchwork.ozlabs.org/project/linuxppc-dev/patch/1504894024-2750-15-git-send-email-ldufour@linux.vnet.ibm.com/mbox/", "series": [ { "id": 2269, "url": "http://patchwork.ozlabs.org/api/series/2269/?format=api", "web_url": "http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=2269", "date": "2017-09-08T18:06:44", "name": "Speculative page faults", "version": 3, "mbox": "http://patchwork.ozlabs.org/series/2269/mbox/" } ], "comments": "http://patchwork.ozlabs.org/api/patches/811800/comments/", "check": "pending", "checks": "http://patchwork.ozlabs.org/api/patches/811800/checks/", "tags": {}, "related": [], "headers": { "Return-Path": "<linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org>", "X-Original-To": [ "patchwork-incoming@ozlabs.org", "linuxppc-dev@lists.ozlabs.org" ], "Delivered-To": [ "patchwork-incoming@ozlabs.org", "linuxppc-dev@lists.ozlabs.org" ], "Received": [ "from lists.ozlabs.org (lists.ozlabs.org [103.22.144.68])\n\t(using TLSv1.2 with cipher ADH-AES256-GCM-SHA384 (256/256 bits))\n\t(No client certificate requested)\n\tby ozlabs.org (Postfix) with ESMTPS id 3xpmb54SMkz9sBd\n\tfor <patchwork-incoming@ozlabs.org>;\n\tSat, 9 Sep 2017 04:49:53 +1000 (AEST)", "from lists.ozlabs.org (lists.ozlabs.org [IPv6:2401:3900:2:1::3])\n\tby lists.ozlabs.org (Postfix) with ESMTP id 3xpmb53HCjzDrWZ\n\tfor <patchwork-incoming@ozlabs.org>;\n\tSat, 9 Sep 2017 04:49:53 +1000 (AEST)", "from mx0a-001b2d01.pphosted.com (mx0a-001b2d01.pphosted.com\n\t[148.163.156.1])\n\t(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256\n\tbits)) (No client certificate requested)\n\tby lists.ozlabs.org (Postfix) with ESMTPS id 3xplfs4Y9bzDrVn\n\tfor <linuxppc-dev@lists.ozlabs.org>;\n\tSat, 9 Sep 2017 04:08:05 +1000 (AEST)", "from pps.filterd (m0098404.ppops.net [127.0.0.1])\n\tby mx0a-001b2d01.pphosted.com (8.16.0.21/8.16.0.21) with SMTP id\n\tv88I6hcX022617\n\tfor <linuxppc-dev@lists.ozlabs.org>; Fri, 8 Sep 2017 14:08:03 -0400", "from e06smtp12.uk.ibm.com (e06smtp12.uk.ibm.com [195.75.94.108])\n\tby mx0a-001b2d01.pphosted.com with ESMTP id 2cuut55s05-1\n\t(version=TLSv1.2 cipher=AES256-SHA bits=256 verify=NOT)\n\tfor <linuxppc-dev@lists.ozlabs.org>; Fri, 08 Sep 2017 14:08:03 -0400", "from localhost\n\tby e06smtp12.uk.ibm.com with IBM ESMTP SMTP Gateway: Authorized Use\n\tOnly! Violators will be prosecuted\n\tfor <linuxppc-dev@lists.ozlabs.org> from <ldufour@linux.vnet.ibm.com>;\n\tFri, 8 Sep 2017 19:08:00 +0100", "from b06cxnps3075.portsmouth.uk.ibm.com (9.149.109.195)\n\tby e06smtp12.uk.ibm.com (192.168.101.142) with IBM ESMTP SMTP\n\tGateway: Authorized Use Only! Violators will be prosecuted; \n\tFri, 8 Sep 2017 19:07:54 +0100", "from d06av24.portsmouth.uk.ibm.com (mk.ibm.com [9.149.105.60])\n\tby b06cxnps3075.portsmouth.uk.ibm.com (8.14.9/8.14.9/NCO v10.0) with\n\tESMTP id v88I7sWe27197474; Fri, 8 Sep 2017 18:07:54 GMT", "from d06av24.portsmouth.uk.ibm.com (unknown [127.0.0.1])\n\tby IMSVA (Postfix) with ESMTP id 337AE42042;\n\tFri, 8 Sep 2017 19:04:21 +0100 (BST)", "from d06av24.portsmouth.uk.ibm.com (unknown [127.0.0.1])\n\tby IMSVA (Postfix) with ESMTP id 0032942041;\n\tFri, 8 Sep 2017 19:04:18 +0100 (BST)", "from nimbus.lab.toulouse-stg.fr.ibm.com (unknown [9.145.31.125])\n\tby d06av24.portsmouth.uk.ibm.com (Postfix) with ESMTP;\n\tFri, 8 Sep 2017 19:04:17 +0100 (BST)" ], "Authentication-Results": "ozlabs.org;\n\tspf=none (mailfrom) smtp.mailfrom=linux.vnet.ibm.com\n\t(client-ip=148.163.156.1; helo=mx0a-001b2d01.pphosted.com;\n\tenvelope-from=ldufour@linux.vnet.ibm.com; receiver=<UNKNOWN>)", "From": "Laurent Dufour <ldufour@linux.vnet.ibm.com>", "To": "paulmck@linux.vnet.ibm.com, peterz@infradead.org,\n\takpm@linux-foundation.org, kirill@shutemov.name, ak@linux.intel.com, \n\tmhocko@kernel.org, dave@stgolabs.net, jack@suse.cz,\n\tMatthew Wilcox <willy@infradead.org>, benh@kernel.crashing.org,\n\tmpe@ellerman.id.au, paulus@samba.org,\n\tThomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@redhat.com>, \n\thpa@zytor.com, Will Deacon <will.deacon@arm.com>,\n\tSergey Senozhatsky <sergey.senozhatsky@gmail.com>", "Subject": "[PATCH v3 14/20] mm: Provide speculative fault infrastructure", "Date": "Fri, 8 Sep 2017 20:06:58 +0200", "X-Mailer": "git-send-email 2.7.4", "In-Reply-To": "<1504894024-2750-1-git-send-email-ldufour@linux.vnet.ibm.com>", "References": "<1504894024-2750-1-git-send-email-ldufour@linux.vnet.ibm.com>", "X-TM-AS-GCONF": "00", "x-cbid": "17090818-0008-0000-0000-000004959E6B", "X-IBM-AV-DETECTION": "SAVI=unused REMOTE=unused XFE=unused", "x-cbparentid": "17090818-0009-0000-0000-00001E26A5FB", "Message-Id": "<1504894024-2750-15-git-send-email-ldufour@linux.vnet.ibm.com>", "X-Proofpoint-Virus-Version": "vendor=fsecure engine=2.50.10432:, ,\n\tdefinitions=2017-09-08_12:, , signatures=0", "X-Proofpoint-Spam-Details": "rule=outbound_notspam policy=outbound score=0\n\tspamscore=0 suspectscore=2\n\tmalwarescore=0 phishscore=0 adultscore=0 bulkscore=0 classifier=spam\n\tadjust=0 reason=mlx scancount=1 engine=8.0.1-1707230000\n\tdefinitions=main-1709080270", "X-BeenThere": "linuxppc-dev@lists.ozlabs.org", "X-Mailman-Version": "2.1.23", "Precedence": "list", "List-Id": "Linux on PowerPC Developers Mail List\n\t<linuxppc-dev.lists.ozlabs.org>", "List-Unsubscribe": "<https://lists.ozlabs.org/options/linuxppc-dev>,\n\t<mailto:linuxppc-dev-request@lists.ozlabs.org?subject=unsubscribe>", "List-Archive": "<http://lists.ozlabs.org/pipermail/linuxppc-dev/>", "List-Post": "<mailto:linuxppc-dev@lists.ozlabs.org>", "List-Help": "<mailto:linuxppc-dev-request@lists.ozlabs.org?subject=help>", "List-Subscribe": "<https://lists.ozlabs.org/listinfo/linuxppc-dev>,\n\t<mailto:linuxppc-dev-request@lists.ozlabs.org?subject=subscribe>", "Cc": "linuxppc-dev@lists.ozlabs.org, x86@kernel.org,\n\tlinux-kernel@vger.kernel.org, npiggin@gmail.com, linux-mm@kvack.org,\n\tTim Chen <tim.c.chen@linux.intel.com>, \n\tharen@linux.vnet.ibm.com, khandual@linux.vnet.ibm.com", "Errors-To": "linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org", "Sender": "\"Linuxppc-dev\"\n\t<linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org>" }, "content": "From: Peter Zijlstra <peterz@infradead.org>\n\nProvide infrastructure to do a speculative fault (not holding\nmmap_sem).\n\nThe not holding of mmap_sem means we can race against VMA\nchange/removal and page-table destruction. We use the SRCU VMA freeing\nto keep the VMA around. We use the VMA seqcount to detect change\n(including umapping / page-table deletion) and we use gup_fast() style\npage-table walking to deal with page-table races.\n\nOnce we've obtained the page and are ready to update the PTE, we\nvalidate if the state we started the fault with is still valid, if\nnot, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the\nPTE and we're done.\n\nSigned-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>\n\n[Manage the newly introduced pte_spinlock() for speculative page\n fault to fail if the VMA is touched in our back]\n[Rename vma_is_dead() to vma_has_changed() and declare it here]\n[Call p4d_alloc() as it is safe since pgd is valid]\n[Call pud_alloc() as it is safe since p4d is valid]\n[Set fe.sequence in __handle_mm_fault()]\n[Abort speculative path when handle_userfault() has to be called]\n[Add additional VMA's flags checks in handle_speculative_fault()]\n[Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]\n[Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]\n[Remove warning comment about waiting for !seq&1 since we don't want\n to wait]\n[Remove warning about no huge page support, mention it explictly]\n[Don't call do_fault() in the speculative path as __do_fault() calls\n vma->vm_ops->fault() which may want to release mmap_sem]\n[Only vm_fault pointer argument for vma_has_changed()]\n[Fix check against huge page, calling pmd_trans_huge()]\n[Introduce __HAVE_ARCH_CALL_SPF to declare the SPF handler only when\n architecture is supporting it]\n[Use READ_ONCE() when reading VMA's fields in the speculative path]\n[Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for\n processing done in vm_normal_page()]\n[Check that vma->anon_vma is already set when starting the speculative\n path]\n[Check for memory policy as we can't support MPOL_INTERLEAVE case due to\n the processing done in mpol_misplaced()]\n[Don't support VMA growing up or down]\n[Move check on vm_sequence just before calling handle_pte_fault()]\nSigned-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>\n---\n include/linux/hugetlb_inline.h | 2 +-\n include/linux/mm.h | 5 +\n include/linux/pagemap.h | 4 +-\n mm/internal.h | 14 +++\n mm/memory.c | 249 ++++++++++++++++++++++++++++++++++++++++-\n 5 files changed, 266 insertions(+), 8 deletions(-)", "diff": "diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h\nindex a4e7ca0f3585..6cfdfca4cc2a 100644\n--- a/include/linux/hugetlb_inline.h\n+++ b/include/linux/hugetlb_inline.h\n@@ -7,7 +7,7 @@\n \n static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)\n {\n-\treturn !!(vma->vm_flags & VM_HUGETLB);\n+\treturn !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);\n }\n \n #else\ndiff --git a/include/linux/mm.h b/include/linux/mm.h\nindex a2857aaa03f1..966b69f10f57 100644\n--- a/include/linux/mm.h\n+++ b/include/linux/mm.h\n@@ -320,6 +320,7 @@ struct vm_fault {\n \tgfp_t gfp_mask;\t\t\t/* gfp mask to be used for allocations */\n \tpgoff_t pgoff;\t\t\t/* Logical page offset based on vma */\n \tunsigned long address;\t\t/* Faulting virtual address */\n+\tunsigned int sequence;\n \tpmd_t *pmd;\t\t\t/* Pointer to pmd entry matching\n \t\t\t\t\t * the 'address' */\n \tpud_t *pud;\t\t\t/* Pointer to pud entry matching\n@@ -1342,6 +1343,10 @@ int invalidate_inode_page(struct page *page);\n #ifdef CONFIG_MMU\n extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,\n \t\tunsigned int flags);\n+#ifdef __HAVE_ARCH_CALL_SPF\n+extern int handle_speculative_fault(struct mm_struct *mm,\n+\t\t\t\t unsigned long address, unsigned int flags);\n+#endif /* __HAVE_ARCH_CALL_SPF */\n extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,\n \t\t\t unsigned long address, unsigned int fault_flags,\n \t\t\t bool *unlocked);\ndiff --git a/include/linux/pagemap.h b/include/linux/pagemap.h\nindex 5bbd6780f205..832aa3ec7d00 100644\n--- a/include/linux/pagemap.h\n+++ b/include/linux/pagemap.h\n@@ -451,8 +451,8 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,\n \tpgoff_t pgoff;\n \tif (unlikely(is_vm_hugetlb_page(vma)))\n \t\treturn linear_hugepage_index(vma, address);\n-\tpgoff = (address - vma->vm_start) >> PAGE_SHIFT;\n-\tpgoff += vma->vm_pgoff;\n+\tpgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT;\n+\tpgoff += READ_ONCE(vma->vm_pgoff);\n \treturn pgoff;\n }\n \ndiff --git a/mm/internal.h b/mm/internal.h\nindex 84360184eafd..4ddadc440c26 100644\n--- a/mm/internal.h\n+++ b/mm/internal.h\n@@ -45,6 +45,20 @@ extern struct srcu_struct vma_srcu;\n extern struct vm_area_struct *find_vma_srcu(struct mm_struct *mm,\n \t\t\t\t\t unsigned long addr);\n \n+static inline bool vma_has_changed(struct vm_fault *vmf)\n+{\n+\tint ret = RB_EMPTY_NODE(&vmf->vma->vm_rb);\n+\tunsigned seq = ACCESS_ONCE(vmf->vma->vm_sequence.sequence);\n+\n+\t/*\n+\t * Matches both the wmb in write_seqlock_{begin,end}() and\n+\t * the wmb in vma_rb_erase().\n+\t */\n+\tsmp_rmb();\n+\n+\treturn ret || seq != vmf->sequence;\n+}\n+\n void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,\n \t\tunsigned long floor, unsigned long ceiling);\n \ndiff --git a/mm/memory.c b/mm/memory.c\nindex 479b47a8ed7c..5e98259c7ac0 100644\n--- a/mm/memory.c\n+++ b/mm/memory.c\n@@ -762,7 +762,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,\n \tif (page)\n \t\tdump_page(page, \"bad pte\");\n \tpr_alert(\"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\\n\",\n-\t\t (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);\n+\t\t (void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma,\n+\t\t mapping, index);\n \t/*\n \t * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y\n \t */\n@@ -2417,15 +2418,69 @@ static inline void wp_page_reuse(struct vm_fault *vmf)\n \n static bool pte_spinlock(struct vm_fault *vmf)\n {\n+\tbool ret = false;\n+\n+\t/* Check if vma is still valid */\n+\tif (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {\n+\t\tvmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);\n+\t\tspin_lock(vmf->ptl);\n+\t\treturn true;\n+\t}\n+\n+\tlocal_irq_disable();\n+\tif (vma_has_changed(vmf))\n+\t\tgoto out;\n+\n \tvmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);\n \tspin_lock(vmf->ptl);\n-\treturn true;\n+\n+\tif (vma_has_changed(vmf)) {\n+\t\tspin_unlock(vmf->ptl);\n+\t\tgoto out;\n+\t}\n+\n+\tret = true;\n+out:\n+\tlocal_irq_enable();\n+\treturn ret;\n }\n \n static bool pte_map_lock(struct vm_fault *vmf)\n {\n-\tvmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl);\n-\treturn true;\n+\tbool ret = false;\n+\tpte_t *pte;\n+\tspinlock_t *ptl;\n+\n+\tif (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {\n+\t\tvmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,\n+\t\t\t\t\t vmf->address, &vmf->ptl);\n+\t\treturn true;\n+\t}\n+\n+\t/*\n+\t * The first vma_has_changed() guarantees the page-tables are still\n+\t * valid, having IRQs disabled ensures they stay around, hence the\n+\t * second vma_has_changed() to make sure they are still valid once\n+\t * we've got the lock. After that a concurrent zap_pte_range() will\n+\t * block on the PTL and thus we're safe.\n+\t */\n+\tlocal_irq_disable();\n+\tif (vma_has_changed(vmf))\n+\t\tgoto out;\n+\n+\tpte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,\n+\t\t\t\t vmf->address, &ptl);\n+\tif (vma_has_changed(vmf)) {\n+\t\tpte_unmap_unlock(pte, ptl);\n+\t\tgoto out;\n+\t}\n+\n+\tvmf->pte = pte;\n+\tvmf->ptl = ptl;\n+\tret = true;\n+out:\n+\tlocal_irq_enable();\n+\treturn ret;\n }\n \n /*\n@@ -3094,6 +3149,14 @@ static int do_anonymous_page(struct vm_fault *vmf)\n \t\tret = check_stable_address_space(vma->vm_mm);\n \t\tif (ret)\n \t\t\tgoto unlock;\n+\t\t/*\n+\t\t * Don't call the userfaultfd during the speculative path.\n+\t\t * We already checked for the VMA to not be managed through\n+\t\t * userfaultfd, but it may be set in our back once we have lock\n+\t\t * the pte. In such a case we can ignore it this time.\n+\t\t */\n+\t\tif (vmf->flags & FAULT_FLAG_SPECULATIVE)\n+\t\t\tgoto setpte;\n \t\t/* Deliver the page fault to userland, check inside PT lock */\n \t\tif (userfaultfd_missing(vma)) {\n \t\t\tpte_unmap_unlock(vmf->pte, vmf->ptl);\n@@ -3136,7 +3199,7 @@ static int do_anonymous_page(struct vm_fault *vmf)\n \t\tgoto release;\n \n \t/* Deliver the page fault to userland, check inside PT lock */\n-\tif (userfaultfd_missing(vma)) {\n+\tif (!(vmf->flags & FAULT_FLAG_SPECULATIVE) && userfaultfd_missing(vma)) {\n \t\tpte_unmap_unlock(vmf->pte, vmf->ptl);\n \t\tmem_cgroup_cancel_charge(page, memcg, false);\n \t\tput_page(page);\n@@ -3915,6 +3978,8 @@ static int handle_pte_fault(struct vm_fault *vmf)\n \tif (!vmf->pte) {\n \t\tif (vma_is_anonymous(vmf->vma))\n \t\t\treturn do_anonymous_page(vmf);\n+\t\telse if (vmf->flags & FAULT_FLAG_SPECULATIVE)\n+\t\t\treturn VM_FAULT_RETRY;\n \t\telse\n \t\t\treturn do_fault(vmf);\n \t}\n@@ -4012,6 +4077,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,\n \tvmf.pmd = pmd_alloc(mm, vmf.pud, address);\n \tif (!vmf.pmd)\n \t\treturn VM_FAULT_OOM;\n+\tvmf.sequence = raw_read_seqcount(&vma->vm_sequence);\n \tif (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {\n \t\tret = create_huge_pmd(&vmf);\n \t\tif (!(ret & VM_FAULT_FALLBACK))\n@@ -4045,6 +4111,179 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,\n \treturn handle_pte_fault(&vmf);\n }\n \n+#ifdef __HAVE_ARCH_CALL_SPF\n+\n+#ifndef __HAVE_ARCH_PTE_SPECIAL\n+/* This is required by vm_normal_page() */\n+#error \"Speculative page fault handler requires __HAVE_ARCH_PTE_SPECIAL\"\n+#endif\n+\n+/*\n+ * vm_normal_page() adds some processing which should be done while\n+ * hodling the mmap_sem.\n+ */\n+int handle_speculative_fault(struct mm_struct *mm, unsigned long address,\n+\t\t\t unsigned int flags)\n+{\n+\tstruct vm_fault vmf = {\n+\t\t.address = address,\n+\t};\n+\tpgd_t *pgd;\n+\tp4d_t *p4d;\n+\tpud_t *pud;\n+\tpmd_t *pmd;\n+\tint dead, seq, idx, ret = VM_FAULT_RETRY;\n+\tstruct vm_area_struct *vma;\n+#ifdef CONFIG_NUMA\n+\tstruct mempolicy *pol;\n+#endif\n+\n+\t/* Clear flags that may lead to release the mmap_sem to retry */\n+\tflags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);\n+\tflags |= FAULT_FLAG_SPECULATIVE;\n+\n+\tidx = srcu_read_lock(&vma_srcu);\n+\tvma = find_vma_srcu(mm, address);\n+\tif (!vma)\n+\t\tgoto unlock;\n+\n+\t/*\n+\t * Validate the VMA found by the lockless lookup.\n+\t */\n+\tdead = RB_EMPTY_NODE(&vma->vm_rb);\n+\tseq = raw_read_seqcount(&vma->vm_sequence); /* rmb <-> seqlock,vma_rb_erase() */\n+\tif ((seq & 1) || dead)\n+\t\tgoto unlock;\n+\n+\t/*\n+\t * Can't call vm_ops service has we don't know what they would do\n+\t * with the VMA.\n+\t * This include huge page from hugetlbfs.\n+\t */\n+\tif (vma->vm_ops)\n+\t\tgoto unlock;\n+\n+\t/*\n+\t * __anon_vma_prepare() requires the mmap_sem to be held\n+\t * because vm_next and vm_prev must be safe. This can't be guaranteed\n+\t * in the speculative path.\n+\t */\n+\tif (unlikely(!vma->anon_vma))\n+\t\tgoto unlock;\n+\n+\tvmf.vma_flags = READ_ONCE(vma->vm_flags);\n+\tvmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);\n+\n+\t/* Can't call userland page fault handler in the speculative path */\n+\tif (unlikely(vmf.vma_flags & VM_UFFD_MISSING))\n+\t\tgoto unlock;\n+\n+#ifdef CONFIG_NUMA\n+\t/*\n+\t * MPOL_INTERLEAVE implies additional check in mpol_misplaced() which\n+\t * are not compatible with the speculative page fault processing.\n+\t */\n+\tpol = __get_vma_policy(vma, address);\n+\tif (!pol)\n+\t\tpol = get_task_policy(current);\n+\tif (pol && pol->mode == MPOL_INTERLEAVE)\n+\t\tgoto unlock;\n+#endif\n+\n+\tif (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP)\n+\t\t/*\n+\t\t * This could be detected by the check address against VMA's\n+\t\t * boundaries but we want to trace it as not supported instead\n+\t\t * of changed.\n+\t\t */\n+\t\tgoto unlock;\n+\n+\tif (address < READ_ONCE(vma->vm_start)\n+\t || READ_ONCE(vma->vm_end) <= address)\n+\t\tgoto unlock;\n+\n+\tif (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,\n+\t\t\t\t flags & FAULT_FLAG_INSTRUCTION,\n+\t\t\t\t flags & FAULT_FLAG_REMOTE)) {\n+\t\tret = VM_FAULT_SIGSEGV;\n+\t\tgoto unlock;\n+\t}\n+\n+\t/* This is one is required to check that the VMA has write access set */\n+\tif (flags & FAULT_FLAG_WRITE) {\n+\t\tif (unlikely(!(vmf.vma_flags & VM_WRITE))) {\n+\t\t\tret = VM_FAULT_SIGSEGV;\n+\t\t\tgoto unlock;\n+\t\t}\n+\t} else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE)))) {\n+\t\tret = VM_FAULT_SIGSEGV;\n+\t\tgoto unlock;\n+\t}\n+\n+\t/*\n+\t * Do a speculative lookup of the PTE entry.\n+\t */\n+\tlocal_irq_disable();\n+\tpgd = pgd_offset(mm, address);\n+\tif (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))\n+\t\tgoto out_walk;\n+\n+\tp4d = p4d_alloc(mm, pgd, address);\n+\tif (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))\n+\t\tgoto out_walk;\n+\n+\tpud = pud_alloc(mm, p4d, address);\n+\tif (pud_none(*pud) || unlikely(pud_bad(*pud)))\n+\t\tgoto out_walk;\n+\n+\t/* Transparent huge pages are not supported. */\n+\tif (unlikely(pud_trans_huge(*pud)))\n+\t\tgoto out_walk;\n+\n+\tpmd = pmd_offset(pud, address);\n+\tif (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))\n+\t\tgoto out_walk;\n+\n+\t/*\n+\t * The above does not allocate/instantiate page-tables because doing so\n+\t * would lead to the possibility of instantiating page-tables after\n+\t * free_pgtables() -- and consequently leaking them.\n+\t *\n+\t * The result is that we take at least one !speculative fault per PMD\n+\t * in order to instantiate it.\n+\t */\n+\t/* Transparent huge pages are not supported. */\n+\tif (unlikely(pmd_trans_huge(*pmd)))\n+\t\tgoto out_walk;\n+\n+\tvmf.vma = vma;\n+\tvmf.pmd = pmd;\n+\tvmf.pgoff = linear_page_index(vma, address);\n+\tvmf.gfp_mask = __get_fault_gfp_mask(vma);\n+\tvmf.sequence = seq;\n+\tvmf.flags = flags;\n+\n+\tlocal_irq_enable();\n+\n+\t/*\n+\t * We need to re-validate the VMA after checking the bounds, otherwise\n+\t * we might have a false positive on the bounds.\n+\t */\n+\tif (read_seqcount_retry(&vma->vm_sequence, seq))\n+\t\tgoto unlock;\n+\n+\tret = handle_pte_fault(&vmf);\n+\n+unlock:\n+\tsrcu_read_unlock(&vma_srcu, idx);\n+\treturn ret;\n+\n+out_walk:\n+\tlocal_irq_enable();\n+\tgoto unlock;\n+}\n+#endif /* __HAVE_ARCH_CALL_SPF */\n+\n /*\n * By the time we get here, we already hold the mm semaphore\n *\n", "prefixes": [ "v3", "14/20" ] }