From patchwork Wed Aug 1 22:23:45 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Henderson X-Patchwork-Id: 952438 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=sourceware.org (client-ip=209.132.180.131; helo=sourceware.org; envelope-from=libc-alpha-return-94987-incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Authentication-Results: ozlabs.org; dmarc=none (p=none dis=none) header.from=twiddle.net Authentication-Results: ozlabs.org; dkim=pass (2048-bit key; unprotected) header.d=gmail.com header.i=@gmail.com header.b="AUW5hRh8"; dkim-atps=neutral Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 41gnsm02bwz9s3q for ; Thu, 2 Aug 2018 08:24:27 +1000 (AEST) Received: (qmail 24395 invoked by alias); 1 Aug 2018 22:24:00 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 24267 invoked by uid 89); 1 Aug 2018 22:23:59 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-26.6 required=5.0 tests=BAYES_00, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE, SPF_PASS autolearn=ham version=3.3.2 spammy=PROF, Jump X-HELO: mail-ua0-f173.google.com DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20161025; h=sender:from:to:cc:subject:date:message-id:in-reply-to:references; bh=sOhRp65/pGulKyAWQDLjGPI89+CTFVdgL3WNY/sc4X4=; b=AUW5hRh81h5n8nP4IbeGivMSIWMtM9SKr4mkjg9yZYmN8atGBDfDCYzvba79PiRxUo krJ33cOIFsOQnfRi/n3vUiy77hTOSXwQwNN05uX/qgQC1EomLcLNFcG1awAkNMBb4qS3 /kysswfSrYKnoc1r3+bk55t4yjg2IQBWN8AjZiQLPU7vCnMF94tGACb59FMNxRYtWXMO q+EaKhR1LTY4IDUnjlvvThm0GwJLgads7XwOmBN6N3cCrp+VEsiTqzQhcWplDR71Ne13 NgVqK+kWVa09qSO/aDAm/snG4L0EAcW8CUzZNOswknoCYAeU1BHZzDWuZHa1I2NEoHo4 Tynw== Sender: Richard Henderson From: rth@twiddle.net To: libc-alpha@sourceware.org Cc: marcus.shawcroft@linaro.org, szabolcs.nagy@arm.com, Richard Henderson Subject: [PATCH 1/3] aarch64: Clean up _dl_runtime_resolve Date: Wed, 1 Aug 2018 18:23:45 -0400 Message-Id: <20180801222347.18903-2-rth@twiddle.net> In-Reply-To: <20180801222347.18903-1-rth@twiddle.net> References: <20180801222347.18903-1-rth@twiddle.net> From: Richard Henderson * sysdeps/aarch64/dl-trampoline.S (_dl_runtime_resolve): Do not record unwind info for arguments; this is unneeded; do not save x9 just to have a register to pair with x8; properly include the 16 bytes of PLT stack into the unwind; create a frame pointer with the spare stack slot; rearrange the exit to only adjust the stack once. --- sysdeps/aarch64/dl-trampoline.S | 50 +++++++++------------------------ 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S index a86d0722d4..e8e2af485a 100644 --- a/sysdeps/aarch64/dl-trampoline.S +++ b/sysdeps/aarch64/dl-trampoline.S @@ -32,7 +32,6 @@ .text .globl _dl_runtime_resolve .type _dl_runtime_resolve, #function - cfi_startproc .align 2 _dl_runtime_resolve: /* AArch64 we get called with: @@ -41,46 +40,24 @@ _dl_runtime_resolve: [sp, #8] lr [sp, #0] &PLTGOT[n] */ - + cfi_startproc + cfi_adjust_cfa_offset(16) /* Incorporate PLT */ cfi_rel_offset (lr, 8) /* Save arguments. */ - stp x8, x9, [sp, #-(80+8*16)]! + stp x29, x8, [sp, #-(80+8*16)]! cfi_adjust_cfa_offset (80+8*16) - cfi_rel_offset (x8, 0) - cfi_rel_offset (x9, 8) + cfi_rel_offset (x29, 0) + mov x29, sp stp x6, x7, [sp, #16] - cfi_rel_offset (x6, 16) - cfi_rel_offset (x7, 24) - stp x4, x5, [sp, #32] - cfi_rel_offset (x4, 32) - cfi_rel_offset (x5, 40) - stp x2, x3, [sp, #48] - cfi_rel_offset (x2, 48) - cfi_rel_offset (x3, 56) - stp x0, x1, [sp, #64] - cfi_rel_offset (x0, 64) - cfi_rel_offset (x1, 72) - stp q0, q1, [sp, #(80+0*16)] - cfi_rel_offset (q0, 80+0*16) - cfi_rel_offset (q1, 80+1*16) - stp q2, q3, [sp, #(80+2*16)] - cfi_rel_offset (q0, 80+2*16) - cfi_rel_offset (q1, 80+3*16) - stp q4, q5, [sp, #(80+4*16)] - cfi_rel_offset (q0, 80+4*16) - cfi_rel_offset (q1, 80+5*16) - stp q6, q7, [sp, #(80+6*16)] - cfi_rel_offset (q0, 80+6*16) - cfi_rel_offset (q1, 80+7*16) /* Get pointer to linker struct. */ ldr PTR_REG (0), [ip0, #-PTR_SIZE] @@ -101,25 +78,26 @@ _dl_runtime_resolve: mov ip0, x0 /* Get arguments and return address back. */ - ldp q0, q1, [sp, #(80+0*16)] - ldp q2, q3, [sp, #(80+2*16)] - ldp q4, q5, [sp, #(80+4*16)] + ldr lr, [sp, #80+8*16+8] ldp q6, q7, [sp, #(80+6*16)] + ldp q4, q5, [sp, #(80+4*16)] + ldp q2, q3, [sp, #(80+2*16)] + ldp q0, q1, [sp, #(80+0*16)] ldp x0, x1, [sp, #64] ldp x2, x3, [sp, #48] ldp x4, x5, [sp, #32] ldp x6, x7, [sp, #16] - ldp x8, x9, [sp], #(80+8*16) - cfi_adjust_cfa_offset (-(80+8*16)) - - ldp ip1, lr, [sp], #16 - cfi_adjust_cfa_offset (-16) + ldp x29, x8, [sp], 80+8*16+16 + cfi_adjust_cfa_offset (-(80+8*16+16)) + cfi_restore (lr) + cfi_restore (x29) /* Jump to the newly found address. */ br ip0 cfi_endproc .size _dl_runtime_resolve, .-_dl_runtime_resolve + #ifndef PROF .globl _dl_runtime_profile .type _dl_runtime_profile, #function From patchwork Wed Aug 1 22:23:46 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Henderson X-Patchwork-Id: 952439 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=sourceware.org (client-ip=209.132.180.131; helo=sourceware.org; envelope-from=libc-alpha-return-94988-incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Authentication-Results: ozlabs.org; dmarc=none (p=none dis=none) header.from=twiddle.net Authentication-Results: ozlabs.org; dkim=pass (2048-bit key; unprotected) header.d=gmail.com header.i=@gmail.com header.b="ERiXIqAw"; dkim-atps=neutral Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 41gnt43rRCz9s3q for ; Thu, 2 Aug 2018 08:24:44 +1000 (AEST) Received: (qmail 25145 invoked by alias); 1 Aug 2018 22:24:05 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 24737 invoked by uid 89); 1 Aug 2018 22:24:02 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-26.6 required=5.0 tests=BAYES_00, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE, SPF_PASS autolearn=ham version=3.3.2 spammy= X-HELO: mail-ua0-f172.google.com DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20161025; h=sender:from:to:cc:subject:date:message-id:in-reply-to:references; bh=aL1gAGPGZHQcFp7XdCKPoN1KsedXCn7XSo0W7UBK6Qo=; b=ERiXIqAwOubPdMusvYtOQVq5B6WNF9QOYwFpU+CAfGCxFW7xaudiCmUjruXfuUm1kp T3SvwayXPH8Ku19vFy8NKmlDWDCfa5D4ON3yWsQ4hvq/OY4Ok/Bhcuuy6AMWePRZeFHD /F52Y0U1jZHRZ6pWaiYv0sI6CA9ME/5diutyTZh7L7n+JDUxK5CJktrfE7rVzgz76SaH WncUInI0DlI7Em/aOfTYm8xIeKWH7FTeQh4MUATuWMLvcZLCsA0AClv2MvguJlJdFzqv b3jTDUoGmtGlGmvMmGRDCr4PTvbu4UfZv1+eQOdMltJpf21hY5eJn9H9h96gyB9K9y1b 1jCQ== Sender: Richard Henderson From: rth@twiddle.net To: libc-alpha@sourceware.org Cc: marcus.shawcroft@linaro.org, szabolcs.nagy@arm.com, Richard Henderson Subject: [PATCH 2/3] aarch64: Clean up _dl_runtime_profile Date: Wed, 1 Aug 2018 18:23:46 -0400 Message-Id: <20180801222347.18903-3-rth@twiddle.net> In-Reply-To: <20180801222347.18903-1-rth@twiddle.net> References: <20180801222347.18903-1-rth@twiddle.net> From: Richard Henderson Not adjusting La_aarch64_regs or La_aarch64_retval for the new AdvSIMD vector ABI; that will require more thought and coordination. In the meantime, this will at least pass the proper values to each callee, even if the values are not visible to auditing. * sysdeps/aarch64/dl-trampoline.S (_dl_runtime_profile): Do not record unwind info for arguments -- this is unneeded; properly include the 16 bytes of PLT stack into the unwind; save and restore the structure return pointer, x8; save all of the AdvSIMD registers defined for the vector ABI. Reviewed-By: Szabolcs Nagy --- sysdeps/aarch64/dl-trampoline.S | 138 ++++++++++++++++---------------- 1 file changed, 71 insertions(+), 67 deletions(-) diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S index e8e2af485a..67a7c1b207 100644 --- a/sysdeps/aarch64/dl-trampoline.S +++ b/sysdeps/aarch64/dl-trampoline.S @@ -101,7 +101,6 @@ _dl_runtime_resolve: #ifndef PROF .globl _dl_runtime_profile .type _dl_runtime_profile, #function - cfi_startproc .align 2 _dl_runtime_profile: /* AArch64 we get called with: @@ -111,15 +110,16 @@ _dl_runtime_profile: [sp, #0] &PLTGOT[n] Stack frame layout: - [sp, #...] lr - [sp, #...] &PLTGOT[n] - [sp, #96] La_aarch64_regs - [sp, #48] La_aarch64_retval - [sp, #40] frame size return from pltenter - [sp, #32] dl_profile_call saved x1 - [sp, #24] dl_profile_call saved x0 - [sp, #16] t1 - [sp, #0] x29, lr <- x29 + [x29, #...] lr + [x29, #...] &PLTGOT[n] + [x29, #96] La_aarch64_regs + [x29, #48] La_aarch64_retval + [x29, #40] frame size return from pltenter + [x29, #32] dl_profile_call saved x1 + [x29, #24] dl_profile_call saved x0 + [x29, #16] t1 + [x29, #0] x29, x8 + [x29, #-128] full q[0-7] contents */ # define OFFSET_T1 16 @@ -127,46 +127,39 @@ _dl_runtime_profile: # define OFFSET_FS OFFSET_SAVED_CALL_X0 + 16 # define OFFSET_RV OFFSET_FS + 8 # define OFFSET_RG OFFSET_RV + DL_SIZEOF_RV +# define OFFSET_SAVED_VEC (-16 * 8) -# define SF_SIZE OFFSET_RG + DL_SIZEOF_RG +# define SF_SIZE (OFFSET_RG + DL_SIZEOF_RG) # define OFFSET_PLTGOTN SF_SIZE # define OFFSET_LR OFFSET_PLTGOTN + 8 - /* Save arguments. */ - sub sp, sp, #SF_SIZE - cfi_adjust_cfa_offset (SF_SIZE) - stp x29, x30, [SP, #0] - mov x29, sp - cfi_def_cfa_register (x29) - cfi_rel_offset (x29, 0) + cfi_startproc + cfi_adjust_cfa_offset(16) /* Incorporate PLT */ cfi_rel_offset (lr, 8) - stp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] - cfi_rel_offset (x0, OFFSET_RG + DL_OFFSET_RG_X0 + 16*0 + 0) - cfi_rel_offset (x1, OFFSET_RG + DL_OFFSET_RG_X0 + 16*0 + 8) - stp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] - cfi_rel_offset (x2, OFFSET_RG + DL_OFFSET_RG_X0 + 16*1 + 0) - cfi_rel_offset (x3, OFFSET_RG + DL_OFFSET_RG_X0 + 16*1 + 8) - stp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] - cfi_rel_offset (x4, OFFSET_RG + DL_OFFSET_RG_X0 + 16*2 + 0) - cfi_rel_offset (x5, OFFSET_RG + DL_OFFSET_RG_X0 + 16*2 + 8) - stp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] - cfi_rel_offset (x6, OFFSET_RG + DL_OFFSET_RG_X0 + 16*3 + 0) - cfi_rel_offset (x7, OFFSET_RG + DL_OFFSET_RG_X0 + 16*3 + 8) + stp x29, x8, [SP, #-SF_SIZE]! + cfi_adjust_cfa_offset (SF_SIZE) + cfi_rel_offset (x29, 0) + mov x29, sp + cfi_def_cfa_register (x29) + sub sp, sp, #-OFFSET_SAVED_VEC - stp d0, d1, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*0] - cfi_rel_offset (d0, OFFSET_RG + DL_OFFSET_RG_D0 + 16*0) - cfi_rel_offset (d1, OFFSET_RG + DL_OFFSET_RG_D0 + 16*0 + 8) - stp d2, d3, [X29, #OFFSET_RG+ DL_OFFSET_RG_D0 + 16*1] - cfi_rel_offset (d2, OFFSET_RG + DL_OFFSET_RG_D0 + 16*1 + 0) - cfi_rel_offset (d3, OFFSET_RG + DL_OFFSET_RG_D0 + 16*1 + 8) - stp d4, d5, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*2] - cfi_rel_offset (d4, OFFSET_RG + DL_OFFSET_RG_D0 + 16*2 + 0) - cfi_rel_offset (d5, OFFSET_RG + DL_OFFSET_RG_D0 + 16*2 + 8) - stp d6, d7, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*3] - cfi_rel_offset (d6, OFFSET_RG + DL_OFFSET_RG_D0 + 16*3 + 0) - cfi_rel_offset (d7, OFFSET_RG + DL_OFFSET_RG_D0 + 16*3 + 8) + /* Save La_aarch64_regs. */ + stp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] + stp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] + stp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] + stp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] + stp d0, d1, [x29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*0] + stp d2, d3, [x29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*1] + stp d4, d5, [x29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*2] + stp d6, d7, [x29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*3] + + /* Re-save the full contents of the vector arguments. */ + stp q0, q1, [x29, #OFFSET_SAVED_VEC + 16*0] + stp q2, q3, [x29, #OFFSET_SAVED_VEC + 16*2] + stp q4, q5, [x29, #OFFSET_SAVED_VEC + 16*4] + stp q6, q7, [x29, #OFFSET_SAVED_VEC + 16*6] add x0, x29, #SF_SIZE + 16 ldr x1, [x29, #OFFSET_LR] @@ -201,31 +194,28 @@ _dl_runtime_profile: mov ip0, x0 /* Get arguments and return address back. */ + ldr lr, [x29, #OFFSET_LR] ldp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] ldp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] ldp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] ldp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] - ldp d0, d1, [x29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*0] - ldp d2, d3, [x29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*1] - ldp d4, d5, [x29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*2] - ldp d6, d7, [x29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*3] + ldp q0, q1, [x29, #OFFSET_SAVED_VEC + 16*0] + ldp q2, q3, [x29, #OFFSET_SAVED_VEC + 16*2] + ldp q4, q5, [x29, #OFFSET_SAVED_VEC + 16*4] + ldp q6, q7, [x29, #OFFSET_SAVED_VEC + 16*6] - cfi_def_cfa_register (sp) - ldp x29, x30, [x29, #0] - cfi_restore(x29) - cfi_restore(x30) - - add sp, sp, SF_SIZE + 16 - cfi_adjust_cfa_offset (- SF_SIZE - 16) + mov sp, x29 + ldp x29, x8, [sp], SF_SIZE + 16 + cfi_def_cfa (sp, 0) + cfi_restore (x29) + cfi_restore (lr) /* Jump to the newly found address. */ br ip0 cfi_restore_state -1: - /* The new frame size is in ip0. */ - - sub PTR_REG (1), PTR_REG (29), ip0l + /* The new frame size is in ip0, extended for pointer size. */ +1: sub x1, sp, ip0 and sp, x1, #0xfffffffffffffff0 str x0, [x29, #OFFSET_T1] @@ -237,42 +227,56 @@ _dl_runtime_profile: ldr ip0, [x29, #OFFSET_T1] - /* Call the function. */ + /* Load the original arguments. */ ldp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] ldp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] ldp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] ldp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] - ldp d0, d1, [x29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*0] - ldp d2, d3, [x29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*1] - ldp d4, d5, [x29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*2] - ldp d6, d7, [x29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*3] + ldr x8, [x29, 8] + ldp q0, q1, [x29, #OFFSET_SAVED_VEC + 16*0] + ldp q2, q3, [x29, #OFFSET_SAVED_VEC + 16*2] + ldp q4, q5, [x29, #OFFSET_SAVED_VEC + 16*4] + ldp q6, q7, [x29, #OFFSET_SAVED_VEC + 16*6] + + /* Call the function. */ blr ip0 + + /* Save La_aarch64_retval. */ stp x0, x1, [x29, #OFFSET_RV + DL_OFFSET_RV_X0] stp d0, d1, [x29, #OFFSET_RV + DL_OFFSET_RV_D0 + 16*0] stp d2, d3, [x29, #OFFSET_RV + DL_OFFSET_RV_D0 + 16*1] + /* Re-save the full contents of the vector return. */ + stp q0, q1, [x29, #OFFSET_SAVED_VEC + 16*0] + stp q2, q3, [x29, #OFFSET_SAVED_VEC + 16*2] + stp q4, q5, [x29, #OFFSET_SAVED_VEC + 16*4] + stp q6, q7, [x29, #OFFSET_SAVED_VEC + 16*6] + /* Setup call to pltexit */ ldp x0, x1, [x29, #OFFSET_SAVED_CALL_X0] add x2, x29, #OFFSET_RG add x3, x29, #OFFSET_RV bl _dl_call_pltexit + /* Restore the full return value. */ ldp x0, x1, [x29, #OFFSET_RV + DL_OFFSET_RV_X0] - ldp d0, d1, [x29, #OFFSET_RV + DL_OFFSET_RV_D0 + 16*0] - ldp d2, d3, [x29, #OFFSET_RV + DL_OFFSET_RV_D0 + 16*1] + ldp q0, q1, [x29, #OFFSET_SAVED_VEC + 16*0] + ldp q2, q3, [x29, #OFFSET_SAVED_VEC + 16*2] + ldp q4, q5, [x29, #OFFSET_SAVED_VEC + 16*4] + ldp q6, q7, [x29, #OFFSET_SAVED_VEC + 16*6] + /* LR from within La_aarch64_reg */ ldr lr, [x29, #OFFSET_RG + DL_OFFSET_RG_LR] - cfi_restore(lr) mov sp, x29 cfi_def_cfa_register (sp) ldr x29, [x29, #0] - cfi_restore(x29) add sp, sp, SF_SIZE + 16 cfi_adjust_cfa_offset (- SF_SIZE - 16) + cfi_restore(x29) + cfi_restore(lr) br lr cfi_endproc .size _dl_runtime_profile, .-_dl_runtime_profile #endif - .previous From patchwork Wed Aug 1 22:23:47 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Henderson X-Patchwork-Id: 952440 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=sourceware.org (client-ip=209.132.180.131; helo=sourceware.org; envelope-from=libc-alpha-return-94989-incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Authentication-Results: ozlabs.org; dmarc=none (p=none dis=none) header.from=twiddle.net Authentication-Results: ozlabs.org; dkim=pass (2048-bit key; unprotected) header.d=gmail.com header.i=@gmail.com header.b="iDjhLltq"; dkim-atps=neutral Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 41gntB4tHKz9s4c for ; Thu, 2 Aug 2018 08:24:50 +1000 (AEST) Received: (qmail 25350 invoked by alias); 1 Aug 2018 22:24:06 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 25028 invoked by uid 89); 1 Aug 2018 22:24:04 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-26.6 required=5.0 tests=BAYES_00, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE, SPF_PASS autolearn=ham version=3.3.2 spammy=H*r:sk:h1-v6so, HX-Received:sk:l27-v6m, amended X-HELO: mail-ua0-f195.google.com DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20161025; h=sender:from:to:cc:subject:date:message-id:in-reply-to:references; bh=hBKtMhCjVGD6X8lBZBjb5AN6l81Hdrcy8nCWSz88XjE=; b=iDjhLltq88m0yQiPEIzUGI70PMyrCp6gvKcHldmRTr4sLgDf7DbnxfVxMGdq1+JzNA xw3FMLv7xYQdVZUu4vT9eZEwbLmwy4kl3IW9gzpLrFo6T5o+ZbsnBE1Jvzn1fbMdl8Sv gZPahMxhoT8KUUYZHMoMWnj9H7m/XVZ7RwMt+razhK8dHYdnylz2qCjRF84goB2CUwxm 8x8WbcWLs152EmOgv9csuqKwt+j2SqrBVsVq+Jb6rRN+fYN90G9sy5uKIR2/JHX4V1rm KFqn2qeMlCTIxwoLMr0xISNByGeNyLk3f0L/WJBe8BnCuBleGIBXnUH7lKnn8OYwnp1+ D75w== Sender: Richard Henderson From: rth@twiddle.net To: libc-alpha@sourceware.org Cc: marcus.shawcroft@linaro.org, szabolcs.nagy@arm.com, Richard Henderson Subject: [PATCH 3/3] aarch64: Save and restore SVE registers in ld.so Date: Wed, 1 Aug 2018 18:23:47 -0400 Message-Id: <20180801222347.18903-4-rth@twiddle.net> In-Reply-To: <20180801222347.18903-1-rth@twiddle.net> References: <20180801222347.18903-1-rth@twiddle.net> From: Richard Henderson Add SVE versions of _dl_runtime_resolve and _dl_runtime_profile. This honors the extended vector calling conventionn described in ARM_100986_0000_00_en (SVEpcs 00bet1). * sysdeps/aarch64/dl-trampoline.S (_dl_runtime_resolve_sve): New. (_dl_runtime_profile_sve): New. * sysdeps/aarch64/dl-machine.h (elf_machine_runtime_set): Use the new routines if HWCAP_SVE is set. --- sysdeps/aarch64/dl-machine.h | 13 +- sysdeps/aarch64/dl-trampoline.S | 343 ++++++++++++++++++++++++++++++++ 2 files changed, 353 insertions(+), 3 deletions(-) diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h index 4935aa7c54..ea7c5c71d5 100644 --- a/sysdeps/aarch64/dl-machine.h +++ b/sysdeps/aarch64/dl-machine.h @@ -69,6 +69,9 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) ElfW(Addr) *got; extern void _dl_runtime_resolve (ElfW(Word)); extern void _dl_runtime_profile (ElfW(Word)); + extern void _dl_runtime_resolve_sve (ElfW(Word)); + extern void _dl_runtime_profile_sve (ElfW(Word)); + unsigned has_sve = GLRO(dl_hwcap) & HWCAP_SVE; got = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]); if (got[1]) @@ -83,9 +86,11 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) to intercept the calls to collect information. In this case we don't store the address in the GOT so that all future calls also end in this function. */ - if ( profile) + if (profile) { - got[2] = (ElfW(Addr)) &_dl_runtime_profile; + got[2] = (has_sve + ? (ElfW(Addr)) &_dl_runtime_profile_sve + : (ElfW(Addr)) &_dl_runtime_profile); if (GLRO(dl_profile) != NULL && _dl_name_match_p (GLRO(dl_profile), l)) @@ -98,7 +103,9 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) /* This function will get called to fix up the GOT entry indicated by the offset on the stack, and then jump to the resolved address. */ - got[2] = (ElfW(Addr)) &_dl_runtime_resolve; + got[2] = (has_sve + ? (ElfW(Addr)) &_dl_runtime_resolve_sve + : (ElfW(Addr)) &_dl_runtime_resolve); } } diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S index 67a7c1b207..e23e5f1aad 100644 --- a/sysdeps/aarch64/dl-trampoline.S +++ b/sysdeps/aarch64/dl-trampoline.S @@ -280,3 +280,346 @@ _dl_runtime_profile: cfi_endproc .size _dl_runtime_profile, .-_dl_runtime_profile #endif + +/* + * For functions conforming to the procedure call standard as + * amended for SVE support (ARM_100986_0000_00_en (SVEpcs 00bet1)), + * we must save the entire contents of Z0-Z7 as well as P0-P3. + */ + .arch armv8-a+sve + + .globl _dl_runtime_resolve_sve + .type _dl_runtime_resolve_sve, #function + .align 2 +_dl_runtime_resolve_sve: + /* AArch64 we get called with: + ip0 &PLTGOT[2] + ip1 temp(dl resolver entry point) + [sp, #8] lr + [sp, #0] &PLTGOT[n] + */ + cfi_startproc + cfi_adjust_cfa_offset(16) /* Incorporate PLT */ + cfi_rel_offset (lr, 8) + + /* Save arguments. */ + stp x29, x8, [sp, #-80]! + cfi_adjust_cfa_offset (80) + cfi_rel_offset (x29, 0) + mov x29, sp + cfi_def_cfa_register (x29) + + stp x6, x7, [sp, #16] + stp x4, x5, [sp, #32] + stp x2, x3, [sp, #48] + stp x0, x1, [sp, #64] + + /* Allocate space for, and store, Z[0-7]. */ + addvl sp, sp, #-8 + str z0, [sp, #0, mul vl] + str z1, [sp, #1, mul vl] + str z2, [sp, #2, mul vl] + str z3, [sp, #3, mul vl] + str z4, [sp, #4, mul vl] + str z5, [sp, #5, mul vl] + str z6, [sp, #6, mul vl] + str z7, [sp, #7, mul vl] + + /* Allocate space for, and store, P[0-3]. */ + addpl sp, sp, #-4 + str p0, [sp, #0, mul vl] + str p1, [sp, #1, mul vl] + str p2, [sp, #2, mul vl] + str p3, [sp, #3, mul vl] + + /* Get pointer to linker struct. */ + ldr PTR_REG (0), [ip0, #-PTR_SIZE] + + /* Prepare to call _dl_fixup(). */ + ldr x1, [x29, 80] /* Recover &PLTGOT[n] */ + + sub x1, x1, ip0 + add x1, x1, x1, lsl #1 + lsl x1, x1, #3 + sub x1, x1, #(RELA_SIZE<<3) + lsr x1, x1, #3 + + /* Call fixup routine. */ + bl _dl_fixup + + /* Save the return. */ + mov ip0, x0 + + /* Get arguments and return address back. */ + ldr p0, [sp, #0, mul vl] + ldr p1, [sp, #1, mul vl] + ldr p2, [sp, #2, mul vl] + ldr p3, [sp, #3, mul vl] + addpl sp, sp, #4 + + ldr z0, [sp, #0, mul vl] + ldr z1, [sp, #1, mul vl] + ldr z2, [sp, #2, mul vl] + ldr z3, [sp, #3, mul vl] + ldr z4, [sp, #4, mul vl] + ldr z5, [sp, #5, mul vl] + ldr z6, [sp, #6, mul vl] + ldr z7, [sp, #7, mul vl] + addvl sp, sp, #8 + + ldr lr, [sp, #88] + ldp x0, x1, [sp, #64] + ldp x2, x3, [sp, #48] + ldp x4, x5, [sp, #32] + ldp x6, x7, [sp, #16] + ldp x29, x8, [sp], #96 + cfi_def_cfa (sp, 0) + cfi_restore (lr) + cfi_restore (x29) + + /* Jump to the newly found address. */ + br ip0 + + cfi_endproc + .size _dl_runtime_resolve_sve, .-_dl_runtime_resolve_sve + +#ifndef PROF + .globl _dl_runtime_profile_sve + .type _dl_runtime_profile_sve, #function + .align 2 +_dl_runtime_profile_sve: + /* AArch64 we get called with: + ip0 &PLTGOT[2] + ip1 temp(dl resolver entry point) + [sp, #8] lr + [sp, #0] &PLTGOT[n] + + Stack frame layout: + [x29, #...] lr + [x29, #...] &PLTGOT[n] + [x29, #96] La_aarch64_regs + [x29, #48] La_aarch64_retval + [x29, #40] frame size return from pltenter + [x29, #32] dl_profile_call saved x1 + [x29, #24] dl_profile_call saved x0 + [x29, #16] t1 + [x29, #0] x29, lr <- x29 + [x29, #-1, mul vl] full p[0-3] + [x29, #-2, mul vl] full z[0-8] <- sp + + ??? Extending the profiling hook for full SVE register export + is tricky given the variable register size. Perhaps the new + La_aarch64_regs should contain pointers to Z0 and P0, and + the current VL, and one infers the addresses from there. + + This one new form could be used for all, with AdvSIMD + devolving into VL=16 with no predicate registers. + + In the meantime, this function simply saves the contents of + the SVE registers, but only exposes the AdvSIMD portion to + the profile hooks. + */ + + cfi_startproc + cfi_adjust_cfa_offset(16) /* Incorporate PLT */ + cfi_rel_offset (lr, 8) + + stp x29, x8, [SP, #-SF_SIZE]! + cfi_adjust_cfa_offset (SF_SIZE) + cfi_rel_offset (x29, 0) + mov x29, sp + cfi_def_cfa_register (x29) + + /* Save La_aarch64_regs. */ + stp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] + stp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] + stp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] + stp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] + stp d0, d1, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*0] + stp d2, d3, [X29, #OFFSET_RG+ DL_OFFSET_RG_D0 + 16*1] + stp d4, d5, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*2] + stp d6, d7, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*3] + + /* Re-save the full contents of the vector arguments. + + Note that PL = VL/8, so we can save all 4 predicates + in (less than) the space of one vector; this minimizes + the number of stack adjustments required, and gives a + predictable place for each register. + + Despite the unfortunate assembler mnemomics, the vector + stores do not overlap the preceeding prediate stores. */ + addvl sp, sp, #-9 + + str p0, [x29, #-1, mul vl] + str p1, [x29, #-2, mul vl] + str p2, [x29, #-3, mul vl] + str p3, [x29, #-4, mul vl] + + str z0, [x29, #-2, mul vl] + str z1, [x29, #-3, mul vl] + str z2, [x29, #-4, mul vl] + str z3, [x29, #-5, mul vl] + str z4, [x29, #-6, mul vl] + str z5, [x29, #-7, mul vl] + str z6, [x29, #-8, mul vl] + str z7, [x29, #-9, mul vl] + + add x0, x29, #SF_SIZE + 16 + ldr x1, [x29, #OFFSET_LR] + stp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_SP] + + /* Get pointer to linker struct. */ + ldr PTR_REG (0), [ip0, #-PTR_SIZE] + + /* Prepare to call _dl_profile_fixup(). */ + ldr x1, [x29, OFFSET_PLTGOTN] /* Recover &PLTGOT[n] */ + + sub x1, x1, ip0 + add x1, x1, x1, lsl #1 + lsl x1, x1, #3 + sub x1, x1, #(RELA_SIZE<<3) + lsr x1, x1, #3 + + stp x0, x1, [x29, #OFFSET_SAVED_CALL_X0] + + /* Set up extra args for _dl_profile_fixup */ + ldr x2, [x29, #OFFSET_LR] /* load saved LR */ + add x3, x29, #OFFSET_RG /* address of La_aarch64_reg */ + add x4, x29, #OFFSET_FS /* address of framesize */ + bl _dl_profile_fixup + + ldr ip0l, [x29, #OFFSET_FS] /* framesize == 0 */ + cmp ip0l, #0 + bge 1f + cfi_remember_state + + /* Save the return. */ + mov ip0, x0 + + /* Get arguments and return address back. */ + ldr p0, [x29, #-1, mul vl] + ldr p1, [x29, #-2, mul vl] + ldr p2, [x29, #-3, mul vl] + ldr p3, [x29, #-4, mul vl] + + ldr z0, [x29, #-2, mul vl] + ldr z1, [x29, #-3, mul vl] + ldr z2, [x29, #-4, mul vl] + ldr z3, [x29, #-5, mul vl] + ldr z4, [x29, #-6, mul vl] + ldr z5, [x29, #-7, mul vl] + ldr z6, [x29, #-8, mul vl] + ldr z7, [x29, #-9, mul vl] + + ldr lr, [x29, #OFFSET_LR] + ldp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] + ldp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] + ldp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] + ldp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] + + mov sp, x29 + ldp x29, x8, [sp], SF_SIZE + 16 + cfi_def_cfa (sp, 0) + cfi_restore(x29) + cfi_restore(lr) + + /* Jump to the newly found address. */ + br ip0 + + cfi_restore_state + /* The new frame size is in ip0, extended for pointer size. */ +1: sub x1, sp, ip0 + and sp, x1, #0xfffffffffffffff0 + + str x0, [x29, #OFFSET_T1] + + mov x0, sp + add x1, x29, #SF_SIZE + 16 + mov x2, ip0 + bl memcpy + + ldr ip0, [x29, #OFFSET_T1] + + /* Reload the full arguments. */ + ldp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] + ldp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] + ldp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] + ldp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] + ldr x8, [x29, 8] + + ldr p0, [x29, #-1, mul vl] + ldr p1, [x29, #-2, mul vl] + ldr p2, [x29, #-3, mul vl] + ldr p3, [x29, #-4, mul vl] + + ldr z0, [x29, #-2, mul vl] + ldr z1, [x29, #-3, mul vl] + ldr z2, [x29, #-4, mul vl] + ldr z3, [x29, #-5, mul vl] + ldr z4, [x29, #-6, mul vl] + ldr z5, [x29, #-7, mul vl] + ldr z6, [x29, #-8, mul vl] + ldr z7, [x29, #-9, mul vl] + + /* Call the function. */ + blr ip0 + + /* Store La_aarch64_retval, as if for the non-vector ABI. */ + stp x0, x1, [x29, #OFFSET_RV + DL_OFFSET_RV_X0] + stp d0, d1, [x29, #OFFSET_RV + DL_OFFSET_RV_D0 + 16*0] + stp d2, d3, [x29, #OFFSET_RV + DL_OFFSET_RV_D0 + 16*1] + + /* Store the full contents of the vector return. */ + str p0, [x29, #-1, mul vl] + str p1, [x29, #-2, mul vl] + str p2, [x29, #-3, mul vl] + str p3, [x29, #-4, mul vl] + + str z0, [x29, #-2, mul vl] + str z1, [x29, #-3, mul vl] + str z2, [x29, #-4, mul vl] + str z3, [x29, #-5, mul vl] + str z4, [x29, #-6, mul vl] + str z5, [x29, #-7, mul vl] + str z6, [x29, #-8, mul vl] + str z7, [x29, #-9, mul vl] + + /* Setup call to pltexit */ + ldp x0, x1, [x29, #OFFSET_SAVED_CALL_X0] + add x2, x29, #OFFSET_RG + add x3, x29, #OFFSET_RV + bl _dl_call_pltexit + + /* Reload the full return value. */ + ldp x0, x1, [x29, #OFFSET_RV + DL_OFFSET_RV_X0] + + ldr p0, [x29, #-1, mul vl] + ldr p1, [x29, #-2, mul vl] + ldr p2, [x29, #-3, mul vl] + ldr p3, [x29, #-4, mul vl] + + ldr z0, [x29, #-2, mul vl] + ldr z1, [x29, #-3, mul vl] + ldr z2, [x29, #-4, mul vl] + ldr z3, [x29, #-5, mul vl] + ldr z4, [x29, #-6, mul vl] + ldr z5, [x29, #-7, mul vl] + ldr z6, [x29, #-8, mul vl] + ldr z7, [x29, #-9, mul vl] + + /* LR from within La_aarch64_reg */ + ldr lr, [x29, #OFFSET_RG + DL_OFFSET_RG_LR] + mov sp, x29 + cfi_def_cfa_register (sp) + ldr x29, [x29, #0] + add sp, sp, SF_SIZE + 16 + cfi_adjust_cfa_offset (- SF_SIZE - 16) + cfi_restore(x29) + cfi_restore(lr) + + br lr + + cfi_endproc + .size _dl_runtime_profile_sve, .-_dl_runtime_profile_sve +#endif