diff mbox series

[v3,19/28] arm64/sve: ptrace and ELF coredump support

Message ID 1507660725-7986-20-git-send-email-Dave.Martin@arm.com
State New
Headers show
Series ARM Scalable Vector Extension (SVE) | expand

Commit Message

Dave Martin Oct. 10, 2017, 6:38 p.m. UTC
This patch defines and implements a new regset NT_ARM_SVE, which
describes a thread's SVE register state.  This allows a debugger to
manipulate the SVE state, as well as being included in ELF
coredumps for post-mortem debugging.

Because the regset size and layout are dependent on the thread's
current vector length, it is not possible to define a C struct to
describe the regset contents as is done for existing regsets.
Instead, and for the same reasons, NT_ARM_SVE is based on the
freeform variable-layout approach used for the SVE signal frame.

Additionally, to reduce debug overhead when debugging threads that
might or might not have live SVE register state, NT_ARM_SVE may be
presented in one of two different formats: the old struct
user_fpsimd_state format is embedded for describing the state of a
thread with no live SVE state, whereas a new variable-layout
structure is embedded for describing live SVE state.  This avoids a
debugger needing to poll NT_PRFPREG in addition to NT_ARM_SVE, and
allows existing userspace code to handle the non-SVE case without
too much modification.

For this to work, NT_ARM_SVE is defined with a fixed-format header
of type struct user_sve_header, which the recipient can use to
figure out the content, size and layout of the reset of the regset.
Accessor macros are defined to allow the vector-length-dependent
parts of the regset to be manipulated.

Signed-off-by: Alan Hayward <alan.hayward@arm.com>
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Cc: Alex Bennée <alex.bennee@linaro.org>
Cc: Okamoto Takayuki <tokamoto@jp.fujitsu.com>

---

Dropped Alex Bennée's reviewed-by, since bug fixes have been applied.

Changes since v2
----------------

Bug fixes:

 * Initialised user_sve_header.sve_max_vl properly in
   sve_init_header_from_task().  (Reported by Okamoto Takayuki.)

   This bug was introduced by the refactoring since v1 to remove
   BUG_ON()s.  A WARN_ON() was introduced to fix up the error case
   here, but the code for the non-error case was lost.

   This resulted in userspace seeing max_size == 0 when reading
   NT_ARM_SVE with PTRACE_GETREGSET.  NT_ARM_SVE would also get
   truncated in coredmups (though I've not tested that).

   This fix ensures that max_size is initialised, and only overrides
   the value written if it would be garbage.

 * In sve_set(), the flags for sve_set_vector_length() are shifted
   into the correct position.  Without this, a PTRACE_SETREGSET for
   NT_ARM_SVE may reject valid flags (including flags read via
   PTRACE_GETREGSET) with -EINVAL: thus, legitimate uses including
   save/restore may not work.

Miscellaneous:

 * Clarified comment in ptrace.h about keeping flags in sync with
   prctl.h so that it won't be misinterpreted as applying to
   SVE_PT_REGS_{MASK, FPSIMD,SVE} (which deliberately have no prctl
   equivalent).

 * Added comments explaining the intent, purpose and basic constraints
   for fpsimd.c helpers.
---
 arch/arm64/include/asm/fpsimd.h      |  13 +-
 arch/arm64/include/uapi/asm/ptrace.h | 138 ++++++++++++++++++
 arch/arm64/kernel/fpsimd.c           |  60 ++++++++
 arch/arm64/kernel/ptrace.c           | 271 +++++++++++++++++++++++++++++++++--
 include/uapi/linux/elf.h             |   1 +
 5 files changed, 474 insertions(+), 9 deletions(-)

Comments

Catalin Marinas Oct. 12, 2017, 5:06 p.m. UTC | #1
On Tue, Oct 10, 2017 at 07:38:36PM +0100, Dave P Martin wrote:
> @@ -702,6 +737,211 @@ static int system_call_set(struct task_struct *target,
>  	return ret;
>  }
>  
> +#ifdef CONFIG_ARM64_SVE
> +
> +static void sve_init_header_from_task(struct user_sve_header *header,
> +				      struct task_struct *target)
> +{
> +	unsigned int vq;
> +
> +	memset(header, 0, sizeof(*header));
> +
> +	header->flags = test_tsk_thread_flag(target, TIF_SVE) ?
> +		SVE_PT_REGS_SVE : SVE_PT_REGS_FPSIMD;

For PTRACE_SYSCALL, we may or may not have TIF_SVE depending on what
happened with the target. Just a thought: shall we clear TIF_SVE (and
sync it to fpsimd) in syscall_trace_enter()?

> +	if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT))
> +		header->flags |= SVE_PT_VL_INHERIT;
> +
> +	header->vl = target->thread.sve_vl;
> +	vq = sve_vq_from_vl(header->vl);
> +
> +	header->max_vl = sve_max_vl;
> +	if (WARN_ON(!sve_vl_valid(sve_max_vl)))
> +		header->max_vl = header->vl;
> +
> +	header->size = SVE_PT_SIZE(vq, header->flags);
> +	header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl),
> +				      SVE_PT_REGS_SVE);
> +}
[...]
> +static int sve_set(struct task_struct *target,
> +		   const struct user_regset *regset,
> +		   unsigned int pos, unsigned int count,
> +		   const void *kbuf, const void __user *ubuf)
> +{
> +	int ret;
> +	struct user_sve_header header;
> +	unsigned int vq;
> +	unsigned long start, end;
> +
> +	if (!system_supports_sve())
> +		return -EINVAL;
> +
> +	/* Header */
> +	if (count < sizeof(header))
> +		return -EINVAL;
> +	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header,
> +				 0, sizeof(header));
> +	if (ret)
> +		goto out;
> +
> +	/*
> +	 * Apart from PT_SVE_REGS_MASK, all PT_SVE_* flags are consumed by
> +	 * sve_set_vector_length(), which will also validate them for us:
> +	 */
> +	ret = sve_set_vector_length(target, header.vl,
> +		((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
> +	if (ret)
> +		goto out;
> +
> +	/* Actual VL set may be less than the user asked for: */
> +	vq = sve_vq_from_vl(target->thread.sve_vl);
> +
> +	/* Registers: FPSIMD-only case */
> +
> +	BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
> +	if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) {
> +		sve_sync_to_fpsimd(target);
> +
> +		ret = __fpr_set(target, regset, pos, count, kbuf, ubuf,
> +				SVE_PT_FPSIMD_OFFSET);
> +		clear_tsk_thread_flag(target, TIF_SVE);
> +		goto out;
> +	}

__fpr_set() already calls sve_sync_to_fpsimd(). Anyway, do you actually
need this since we are going to override the FPSIMD state anyway here.

> +
> +	/* Otherwise: full SVE case */
> +
> +	/*
> +	 * If setting a different VL from the requested VL and there is
> +	 * register data, the data layout will be wrong: don't even
> +	 * try to set the registers in this case.
> +	 */
> +	if (count && vq != sve_vq_from_vl(header.vl)) {
> +		ret = -EIO;
> +		goto out;
> +	}
> +
> +	sve_alloc(target);
> +	fpsimd_sync_to_sve(target);

Similarly here, it's a full SVE case, so we are going to override it
anyway.

> +	set_tsk_thread_flag(target, TIF_SVE);

This has the side-effect of enabling TIF_SVE even for PTRACE_SYSCALL
which may be cleared in some circumstances. It may not be an issue
though.
Dave Martin Oct. 13, 2017, 4:16 p.m. UTC | #2
On Thu, Oct 12, 2017 at 06:06:32PM +0100, Catalin Marinas wrote:
> On Tue, Oct 10, 2017 at 07:38:36PM +0100, Dave P Martin wrote:
> > @@ -702,6 +737,211 @@ static int system_call_set(struct task_struct *target,
> >  	return ret;
> >  }
> >  
> > +#ifdef CONFIG_ARM64_SVE
> > +
> > +static void sve_init_header_from_task(struct user_sve_header *header,
> > +				      struct task_struct *target)
> > +{
> > +	unsigned int vq;
> > +
> > +	memset(header, 0, sizeof(*header));
> > +
> > +	header->flags = test_tsk_thread_flag(target, TIF_SVE) ?
> > +		SVE_PT_REGS_SVE : SVE_PT_REGS_FPSIMD;
> 
> For PTRACE_SYSCALL, we may or may not have TIF_SVE depending on what
> happened with the target. Just a thought: shall we clear TIF_SVE (and
> sync it to fpsimd) in syscall_trace_enter()?

I'm not so sure: if we were to do that, a syscall that is cancelled by
writing -1 to REGSET_SYSCALL could still discard the SVE registers as a
side-effect.

The target committed to discard by executing SVC, but my feeling is
that cancellation of a syscall in this way shouldn't have avoidable
side-effects for the target.  But the semantics of cancelled syscalls
are a bit of a grey area, so I can see potential arguments on both
sides.

The current approach at least saves a bit of code.  What do you think?

> > +	if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT))
> > +		header->flags |= SVE_PT_VL_INHERIT;
> > +
> > +	header->vl = target->thread.sve_vl;
> > +	vq = sve_vq_from_vl(header->vl);
> > +
> > +	header->max_vl = sve_max_vl;
> > +	if (WARN_ON(!sve_vl_valid(sve_max_vl)))
> > +		header->max_vl = header->vl;
> > +
> > +	header->size = SVE_PT_SIZE(vq, header->flags);
> > +	header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl),
> > +				      SVE_PT_REGS_SVE);
> > +}
> [...]
> > +static int sve_set(struct task_struct *target,
> > +		   const struct user_regset *regset,
> > +		   unsigned int pos, unsigned int count,
> > +		   const void *kbuf, const void __user *ubuf)
> > +{
> > +	int ret;
> > +	struct user_sve_header header;
> > +	unsigned int vq;
> > +	unsigned long start, end;
> > +
> > +	if (!system_supports_sve())
> > +		return -EINVAL;
> > +
> > +	/* Header */
> > +	if (count < sizeof(header))
> > +		return -EINVAL;
> > +	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header,
> > +				 0, sizeof(header));
> > +	if (ret)
> > +		goto out;
> > +
> > +	/*
> > +	 * Apart from PT_SVE_REGS_MASK, all PT_SVE_* flags are consumed by
> > +	 * sve_set_vector_length(), which will also validate them for us:
> > +	 */
> > +	ret = sve_set_vector_length(target, header.vl,
> > +		((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
> > +	if (ret)
> > +		goto out;
> > +
> > +	/* Actual VL set may be less than the user asked for: */
> > +	vq = sve_vq_from_vl(target->thread.sve_vl);
> > +
> > +	/* Registers: FPSIMD-only case */
> > +
> > +	BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
> > +	if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) {
> > +		sve_sync_to_fpsimd(target);
> > +
> > +		ret = __fpr_set(target, regset, pos, count, kbuf, ubuf,
> > +				SVE_PT_FPSIMD_OFFSET);
> > +		clear_tsk_thread_flag(target, TIF_SVE);
> > +		goto out;
> > +	}
> 
> __fpr_set() already calls sve_sync_to_fpsimd(). Anyway, do you actually

Yes, the call to sve_sync_to_fpsimd() is superfluous here -- I think
that I realised that all callers of __fpr_set() need this to happen,
but never deleted the explicit call from sve_set().

I'll delete it.


Looking more closely at __fpr_set() though, I think it needs this change
too, because the sync is unintentionally placed after reading
thread.fpsimd_state instead of before:

@@ -652,11 +652,12 @@ static int __fpr_set(struct task_struct *target,
                     unsigned int start_pos)
 {
        int ret;
-       struct user_fpsimd_state newstate =
-               target->thread.fpsimd_state.user_fpsimd;
+       struct user_fpsimd_state newstate;
 
        sve_sync_to_fpsimd(target);
 
+       newstate = target->thread.fpsimd_state.user_fpsimd;
+
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate,
[...]

(Or were you confident that this was already OK?  Maybe I'm confusing
myself.)

> need this since we are going to override the FPSIMD state anyway here.

The underlying reason for this is the issue of what should happen
for short regset writes.  Historically, writes through fpr_set() can
be truncated arbitrarily, and the rest of fpsimd_state will remain
unchanged.

The issue is that if TIF_SVE is set, fpsimd_state can be stale for
target.  If the initial sve_sync_to_fpsimd() is removed in sve_set()
above, then we may resurrect old values for the untouched registers,
instead of simply leaving them unmodified.

Should I add comments explaining the purpose?  I guess it is rather
non-obvious.


Of course, I don't know whether userspace should really rely on partial
regset writes doing anything sane, but I figured the implemented
behaviour is at least less surprising with respect to the fpr_set()
behavior.

No legacy software can be relying on NT_ARM_SVE at all, so the behaviour
here may not matter that much.  My idea was to reduce the invasiveness
of porting ptrace clients to use NT_ARM_SVE.

> 
> > +
> > +	/* Otherwise: full SVE case */
> > +
> > +	/*
> > +	 * If setting a different VL from the requested VL and there is
> > +	 * register data, the data layout will be wrong: don't even
> > +	 * try to set the registers in this case.
> > +	 */
> > +	if (count && vq != sve_vq_from_vl(header.vl)) {
> > +		ret = -EIO;
> > +		goto out;
> > +	}
> > +
> > +	sve_alloc(target);
> > +	fpsimd_sync_to_sve(target);
> 
> Similarly here, it's a full SVE case, so we are going to override it
> anyway.

The argument here is similar: target->sve_state might already be
allocated and if so it could contain data that doesn't match the user
task's idea of what's in the V-regs.  (In fact, sve_alloc() currently
zeroes sve_state, but that's still different from the current V-regs.)

There is no possibility of legacy software relying on this code path,
so we could say that a short write to NT_ARM_SVE with PT_SVE_REGS_SVE
in flags zeroes any trailing data instead of preserving it.


Either way, I don't intend to document the behaviour of partial
writes to NT_ARM_SVE.  From a userspace point of view, I leave it
unspecified.

> > +	set_tsk_thread_flag(target, TIF_SVE);
> 
> This has the side-effect of enabling TIF_SVE even for PTRACE_SYSCALL
> which may be cleared in some circumstances. It may not be an issue
> though.

I would argue that this is correct behaviour: the syscall enter trap and
exit traps should both behave as if they are outside the syscall,
allowing the debugger to simulate the effect of inserting any
instructions the target could have inserted before or after the SVC.
This may include simulating SVE instructions or modifying SVE regs,
which would require TIF_SVE to get set.

If the tracer doesn't cancel the syscall at the enter trap, then a
real syscall will execute and that may cause the SVE state to be
discarded in the usual way in the case of preemption or blocking: it
seems cleaner for the debug illusion that this decision isn't made
differently just because the target is being traced.

(Spoiler alert though: the syscall exit trap will force the target
to be scheduled out, which will force discard with the current
task_fpsimd_save() behaviour ... but that could be changed in the
future, and I prefer not to document any sort of guarantee here.)


Does this make sense?  There may be issues or corner cases here
that I didn't spot...

Cheers
---Dave
Catalin Marinas Oct. 18, 2017, 10:32 a.m. UTC | #3
On Fri, Oct 13, 2017 at 05:16:39PM +0100, Dave P Martin wrote:
> On Thu, Oct 12, 2017 at 06:06:32PM +0100, Catalin Marinas wrote:
> > On Tue, Oct 10, 2017 at 07:38:36PM +0100, Dave P Martin wrote:
> > > @@ -702,6 +737,211 @@ static int system_call_set(struct task_struct *target,
> > >  	return ret;
> > >  }
> > >  
> > > +#ifdef CONFIG_ARM64_SVE
> > > +
> > > +static void sve_init_header_from_task(struct user_sve_header *header,
> > > +				      struct task_struct *target)
> > > +{
> > > +	unsigned int vq;
> > > +
> > > +	memset(header, 0, sizeof(*header));
> > > +
> > > +	header->flags = test_tsk_thread_flag(target, TIF_SVE) ?
> > > +		SVE_PT_REGS_SVE : SVE_PT_REGS_FPSIMD;
> > 
> > For PTRACE_SYSCALL, we may or may not have TIF_SVE depending on what
> > happened with the target. Just a thought: shall we clear TIF_SVE (and
> > sync it to fpsimd) in syscall_trace_enter()?
> 
> I'm not so sure: if we were to do that, a syscall that is cancelled by
> writing -1 to REGSET_SYSCALL could still discard the SVE registers as a
> side-effect.
> 
> The target committed to discard by executing SVC, but my feeling is
> that cancellation of a syscall in this way shouldn't have avoidable
> side-effects for the target.  But the semantics of cancelled syscalls
> are a bit of a grey area, so I can see potential arguments on both
> sides.

We already can't guarantee this - if the task invoking a syscall gets
preempted before syscall_trace_enter(), TIF_SVE will be cleared. Are you
reinstating TIF_SVE if the syscall was cancelled?

So my comment was more about consistency on presenting the SVE state to
the debugger handling PTRACE_SYSCALL.

> > > +	/* Registers: FPSIMD-only case */
> > > +
> > > +	BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
> > > +	if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) {
> > > +		sve_sync_to_fpsimd(target);
> > > +
> > > +		ret = __fpr_set(target, regset, pos, count, kbuf, ubuf,
> > > +				SVE_PT_FPSIMD_OFFSET);
> > > +		clear_tsk_thread_flag(target, TIF_SVE);
> > > +		goto out;
> > > +	}
> > 
> > __fpr_set() already calls sve_sync_to_fpsimd(). Anyway, do you actually
> 
> Yes, the call to sve_sync_to_fpsimd() is superfluous here -- I think
> that I realised that all callers of __fpr_set() need this to happen,
> but never deleted the explicit call from sve_set().
> 
> I'll delete it.
> 
> 
> Looking more closely at __fpr_set() though, I think it needs this change
> too, because the sync is unintentionally placed after reading
> thread.fpsimd_state instead of before:
> 
> @@ -652,11 +652,12 @@ static int __fpr_set(struct task_struct *target,
>                      unsigned int start_pos)
>  {
>         int ret;
> -       struct user_fpsimd_state newstate =
> -               target->thread.fpsimd_state.user_fpsimd;
> +       struct user_fpsimd_state newstate;
>  
>         sve_sync_to_fpsimd(target);
>  
> +       newstate = target->thread.fpsimd_state.user_fpsimd;
> +
>         ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate,
> [...]
> 
> (Or were you confident that this was already OK?  Maybe I'm confusing
> myself.)

With the sve_sync_to_fpsimd() called before __fpr_set(), it was ok. Once
you removed that you indeed need the change to __fpr_set().

> > need this since we are going to override the FPSIMD state anyway here.
> 
> The underlying reason for this is the issue of what should happen
> for short regset writes.  Historically, writes through fpr_set() can
> be truncated arbitrarily, and the rest of fpsimd_state will remain
> unchanged.

Ah, I forgot about this.

> The issue is that if TIF_SVE is set, fpsimd_state can be stale for
> target.  If the initial sve_sync_to_fpsimd() is removed in sve_set()
> above, then we may resurrect old values for the untouched registers,
> instead of simply leaving them unmodified.
> 
> Should I add comments explaining the purpose?  I guess it is rather
> non-obvious.

Yes, please.

> > > +	set_tsk_thread_flag(target, TIF_SVE);
> > 
> > This has the side-effect of enabling TIF_SVE even for PTRACE_SYSCALL
> > which may be cleared in some circumstances. It may not be an issue
> > though.
> 
> I would argue that this is correct behaviour: the syscall enter trap and
> exit traps should both behave as if they are outside the syscall,
> allowing the debugger to simulate the effect of inserting any
> instructions the target could have inserted before or after the SVC.
> This may include simulating SVE instructions or modifying SVE regs,
> which would require TIF_SVE to get set.

I'm ok with this approach but I'm not sure we can guarantee it with
preemption enabled.

> If the tracer doesn't cancel the syscall at the enter trap, then a
> real syscall will execute and that may cause the SVE state to be
> discarded in the usual way in the case of preemption or blocking: it
> seems cleaner for the debug illusion that this decision isn't made
> differently just because the target is being traced.
> 
> (Spoiler alert though: the syscall exit trap will force the target
> to be scheduled out, which will force discard with the current
> task_fpsimd_save() behaviour ... but that could be changed in the
> future, and I prefer not to document any sort of guarantee here.)

If such state isn't guaranteed, then the de-facto ABI is that the
debugger cannot simulate any SVE instruction via NO_SYSCALL since the
SVE state may be discarded. So it can only rely on what's guaranteed and
changing the behaviour later won't actually help.
Dave Martin Oct. 18, 2017, 4:02 p.m. UTC | #4
On Wed, Oct 18, 2017 at 11:32:55AM +0100, Catalin Marinas wrote:
> On Fri, Oct 13, 2017 at 05:16:39PM +0100, Dave P Martin wrote:
> > On Thu, Oct 12, 2017 at 06:06:32PM +0100, Catalin Marinas wrote:
> > > On Tue, Oct 10, 2017 at 07:38:36PM +0100, Dave P Martin wrote:
> > > > @@ -702,6 +737,211 @@ static int system_call_set(struct task_struct *target,
> > > >  	return ret;
> > > >  }
> > > >  
> > > > +#ifdef CONFIG_ARM64_SVE
> > > > +
> > > > +static void sve_init_header_from_task(struct user_sve_header *header,
> > > > +				      struct task_struct *target)
> > > > +{
> > > > +	unsigned int vq;
> > > > +
> > > > +	memset(header, 0, sizeof(*header));
> > > > +
> > > > +	header->flags = test_tsk_thread_flag(target, TIF_SVE) ?
> > > > +		SVE_PT_REGS_SVE : SVE_PT_REGS_FPSIMD;
> > > 
> > > For PTRACE_SYSCALL, we may or may not have TIF_SVE depending on what
> > > happened with the target. Just a thought: shall we clear TIF_SVE (and
> > > sync it to fpsimd) in syscall_trace_enter()?
> > 
> > I'm not so sure: if we were to do that, a syscall that is cancelled by
> > writing -1 to REGSET_SYSCALL could still discard the SVE registers as a
> > side-effect.
> > 
> > The target committed to discard by executing SVC, but my feeling is
> > that cancellation of a syscall in this way shouldn't have avoidable
> > side-effects for the target.  But the semantics of cancelled syscalls
> > are a bit of a grey area, so I can see potential arguments on both
> > sides.
> 
> We already can't guarantee this - if the task invoking a syscall gets
> preempted before syscall_trace_enter(), TIF_SVE will be cleared. Are you
> reinstating TIF_SVE if the syscall was cancelled?
> 
> So my comment was more about consistency on presenting the SVE state to
> the debugger handling PTRACE_SYSCALL.

See the end of this reply.

> > > > +	/* Registers: FPSIMD-only case */
> > > > +
> > > > +	BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
> > > > +	if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) {
> > > > +		sve_sync_to_fpsimd(target);
> > > > +
> > > > +		ret = __fpr_set(target, regset, pos, count, kbuf, ubuf,
> > > > +				SVE_PT_FPSIMD_OFFSET);
> > > > +		clear_tsk_thread_flag(target, TIF_SVE);
> > > > +		goto out;
> > > > +	}
> > > 
> > > __fpr_set() already calls sve_sync_to_fpsimd(). Anyway, do you actually
> > 
> > Yes, the call to sve_sync_to_fpsimd() is superfluous here -- I think
> > that I realised that all callers of __fpr_set() need this to happen,
> > but never deleted the explicit call from sve_set().
> > 
> > I'll delete it.
> > 
> > 
> > Looking more closely at __fpr_set() though, I think it needs this change
> > too, because the sync is unintentionally placed after reading
> > thread.fpsimd_state instead of before:
> > 
> > @@ -652,11 +652,12 @@ static int __fpr_set(struct task_struct *target,
> >                      unsigned int start_pos)
> >  {
> >         int ret;
> > -       struct user_fpsimd_state newstate =
> > -               target->thread.fpsimd_state.user_fpsimd;
> > +       struct user_fpsimd_state newstate;
> >  
> >         sve_sync_to_fpsimd(target);
> >  
> > +       newstate = target->thread.fpsimd_state.user_fpsimd;
> > +
> >         ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate,
> > [...]
> > 
> > (Or were you confident that this was already OK?  Maybe I'm confusing
> > myself.)
> 
> With the sve_sync_to_fpsimd() called before __fpr_set(), it was ok. Once
> you removed that you indeed need the change to __fpr_set().

Hmmm, yes.  Anyway, I've applied the above fix, so I think this should
be fine now.

> > > need this since we are going to override the FPSIMD state anyway here.
> > 
> > The underlying reason for this is the issue of what should happen
> > for short regset writes.  Historically, writes through fpr_set() can
> > be truncated arbitrarily, and the rest of fpsimd_state will remain
> > unchanged.
> 
> Ah, I forgot about this.
> 
> > The issue is that if TIF_SVE is set, fpsimd_state can be stale for
> > target.  If the initial sve_sync_to_fpsimd() is removed in sve_set()
> > above, then we may resurrect old values for the untouched registers,
> > instead of simply leaving them unmodified.
> > 
> > Should I add comments explaining the purpose?  I guess it is rather
> > non-obvious.
> 
> Yes, please.

Will do.  It's pretty yucky.

> 
> > > > +	set_tsk_thread_flag(target, TIF_SVE);
> > > 
> > > This has the side-effect of enabling TIF_SVE even for PTRACE_SYSCALL
> > > which may be cleared in some circumstances. It may not be an issue
> > > though.
> > 
> > I would argue that this is correct behaviour: the syscall enter trap and
> > exit traps should both behave as if they are outside the syscall,
> > allowing the debugger to simulate the effect of inserting any
> > instructions the target could have inserted before or after the SVC.
> > This may include simulating SVE instructions or modifying SVE regs,
> > which would require TIF_SVE to get set.
> 
> I'm ok with this approach but I'm not sure we can guarantee it with
> preemption enabled.

Hmmm, I think I'm guilty of inventing a spurious argument here.


Apparently gdb does not do syscall cancelling.

Strace does though.
(See http://man7.org/linux/man-pages/man1/strace.1.html, search for
"syscall tampering" ;)

However, it never tries to skip a syscall entirelity AFAICT:
instead, it only attempts to fake what occurs during the syscall,
such as bailing out with a predetermined return value.

> > If the tracer doesn't cancel the syscall at the enter trap, then a
> > real syscall will execute and that may cause the SVE state to be
> > discarded in the usual way in the case of preemption or blocking: it
> > seems cleaner for the debug illusion that this decision isn't made
> > differently just because the target is being traced.
> > 
> > (Spoiler alert though: the syscall exit trap will force the target
> > to be scheduled out, which will force discard with the current
> > task_fpsimd_save() behaviour ... but that could be changed in the
> > future, and I prefer not to document any sort of guarantee here.)
> 
> If such state isn't guaranteed, then the de-facto ABI is that the
> debugger cannot simulate any SVE instruction via NO_SYSCALL since the
> SVE state may be discarded. So it can only rely on what's guaranteed and
> changing the behaviour later won't actually help.

I think you're right.

Simulating the content of the syscall does work though, and doesn't
depend on whether SVE discard occurs or not.

If the tracer sets the SVE regs at the syscall enter/exit traps,
they could still be discarded again before the tracee reenters
userspace, but that's no different from what happens in a real
syscall.


So I think my overall argument would be:

ptrace should be as orthogonal as possible to SVE discard.


Currently ptrace neither assumes that SVE discard will happen or that it
won't: it just does what the tracer asks for and tries to be safe with
either and doesn't try to hide the effects from the tracer or tracee.

I worry that introducing more interdependencies between ptrace and the
fpsimd.c code will complicate maintenance rather than making it easier.

Does that may my position clearer?

If we go with this, I should add a note to the documentation explaining
how NT_ARM_SVE writes interact with ptrace syscall traps.


The alternative would be to forcibly discard SVE in
syscall_trace_enter(), but this doesn't seem to simplify the NT_ARM_SVE
implementation at all -- that code needs to work for all types of
ptrace-stop, not just syscall traps: other that syscall traps, SVE
is potentially live for the tracee and must not be discarded.
So I'm still not clear on what the gain is.

Cheers
---Dave
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index bad72fd..ee6db38 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -38,13 +38,16 @@  struct fpsimd_state {
 			__uint128_t vregs[32];
 			u32 fpsr;
 			u32 fpcr;
+			/*
+			 * For ptrace compatibility, pad to next 128-bit
+			 * boundary here if extending this struct.
+			 */
 		};
 	};
 	/* the id of the last cpu to have restored this state */
 	unsigned int cpu;
 };
 
-
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /* Masks for extracting the FPSR and FPCR from the FPSCR */
 #define VFP_FPSCR_STAT_MASK	0xf800009f
@@ -89,6 +92,10 @@  extern size_t sve_state_size(struct task_struct const *task);
 
 extern void sve_alloc(struct task_struct *task);
 extern void fpsimd_release_thread(struct task_struct *task);
+extern void fpsimd_sync_to_sve(struct task_struct *task);
+extern void sve_sync_to_fpsimd(struct task_struct *task);
+extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task);
+
 extern int sve_set_vector_length(struct task_struct *task,
 				 unsigned long vl, unsigned long flags);
 
@@ -105,6 +112,10 @@  extern void __init sve_setup(void);
 
 static void __maybe_unused sve_alloc(struct task_struct *task) { }
 static void __maybe_unused fpsimd_release_thread(struct task_struct *task) { }
+static void __maybe_unused sve_sync_to_fpsimd(struct task_struct *task) { }
+static void __maybe_unused sve_sync_from_fpsimd_zeropad(
+	struct task_struct *task) { }
+
 static void __maybe_unused sve_init_vq_map(void) { }
 static void __maybe_unused sve_update_vq_map(void) { }
 static int __maybe_unused sve_verify_vq_map(void) { return 0; }
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index d1ff83d..a0fc6a8 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -22,6 +22,7 @@ 
 #include <linux/types.h>
 
 #include <asm/hwcap.h>
+#include <asm/sigcontext.h>
 
 
 /*
@@ -63,6 +64,8 @@ 
 
 #ifndef __ASSEMBLY__
 
+#include <linux/prctl.h>
+
 /*
  * User structures for general purpose, floating point and debug registers.
  */
@@ -90,6 +93,141 @@  struct user_hwdebug_state {
 	}		dbg_regs[16];
 };
 
+/* SVE/FP/SIMD state (NT_ARM_SVE) */
+
+struct user_sve_header {
+	__u32 size; /* total meaningful regset content in bytes */
+	__u32 max_size; /* maxmium possible size for this thread */
+	__u16 vl; /* current vector length */
+	__u16 max_vl; /* maximum possible vector length */
+	__u16 flags;
+	__u16 __reserved;
+};
+
+/* Definitions for user_sve_header.flags: */
+#define SVE_PT_REGS_MASK		(1 << 0)
+
+#define SVE_PT_REGS_FPSIMD		0
+#define SVE_PT_REGS_SVE			SVE_PT_REGS_MASK
+
+/*
+ * Common SVE_PT_* flags:
+ * These must be kept in sync with prctl interface in <linux/ptrace.h>
+ */
+#define SVE_PT_VL_INHERIT		(PR_SVE_VL_INHERIT >> 16)
+#define SVE_PT_VL_ONEXEC		(PR_SVE_SET_VL_ONEXEC >> 16)
+
+
+/*
+ * The remainder of the SVE state follows struct user_sve_header.  The
+ * total size of the SVE state (including header) depends on the
+ * metadata in the header:  SVE_PT_SIZE(vq, flags) gives the total size
+ * of the state in bytes, including the header.
+ *
+ * Refer to <asm/sigcontext.h> for details of how to pass the correct
+ * "vq" argument to these macros.
+ */
+
+/* Offset from the start of struct user_sve_header to the register data */
+#define SVE_PT_REGS_OFFSET					\
+	((sizeof(struct sve_context) + (SVE_VQ_BYTES - 1))	\
+		/ SVE_VQ_BYTES * SVE_VQ_BYTES)
+
+/*
+ * The register data content and layout depends on the value of the
+ * flags field.
+ */
+
+/*
+ * (flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD case:
+ *
+ * The payload starts at offset SVE_PT_FPSIMD_OFFSET, and is of type
+ * struct user_fpsimd_state.  Additional data might be appended in the
+ * future: use SVE_PT_FPSIMD_SIZE(vq, flags) to compute the total size.
+ * SVE_PT_FPSIMD_SIZE(vq, flags) will never be less than
+ * sizeof(struct user_fpsimd_state).
+ */
+
+#define SVE_PT_FPSIMD_OFFSET		SVE_PT_REGS_OFFSET
+
+#define SVE_PT_FPSIMD_SIZE(vq, flags)	(sizeof(struct user_fpsimd_state))
+
+/*
+ * (flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE case:
+ *
+ * The payload starts at offset SVE_PT_SVE_OFFSET, and is of size
+ * SVE_PT_SVE_SIZE(vq, flags).
+ *
+ * Additional macros describe the contents and layout of the payload.
+ * For each, SVE_PT_SVE_x_OFFSET(args) is the start offset relative to
+ * the start of struct user_sve_header, and SVE_PT_SVE_x_SIZE(args) is
+ * the size in bytes:
+ *
+ *	x	type				description
+ *	-	----				-----------
+ *	ZREGS		\
+ *	ZREG		|
+ *	PREGS		| refer to <asm/sigcontext.h>
+ *	PREG		|
+ *	FFR		/
+ *
+ *	FPSR	uint32_t			FPSR
+ *	FPCR	uint32_t			FPCR
+ *
+ * Additional data might be appended in the future.
+ */
+
+#define SVE_PT_SVE_ZREG_SIZE(vq)	SVE_SIG_ZREG_SIZE(vq)
+#define SVE_PT_SVE_PREG_SIZE(vq)	SVE_SIG_PREG_SIZE(vq)
+#define SVE_PT_SVE_FFR_SIZE(vq)		SVE_SIG_FFR_SIZE(vq)
+#define SVE_PT_SVE_FPSR_SIZE		sizeof(__u32)
+#define SVE_PT_SVE_FPCR_SIZE		sizeof(__u32)
+
+#define __SVE_SIG_TO_PT(offset) \
+	((offset) - SVE_SIG_REGS_OFFSET + SVE_PT_REGS_OFFSET)
+
+#define SVE_PT_SVE_OFFSET		SVE_PT_REGS_OFFSET
+
+#define SVE_PT_SVE_ZREGS_OFFSET \
+	__SVE_SIG_TO_PT(SVE_SIG_ZREGS_OFFSET)
+#define SVE_PT_SVE_ZREG_OFFSET(vq, n) \
+	__SVE_SIG_TO_PT(SVE_SIG_ZREG_OFFSET(vq, n))
+#define SVE_PT_SVE_ZREGS_SIZE(vq) \
+	(SVE_PT_SVE_ZREG_OFFSET(vq, SVE_NUM_ZREGS) - SVE_PT_SVE_ZREGS_OFFSET)
+
+#define SVE_PT_SVE_PREGS_OFFSET(vq) \
+	__SVE_SIG_TO_PT(SVE_SIG_PREGS_OFFSET(vq))
+#define SVE_PT_SVE_PREG_OFFSET(vq, n) \
+	__SVE_SIG_TO_PT(SVE_SIG_PREG_OFFSET(vq, n))
+#define SVE_PT_SVE_PREGS_SIZE(vq) \
+	(SVE_PT_SVE_PREG_OFFSET(vq, SVE_NUM_PREGS) - \
+		SVE_PT_SVE_PREGS_OFFSET(vq))
+
+#define SVE_PT_SVE_FFR_OFFSET(vq) \
+	__SVE_SIG_TO_PT(SVE_SIG_FFR_OFFSET(vq))
+
+#define SVE_PT_SVE_FPSR_OFFSET(vq)				\
+	((SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq) +	\
+			(SVE_VQ_BYTES - 1))			\
+		/ SVE_VQ_BYTES * SVE_VQ_BYTES)
+#define SVE_PT_SVE_FPCR_OFFSET(vq) \
+	(SVE_PT_SVE_FPSR_OFFSET(vq) + SVE_PT_SVE_FPSR_SIZE)
+
+/*
+ * Any future extension appended after FPCR must be aligned to the next
+ * 128-bit boundary.
+ */
+
+#define SVE_PT_SVE_SIZE(vq, flags)					\
+	((SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE		\
+			- SVE_PT_SVE_OFFSET + (SVE_VQ_BYTES - 1))	\
+		/ SVE_VQ_BYTES * SVE_VQ_BYTES)
+
+#define SVE_PT_SIZE(vq, flags)						\
+	 (((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ?		\
+		  SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, flags)	\
+		: SVE_PT_FPSIMD_OFFSET + SVE_PT_FPSIMD_SIZE(vq, flags))
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _UAPI__ASM_PTRACE_H */
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index c194627..6db9f30 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -438,6 +438,66 @@  void sve_alloc(struct task_struct *task)
 	BUG_ON(!task->thread.sve_state);
 }
 
+
+/*
+ * Ensure that task->thread.sve_state is up to date with respect to
+ * the user task, irrespective of when SVE is in use or not.
+ *
+ * This should only be called by ptrace.  task must be non-runnable.
+ * task->thread.sve_state must point to at least sve_state_size(task)
+ * bytes of allocated kernel memory.
+ */
+void fpsimd_sync_to_sve(struct task_struct *task)
+{
+	if (!test_tsk_thread_flag(task, TIF_SVE))
+		fpsimd_to_sve(task);
+}
+
+/*
+ * Ensure that task->thread.fpsimd_state is up to date with respect to
+ * the user task, irrespective of whether SVE is in use or not.
+ *
+ * This should only be called by ptrace.  task must be non-runnable.
+ * task->thread.sve_state must point to at least sve_state_size(task)
+ * bytes of allocated kernel memory.
+ */
+void sve_sync_to_fpsimd(struct task_struct *task)
+{
+	if (test_tsk_thread_flag(task, TIF_SVE))
+		sve_to_fpsimd(task);
+}
+
+/*
+ * Ensure that task->thread.sve_state is up to date with respect to
+ * the task->thread.fpsimd_state.
+ *
+ * This should only be called by ptrace to merge new FPSIMD register
+ * values into a task for which SVE is currently active.
+ * task must be non-runnable.
+ * task->thread.sve_state must point to at least sve_state_size(task)
+ * bytes of allocated kernel memory.
+ * task->thread.fpsimd_state must already have been initialised with
+ * the new FPSIMD register values to be merged in.
+ */
+void sve_sync_from_fpsimd_zeropad(struct task_struct *task)
+{
+	unsigned int vq;
+	void *sst = task->thread.sve_state;
+	struct fpsimd_state const *fst = &task->thread.fpsimd_state;
+	unsigned int i;
+
+	if (!test_tsk_thread_flag(task, TIF_SVE))
+		return;
+
+	vq = sve_vq_from_vl(task->thread.sve_vl);
+
+	memset(sst, 0, SVE_SIG_REGS_SIZE(vq));
+
+	for (i = 0; i < 32; ++i)
+		memcpy(ZREG(sst, vq, i), &fst->vregs[i],
+		       sizeof(fst->vregs[i]));
+}
+
 int sve_set_vector_length(struct task_struct *task,
 			  unsigned long vl, unsigned long flags)
 {
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 9cbb612..7252209 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -32,6 +32,7 @@ 
 #include <linux/security.h>
 #include <linux/init.h>
 #include <linux/signal.h>
+#include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/perf_event.h>
 #include <linux/hw_breakpoint.h>
@@ -40,6 +41,7 @@ 
 #include <linux/elf.h>
 
 #include <asm/compat.h>
+#include <asm/cpufeature.h>
 #include <asm/debug-monitors.h>
 #include <asm/pgtable.h>
 #include <asm/stacktrace.h>
@@ -618,33 +620,66 @@  static int gpr_set(struct task_struct *target, const struct user_regset *regset,
 /*
  * TODO: update fp accessors for lazy context switching (sync/flush hwstate)
  */
-static int fpr_get(struct task_struct *target, const struct user_regset *regset,
-		   unsigned int pos, unsigned int count,
-		   void *kbuf, void __user *ubuf)
+static int __fpr_get(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     void *kbuf, void __user *ubuf, unsigned int start_pos)
 {
 	struct user_fpsimd_state *uregs;
+
+	sve_sync_to_fpsimd(target);
+
 	uregs = &target->thread.fpsimd_state.user_fpsimd;
 
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs,
+				   start_pos, start_pos + sizeof(*uregs));
+}
+
+static int fpr_get(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   void *kbuf, void __user *ubuf)
+{
 	if (target == current)
 		fpsimd_preserve_current_state();
 
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, 0, -1);
+	return __fpr_get(target, regset, pos, count, kbuf, ubuf, 0);
 }
 
-static int fpr_set(struct task_struct *target, const struct user_regset *regset,
-		   unsigned int pos, unsigned int count,
-		   const void *kbuf, const void __user *ubuf)
+static int __fpr_set(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     const void *kbuf, const void __user *ubuf,
+		     unsigned int start_pos)
 {
 	int ret;
 	struct user_fpsimd_state newstate =
 		target->thread.fpsimd_state.user_fpsimd;
 
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1);
+	sve_sync_to_fpsimd(target);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate,
+				 start_pos, start_pos + sizeof(newstate));
 	if (ret)
 		return ret;
 
 	target->thread.fpsimd_state.user_fpsimd = newstate;
+
+	return ret;
+}
+
+static int fpr_set(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+
+	ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, 0);
+	if (ret)
+		return ret;
+
+	sve_sync_from_fpsimd_zeropad(target);
 	fpsimd_flush_task_state(target);
+
 	return ret;
 }
 
@@ -702,6 +737,211 @@  static int system_call_set(struct task_struct *target,
 	return ret;
 }
 
+#ifdef CONFIG_ARM64_SVE
+
+static void sve_init_header_from_task(struct user_sve_header *header,
+				      struct task_struct *target)
+{
+	unsigned int vq;
+
+	memset(header, 0, sizeof(*header));
+
+	header->flags = test_tsk_thread_flag(target, TIF_SVE) ?
+		SVE_PT_REGS_SVE : SVE_PT_REGS_FPSIMD;
+	if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT))
+		header->flags |= SVE_PT_VL_INHERIT;
+
+	header->vl = target->thread.sve_vl;
+	vq = sve_vq_from_vl(header->vl);
+
+	header->max_vl = sve_max_vl;
+	if (WARN_ON(!sve_vl_valid(sve_max_vl)))
+		header->max_vl = header->vl;
+
+	header->size = SVE_PT_SIZE(vq, header->flags);
+	header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl),
+				      SVE_PT_REGS_SVE);
+}
+
+static unsigned int sve_size_from_header(struct user_sve_header const *header)
+{
+	return ALIGN(header->size, SVE_VQ_BYTES);
+}
+
+static unsigned int sve_get_size(struct task_struct *target,
+				 const struct user_regset *regset)
+{
+	struct user_sve_header header;
+
+	if (!system_supports_sve())
+		return 0;
+
+	sve_init_header_from_task(&header, target);
+	return sve_size_from_header(&header);
+}
+
+static int sve_get(struct task_struct *target,
+		   const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   void *kbuf, void __user *ubuf)
+{
+	int ret;
+	struct user_sve_header header;
+	unsigned int vq;
+	unsigned long start, end;
+
+	if (!system_supports_sve())
+		return -EINVAL;
+
+	/* Header */
+	sve_init_header_from_task(&header, target);
+	vq = sve_vq_from_vl(header.vl);
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &header,
+				  0, sizeof(header));
+	if (ret)
+		return ret;
+
+	if (target == current)
+		fpsimd_preserve_current_state();
+
+	/* Registers: FPSIMD-only case */
+
+	BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
+	if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD)
+		return __fpr_get(target, regset, pos, count, kbuf, ubuf,
+				 SVE_PT_FPSIMD_OFFSET);
+
+	/* Otherwise: full SVE case */
+
+	BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header));
+	start = SVE_PT_SVE_OFFSET;
+	end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  target->thread.sve_state,
+				  start, end);
+	if (ret)
+		return ret;
+
+	start = end;
+	end = SVE_PT_SVE_FPSR_OFFSET(vq);
+	ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+				       start, end);
+	if (ret)
+		return ret;
+
+	/*
+	 * Copy fpsr, and fpcr which must follow contiguously in
+	 * struct fpsimd_state:
+	 */
+	start = end;
+	end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.fpsimd_state.fpsr,
+				  start, end);
+	if (ret)
+		return ret;
+
+	start = end;
+	end = sve_size_from_header(&header);
+	return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					start, end);
+}
+
+static int sve_set(struct task_struct *target,
+		   const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+	struct user_sve_header header;
+	unsigned int vq;
+	unsigned long start, end;
+
+	if (!system_supports_sve())
+		return -EINVAL;
+
+	/* Header */
+	if (count < sizeof(header))
+		return -EINVAL;
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header,
+				 0, sizeof(header));
+	if (ret)
+		goto out;
+
+	/*
+	 * Apart from PT_SVE_REGS_MASK, all PT_SVE_* flags are consumed by
+	 * sve_set_vector_length(), which will also validate them for us:
+	 */
+	ret = sve_set_vector_length(target, header.vl,
+		((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
+	if (ret)
+		goto out;
+
+	/* Actual VL set may be less than the user asked for: */
+	vq = sve_vq_from_vl(target->thread.sve_vl);
+
+	/* Registers: FPSIMD-only case */
+
+	BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
+	if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) {
+		sve_sync_to_fpsimd(target);
+
+		ret = __fpr_set(target, regset, pos, count, kbuf, ubuf,
+				SVE_PT_FPSIMD_OFFSET);
+		clear_tsk_thread_flag(target, TIF_SVE);
+		goto out;
+	}
+
+	/* Otherwise: full SVE case */
+
+	/*
+	 * If setting a different VL from the requested VL and there is
+	 * register data, the data layout will be wrong: don't even
+	 * try to set the registers in this case.
+	 */
+	if (count && vq != sve_vq_from_vl(header.vl)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	sve_alloc(target);
+	fpsimd_sync_to_sve(target);
+	set_tsk_thread_flag(target, TIF_SVE);
+
+	BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header));
+	start = SVE_PT_SVE_OFFSET;
+	end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 target->thread.sve_state,
+				 start, end);
+	if (ret)
+		goto out;
+
+	start = end;
+	end = SVE_PT_SVE_FPSR_OFFSET(vq);
+	ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					start, end);
+	if (ret)
+		goto out;
+
+	/*
+	 * Copy fpsr, and fpcr which must follow contiguously in
+	 * struct fpsimd_state:
+	 */
+	start = end;
+	end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &target->thread.fpsimd_state.fpsr,
+				 start, end);
+
+out:
+	fpsimd_flush_task_state(target);
+	return ret;
+}
+
+#endif /* CONFIG_ARM64_SVE */
+
 enum aarch64_regset {
 	REGSET_GPR,
 	REGSET_FPR,
@@ -711,6 +951,9 @@  enum aarch64_regset {
 	REGSET_HW_WATCH,
 #endif
 	REGSET_SYSTEM_CALL,
+#ifdef CONFIG_ARM64_SVE
+	REGSET_SVE,
+#endif
 };
 
 static const struct user_regset aarch64_regsets[] = {
@@ -768,6 +1011,18 @@  static const struct user_regset aarch64_regsets[] = {
 		.get = system_call_get,
 		.set = system_call_set,
 	},
+#ifdef CONFIG_ARM64_SVE
+	[REGSET_SVE] = { /* Scalable Vector Extension */
+		.core_note_type = NT_ARM_SVE,
+		.n = DIV_ROUND_UP(SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE),
+				  SVE_VQ_BYTES),
+		.size = SVE_VQ_BYTES,
+		.align = SVE_VQ_BYTES,
+		.get = sve_get,
+		.set = sve_set,
+		.get_size = sve_get_size,
+	},
+#endif
 };
 
 static const struct user_regset_view user_aarch64_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b5280db..735b8f4 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -416,6 +416,7 @@  typedef struct elf64_shdr {
 #define NT_ARM_HW_BREAK	0x402		/* ARM hardware breakpoint registers */
 #define NT_ARM_HW_WATCH	0x403		/* ARM hardware watchpoint registers */
 #define NT_ARM_SYSTEM_CALL	0x404	/* ARM system call number */
+#define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
 #define NT_METAG_CBUF	0x500		/* Metag catch buffer registers */
 #define NT_METAG_RPIPE	0x501		/* Metag read pipeline state */
 #define NT_METAG_TLS	0x502		/* Metag TLS pointer */