diff mbox series

ARC: ARCv2: jump label: implement jump label patching

Message ID 20190614164049.31626-1-Eugeniy.Paltsev@synopsys.com
State New
Headers show
Series ARC: ARCv2: jump label: implement jump label patching | expand

Commit Message

Eugeniy Paltsev June 14, 2019, 4:40 p.m. UTC
Implement jump label patching for ARC. Jump labels provide
an interface to generate dynamic branches using
self-modifying code.

This allows us to implement conditional branches where
changing branch direction is expensive but branch selection
is basically 'free'

This implementation uses 32-bit NOP and BRANCH instructions
which forced to be aligned by 4 to guarantee that they don't
cross L1 cache line and can be update atomically.

Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
---
 arch/arc/Kconfig                  |   8 ++
 arch/arc/include/asm/jump_label.h |  68 ++++++++++++
 arch/arc/kernel/Makefile          |   1 +
 arch/arc/kernel/jump_label.c      | 168 ++++++++++++++++++++++++++++++
 4 files changed, 245 insertions(+)
 create mode 100644 arch/arc/include/asm/jump_label.h
 create mode 100644 arch/arc/kernel/jump_label.c

Comments

Vineet Gupta June 18, 2019, 4:16 p.m. UTC | #1
On 6/14/19 9:41 AM, Eugeniy Paltsev wrote:
> Implement jump label patching for ARC. Jump labels provide
> an interface to generate dynamic branches using
> self-modifying code.
>
> This allows us to implement conditional branches where
> changing branch direction is expensive but branch selection
> is basically 'free'
>
> This implementation uses 32-bit NOP and BRANCH instructions
> which forced to be aligned by 4 to guarantee that they don't
> cross L1 cache line and can be update atomically.
>
> Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>

LGTM overall - nits below.

> ---
>  arch/arc/Kconfig                  |   8 ++
>  arch/arc/include/asm/jump_label.h |  68 ++++++++++++
>  arch/arc/kernel/Makefile          |   1 +
>  arch/arc/kernel/jump_label.c      | 168 ++++++++++++++++++++++++++++++
>  4 files changed, 245 insertions(+)
>  create mode 100644 arch/arc/include/asm/jump_label.h
>  create mode 100644 arch/arc/kernel/jump_label.c
>
> diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
> index c781e45d1d99..b1313e016c54 100644
> --- a/arch/arc/Kconfig
> +++ b/arch/arc/Kconfig
> @@ -47,6 +47,7 @@ config ARC
>  	select OF_EARLY_FLATTREE
>  	select PCI_SYSCALL if PCI
>  	select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING
> +	select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32
>  
>  config ARCH_HAS_CACHE_LINE_SIZE
>  	def_bool y
> @@ -529,6 +530,13 @@ config ARC_DW2_UNWIND
>  config ARC_DBG_TLB_PARANOIA
>  	bool "Paranoia Checks in Low Level TLB Handlers"
>  
> +config ARC_DBG_STATIC_KEYS
> +	bool "Paranoid checks in Static Keys code"
> +	depends on JUMP_LABEL
> +	select STATIC_KEYS_SELFTEST
> +	help
> +	  Enable paranoid checks and self-test of both ARC-specific and generic
> +	  part of static-keys-related code.

Why can't we just enable this if STATIC_KEYS_SELFTEST

>  endif
>  
>  config ARC_BUILTIN_DTB_NAME
> diff --git a/arch/arc/include/asm/jump_label.h b/arch/arc/include/asm/jump_label.h
> new file mode 100644
> index 000000000000..8971d0608f2c
> --- /dev/null
> +++ b/arch/arc/include/asm/jump_label.h
> @@ -0,0 +1,68 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_ARC_JUMP_LABEL_H
> +#define _ASM_ARC_JUMP_LABEL_H
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <linux/types.h>
> +
> +#define JUMP_LABEL_NOP_SIZE 4
> +
> +/*
> + * To make atomic update of patched instruction available we need to guarantee
> + * that this instruction doesn't cross L1 cache line boundary.
> + *
> + * As of today we simply align instruction which can be patched by 4 byte using
> + * ".balign 4" directive. In that case patched instruction is aligned with one
> + * 16-bit NOP_S if this is required.
> + * However 'align by 4' directive is much stricter than it actually required.
> + * It's enough that our 32-bit instruction don't cross l1 cache line boundary
> + * which can be achieved by using ".bundle_align_mode" directive. That will save
> + * us from adding useless NOP_S padding in most of the cases.
> + *
> + * TODO: switch to ".bundle_align_mode" directive using whin it will be
> + * supported by ARC toolchain.
> + */
> +

So block comments on top of a function imply summary of function etc.
What you are doing here is calling out the need for .balign quirk. So better to
phrase it is as "Note about .balign" and then describe the thing

> +static __always_inline bool arch_static_branch(struct static_key *key,
> +					       bool branch)
> +{
> +	asm_volatile_goto(".balign 4			\n"
> +		 "1:					\n"
> +		 "nop					\n"
> +		 ".pushsection __jump_table, \"aw\"	\n"
> +		 ".word 1b, %l[l_yes], %c0		\n"
> +		 ".popsection				\n"
> +		 : : "i" (&((char *)key)[branch]) : : l_yes);
> +
> +	return false;
> +l_yes:
> +	return true;
> +}
> +
> +static __always_inline bool arch_static_branch_jump(struct static_key *key,
> +						    bool branch)
> +{
> +	asm_volatile_goto(".balign 4			\n"
> +		 "1:					\n"
> +		 "b %l[l_yes]				\n"
> +		 ".pushsection __jump_table, \"aw\"	\n"
> +		 ".word 1b, %l[l_yes], %c0		\n"
> +		 ".popsection				\n"
> +		 : : "i" (&((char *)key)[branch]) : : l_yes);
> +
> +	return false;
> +l_yes:
> +	return true;
> +}
> +
> +typedef u32 jump_label_t;
> +
> +struct jump_entry {
> +	jump_label_t code;
> +	jump_label_t target;
> +	jump_label_t key;
> +};
> +
> +#endif  /* __ASSEMBLY__ */
> +#endif
> diff --git a/arch/arc/kernel/Makefile b/arch/arc/kernel/Makefile
> index 2dc5f4296d44..307f74156d99 100644
> --- a/arch/arc/kernel/Makefile
> +++ b/arch/arc/kernel/Makefile
> @@ -22,6 +22,7 @@ obj-$(CONFIG_ARC_EMUL_UNALIGNED) 	+= unaligned.o
>  obj-$(CONFIG_KGDB)			+= kgdb.o
>  obj-$(CONFIG_ARC_METAWARE_HLINK)	+= arc_hostlink.o
>  obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
> +obj-$(CONFIG_JUMP_LABEL)		+= jump_label.o
>  
>  obj-$(CONFIG_ARC_FPU_SAVE_RESTORE)	+= fpu.o
>  CFLAGS_fpu.o   += -mdpfp
> diff --git a/arch/arc/kernel/jump_label.c b/arch/arc/kernel/jump_label.c
> new file mode 100644
> index 000000000000..93e3bc84288f
> --- /dev/null
> +++ b/arch/arc/kernel/jump_label.c
> @@ -0,0 +1,168 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/kernel.h>
> +#include <linux/jump_label.h>
> +
> +#include "asm/cacheflush.h"
> +
> +#define JUMPLABEL_ERR	"ARC: jump_label: ERROR: "
> +
> +/* Halt system on fatal error to make debug easier */
> +#define arc_jl_fatal(format...)						\
> +({									\
> +	pr_err(JUMPLABEL_ERR format);					\
> +	BUG();								\

Does it make sense to bring down the whole system vs. failing and returning.
I see there is no error propagation to core code, but still.

> +})
> +
> +static inline u32 arc_gen_nop(void)
> +{
> +	/* 1x 32bit NOP in middle endian */
> +	return 0x7000264a;
> +}
> +
> +static inline bool cross_l1_cache_line(void *addr, int len)
> +{
> +	unsigned long a = (unsigned long)addr;
> +
> +	return (a >> L1_CACHE_SHIFT) != ((a + len - 1) >> L1_CACHE_SHIFT);
> +}
> +
> +/*
> + * ARCv2 'Branch unconditionally' instruction:
> + * 00000ssssssssss1SSSSSSSSSSNRtttt
> + * s S[n:0] lower bits signed immediate (number is bitfield size)
> + * S S[m:n+1] upper bits signed immediate (number is bitfield size)
> + * t S[24:21] upper bits signed immediate (branch unconditionally far)
> + * N N <.d> delay slot mode
> + * R R Reserved
> + */
> +static inline u32 arc_gen_branch(jump_label_t pc, jump_label_t target)
> +{
> +	u32 instruction_l, instruction_r;
> +	u32 pcl = pc & GENMASK(31, 2);
> +	u32 u_offset = target - pcl;
> +	u32 s, S, t;
> +
> +	/*
> +	 * Offset in 32-bit branch instruction must to fit into s25.
> +	 * Something is terribly broken if we get such huge offset within one
> +	 * function.
> +	 */
> +	if ((s32)u_offset < -16777216 || (s32)u_offset > 16777214)
> +		arc_jl_fatal("gen branch with offset (%d) not fit in s25\n",
> +			     (s32)u_offset);
> +
> +	/*
> +	 * All instructions are aligned by 2 bytes so we should never get offset
> +	 * here which is not 2 bytes aligned.
> +	 */
> +	if (u_offset & 0x1)
> +		arc_jl_fatal("gen branch with offset (%d) unaligned to 2 bytes\n",
> +			     (s32)u_offset);
> +
> +	s = (u_offset >> 1)  & GENMASK(9, 0);
> +	S = (u_offset >> 11) & GENMASK(9, 0);
> +	t = (u_offset >> 21) & GENMASK(3, 0);
> +
> +	/* 00000ssssssssss1 */
> +	instruction_l = (s << 1) | 0x1;
> +	/* SSSSSSSSSSNRtttt */
> +	instruction_r = (S << 6) | t;
> +
> +	return (instruction_r << 16) | (instruction_l & GENMASK(15, 0));
> +}
> +
> +void arch_jump_label_transform(struct jump_entry *entry,
> +			       enum jump_label_type type)
> +{
> +	jump_label_t *instr_addr = (jump_label_t *)entry->code;
> +	u32 instr;
> +
> +	/*
> +	 * Atomic update of patched instruction is only available if this
> +	 * instruction doesn't cross L1 cache line boundary. You can read about
> +	 * the way we achieve this in arc/include/asm/jump_label.h
> +	 */
> +	if (cross_l1_cache_line(instr_addr, JUMP_LABEL_NOP_SIZE))
> +		arc_jl_fatal("instruction (addr %px) cross L1 cache line",
> +			     instr_addr);
> +
> +	if (type == JUMP_LABEL_JMP)
> +		instr = arc_gen_branch(entry->code, entry->target);
> +	else
> +		instr = arc_gen_nop();
> +
> +	WRITE_ONCE(*instr_addr, instr);
> +	flush_icache_range(entry->code, entry->code + JUMP_LABEL_NOP_SIZE);
> +}
> +
> +void arch_jump_label_transform_static(struct jump_entry *entry,
> +				      enum jump_label_type type)
> +{
> +	/*
> +	 * We use only one NOP type (1x, 4 byte) in arch_static_branch, so
> +	 * there's no need to patch an identical NOP over the top of it here.
> +	 * The generic code calls 'arch_jump_label_transform' if the NOP needs
> +	 * to be replaced by a branch, so 'arch_jump_label_transform_static' is
> +	 * newer called with type other than JUMP_LABEL_NOP.

s/newer/never

> +	 */
> +	BUG_ON(type != JUMP_LABEL_NOP);
> +}
> +
> +#ifdef CONFIG_ARC_DBG_STATIC_KEYS
> +#define SELFTEST_MSG	"ARC: instruction generation self-test: "
> +
> +struct arc_gen_branch_testdata {
> +	jump_label_t pc;
> +	jump_label_t target_address;
> +	u32 expected_instr;
> +};
> +
> +static __init int branch_gen_test(struct arc_gen_branch_testdata *test_data)
> +{
> +	u32 instr_got;
> +
> +	instr_got = arc_gen_branch(test_data->pc, test_data->target_address);
> +	if (instr_got != test_data->expected_instr) {
> +		pr_err(SELFTEST_MSG "FAIL:\n arc_gen_branch(0x%08x, 0x%08x) != 0x%08x, got 0x%08x\n",
> +		       test_data->pc, test_data->target_address,
> +		       test_data->expected_instr, instr_got);
> +
> +		return -EFAULT;
> +	}
> +
> +	return 0;
> +}
> +
> +static __init int instr_gen_test(void)
> +{
> +	int i, ret;
> +
> +	struct arc_gen_branch_testdata test_data[] = {
> +		{0x90007548, 0x90007514, 0xffcf07cd}, /* tiny (-52) offs */
> +		{0x9000c9c0, 0x9000c782, 0xffcf05c3}, /* tiny (-574) offs */
> +		{0x9000cc1c, 0x9000c782, 0xffcf0367}, /* tiny (-1178) offs */
> +		{0x9009dce0, 0x9009d106, 0xff8f0427}, /* small (-3034) offs */
> +		{0x9000f5de, 0x90007d30, 0xfc0f0755}, /* big  (-30892) offs */
> +		{0x900a2444, 0x90035f64, 0xc9cf0321}, /* huge (-443616) offs */
> +		{0x90007514, 0x9000752c, 0x00000019}, /* tiny (+24) offs */
> +		{0x9001a578, 0x9001a77a, 0x00000203}, /* tiny (+514) offs */
> +		{0x90031ed8, 0x90032634, 0x0000075d}, /* tiny (+1884) offs */
> +		{0x9008c7f2, 0x9008d3f0, 0x00400401}, /* small (+3072) offs */
> +		{0x9000bb38, 0x9003b340, 0x17c00009}, /* big  (+194568) offs */
> +		{0x90008f44, 0x90578d80, 0xb7c2063d}  /* huge (+5701180) offs */
> +	};
> +
> +	for (i = 0; i < ARRAY_SIZE(test_data); i++) {
> +		ret = branch_gen_test(&test_data[i]);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	pr_info(SELFTEST_MSG "OK\n");
> +
> +	return 0;
> +}
> +early_initcall(instr_gen_test);
> +
> +#endif /* CONFIG_ARC_STATIC_KEYS_DEBUG */
Peter Zijlstra June 19, 2019, 8:12 a.m. UTC | #2
On Tue, Jun 18, 2019 at 04:16:20PM +0000, Vineet Gupta wrote:

> > +/*
> > + * To make atomic update of patched instruction available we need to guarantee
> > + * that this instruction doesn't cross L1 cache line boundary.
> > + *

Oh urgh. Is that the only way ARC can do text patching? We've actually
considered something like this on x86 at some point, but there we have
the 'fun' detail that the i-fetch window does not in fact align with L1
size on all uarchs, so that got complicated real fast.

I'm assuming you've looked at what x86 currently does and found
something like that doesn't work for ARC?

> > +/* Halt system on fatal error to make debug easier */
> > +#define arc_jl_fatal(format...)						\
> > +({									\
> > +	pr_err(JUMPLABEL_ERR format);					\
> > +	BUG();								\
> 
> Does it make sense to bring down the whole system vs. failing and returning.
> I see there is no error propagation to core code, but still.

It is what x86 does. And I've been fairly adamant about doing so. When
the kernel text is compromised, do you really want to continue running
it?

> > +})
> > +
> > +static inline u32 arc_gen_nop(void)
> > +{
> > +	/* 1x 32bit NOP in middle endian */

I dare not ask...

> > +	return 0x7000264a;
> > +}
> > +

> > +	/*
> > +	 * All instructions are aligned by 2 bytes so we should never get offset
> > +	 * here which is not 2 bytes aligned.
> > +	 */

> > +	WRITE_ONCE(*instr_addr, instr);
> > +	flush_icache_range(entry->code, entry->code + JUMP_LABEL_NOP_SIZE);

So do you have a 2 byte opcode that traps unconditionally? In that case
I'm thinking you could do something like x86 does. And it would avoid
that NOP padding you do to get the alignment.
Vineet Gupta June 19, 2019, 11:55 p.m. UTC | #3
On 6/19/19 1:12 AM, Peter Zijlstra wrote:
> On Tue, Jun 18, 2019 at 04:16:20PM +0000, Vineet Gupta wrote:
>
>>> +/*
>>> + * To make atomic update of patched instruction available we need to guarantee
>>> + * that this instruction doesn't cross L1 cache line boundary.
>>> + *
> Oh urgh. Is that the only way ARC can do text patching? 

Nothing seems out of the ordinary here. Perhaps Eugeniy's comment confused you, so
let me explain the whole thing - likely obvious to you anyways.

I-cache is non snooping on ARC (like most embedded style arches) so self modifying
code has to flush corresponding D and I cache lines. Instructions can be 2 byte
aligned and could be 2, 4, 6, 8 bytes long, so a 4 byte NOP can potentially
straddle cache line, needing 2 lines to be flushed. The cache maintenance ops work
on region or line but coherency unit nonetheless operates on line sized units
meaning 2 line ops may not be atomic on a remote cpu. So in the pathetic corner
case we can have "other (non patching) CPU execute the code around patched PC with
partial old/new fragments. So we ensure a patched instruction never crosses a
cache line - using .balign 4. This causes a slight mis-optimization that all
patched instruction locations are forced to be 4 bytes aligned while ISA allows
code to be 2 byte aligned. The cost is an extra NOP_S (2 bytes) - no big deal in
grand scheme of things in IMO.

FWIW I tried to avoid all of this by using the 2 byte NOP_S and B_S variants which
ensures we can never straddle cache line so the alignment issue goes away. There's
a nice code size reduction too - see [1] . But I get build link errors in
networking code around DO_ONCE where the unlikely code is too much and offset
can't be encoded in signed 10 bits which B_S is allowed.

[1] http://lists.infradead.org/pipermail/linux-snps-arc/2019-June/005875.html
> We've actually
> considered something like this on x86 at some point, but there we have
> the 'fun' detail that the i-fetch window does not in fact align with L1
> size on all uarchs, so that got complicated real fast.

As described above we don't have such an issue. I/D flushing works - its just that
they are not be atomic

> I'm assuming you've looked at what x86 currently does and found
> something like that doesn't work for ARC?

Just looked at x86 code and it seems similar

>
>>> +/* Halt system on fatal error to make debug easier */
>>> +#define arc_jl_fatal(format...)						\
>>> +({									\
>>> +	pr_err(JUMPLABEL_ERR format);					\
>>> +	BUG();								\
>> Does it make sense to bring down the whole system vs. failing and returning.
>> I see there is no error propagation to core code, but still.
> It is what x86 does. And I've been fairly adamant about doing so. When
> the kernel text is compromised, do you really want to continue running
> it?

Agree, but the errors here are not in the middle of code patching itself. They are
found before committing to patching say because patched code straddles line (which
BTW can never happen given the .balign, it is perhaps a pedantic safety net), or
the offset can't be encoded in B. So it is possible to  do a pr_err and just
return w/o any patching like an API call failed. But given that the error
propagation to core is not there - the assumption is it either always works or
panics, there is no "failing" semantics.

>
>>> +})
>>> +
>>> +static inline u32 arc_gen_nop(void)
>>> +{
>>> +	/* 1x 32bit NOP in middle endian */
> I dare not ask...

:-) The public PRM is being worked on for *real* so this will be remedied in a few
months at best.

Short answer is it specifies how instruction stream is laid out in code memory for
efficient next PC decoding given that ARC can freely intermix 2, 4, 6, 8 byte
instruction fragments w/o any restrictions.

Consier SWI insn encoding: per PRM is

31                                     0
--------------------------------------------------------------
00100    010    01    101111    0    000    000000    111111
--------------------------------------------------------------

In regular little endian notation, in memory it would have looked as

31                  0
 0x22    0x6F    0x00    0x3F 
  b3     b2      b1      b0

However in middle endian format, the 2 short words are flipped

31                   0
0x00    0x3F   0x22     0x6F   
  b1     b0      b3       b2

>
>>> +	WRITE_ONCE(*instr_addr, instr);
>>> +	flush_icache_range(entry->code, entry->code + JUMP_LABEL_NOP_SIZE);
> So do you have a 2 byte opcode that traps unconditionally? In that case
> I'm thinking you could do something like x86 does. And it would avoid
> that NOP padding you do to get the alignment.

Just to be clear there is no trapping going on in the canonical sense of it. There
are regular instructions for NO-OP and Branch.
We do have 2 byte opcodes for both but as described the branch offset is too
limited so not usable.

-Vineet
Peter Zijlstra June 20, 2019, 7:01 a.m. UTC | #4
On Wed, Jun 19, 2019 at 11:55:41PM +0000, Vineet Gupta wrote:
> On 6/19/19 1:12 AM, Peter Zijlstra wrote:

> > I'm assuming you've looked at what x86 currently does and found
> > something like that doesn't work for ARC?
> 
> Just looked at x86 code and it seems similar

I think you missed a bit.

> >>> +	WRITE_ONCE(*instr_addr, instr);
> >>> +	flush_icache_range(entry->code, entry->code + JUMP_LABEL_NOP_SIZE);
> > So do you have a 2 byte opcode that traps unconditionally? In that case
> > I'm thinking you could do something like x86 does. And it would avoid
> > that NOP padding you do to get the alignment.
> 
> Just to be clear there is no trapping going on in the canonical sense of it. There
> are regular instructions for NO-OP and Branch.
> We do have 2 byte opcodes for both but as described the branch offset is too
> limited so not usable.

In particular we do not need the alignment.

So what the x86 code does is:

 - overwrite the first byte of the instruction with a single byte trap
   instruction

 - machine wide IPI which synchronizes I$

At this point, any CPU that encounters this instruction will trap; and
the trap handler will emulate the 'new' instruction -- typically a jump.

  - overwrite the tail of the instruction (if there is a tail)

  - machine wide IPI which syncrhonizes I$

At this point, nobody will execute the tail, because we'll still trap on
that first single byte instruction, but if they were to read the
instruction stream, the tail must be there.

  - overwrite the first byte of the instruction to now have a complete
    instruction.

  - machine wide IPI which syncrhonizes I$

At this point, any CPU will encounter the new instruction as a whole,
irrespective of alignment.


So the benefit of this scheme is that is works irrespective of the
instruction fetch window size and don't need the 'funny' alignment
stuff.

Now, I've no idea if something like this is feasible on ARC; for it to
work you need that 2 byte trap instruction -- since all instructions are
2 byte aligned, you can always poke that without issue.
Peter Zijlstra June 20, 2019, 7:15 a.m. UTC | #5
On Wed, Jun 19, 2019 at 11:55:41PM +0000, Vineet Gupta wrote:

> FWIW I tried to avoid all of this by using the 2 byte NOP_S and B_S variants which
> ensures we can never straddle cache line so the alignment issue goes away. There's
> a nice code size reduction too - see [1] . But I get build link errors in
> networking code around DO_ONCE where the unlikely code is too much and offset
> can't be encoded in signed 10 bits which B_S is allowed.

Yeah, so on x86 we have a 2 byte and a 5 byte relative jump and have the
exact same issue. We're currently using 5 byte jumps unconditionally for
the same reason.

Getting it to use the 2 byte one where possible is a 'fun' project for
someone with spare time at some point.  It might need a GCC plugin to
pull off, I've not put too much tought into it.
Peter Zijlstra June 20, 2019, 7:21 a.m. UTC | #6
On Wed, Jun 19, 2019 at 11:55:41PM +0000, Vineet Gupta wrote:
> So we ensure a patched instruction never crosses a
> cache line - using .balign 4. This causes a slight mis-optimization that all
> patched instruction locations are forced to be 4 bytes aligned while ISA allows
> code to be 2 byte aligned. The cost is an extra NOP_S (2 bytes) - no big deal in
> grand scheme of things in IMO.

Right, so the scheme x86 uses (which I outlined in an earlier email)
allows you to get rid of those extra NOPs.

Given jump labels are typically used on fast paths, and NOPs still take
up cycles to, at the very least, fetch and decode, some people might care.

But if you're OK with having them, then sure, your scheme certainly
should work.
Peter Zijlstra June 20, 2019, 7:52 a.m. UTC | #7
On Wed, Jun 19, 2019 at 11:55:41PM +0000, Vineet Gupta wrote:
> On 6/19/19 1:12 AM, Peter Zijlstra wrote:

> >>> +static inline u32 arc_gen_nop(void)
> >>> +{
> >>> +	/* 1x 32bit NOP in middle endian */
> > I dare not ask...
> 
> :-) The public PRM is being worked on for *real* so this will be remedied in a few
> months at best.
> 
> Short answer is it specifies how instruction stream is laid out in code memory for
> efficient next PC decoding given that ARC can freely intermix 2, 4, 6, 8 byte
> instruction fragments w/o any restrictions.
> 
> Consier SWI insn encoding: per PRM is
> 
> 31                                     0
> --------------------------------------------------------------
> 00100    010    01    101111    0    000    000000    111111
> --------------------------------------------------------------
> 
> In regular little endian notation, in memory it would have looked as
> 
> 31                  0
>  0x22    0x6F    0x00    0x3F 
>   b3     b2      b1      b0
> 
> However in middle endian format, the 2 short words are flipped
> 
> 31                   0
> 0x00    0x3F   0x22     0x6F   
>   b1     b0      b3       b2

I'm probably missing something here. I understand the I-fetch likes 2
byte chunks, but I'm not sure I understand how you end up with middle
endian.

With everything little endian, everything seems just fine. If you load
the first 2 byte at offset 0, you get the first 2 bytes of the
instruction.

If OTOH your arch big endian, and you were to load the first two bytes
at offset 0, you get the top two.

So this middle endian scheme, only seems to make sense if you're
otherwise big endian. But AFAICT ARC is bi-endian and the jump-label
patch under condsideration is explicitly !CPU_ENDIAN_BE32 -- which
suggests the instruction stream is sensitive to the endian selection.

Anyway, I was just 'surprised' at seeing middle endian mentioned,
curiosity killed the cat and all that :-)
Eugeniy Paltsev June 20, 2019, 6:34 p.m. UTC | #8
Hi Peter,

On Wed, 2019-06-19 at 10:12 +0200, Peter Zijlstra wrote:
> On Tue, Jun 18, 2019 at 04:16:20PM +0000, Vineet Gupta wrote:
> 
> > > +/*
> > > + * To make atomic update of patched instruction available we need to guarantee
> > > + * that this instruction doesn't cross L1 cache line boundary.
> > > + *
> 
> Oh urgh. Is that the only way ARC can do text patching? We've actually
> considered something like this on x86 at some point, but there we have
> the 'fun' detail that the i-fetch window does not in fact align with L1
> size on all uarchs, so that got complicated real fast.
> 
> I'm assuming you've looked at what x86 currently does and found
> something like that doesn't work for ARC?

To be honest I've mostly look at arm/arm64 implementation :)

But yeah it's good remark about i-fetch window.
It's named 'instruction_fetch_block_width' in ARC documentation and it's
smaller than L1 I$ line size. On ARCv2 it's 16 byte.
So in current implementation we need to guarantee that this instruction doesn't
cross 'instruction_fetch_block' and not L1 cache line boundary.

So there is no problem with this code itself but the comment should be fixed.

[snip]
Eugeniy Paltsev June 20, 2019, 6:34 p.m. UTC | #9
Hi Peter,
On Thu, 2019-06-20 at 09:01 +0200, Peter Zijlstra wrote:
> On Wed, Jun 19, 2019 at 11:55:41PM +0000, Vineet Gupta wrote:
> > On 6/19/19 1:12 AM, Peter Zijlstra wrote:
> > > I'm assuming you've looked at what x86 currently does and found
> > > something like that doesn't work for ARC?
> > 
> > Just looked at x86 code and it seems similar
> 
> I think you missed a bit.
> 
> > > > > +	WRITE_ONCE(*instr_addr, instr);
> > > > > +	flush_icache_range(entry->code, entry->code + JUMP_LABEL_NOP_SIZE);
> > > So do you have a 2 byte opcode that traps unconditionally? In that case
> > > I'm thinking you could do something like x86 does. And it would avoid
> > > that NOP padding you do to get the alignment.
> > 
> > Just to be clear there is no trapping going on in the canonical sense of it. There
> > are regular instructions for NO-OP and Branch.
> > We do have 2 byte opcodes for both but as described the branch offset is too
> > limited so not usable.
> 
> In particular we do not need the alignment.
> 
> So what the x86 code does is:
> 
>  - overwrite the first byte of the instruction with a single byte trap
>    instruction
> 
>  - machine wide IPI which synchronizes I$
> 
> At this point, any CPU that encounters this instruction will trap; and
> the trap handler will emulate the 'new' instruction -- typically a jump.
> 
>   - overwrite the tail of the instruction (if there is a tail)
> 
>   - machine wide IPI which syncrhonizes I$
> 
> At this point, nobody will execute the tail, because we'll still trap on
> that first single byte instruction, but if they were to read the
> instruction stream, the tail must be there.
> 
>   - overwrite the first byte of the instruction to now have a complete
>     instruction.
> 
>   - machine wide IPI which syncrhonizes I$
> 
> At this point, any CPU will encounter the new instruction as a whole,
> irrespective of alignment.
> 
> 
> So the benefit of this scheme is that is works irrespective of the
> instruction fetch window size and don't need the 'funny' alignment
> stuff.
> 

Thanks for explanation. Now I understand how this x86 magic works.

However it looks like even more complex than ARM implementation.
As I understand on ARM they do something like that:
---------------------------->8-------------------------
on_each_cpu {
	write_instruction
	flush_data_cache_region
	invalidate_instruction_cache_region
}
---------------------------->8-------------------------

https://elixir.bootlin.com/linux/v5.1/source/arch/arm/kernel/patch.c#L121

Yep, there is some overhead - as we don't need to do white and D$ flush on each cpu
but that makes code simple and avoids additional checks.

And I don't understand in which cases x86 approach with trap is better.
In this ARM implementation we do one machine wide IPI instead of three in x86 trap approach.

Probably there is some x86 specifics I don't get?


> Now, I've no idea if something like this is feasible on ARC; for it to
> work you need that 2 byte trap instruction -- since all instructions are
> 2 byte aligned, you can always poke that without issue.

Yep we have 2 byte trap (trap_s instruction).

Actually there are even two candidates another candidates which can be used
instead trap_s to avoid adding additional code to trap handler:
SWI_S - software interrupt
and
UNIMP_S - instruction with funny name 'unimplemented instruction'
Vineet Gupta June 20, 2019, 6:48 p.m. UTC | #10
On 6/20/19 12:01 AM, Peter Zijlstra wrote:

> 
> In particular we do not need the alignment.
> 
> So what the x86 code does is:
> 
>  - overwrite the first byte of the instruction with a single byte trap
>    instruction
> 
>  - machine wide IPI which synchronizes I$
> 
> At this point, any CPU that encounters this instruction will trap; and
> the trap handler will emulate the 'new' instruction -- typically a jump.
> 
>   - overwrite the tail of the instruction (if there is a tail)
> 
>   - machine wide IPI which syncrhonizes I$
> 
> At this point, nobody will execute the tail, because we'll still trap on
> that first single byte instruction, but if they were to read the
> instruction stream, the tail must be there.
> 
>   - overwrite the first byte of the instruction to now have a complete
>     instruction.
> 
>   - machine wide IPI which syncrhonizes I$
> 
> At this point, any CPU will encounter the new instruction as a whole,
> irrespective of alignment.
> 
> 
> So the benefit of this scheme is that is works irrespective of the
> instruction fetch window size and don't need the 'funny' alignment
> stuff.
> 
> Now, I've no idea if something like this is feasible on ARC; for it to
> work you need that 2 byte trap instruction -- since all instructions are
> 2 byte aligned, you can always poke that without issue.

We do have a 2 byte TRAP_S u6 which is used for all/any trap'ing: syscalls,
software breakpoint, kprobes etc. But using it like x86 seems a bit excessive for
ARC. Given that x86 doesn't implement flush_icache_range() it must have I$
snooping D$ and also this machine wide IPI sync I$ must be totally under the hood
all hardware affair - unlike ARC which needs on_each_cpu( I$ line range).

Using TRAP_S would actually requires 2 passes (and 2 rounds of IPI) for code
patching - the last one to undo the TRAP_S itself.

I do worry about the occasional alignment induced extra NOP_S instruction (2 byte)
but there doesn't seem to be an easy solution. Heck if we could use the NOP_S /
B_S in first place. While not a clean solution by any standards, could anything be
done to reduce the code path of DO_ONCE() so that unlikely code is not too far off.
Vineet Gupta June 20, 2019, 8:49 p.m. UTC | #11
On 6/20/19 12:52 AM, Peter Zijlstra wrote:
> 
> With everything little endian, everything seems just fine. If you load
> the first 2 byte at offset 0, you get the first 2 bytes of the
> instruction.

It has to do with the instruction encoding scheme and what part of instruction has
the major opcode/subopcode etc. For a canonical 4 byte instruction they happen to
be in the upper 16 bits (for some hardware efficiency reasons unknown to me). So
when a 2 byte insn follows a 4 byte, a 4 byte fetch already tells it what the 2nd
instruction opcode is and whether it is a 2, 4, 6 or 8 byte insn.


> 
> If OTOH your arch big endian, and you were to load the first two bytes
> at offset 0, you get the top two.
> 
> So this middle endian scheme, only seems to make sense if you're
> otherwise big endian.

Insn encoding is always middl eendina - irrespective of endianness.

> But AFAICT ARC is bi-endian and the jump-label
> patch under condsideration is explicitly !CPU_ENDIAN_BE32 -- which
> suggests the instruction stream is sensitive to the endian selection.

Nope they are not directly related. Likely the difference is when we patch the
instruction to memory it is written as data so might need some endian swap. Not
really rocket science. Perhaps it saves some testing effort etc.
Also we don't seem to have any customers interested in BE these days.

> Anyway, I was just 'surprised' at seeing middle endian mentioned,
> curiosity killed the cat and all that :-)

Curiosity may certainly kill cats, but on lkml curiosity is almost always a good
thing both for the enquirer and enquiree ;-)
Peter Zijlstra June 20, 2019, 9:12 p.m. UTC | #12
On Thu, Jun 20, 2019 at 06:34:29PM +0000, Eugeniy Paltsev wrote:
> On Wed, 2019-06-19 at 10:12 +0200, Peter Zijlstra wrote:
> > On Tue, Jun 18, 2019 at 04:16:20PM +0000, Vineet Gupta wrote:

> > I'm assuming you've looked at what x86 currently does and found
> > something like that doesn't work for ARC?
> 
> To be honest I've mostly look at arm/arm64 implementation :)

Yeah, but that's fixed instruction width RISC. They have it easy.
Peter Zijlstra June 20, 2019, 9:22 p.m. UTC | #13
On Thu, Jun 20, 2019 at 11:48:17AM -0700, Vineet Gupta wrote:
> On 6/20/19 12:01 AM, Peter Zijlstra wrote:
> 
> > 
> > In particular we do not need the alignment.
> > 
> > So what the x86 code does is:
> > 
> >  - overwrite the first byte of the instruction with a single byte trap
> >    instruction
> > 
> >  - machine wide IPI which synchronizes I$
> > 
> > At this point, any CPU that encounters this instruction will trap; and
> > the trap handler will emulate the 'new' instruction -- typically a jump.
> > 
> >   - overwrite the tail of the instruction (if there is a tail)
> > 
> >   - machine wide IPI which syncrhonizes I$
> > 
> > At this point, nobody will execute the tail, because we'll still trap on
> > that first single byte instruction, but if they were to read the
> > instruction stream, the tail must be there.
> > 
> >   - overwrite the first byte of the instruction to now have a complete
> >     instruction.
> > 
> >   - machine wide IPI which syncrhonizes I$
> > 
> > At this point, any CPU will encounter the new instruction as a whole,
> > irrespective of alignment.
> > 
> > 
> > So the benefit of this scheme is that is works irrespective of the
> > instruction fetch window size and don't need the 'funny' alignment
> > stuff.
> > 
> > Now, I've no idea if something like this is feasible on ARC; for it to
> > work you need that 2 byte trap instruction -- since all instructions are
> > 2 byte aligned, you can always poke that without issue.
> 
> We do have a 2 byte TRAP_S u6 which is used for all/any trap'ing: syscalls,
> software breakpoint, kprobes etc. But using it like x86 seems a bit excessive for
> ARC. Given that x86 doesn't implement flush_icache_range() it must have I$
> snooping D$ and also this machine wide IPI sync I$ must be totally under the hood
> all hardware affair - unlike ARC which needs on_each_cpu( I$ line range).

I always forget the exact details, but we do have to execute what is
called a serializing instruction to flush CPU state and force it to
re-read the actual instructions -- see sync_core().

> Using TRAP_S would actually requires 2 passes (and 2 rounds of IPI) for code
> patching - the last one to undo the TRAP_S itself.

Correct -- we do 3, like detailed in the other email. But we figured the
actual poking of text is the slow path anyway.

> I do worry about the occasional alignment induced extra NOP_S instruction (2 byte)
> but there doesn't seem to be an easy solution. Heck if we could use the NOP_S /
> B_S in first place. While not a clean solution by any standards, could anything be
> done to reduce the code path of DO_ONCE() so that unlikely code is not too far off.

if one could somehow get the arch_static_branch*() things to
conditionally emit either the 2 or 4 byte jump, depending on the offset
(which is known there, since we stick it in the __jump_table), then we
can have arch_jump_label_transform() use that same condition to switch
between 2 and 4 bytes too.

I just don't know if it's possible :-/
Peter Zijlstra June 20, 2019, 9:30 p.m. UTC | #14
On Thu, Jun 20, 2019 at 06:34:55PM +0000, Eugeniy Paltsev wrote:
> On Thu, 2019-06-20 at 09:01 +0200, Peter Zijlstra wrote:

> > In particular we do not need the alignment.
> > 
> > So what the x86 code does is:
> > 
> >  - overwrite the first byte of the instruction with a single byte trap
> >    instruction
> > 
> >  - machine wide IPI which synchronizes I$
> > 
> > At this point, any CPU that encounters this instruction will trap; and
> > the trap handler will emulate the 'new' instruction -- typically a jump.
> > 
> >   - overwrite the tail of the instruction (if there is a tail)
> > 
> >   - machine wide IPI which syncrhonizes I$
> > 
> > At this point, nobody will execute the tail, because we'll still trap on
> > that first single byte instruction, but if they were to read the
> > instruction stream, the tail must be there.
> > 
> >   - overwrite the first byte of the instruction to now have a complete
> >     instruction.
> > 
> >   - machine wide IPI which syncrhonizes I$
> > 
> > At this point, any CPU will encounter the new instruction as a whole,
> > irrespective of alignment.
> > 
> > 
> > So the benefit of this scheme is that is works irrespective of the
> > instruction fetch window size and don't need the 'funny' alignment
> > stuff.
> > 
> 
> Thanks for explanation. Now I understand how this x86 magic works.
> 
> However it looks like even more complex than ARM implementation.
> As I understand on ARM they do something like that:
> ---------------------------->8-------------------------
> on_each_cpu {
> 	write_instruction
> 	flush_data_cache_region
> 	invalidate_instruction_cache_region
> }
> ---------------------------->8-------------------------
> 
> https://elixir.bootlin.com/linux/v5.1/source/arch/arm/kernel/patch.c#L121
> 
> Yep, there is some overhead - as we don't need to do white and D$ flush on each cpu
> but that makes code simple and avoids additional checks.
> 
> And I don't understand in which cases x86 approach with trap is better.
> In this ARM implementation we do one machine wide IPI instead of three in x86 trap approach.
> 
> Probably there is some x86 specifics I don't get?

It's about variable instruction length; ARM (RISC in general) doesn't
have that, ARC does.

Your current proposal works by keeping the instruction inside of the
i-fetch window, but that then results in instruction padding (extra
NOPs). And that is fine, it really should work.

The x86 approach however allows you to get rid of that padding and
should work for unaligned variable length instructions (we have 1-15
byte instructions).

I just wanted to make sure you were aware of the possiblities such that
you made an informed decision, I'm not trying to force complexity on you
:-)
Peter Zijlstra June 21, 2019, 12:09 p.m. UTC | #15
On Thu, Jun 20, 2019 at 11:22:56PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 20, 2019 at 11:48:17AM -0700, Vineet Gupta wrote:

> > I do worry about the occasional alignment induced extra NOP_S instruction (2 byte)
> > but there doesn't seem to be an easy solution. Heck if we could use the NOP_S /
> > B_S in first place. While not a clean solution by any standards, could anything be
> > done to reduce the code path of DO_ONCE() so that unlikely code is not too far off.
> 
> if one could somehow get the arch_static_branch*() things to
> conditionally emit either the 2 or 4 byte jump, depending on the offset
> (which is known there, since we stick it in the __jump_table), then we
> can have arch_jump_label_transform() use that same condition to switch
> between 2 and 4 bytes too.
> 
> I just don't know if it's possible :-/

So I had to try; but GAS macro .if directives don't like labels as
arguments, not constant enough for them.

../arch/x86/include/asm/jump_label.h:26: Error: non-constant expression in ".if" statement

Damn!

---
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -12,24 +12,19 @@
 # define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
 #endif
 
-#include <asm/asm.h>
-#include <asm/nops.h>
+asm(".include \"asm/jump_label_asm.h\"");
 
 #ifndef __ASSEMBLY__
 
 #include <linux/stringify.h>
 #include <linux/types.h>
+#include <asm/asm.h>
+#include <asm/nops.h>
 
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:"
-		".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
-		".pushsection __jump_table,  \"aw\" \n\t"
-		_ASM_ALIGN "\n\t"
-		".long 1b - ., %l[l_yes] - . \n\t"
-		_ASM_PTR "%c0 + %c1 - .\n\t"
-		".popsection \n\t"
-		: :  "i" (key), "i" (branch) : : l_yes);
+	asm_volatile_goto("STATIC_BRANCH_NOP l_yes=\"%l[l_yes]\", key=\"%c0\", branch=\"%c1\""
+			  : : "i" (key), "i" (branch) : : l_yes);
 
 	return false;
 l_yes:
@@ -38,57 +33,13 @@ static __always_inline bool arch_static_
 
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:"
-		".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
-		"2:\n\t"
-		".pushsection __jump_table,  \"aw\" \n\t"
-		_ASM_ALIGN "\n\t"
-		".long 1b - ., %l[l_yes] - . \n\t"
-		_ASM_PTR "%c0 + %c1 - .\n\t"
-		".popsection \n\t"
-		: :  "i" (key), "i" (branch) : : l_yes);
+	asm_volatile_goto("STATIC_BRANCH_JMP l_yes=\"%l[l_yes]\", key=\"%c0\", branch=\"%c1\""
+			  : : "i" (key), "i" (branch) : : l_yes);
 
 	return false;
 l_yes:
 	return true;
 }
 
-#else	/* __ASSEMBLY__ */
-
-.macro STATIC_JUMP_IF_TRUE target, key, def
-.Lstatic_jump_\@:
-	.if \def
-	/* Equivalent to "jmp.d32 \target" */
-	.byte		0xe9
-	.long		\target - .Lstatic_jump_after_\@
-.Lstatic_jump_after_\@:
-	.else
-	.byte		STATIC_KEY_INIT_NOP
-	.endif
-	.pushsection __jump_table, "aw"
-	_ASM_ALIGN
-	.long		.Lstatic_jump_\@ - ., \target - .
-	_ASM_PTR	\key - .
-	.popsection
-.endm
-
-.macro STATIC_JUMP_IF_FALSE target, key, def
-.Lstatic_jump_\@:
-	.if \def
-	.byte		STATIC_KEY_INIT_NOP
-	.else
-	/* Equivalent to "jmp.d32 \target" */
-	.byte		0xe9
-	.long		\target - .Lstatic_jump_after_\@
-.Lstatic_jump_after_\@:
-	.endif
-	.pushsection __jump_table, "aw"
-	_ASM_ALIGN
-	.long		.Lstatic_jump_\@ - ., \target - .
-	_ASM_PTR	\key + 1 - .
-	.popsection
-.endm
-
-#endif	/* __ASSEMBLY__ */
-
-#endif
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_JUMP_LABEL_H */
--- /dev/null
+++ b/arch/x86/include/asm/jump_label_asm.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_JUMP_LABEL_ASM_H
+#define _ASM_X86_JUMP_LABEL_ASM_H
+
+#include <asm/asm.h>
+#include <asm/nops.h>
+
+#ifdef __ASSEMBLY__
+
+.macro STATIC_BRANCH_ENTRY l_target:req l_yes:req key:req branch:req
+	.pushsection __jump_table, "aw"
+	.long		\l_target - ., \l_yes - .
+#ifdef __X86_64__
+	.quad		(\key - .) + \branch
+#else
+	.long		(\key - .) + \branch
+#endif
+	.popsection
+.endm
+
+.macro STATIC_BRANCH_NOP l_yes:req key:req branch:req
+.Lstatic_branch_nop_\@:
+.iflt 127 - .
+	.byte 0x66, 0x90
+.else
+	.byte STATIC_KEY_INIT_NOP
+.endif
+	STATIC_BRANCH_ENTRY l_target=.Lstatic_branch_nop_\@, l_yes=\l_yes, key=\key, branch=\branch
+.endm
+
+.macro STATIC_BRANCH_JMP l_yes:req key:req branch:req
+.Lstatic_branch_jmp_\@:
+.if \l_yes - . < 127
+	.byte 0xeb
+	.byte \l_yes - (. + 1)
+.else
+	.byte 0xe9
+	.long \l_yes - (. + 4)
+.endif
+	STATIC_BRANCH_ENTRY l_target=.Lstatic_branch_jmp_\@, l_yes=\l_yes, key=\key, branch=\branch
+.endm
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_JUMP_LABEL_ASM_H */
Peter Zijlstra June 21, 2019, 12:12 p.m. UTC | #16
On Fri, Jun 21, 2019 at 02:09:23PM +0200, Peter Zijlstra wrote:

> --- /dev/null
> +++ b/arch/x86/include/asm/jump_label_asm.h
> @@ -0,0 +1,44 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_JUMP_LABEL_ASM_H
> +#define _ASM_X86_JUMP_LABEL_ASM_H
> +
> +#include <asm/asm.h>
> +#include <asm/nops.h>
> +
> +#ifdef __ASSEMBLY__
> +
> +.macro STATIC_BRANCH_ENTRY l_target:req l_yes:req key:req branch:req
> +	.pushsection __jump_table, "aw"
> +	.long		\l_target - ., \l_yes - .
> +#ifdef __X86_64__
> +	.quad		(\key - .) + \branch
> +#else
> +	.long		(\key - .) + \branch
> +#endif
> +	.popsection
> +.endm
> +
> +.macro STATIC_BRANCH_NOP l_yes:req key:req branch:req
> +.Lstatic_branch_nop_\@:
> +.iflt 127 - .

That should've been:

.if \l_yes - . < 127

too, I had been playing with various forms to see when it compiles.
But as soon as a label (either \l_yes or '.' gets used) it barfs.

> +	.byte 0x66, 0x90
> +.else
> +	.byte STATIC_KEY_INIT_NOP
> +.endif
> +	STATIC_BRANCH_ENTRY l_target=.Lstatic_branch_nop_\@, l_yes=\l_yes, key=\key, branch=\branch
> +.endm
> +
> +.macro STATIC_BRANCH_JMP l_yes:req key:req branch:req
> +.Lstatic_branch_jmp_\@:
> +.if \l_yes - . < 127
> +	.byte 0xeb
> +	.byte \l_yes - (. + 1)
> +.else
> +	.byte 0xe9
> +	.long \l_yes - (. + 4)
> +.endif
> +	STATIC_BRANCH_ENTRY l_target=.Lstatic_branch_jmp_\@, l_yes=\l_yes, key=\key, branch=\branch
> +.endm
> +
> +#endif /* __ASSEMBLY__ */
> +#endif /* _ASM_X86_JUMP_LABEL_ASM_H */
Alexey Brodkin June 21, 2019, 3:39 p.m. UTC | #17
Hi Vineet,

> -----Original Message-----
> From: linux-snps-arc <linux-snps-arc-bounces@lists.infradead.org> On Behalf Of Vineet Gupta
> Sent: Thursday, June 20, 2019 11:50 PM
> To: Peter Zijlstra <peterz@infradead.org>
> Cc: linux-arch@vger.kernel.org; Ard Biesheuvel <ard.biesheuvel@linaro.org>; Alexey Brodkin
> <abrodkin@synopsys.com>; linux-kernel@vger.kernel.org; Jason Baron <jbaron@akamai.com>; Paolo Bonzini
> <pbonzini@redhat.com>; linux-snps-arc@lists.infradead.org; Eugeniy Paltsev
> <Eugeniy.Paltsev@synopsys.com>
> Subject: Re: [PATCH] ARC: ARCv2: jump label: implement jump label patching

[snip]

> Insn encoding is always middl eendina - irrespective of endianness.

Apparently only in little-endian mode instructions are encoded as middle-endian,
see:
-------------->8-------------
# cat endian.S

.global myfunc
myfunc:
        mov r0, r1
-------------->8-------------

Little-endian:
-------------->8-------------
# arc-linux-gcc -c -mcpu=archs endian.S -EL
# arc-linux-objdump -d endian.o

endian.o:     file format elf32-littlearc

Disassembly of section .text:
00000000 <myfunc>:
   0:   200a 0040               mov     r0,r1

# arc-linux-readelf -x .text endian.o
Hex dump of section '.text':
  0x00000000 0a204000                            . @.
-------------->8-------------

Big-endian:
-------------->8-------------
# arc-linux-gcc -c -mcpu=archs endian.S -EB
# arc-linux-objdump -d endian.o

endian.o:     file format elf32-bigarc

Disassembly of section .text:
00000000 <myfunc>:
   0:   200a 0040               mov     r0,r1

# arc-linux-readelf -x .text endian.o

Hex dump of section '.text':
  0x00000000 200a0040                             ..@
-------------->8-------------

-Alexey
Nadav Amit June 21, 2019, 6:37 p.m. UTC | #18
> On Jun 21, 2019, at 5:12 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Fri, Jun 21, 2019 at 02:09:23PM +0200, Peter Zijlstra wrote:
> 
>> --- /dev/null
>> +++ b/arch/x86/include/asm/jump_label_asm.h
>> @@ -0,0 +1,44 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef _ASM_X86_JUMP_LABEL_ASM_H
>> +#define _ASM_X86_JUMP_LABEL_ASM_H
>> +
>> +#include <asm/asm.h>
>> +#include <asm/nops.h>
>> +
>> +#ifdef __ASSEMBLY__
>> +
>> +.macro STATIC_BRANCH_ENTRY l_target:req l_yes:req key:req branch:req
>> +	.pushsection __jump_table, "aw"
>> +	.long		\l_target - ., \l_yes - .
>> +#ifdef __X86_64__
>> +	.quad		(\key - .) + \branch
>> +#else
>> +	.long		(\key - .) + \branch
>> +#endif
>> +	.popsection
>> +.endm
>> +
>> +.macro STATIC_BRANCH_NOP l_yes:req key:req branch:req
>> +.Lstatic_branch_nop_\@:
>> +.iflt 127 - .
> 
> That should've been:
> 
> .if \l_yes - . < 127
> 
> too, I had been playing with various forms to see when it compiles.
> But as soon as a label (either \l_yes or '.' gets used) it barfs.

I think the error makes sense as it creates a “circular dependency”:
assembly of the code might affect the label address, and this address affect
the assembly.
Vineet Gupta June 28, 2019, 10:59 p.m. UTC | #19
On 6/18/19 9:16 AM, Vineet Gupta wrote:
> On 6/14/19 9:41 AM, Eugeniy Paltsev wrote:
>> Implement jump label patching for ARC. Jump labels provide
>> an interface to generate dynamic branches using
>> self-modifying code.
>>
>> This allows us to implement conditional branches where
>> changing branch direction is expensive but branch selection
>> is basically 'free'
>>
>> This implementation uses 32-bit NOP and BRANCH instructions
>> which forced to be aligned by 4 to guarantee that they don't
>> cross L1 cache line and can be update atomically.
>>
>> Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
> 
> LGTM overall - nits below.

@Peter can we have your reviewed/ack or some such assuming you don't have any
objections !

Thx,
-Vineet
Vineet Gupta July 3, 2019, 4:15 p.m. UTC | #20
On 6/18/19 9:16 AM, Vineet Gupta wrote:
> On 6/14/19 9:41 AM, Eugeniy Paltsev wrote:
>> Implement jump label patching for ARC. Jump labels provide
>> an interface to generate dynamic branches using
>> self-modifying code.
>>
>> This allows us to implement conditional branches where
>> changing branch direction is expensive but branch selection
>> is basically 'free'
>>
>> This implementation uses 32-bit NOP and BRANCH instructions
>> which forced to be aligned by 4 to guarantee that they don't
>> cross L1 cache line and can be update atomically.
>>
>> Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
> LGTM overall - nits below.
> 

Can you address the review comments soon so this gets merged in 5.3 whose merge
window is looming !
Eugeniy Paltsev July 17, 2019, 3:09 p.m. UTC | #21
Hi Vineet,
I'm finally back, so see my comments below.

On Tue, 2019-06-18 at 16:16 +0000, Vineet Gupta wrote:
> On 6/14/19 9:41 AM, Eugeniy Paltsev wrote:
> > Implement jump label patching for ARC. Jump labels provide
> > an interface to generate dynamic branches using
> > self-modifying code.
> > 
> > This allows us to implement conditional branches where
> > changing branch direction is expensive but branch selection
> > is basically 'free'
> > 
> > This implementation uses 32-bit NOP and BRANCH instructions
> > which forced to be aligned by 4 to guarantee that they don't
> > cross L1 cache line and can be update atomically.
> > 
> > Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
> 
> LGTM overall - nits below.
> 
> > ---
> >  arch/arc/Kconfig                  |   8 ++
> >  arch/arc/include/asm/jump_label.h |  68 ++++++++++++
> >  arch/arc/kernel/Makefile          |   1 +
> >  arch/arc/kernel/jump_label.c      | 168 ++++++++++++++++++++++++++++++
> >  4 files changed, 245 insertions(+)
> >  create mode 100644 arch/arc/include/asm/jump_label.h
> >  create mode 100644 arch/arc/kernel/jump_label.c
> > 
> > diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
> > index c781e45d1d99..b1313e016c54 100644
> > --- a/arch/arc/Kconfig
> > +++ b/arch/arc/Kconfig
> > @@ -47,6 +47,7 @@ config ARC
> >  	select OF_EARLY_FLATTREE
> >  	select PCI_SYSCALL if PCI
> >  	select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING
> > +	select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32
> >  
> >  config ARCH_HAS_CACHE_LINE_SIZE
> >  	def_bool y
> > @@ -529,6 +530,13 @@ config ARC_DW2_UNWIND
> >  config ARC_DBG_TLB_PARANOIA
> >  	bool "Paranoia Checks in Low Level TLB Handlers"
> >  
> > +config ARC_DBG_STATIC_KEYS
> > +	bool "Paranoid checks in Static Keys code"
> > +	depends on JUMP_LABEL
> > +	select STATIC_KEYS_SELFTEST
> > +	help
> > +	  Enable paranoid checks and self-test of both ARC-specific and generic
> > +	  part of static-keys-related code.
> 
> Why can't we just enable this if STATIC_KEYS_SELFTEST

As we extent STATIC_KEYS_SELFTEST option such dependency looks more reasonable for me
(we enable our checks -> lets enable corresponding generic ones)
I don't insist, though.

> >  endif
> >  
> >  config ARC_BUILTIN_DTB_NAME
> > diff --git a/arch/arc/include/asm/jump_label.h b/arch/arc/include/asm/jump_label.h
> > new file mode 100644
> > index 000000000000..8971d0608f2c
> > --- /dev/null
> > +++ b/arch/arc/include/asm/jump_label.h
> > @@ -0,0 +1,68 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _ASM_ARC_JUMP_LABEL_H
> > +#define _ASM_ARC_JUMP_LABEL_H
> > +
> > +#ifndef __ASSEMBLY__
> > +
> > +#include <linux/types.h>
> > +
> > +#define JUMP_LABEL_NOP_SIZE 4
> > +
> > +/*
> > + * To make atomic update of patched instruction available we need to guarantee
> > + * that this instruction doesn't cross L1 cache line boundary.
> > + *
> > + * As of today we simply align instruction which can be patched by 4 byte using
> > + * ".balign 4" directive. In that case patched instruction is aligned with one
> > + * 16-bit NOP_S if this is required.
> > + * However 'align by 4' directive is much stricter than it actually required.
> > + * It's enough that our 32-bit instruction don't cross l1 cache line boundary
> > + * which can be achieved by using ".bundle_align_mode" directive. That will save
> > + * us from adding useless NOP_S padding in most of the cases.
> > + *
> > + * TODO: switch to ".bundle_align_mode" directive using whin it will be
> > + * supported by ARC toolchain.
> > + */
> > +
> 
> So block comments on top of a function imply summary of function etc.
> What you are doing here is calling out the need for .balign quirk. So better to
> phrase it is as "Note about .balign" and then describe the thing

Ok, will fix in v2.

> > +static __always_inline bool arch_static_branch(struct static_key *key,
> > +					       bool branch)
> > +{
> > +	asm_volatile_goto(".balign 4			\n"
> > +		 "1:					\n"
> > +		 "nop					\n"
> > +		 ".pushsection __jump_table, \"aw\"	\n"
> > +		 ".word 1b, %l[l_yes], %c0		\n"
> > +		 ".popsection				\n"
> > +		 : : "i" (&((char *)key)[branch]) : : l_yes);
> > +
> > +	return false;
> > +l_yes:
> > +	return true;
> > +}
> > +
> > +static __always_inline bool arch_static_branch_jump(struct static_key *key,
> > +						    bool branch)
> > +{
> > +	asm_volatile_goto(".balign 4			\n"
> > +		 "1:					\n"
> > +		 "b %l[l_yes]				\n"
> > +		 ".pushsection __jump_table, \"aw\"	\n"
> > +		 ".word 1b, %l[l_yes], %c0		\n"
> > +		 ".popsection				\n"
> > +		 : : "i" (&((char *)key)[branch]) : : l_yes);
> > +
> > +	return false;
> > +l_yes:
> > +	return true;
> > +}
> > +
> > +typedef u32 jump_label_t;
> > +
> > +struct jump_entry {
> > +	jump_label_t code;
> > +	jump_label_t target;
> > +	jump_label_t key;
> > +};
> > +
> > +#endif  /* __ASSEMBLY__ */
> > +#endif
> > diff --git a/arch/arc/kernel/Makefile b/arch/arc/kernel/Makefile
> > index 2dc5f4296d44..307f74156d99 100644
> > --- a/arch/arc/kernel/Makefile
> > +++ b/arch/arc/kernel/Makefile
> > @@ -22,6 +22,7 @@ obj-$(CONFIG_ARC_EMUL_UNALIGNED) 	+= unaligned.o
> >  obj-$(CONFIG_KGDB)			+= kgdb.o
> >  obj-$(CONFIG_ARC_METAWARE_HLINK)	+= arc_hostlink.o
> >  obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
> > +obj-$(CONFIG_JUMP_LABEL)		+= jump_label.o
> >  
> >  obj-$(CONFIG_ARC_FPU_SAVE_RESTORE)	+= fpu.o
> >  CFLAGS_fpu.o   += -mdpfp
> > diff --git a/arch/arc/kernel/jump_label.c b/arch/arc/kernel/jump_label.c
> > new file mode 100644
> > index 000000000000..93e3bc84288f
> > --- /dev/null
> > +++ b/arch/arc/kernel/jump_label.c
> > @@ -0,0 +1,168 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +#include <linux/kernel.h>
> > +#include <linux/jump_label.h>
> > +
> > +#include "asm/cacheflush.h"
> > +
> > +#define JUMPLABEL_ERR	"ARC: jump_label: ERROR: "
> > +
> > +/* Halt system on fatal error to make debug easier */
> > +#define arc_jl_fatal(format...)						\
> > +({									\
> > +	pr_err(JUMPLABEL_ERR format);					\
> > +	BUG();								\
> 
> Does it make sense to bring down the whole system vs. failing and returning.
> I see there is no error propagation to core code, but still.

I totally agree with Peter, and I prefer to stop the system on this errors. Here is few arguments:
All this checks can't be toggle in system operating normally and only may be caused by bad code generation (or some code/data corruption):
1) We got our instruction to patch unaligned by 4 bytes (despite the fact that we used '.balign 4' to align it)
2) We got branch offset unaligned (which means that we calculate it wrong at build time or corrupt it in run time)
3) We got branch offset which not fits into s25. As this is offset inside one function (inside one 'if' statement actually) we newer get such huge
offset in real life if code generation is ok.

If we only warn to log and return we will face with compromised kernel flow later.
I.E. it could be 'if' statement in kernel text which is switched to wrong state: the condition is true but we take the false branch.
And It will be the issue which is much more difficult to debug.

Does it sound reasonably?

If you really don't want to have BUG here, I can make it conditionally enabled
in depend on CONFIG_ARC_STATIC_KEYS_DEBUG as I want to have it enabled at least on ARC development platforms.

[snip]
Vineet Gupta July 17, 2019, 5:45 p.m. UTC | #22
On 7/17/19 8:09 AM, Eugeniy Paltsev wrote:
>>> +/* Halt system on fatal error to make debug easier */
>>> +#define arc_jl_fatal(format...)						\
>>> +({									\
>>> +	pr_err(JUMPLABEL_ERR format);					\
>>> +	BUG();								\
>> Does it make sense to bring down the whole system vs. failing and returning.
>> I see there is no error propagation to core code, but still.
> I totally agree with Peter, and I prefer to stop the system on this errors. Here is few arguments:
> All this checks can't be toggle in system operating normally and only may be caused by bad code generation (or some code/data corruption):
> 1) We got our instruction to patch unaligned by 4 bytes (despite the fact that we used '.balign 4' to align it)
> 2) We got branch offset unaligned (which means that we calculate it wrong at build time or corrupt it in run time)
> 3) We got branch offset which not fits into s25. As this is offset inside one function (inside one 'if' statement actually) we newer get such huge
> offset in real life if code generation is ok.

I understand that. But AFAIKR in implementation arc_jl_fatal() gets called before
we have done the actual code patching and/or flushing the caches to that effect.
So harm has been done just yet. We just need to make sure that any book-keeping of
true/false is not yet done or unrolled.

> If we only warn to log and return we will face with compromised kernel flow later.
> I.E. it could be 'if' statement in kernel text which is switched to wrong state: the condition is true but we take the false branch.
> And It will be the issue which is much more difficult to debug.
>
> Does it sound reasonably?

I'm still not convinced that by hitting the _fatal() we are in some inconsistent
state already. But if u feel strongly lets keep it.

>
> If you really don't want to have BUG here, I can make it conditionally enabled

Not a good idea. It is unconditionally present or not. Not something in between.

> in depend on CONFIG_ARC_STATIC_KEYS_DEBUG as I want to have it enabled at least on ARC development platforms.
Eugeniy Paltsev July 17, 2019, 6:54 p.m. UTC | #23
On Wed, 2019-07-17 at 17:45 +0000, Vineet Gupta wrote:
> On 7/17/19 8:09 AM, Eugeniy Paltsev wrote:
> > > > +/* Halt system on fatal error to make debug easier */
> > > > +#define arc_jl_fatal(format...)						\
> > > > +({									\
> > > > +	pr_err(JUMPLABEL_ERR format);					\
> > > > +	BUG();								\
> > > Does it make sense to bring down the whole system vs. failing and returning.
> > > I see there is no error propagation to core code, but still.
> > I totally agree with Peter, and I prefer to stop the system on this errors. Here is few arguments:
> > All this checks can't be toggle in system operating normally and only may be caused by bad code generation (or some code/data corruption):
> > 1) We got our instruction to patch unaligned by 4 bytes (despite the fact that we used '.balign 4' to align it)
> > 2) We got branch offset unaligned (which means that we calculate it wrong at build time or corrupt it in run time)
> > 3) We got branch offset which not fits into s25. As this is offset inside one function (inside one 'if' statement actually) we newer get such huge
> > offset in real life if code generation is ok.
> 
> I understand that. But AFAIKR in implementation arc_jl_fatal() gets called before
> we have done the actual code patching and/or flushing the caches

Correct.

>  to that effect.
> So harm has been done just yet. We just need to make sure that any book-keeping of
> true/false is not yet done or unrolled.

Can you describe your proposal in more detail?

Be noted that in case (2) or (3) we know that we are not able to generate correct instruction for replace existing one. And we don't know anything
about the real cause.
So what we can do here besides that we warn about it?

-We can halt the system to make debug easy (which I propose)
-We can generate invalid instruction.
-We can just skip this patching. In that case we'll end with some 'if' statements [in kernel code] working incorrectly. I really don't want to debug
this.

Given that we'll never face with this asserts in any consistent state I don't see any reason why we shouldn't simply call BUG() here.

> 
> > If we only warn to log and return we will face with compromised kernel flow later.
> > I.E. it could be 'if' statement in kernel text which is switched to wrong state: the condition is true but we take the false branch.
> > And It will be the issue which is much more difficult to debug.
> > 
> > Does it sound reasonably?
> 
> I'm still not convinced that by hitting the _fatal() we are in some inconsistent
> state already. But if u feel strongly lets keep it.
> 
> > If you really don't want to have BUG here, I can make it conditionally enabled
> 
> Not a good idea. It is unconditionally present or not. Not something in between.
> 
> > in depend on CONFIG_ARC_STATIC_KEYS_DEBUG as I want to have it enabled at least on ARC development platforms.
> 
>
diff mbox series

Patch

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index c781e45d1d99..b1313e016c54 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -47,6 +47,7 @@  config ARC
 	select OF_EARLY_FLATTREE
 	select PCI_SYSCALL if PCI
 	select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING
+	select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32
 
 config ARCH_HAS_CACHE_LINE_SIZE
 	def_bool y
@@ -529,6 +530,13 @@  config ARC_DW2_UNWIND
 config ARC_DBG_TLB_PARANOIA
 	bool "Paranoia Checks in Low Level TLB Handlers"
 
+config ARC_DBG_STATIC_KEYS
+	bool "Paranoid checks in Static Keys code"
+	depends on JUMP_LABEL
+	select STATIC_KEYS_SELFTEST
+	help
+	  Enable paranoid checks and self-test of both ARC-specific and generic
+	  part of static-keys-related code.
 endif
 
 config ARC_BUILTIN_DTB_NAME
diff --git a/arch/arc/include/asm/jump_label.h b/arch/arc/include/asm/jump_label.h
new file mode 100644
index 000000000000..8971d0608f2c
--- /dev/null
+++ b/arch/arc/include/asm/jump_label.h
@@ -0,0 +1,68 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_ARC_JUMP_LABEL_H
+#define _ASM_ARC_JUMP_LABEL_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+#define JUMP_LABEL_NOP_SIZE 4
+
+/*
+ * To make atomic update of patched instruction available we need to guarantee
+ * that this instruction doesn't cross L1 cache line boundary.
+ *
+ * As of today we simply align instruction which can be patched by 4 byte using
+ * ".balign 4" directive. In that case patched instruction is aligned with one
+ * 16-bit NOP_S if this is required.
+ * However 'align by 4' directive is much stricter than it actually required.
+ * It's enough that our 32-bit instruction don't cross l1 cache line boundary
+ * which can be achieved by using ".bundle_align_mode" directive. That will save
+ * us from adding useless NOP_S padding in most of the cases.
+ *
+ * TODO: switch to ".bundle_align_mode" directive using whin it will be
+ * supported by ARC toolchain.
+ */
+
+static __always_inline bool arch_static_branch(struct static_key *key,
+					       bool branch)
+{
+	asm_volatile_goto(".balign 4			\n"
+		 "1:					\n"
+		 "nop					\n"
+		 ".pushsection __jump_table, \"aw\"	\n"
+		 ".word 1b, %l[l_yes], %c0		\n"
+		 ".popsection				\n"
+		 : : "i" (&((char *)key)[branch]) : : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key,
+						    bool branch)
+{
+	asm_volatile_goto(".balign 4			\n"
+		 "1:					\n"
+		 "b %l[l_yes]				\n"
+		 ".pushsection __jump_table, \"aw\"	\n"
+		 ".word 1b, %l[l_yes], %c0		\n"
+		 ".popsection				\n"
+		 : : "i" (&((char *)key)[branch]) : : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+typedef u32 jump_label_t;
+
+struct jump_entry {
+	jump_label_t code;
+	jump_label_t target;
+	jump_label_t key;
+};
+
+#endif  /* __ASSEMBLY__ */
+#endif
diff --git a/arch/arc/kernel/Makefile b/arch/arc/kernel/Makefile
index 2dc5f4296d44..307f74156d99 100644
--- a/arch/arc/kernel/Makefile
+++ b/arch/arc/kernel/Makefile
@@ -22,6 +22,7 @@  obj-$(CONFIG_ARC_EMUL_UNALIGNED) 	+= unaligned.o
 obj-$(CONFIG_KGDB)			+= kgdb.o
 obj-$(CONFIG_ARC_METAWARE_HLINK)	+= arc_hostlink.o
 obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
+obj-$(CONFIG_JUMP_LABEL)		+= jump_label.o
 
 obj-$(CONFIG_ARC_FPU_SAVE_RESTORE)	+= fpu.o
 CFLAGS_fpu.o   += -mdpfp
diff --git a/arch/arc/kernel/jump_label.c b/arch/arc/kernel/jump_label.c
new file mode 100644
index 000000000000..93e3bc84288f
--- /dev/null
+++ b/arch/arc/kernel/jump_label.c
@@ -0,0 +1,168 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+#include <linux/jump_label.h>
+
+#include "asm/cacheflush.h"
+
+#define JUMPLABEL_ERR	"ARC: jump_label: ERROR: "
+
+/* Halt system on fatal error to make debug easier */
+#define arc_jl_fatal(format...)						\
+({									\
+	pr_err(JUMPLABEL_ERR format);					\
+	BUG();								\
+})
+
+static inline u32 arc_gen_nop(void)
+{
+	/* 1x 32bit NOP in middle endian */
+	return 0x7000264a;
+}
+
+static inline bool cross_l1_cache_line(void *addr, int len)
+{
+	unsigned long a = (unsigned long)addr;
+
+	return (a >> L1_CACHE_SHIFT) != ((a + len - 1) >> L1_CACHE_SHIFT);
+}
+
+/*
+ * ARCv2 'Branch unconditionally' instruction:
+ * 00000ssssssssss1SSSSSSSSSSNRtttt
+ * s S[n:0] lower bits signed immediate (number is bitfield size)
+ * S S[m:n+1] upper bits signed immediate (number is bitfield size)
+ * t S[24:21] upper bits signed immediate (branch unconditionally far)
+ * N N <.d> delay slot mode
+ * R R Reserved
+ */
+static inline u32 arc_gen_branch(jump_label_t pc, jump_label_t target)
+{
+	u32 instruction_l, instruction_r;
+	u32 pcl = pc & GENMASK(31, 2);
+	u32 u_offset = target - pcl;
+	u32 s, S, t;
+
+	/*
+	 * Offset in 32-bit branch instruction must to fit into s25.
+	 * Something is terribly broken if we get such huge offset within one
+	 * function.
+	 */
+	if ((s32)u_offset < -16777216 || (s32)u_offset > 16777214)
+		arc_jl_fatal("gen branch with offset (%d) not fit in s25\n",
+			     (s32)u_offset);
+
+	/*
+	 * All instructions are aligned by 2 bytes so we should never get offset
+	 * here which is not 2 bytes aligned.
+	 */
+	if (u_offset & 0x1)
+		arc_jl_fatal("gen branch with offset (%d) unaligned to 2 bytes\n",
+			     (s32)u_offset);
+
+	s = (u_offset >> 1)  & GENMASK(9, 0);
+	S = (u_offset >> 11) & GENMASK(9, 0);
+	t = (u_offset >> 21) & GENMASK(3, 0);
+
+	/* 00000ssssssssss1 */
+	instruction_l = (s << 1) | 0x1;
+	/* SSSSSSSSSSNRtttt */
+	instruction_r = (S << 6) | t;
+
+	return (instruction_r << 16) | (instruction_l & GENMASK(15, 0));
+}
+
+void arch_jump_label_transform(struct jump_entry *entry,
+			       enum jump_label_type type)
+{
+	jump_label_t *instr_addr = (jump_label_t *)entry->code;
+	u32 instr;
+
+	/*
+	 * Atomic update of patched instruction is only available if this
+	 * instruction doesn't cross L1 cache line boundary. You can read about
+	 * the way we achieve this in arc/include/asm/jump_label.h
+	 */
+	if (cross_l1_cache_line(instr_addr, JUMP_LABEL_NOP_SIZE))
+		arc_jl_fatal("instruction (addr %px) cross L1 cache line",
+			     instr_addr);
+
+	if (type == JUMP_LABEL_JMP)
+		instr = arc_gen_branch(entry->code, entry->target);
+	else
+		instr = arc_gen_nop();
+
+	WRITE_ONCE(*instr_addr, instr);
+	flush_icache_range(entry->code, entry->code + JUMP_LABEL_NOP_SIZE);
+}
+
+void arch_jump_label_transform_static(struct jump_entry *entry,
+				      enum jump_label_type type)
+{
+	/*
+	 * We use only one NOP type (1x, 4 byte) in arch_static_branch, so
+	 * there's no need to patch an identical NOP over the top of it here.
+	 * The generic code calls 'arch_jump_label_transform' if the NOP needs
+	 * to be replaced by a branch, so 'arch_jump_label_transform_static' is
+	 * newer called with type other than JUMP_LABEL_NOP.
+	 */
+	BUG_ON(type != JUMP_LABEL_NOP);
+}
+
+#ifdef CONFIG_ARC_DBG_STATIC_KEYS
+#define SELFTEST_MSG	"ARC: instruction generation self-test: "
+
+struct arc_gen_branch_testdata {
+	jump_label_t pc;
+	jump_label_t target_address;
+	u32 expected_instr;
+};
+
+static __init int branch_gen_test(struct arc_gen_branch_testdata *test_data)
+{
+	u32 instr_got;
+
+	instr_got = arc_gen_branch(test_data->pc, test_data->target_address);
+	if (instr_got != test_data->expected_instr) {
+		pr_err(SELFTEST_MSG "FAIL:\n arc_gen_branch(0x%08x, 0x%08x) != 0x%08x, got 0x%08x\n",
+		       test_data->pc, test_data->target_address,
+		       test_data->expected_instr, instr_got);
+
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static __init int instr_gen_test(void)
+{
+	int i, ret;
+
+	struct arc_gen_branch_testdata test_data[] = {
+		{0x90007548, 0x90007514, 0xffcf07cd}, /* tiny (-52) offs */
+		{0x9000c9c0, 0x9000c782, 0xffcf05c3}, /* tiny (-574) offs */
+		{0x9000cc1c, 0x9000c782, 0xffcf0367}, /* tiny (-1178) offs */
+		{0x9009dce0, 0x9009d106, 0xff8f0427}, /* small (-3034) offs */
+		{0x9000f5de, 0x90007d30, 0xfc0f0755}, /* big  (-30892) offs */
+		{0x900a2444, 0x90035f64, 0xc9cf0321}, /* huge (-443616) offs */
+		{0x90007514, 0x9000752c, 0x00000019}, /* tiny (+24) offs */
+		{0x9001a578, 0x9001a77a, 0x00000203}, /* tiny (+514) offs */
+		{0x90031ed8, 0x90032634, 0x0000075d}, /* tiny (+1884) offs */
+		{0x9008c7f2, 0x9008d3f0, 0x00400401}, /* small (+3072) offs */
+		{0x9000bb38, 0x9003b340, 0x17c00009}, /* big  (+194568) offs */
+		{0x90008f44, 0x90578d80, 0xb7c2063d}  /* huge (+5701180) offs */
+	};
+
+	for (i = 0; i < ARRAY_SIZE(test_data); i++) {
+		ret = branch_gen_test(&test_data[i]);
+		if (ret)
+			return ret;
+	}
+
+	pr_info(SELFTEST_MSG "OK\n");
+
+	return 0;
+}
+early_initcall(instr_gen_test);
+
+#endif /* CONFIG_ARC_STATIC_KEYS_DEBUG */