Patchwork [-V2,2/2] powerpc: Update kernel VSID range

login
register
mail settings
Submitter Aneesh Kumar K.V
Date March 12, 2013, 7:19 a.m.
Message ID <87vc8xrsp1.fsf@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/226827/
State Superseded
Headers show

Comments

Aneesh Kumar K.V - March 12, 2013, 7:19 a.m.
David Gibson <dwg@au1.ibm.com> writes:

> On Fri, Feb 15, 2013 at 09:09:38PM +0530, Aneesh Kumar K.V wrote:
>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>> 

.... snip....

>> - *	- We allow for USER_ESID_BITS significant bits of ESID and
>> - * CONTEXT_BITS  bits of context for user addresses.
>> - *  i.e. 64T (46 bits) of address space for up to half a million contexts.
>> - *
>> - *	- The scramble function gives robust scattering in the hash
>> - * table (at least based on some initial results).  The previous
>> - * method was more susceptible to pathological cases giving excessive
>> - * hash collisions.
>> + * We also need to avoid the last segment of the last context, because that
>> + * would give a protovsid of 0x1fffffffff. That will result in a VSID 0
>> + * because of the modulo operation in vsid scramble. But the vmemmap
>> + * (which is what uses region 0xf) will never be close to 64TB in size
>> + * (it's 56 bytes per page of system memory).
>>   */
>>  
>>  #define CONTEXT_BITS		19
>> @@ -386,15 +382,25 @@ extern void slb_set_size(u16 size);
>>  #define USER_ESID_BITS_1T	6
>
> USER_ESID_BITS should probably be renamed just ESID_BITS, since it's
> now relevant to kernel addresses too.

Done. I added this as a patch on top of the series. 

>
>>  /*
>> + * 256MB segment
>> + * The proto-VSID space has 2^(CONTEX_BITS + USER_ESID_BITS) - 1 segments
>> + * available for user + kernel mapping. The top 4 contexts are used for
>> + * kernel mapping. Each segment contains 2^28 bytes. Each
>> + * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts
>> + * (19 == 37 + 28 - 46).
>> + */
>> +#define MAX_CONTEXT	((ASM_CONST(1) << CONTEXT_BITS) - 1)
>
> Hrm.  I think it would be clearer to have MAX_CONTEXT (still) be the
> maximum usable *user* context (i.e. 0x80000 - 5) and put the kernel
> ones above that still.
>

Done as

-#define MAX_CONTEXT	((ASM_CONST(1) << CONTEXT_BITS) - 1)
+#define MAX_USER_CONTEXT	((ASM_CONST(1) << CONTEXT_BITS) - 5)

Also updated the reference of MAX_CONTEXT in the code to
MAX_USER_CONTEXT
 
>> +
>> +/*
>>   * This should be computed such that protovosid * vsid_mulitplier
>>   * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus
>>   */
>>  #define VSID_MULTIPLIER_256M	ASM_CONST(12538073)	/* 24-bit prime */
>> -#define VSID_BITS_256M		(CONTEXT_BITS + USER_ESID_BITS + 1)
>> +#define VSID_BITS_256M		(CONTEXT_BITS + USER_ESID_BITS)
>>  #define VSID_MODULUS_256M	((1UL<<VSID_BITS_256M)-1)
>>  
>>  #define VSID_MULTIPLIER_1T	ASM_CONST(12538073)	/* 24-bit prime */
>> -#define VSID_BITS_1T		(CONTEXT_BITS + USER_ESID_BITS_1T + 1)
>> +#define VSID_BITS_1T		(CONTEXT_BITS + USER_ESID_BITS_1T)
>>  #define VSID_MODULUS_1T		((1UL<<VSID_BITS_1T)-1)
>>  
>>  

.... snip......

>>  #endif /* _ASM_POWERPC_MMU_HASH64_H_ */
>> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
>> index 4665e82..0e9c48c 100644
>> --- a/arch/powerpc/kernel/exceptions-64s.S
>> +++ b/arch/powerpc/kernel/exceptions-64s.S
>> @@ -1268,20 +1268,39 @@ do_ste_alloc:
>>  _GLOBAL(do_stab_bolted)
>
> The stab path certainly hasn't been tested, since we've been broken on
> stab machines for a long time.
>
>>  	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
>>  	std	r11,PACA_EXSLB+EX_SRR0(r13)	/* save SRR0 in exc. frame */
>> +	mfspr	r11,SPRN_DAR			/* ea */
>>  
>> +	/*
>> +	 * check for bad kernel/user address
>> +	 * (ea & ~REGION_MASK) >= PGTABLE_RANGE
>> +	 */
>> +	clrldi  r9,r11,4
>> +	li	r10,-1
>> +	clrldi  r10,r10,(64 - 46)
>> +	cmpld	cr7,r9,r10
>
> You can replace the above 4 instructions with just:
> 	rldicr. r9,r11,4,(64-46-4)
>

nice, updated as below

-	clrldi  r9,r11,4
-	li	r10,-1
-	clrldi  r10,r10,(64 - 46)
-	cmpld	cr7,r9,r10
+	rldicr. r9,r11,4,(64 - 46 - 4)
 	li	r9,0	/* VSID = 0 for bad address */
-	bgt	cr7,0f
+	bne-	0f


>> +	li	r9,0	/* VSID = 0 for bad address */
>> +	bgt	cr7,0f
>> +
>> +	/*


.... snip....

>> diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
>> index 3a292be..bfeab83 100644
>> --- a/arch/powerpc/mm/hash_utils_64.c
>> +++ b/arch/powerpc/mm/hash_utils_64.c
>> @@ -194,6 +194,11 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
>>  		unsigned long vpn  = hpt_vpn(vaddr, vsid, ssize);
>>  		unsigned long tprot = prot;
>>  
>> +		/*
>> +		 * If we hit a bad address return error.
>> +		 */
>> +		if (!vsid)
>> +			return -1;
>>  		/* Make kernel text executable */
>>  		if (overlaps_kernel_text(vaddr, vaddr + step))
>>  			tprot &= ~HPTE_R_N;
>> @@ -921,11 +926,6 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
>>  	DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
>>  		ea, access, trap);
>>  
>> -	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) {
>> -		DBG_LOW(" out of pgtable range !\n");
>> - 		return 1;
>> -	}
>> -
>
> Hrm.  This test is conceptually different, even if the logic is the
> same as the vsid availablility test you may have performed earlier.
> Perhaps add BUILD_BUG_ON()s to ensure that they really are the same.
>

can you elaborate that ? What BUILD_BUG_ON test are you suggesting ?



>
>>  	/* Get region & vsid */
>>   	switch (REGION_ID(ea)) {
>>  	case USER_REGION_ID:
>> @@ -956,6 +956,11 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
>>  	}
>>  	DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
>>  
>> +	/* Bad address. */
>> +	if (!vsid) {
>> +		DBG_LOW("Bad address!\n");
>> +		return 1;
>> +	}
>>  	/* Get pgdir */
>>  	pgdir = mm->pgd;
>>  	if (pgdir == NULL)
>> @@ -1125,6 +1130,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
>>  	/* Get VSID */
>>  	ssize = user_segment_size(ea);
>>  	vsid = get_vsid(mm->context.id, ea, ssize);
>> +	if (!vsid)
>> +		return;
>>  
>>  	/* Hash doesn't like irqs */
>>  	local_irq_save(flags);
>> @@ -1217,6 +1224,9 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
>>  	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
>>  	hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
>>  
>> +	/* Don't create HPTE entries for bad address */
>> +	if (!vsid)
>> +		return;
>>  	ret = ppc_md.hpte_insert(hpteg, vpn, __pa(vaddr),
>>  				 mode, HPTE_V_BOLTED,
>>  				 mmu_linear_psize, mmu_kernel_ssize);
>> diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
>> index 40bc5b0..59cd773 100644
>> --- a/arch/powerpc/mm/mmu_context_hash64.c
>> +++ b/arch/powerpc/mm/mmu_context_hash64.c
>> @@ -29,15 +29,6 @@
>>  static DEFINE_SPINLOCK(mmu_context_lock);
>>  static DEFINE_IDA(mmu_context_ida);
>>  
>> -/*
>> - * 256MB segment
>> - * The proto-VSID space has 2^(CONTEX_BITS + USER_ESID_BITS) - 1 segments
>> - * available for user mappings. Each segment contains 2^28 bytes. Each
>> - * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts
>> - * (19 == 37 + 28 - 46).
>> - */
>> -#define MAX_CONTEXT	((1UL << CONTEXT_BITS) - 1)
>> -
>>  int __init_new_context(void)
>>  {
>>  	int index;
>> @@ -56,7 +47,8 @@ again:
>>  	else if (err)
>>  		return err;
>>  
>> -	if (index > MAX_CONTEXT) {
>> +	if (index > (MAX_CONTEXT - 4)) {
>> +		/* Top 4 context id values are used for kernel */
>
> This change would not be necessary if you changed MAX_CONTEXT as
> suggested above.
>

done now have

-	if (index > (MAX_CONTEXT - 4)) {
-		/* Top 4 context id values are used for kernel */
+	if (index > (MAX_USER_CONTEXT) {


>>  		spin_lock(&mmu_context_lock);
>>  		ida_remove(&mmu_context_ida, index);
>>  		spin_unlock(&mmu_context_lock);
>> diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
>> index 1a16ca2..c066d00 100644
>> --- a/arch/powerpc/mm/slb_low.S
>> +++ b/arch/powerpc/mm/slb_low.S
>> @@ -31,13 +31,20 @@
>>   * No other registers are examined or changed.
>>   */
>>  _GLOBAL(slb_allocate_realmode)
>> -	/* r3 = faulting address */
>> +	/*
>> +	 * check for bad kernel/user address
>> +	 * (ea & ~REGION_MASK) >= PGTABLE_RANGE
>> +	 */
>> +	clrldi  r9,r3,4
>> +	li	r10,-1
>> +	clrldi  r10,r10,(64 - 46)
>> +	cmpld	cr7,r9,r10
>> +	bgt	cr7,8f
>
> As in the stab path, you can accomplish this with a single rldicr.
>
>>  
>>  	srdi	r9,r3,60		/* get region */
>> -	srdi	r10,r3,28		/* get esid */
>>  	cmpldi	cr7,r9,0xc		/* cmp PAGE_OFFSET for later use */
>>  
>> -	/* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
>> +	/* r3 = address, cr7 = <> PAGE_OFFSET */
>>  	blt	cr7,0f			/* user or kernel? */
>>  
>>  	/* kernel address: proto-VSID = ESID */
>> @@ -56,18 +63,26 @@ _GLOBAL(slb_allocate_realmode)
>>  	 */
>>  _GLOBAL(slb_miss_kernel_load_linear)
>>  	li	r11,0
>> -	li	r9,0x1
>> +	/*
>> +	 * context = (MAX_CONTEXT - 3) + ((ea >> 60) - 0xc)
>> +	 */
>> +	srdi	r9,r3,60
>> +	subi	r9,r9,(0xc + 3 + 1)
>> +	lis	r10, 8
>> +	add	r9,r9,r10
>
> Hrm.  You can avoid clobbering r10, which I assume is why you removed
> the computation of esid from the common path by doing this instead:
>
> 	rldicl	r9,r3,4,62
> 	addis	r9,r9,8
> 	subi	r9,r9,4

nice. Updated. I am inlining the final patch below. 

>
>> +	srdi	r10,r3,SID_SHIFT	/* get esid */
>>  	/*
>>  	 * for 1T we shift 12 bits more.  slb_finish_load_1T will do
>>  	 * the necessary adjustment
>>  	 */
>> -	rldimi  r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
>> +	rldimi  r10,r9,USER_ESID_BITS,0
>>  BEGIN_FTR_SECTION
>>  	b	slb_finish_load
>>  END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
>>  	b	slb_finish_load_1T
>>  
>>  1:
>> +	srdi	r10,r3,SID_SHIFT	/* get esid */
>>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>>  	/* Check virtual memmap region. To be patches at kernel boot */
>>  	cmpldi	cr0,r9,0xf
>> @@ -91,23 +106,26 @@ _GLOBAL(slb_miss_kernel_load_vmemmap)
>>  	_GLOBAL(slb_miss_kernel_load_io)
>>  	li	r11,0
>>  6:
>> -	li	r9,0x1
>> +	/*
>> +	 * context = (MAX_CONTEXT - 3) + ((ea >> 60) - 0xc)
>> +	 */
>> +	srdi	r9,r3,60
>> +	subi	r9,r9,(0xc + 3 + 1)
>> +	lis	r10,8
>> +	add	r9,r9,r10
>> +	srdi	r10,r3,SID_SHIFT
>
> Same here.  Can you put the kernel context calculation into a common
> path?  In fact, now that kernel vsids, like user vsids are made of a
> context and vsid component, can you put the rldimi which combines them
> into a common path for both kernel and user addresses?


will do.
Geoff Levand - March 12, 2013, 8:11 p.m.
Hi Aneesh,

On Tue, 2013-03-12 at 12:49 +0530, Aneesh Kumar K.V wrote:
> Done. I added this as a patch on top of the series. 

I tried to test this against v3.8, but applying it failed:

 Applying: powerpc: Update kernel VSID range
 error: patch failed: arch/powerpc/mm/slb_low.S:66

Could you please make a set that applies to 3.8?  Thanks.

-Geoff

Patch

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index ffa0c48..f0351b5 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1274,12 +1274,9 @@  _GLOBAL(do_stab_bolted)
 	 * check for bad kernel/user address
 	 * (ea & ~REGION_MASK) >= PGTABLE_RANGE
 	 */
-	clrldi  r9,r11,4
-	li	r10,-1
-	clrldi  r10,r10,(64 - 46)
-	cmpld	cr7,r9,r10
+	rldicr. r9,r11,4,(64 - 46 - 4)
 	li	r9,0	/* VSID = 0 for bad address */
-	bgt	cr7,0f
+	bne-	0f
 
 	/*
 	 * Calculate VSID:
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index 80fd644..349411e 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -35,16 +35,14 @@  _GLOBAL(slb_allocate_realmode)
 	 * check for bad kernel/user address
 	 * (ea & ~REGION_MASK) >= PGTABLE_RANGE
 	 */
-	clrldi  r9,r3,4
-	li	r10,-1
-	clrldi  r10,r10,(64 - 46)
-	cmpld	cr7,r9,r10
-	bgt	cr7,8f
+	rldicr. r9,r3,4,(64 - 46 - 4)
+	bne-	8f
 
 	srdi	r9,r3,60		/* get region */
+	srdi	r10,r3,SID_SHIFT	/* get esid */
 	cmpldi	cr7,r9,0xc		/* cmp PAGE_OFFSET for later use */
 
-	/* r3 = address, cr7 = <> PAGE_OFFSET */
+	/* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
 	blt	cr7,0f			/* user or kernel? */
 
 	/* kernel address: proto-VSID = ESID */
@@ -66,11 +64,10 @@  _GLOBAL(slb_miss_kernel_load_linear)
 	/*
 	 * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
 	 */
-	srdi	r9,r3,60
-	subi	r9,r9,(0xc + 4)
-	lis	r10, 8
-	add	r9,r9,r10
-	srdi	r10,r3,SID_SHIFT	/* get esid */
+	rldicl	r9,r3,4,62
+	addis	r9,r9,8
+	subi	r9,r9,4
+
 	/*
 	 * for 1T we shift 12 bits more.  slb_finish_load_1T will do
 	 * the necessary adjustment
@@ -82,7 +79,6 @@  END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
 	b	slb_finish_load_1T
 
 1:
-	srdi	r10,r3,SID_SHIFT	/* get esid */
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 	/* Check virtual memmap region. To be patches at kernel boot */
 	cmpldi	cr0,r9,0xf
@@ -109,11 +105,9 @@  _GLOBAL(slb_miss_kernel_load_vmemmap)
 	/*
 	 * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
 	 */
-	srdi	r9,r3,60
-	subi	r9,r9,(0xc + 4)
-	lis	r10,8
-	add	r9,r9,r10
-	srdi	r10,r3,SID_SHIFT
+	rldicl	r9,r3,4,62
+	addis	r9,r9,8
+	subi	r9,r9,4
 	/*
 	 * for 1T we shift 12 bits more.  slb_finish_load_1T will do
 	 * the necessary adjustment
@@ -125,8 +119,6 @@  END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
 	b	slb_finish_load_1T
 
 0:
-	srdi	r10,r3,SID_SHIFT	/* get esid */
-
 	/* when using slices, we extract the psize off the slice bitmaps
 	 * and then we need to get the sllp encoding off the mmu_psize_defs
 	 * array.