diff mbox

[3/4] sparc64: convert spinlock_t to raw_spinlock_t in mmu_context_t

Message ID 1388980510-10190-4-git-send-email-allen.pais@oracle.com
State Not Applicable
Delegated to: David Miller
Headers show

Commit Message

Allen Jan. 6, 2014, 3:55 a.m. UTC
In the attempt of get PREEMPT_RT working on sparc64 using
linux-stable-rt version 3.10.22-rt19+, the kernel crash
with the following trace:

[ 1487.027884] I7: <rt_mutex_setprio+0x3c/0x2c0>
[ 1487.027885] Call Trace:
[ 1487.027887]  [00000000004967dc] rt_mutex_setprio+0x3c/0x2c0
[ 1487.027892]  [00000000004afe20] task_blocks_on_rt_mutex+0x180/0x200
[ 1487.027895]  [0000000000819114] rt_spin_lock_slowlock+0x94/0x300
[ 1487.027897]  [0000000000817ebc] __schedule+0x39c/0x53c
[ 1487.027899]  [00000000008185fc] schedule+0x1c/0xc0
[ 1487.027908]  [000000000048fff4] smpboot_thread_fn+0x154/0x2e0
[ 1487.027913]  [000000000048753c] kthread+0x7c/0xa0
[ 1487.027920]  [00000000004060c4] ret_from_syscall+0x1c/0x2c
[ 1487.027922]  [0000000000000000]           (null)

Thomas debugged this issue and pointed to switch_mm

        spin_lock_irqsave(&mm->context.lock, flags);

context.lock needs to be a raw_spinlock.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Allen Pais <allen.pais@oracle.com>
---
 arch/sparc/include/asm/mmu_64.h         |    2 +-
 arch/sparc/include/asm/mmu_context_64.h |    8 ++++----
 arch/sparc/kernel/smp_64.c              |    4 ++--
 arch/sparc/mm/init_64.c                 |    4 ++--
 arch/sparc/mm/tsb.c                     |   16 ++++++++--------
 5 files changed, 17 insertions(+), 17 deletions(-)

Comments

Kirill Tkhai Feb. 11, 2014, 9:13 p.m. UTC | #1
06.01.2014, 07:56, "Allen Pais" <allen.pais@oracle.com>:
> In the attempt of get PREEMPT_RT working on sparc64 using
> linux-stable-rt version 3.10.22-rt19+, the kernel crash
> with the following trace:
>
> [ 1487.027884] I7: <rt_mutex_setprio+0x3c/0x2c0>
> [ 1487.027885] Call Trace:
> [ 1487.027887]  [00000000004967dc] rt_mutex_setprio+0x3c/0x2c0
> [ 1487.027892]  [00000000004afe20] task_blocks_on_rt_mutex+0x180/0x200
> [ 1487.027895]  [0000000000819114] rt_spin_lock_slowlock+0x94/0x300
> [ 1487.027897]  [0000000000817ebc] __schedule+0x39c/0x53c
> [ 1487.027899]  [00000000008185fc] schedule+0x1c/0xc0
> [ 1487.027908]  [000000000048fff4] smpboot_thread_fn+0x154/0x2e0
> [ 1487.027913]  [000000000048753c] kthread+0x7c/0xa0
> [ 1487.027920]  [00000000004060c4] ret_from_syscall+0x1c/0x2c
> [ 1487.027922]  [0000000000000000]           (null)
>
> Thomas debugged this issue and pointed to switch_mm
>
>         spin_lock_irqsave(&mm->context.lock, flags);
>
> context.lock needs to be a raw_spinlock.
>
> Acked-by: David S. Miller <davem@davemloft.net>
> Signed-off-by: Allen Pais <allen.pais@oracle.com>
> ---
>  arch/sparc/include/asm/mmu_64.h         |    2 +-
>  arch/sparc/include/asm/mmu_context_64.h |    8 ++++----
>  arch/sparc/kernel/smp_64.c              |    4 ++--
>  arch/sparc/mm/init_64.c                 |    4 ++--
>  arch/sparc/mm/tsb.c                     |   16 ++++++++--------
>  5 files changed, 17 insertions(+), 17 deletions(-)
>
> diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h
> index 76092c4..e945ddb 100644
> --- a/arch/sparc/include/asm/mmu_64.h
> +++ b/arch/sparc/include/asm/mmu_64.h
> @@ -90,7 +90,7 @@ struct tsb_config {
>  #endif
>
>  typedef struct {
> - spinlock_t lock;
> + raw_spinlock_t lock;
>          unsigned long sparc64_ctx_val;
>          unsigned long huge_pte_count;
>          struct page *pgtable_page;
> diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
> index 3d528f0..3a85624 100644
> --- a/arch/sparc/include/asm/mmu_context_64.h
> +++ b/arch/sparc/include/asm/mmu_context_64.h
> @@ -77,7 +77,7 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
>          if (unlikely(mm == &init_mm))
>                  return;
>
> - spin_lock_irqsave(&mm->context.lock, flags);
> + raw_spin_lock_irqsave(&mm->context.lock, flags);
>          ctx_valid = CTX_VALID(mm->context);
>          if (!ctx_valid)
>                  get_new_mmu_context(mm);
> @@ -125,7 +125,7 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
>                  __flush_tlb_mm(CTX_HWBITS(mm->context),
>                                 SECONDARY_CONTEXT);
>          }
> - spin_unlock_irqrestore(&mm->context.lock, flags);
> + raw_spin_unlock_irqrestore(&mm->context.lock, flags);
>  }
>
>  #define deactivate_mm(tsk,mm) do { } while (0)
> @@ -136,7 +136,7 @@ static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm
>          unsigned long flags;
>          int cpu;
>
> - spin_lock_irqsave(&mm->context.lock, flags);
> + raw_spin_lock_irqsave(&mm->context.lock, flags);
>          if (!CTX_VALID(mm->context))
>                  get_new_mmu_context(mm);
>          cpu = smp_processor_id();
> @@ -146,7 +146,7 @@ static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm
>          load_secondary_context(mm);
>          __flush_tlb_mm(CTX_HWBITS(mm->context), SECONDARY_CONTEXT);
>          tsb_context_switch(mm);
> - spin_unlock_irqrestore(&mm->context.lock, flags);
> + raw_spin_unlock_irqrestore(&mm->context.lock, flags);
>  }
>
>  #endif /* !(__ASSEMBLY__) */
> diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
> index 77539ed..f42e1a7 100644
> --- a/arch/sparc/kernel/smp_64.c
> +++ b/arch/sparc/kernel/smp_64.c
> @@ -975,12 +975,12 @@ void __irq_entry smp_new_mmu_context_version_client(int irq, struct pt_regs *reg
>          if (unlikely(!mm || (mm == &init_mm)))
>                  return;
>
> - spin_lock_irqsave(&mm->context.lock, flags);
> + raw_spin_lock_irqsave(&mm->context.lock, flags);
>
>          if (unlikely(!CTX_VALID(mm->context)))
>                  get_new_mmu_context(mm);
>
> - spin_unlock_irqrestore(&mm->context.lock, flags);
> + raw_spin_unlock_irqrestore(&mm->context.lock, flags);
>
>          load_secondary_context(mm);
>          __flush_tlb_mm(CTX_HWBITS(mm->context),
> diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
> index 04fd55a..bd5253d 100644
> --- a/arch/sparc/mm/init_64.c
> +++ b/arch/sparc/mm/init_64.c
> @@ -350,7 +350,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
>
>          mm = vma->vm_mm;
>
> - spin_lock_irqsave(&mm->context.lock, flags);
> + raw_spin_lock_irqsave(&mm->context.lock, flags);
>
>  #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
>          if (mm->context.huge_pte_count && is_hugetlb_pte(pte))
> @@ -361,7 +361,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
>                  __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT,
>                                          address, pte_val(pte));
>
> - spin_unlock_irqrestore(&mm->context.lock, flags);
> + raw_spin_unlock_irqrestore(&mm->context.lock, flags);
>  }

We also should do the same in update_mmu_cache_pmd().

>
>  void flush_dcache_page(struct page *page)
> diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
> index 2cc3bce..d84d4ea 100644
> --- a/arch/sparc/mm/tsb.c
> +++ b/arch/sparc/mm/tsb.c
> @@ -73,7 +73,7 @@ void flush_tsb_user(struct tlb_batch *tb)
>          struct mm_struct *mm = tb->mm;
>          unsigned long nentries, base, flags;
>
> - spin_lock_irqsave(&mm->context.lock, flags);
> + raw_spin_lock_irqsave(&mm->context.lock, flags);
>
>          base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
>          nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
> @@ -90,14 +90,14 @@ void flush_tsb_user(struct tlb_batch *tb)
>                  __flush_tsb_one(tb, HPAGE_SHIFT, base, nentries);
>          }
>  #endif
> - spin_unlock_irqrestore(&mm->context.lock, flags);
> + raw_spin_unlock_irqrestore(&mm->context.lock, flags);
>  }
>
>  void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr)
>  {
>          unsigned long nentries, base, flags;
>
> - spin_lock_irqsave(&mm->context.lock, flags);
> + raw_spin_lock_irqsave(&mm->context.lock, flags);
>
>          base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
>          nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
> @@ -114,7 +114,7 @@ void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr)
>                  __flush_tsb_one_entry(base, vaddr, HPAGE_SHIFT, nentries);
>          }
>  #endif
> - spin_unlock_irqrestore(&mm->context.lock, flags);
> + raw_spin_unlock_irqrestore(&mm->context.lock, flags);
>  }
>
>  #define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_8K
> @@ -392,7 +392,7 @@ retry_tsb_alloc:
>           * the lock and ask all other cpus running this address space
>           * to run tsb_context_switch() to see the new TSB table.
>           */
> - spin_lock_irqsave(&mm->context.lock, flags);
> + raw_spin_lock_irqsave(&mm->context.lock, flags);
>
>          old_tsb = mm->context.tsb_block[tsb_index].tsb;
>          old_cache_index =
> @@ -407,7 +407,7 @@ retry_tsb_alloc:
>           */
>          if (unlikely(old_tsb &&
>                       (rss < mm->context.tsb_block[tsb_index].tsb_rss_limit))) {
> - spin_unlock_irqrestore(&mm->context.lock, flags);
> + raw_spin_unlock_irqrestore(&mm->context.lock, flags);
>
>                  kmem_cache_free(tsb_caches[new_cache_index], new_tsb);
>                  return;
> @@ -433,7 +433,7 @@ retry_tsb_alloc:
>          mm->context.tsb_block[tsb_index].tsb = new_tsb;
>          setup_tsb_params(mm, tsb_index, new_size);
>
> - spin_unlock_irqrestore(&mm->context.lock, flags);
> + raw_spin_unlock_irqrestore(&mm->context.lock, flags);
>
>          /* If old_tsb is NULL, we're being invoked for the first time
>           * from init_new_context().
> @@ -459,7 +459,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
>  #endif
>          unsigned int i;
>
> - spin_lock_init(&mm->context.lock);
> + raw_spin_lock_init(&mm->context.lock);
>
>          mm->context.sparc64_ctx_val = 0UL;
>
> --
> 1.7.10.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Allen Feb. 12, 2014, 7:31 a.m. UTC | #2
On Wednesday 12 February 2014 02:43 AM, Kirill Tkhai wrote:
> 
> 
> 06.01.2014, 07:56, "Allen Pais" <allen.pais@oracle.com>:
>> In the attempt of get PREEMPT_RT working on sparc64 using
>> linux-stable-rt version 3.10.22-rt19+, the kernel crash
>> with the following trace:
>>
>> [ 1487.027884] I7: <rt_mutex_setprio+0x3c/0x2c0>
>> [ 1487.027885] Call Trace:
>> [ 1487.027887]  [00000000004967dc] rt_mutex_setprio+0x3c/0x2c0
>> [ 1487.027892]  [00000000004afe20] task_blocks_on_rt_mutex+0x180/0x200
>> [ 1487.027895]  [0000000000819114] rt_spin_lock_slowlock+0x94/0x300
>> [ 1487.027897]  [0000000000817ebc] __schedule+0x39c/0x53c
>> [ 1487.027899]  [00000000008185fc] schedule+0x1c/0xc0
>> [ 1487.027908]  [000000000048fff4] smpboot_thread_fn+0x154/0x2e0
>> [ 1487.027913]  [000000000048753c] kthread+0x7c/0xa0
>> [ 1487.027920]  [00000000004060c4] ret_from_syscall+0x1c/0x2c
>> [ 1487.027922]  [0000000000000000]           (null)
>> - spin_unlock_irqrestore(&mm->context.lock, flags);
>> + raw_spin_unlock_irqrestore(&mm->context.lock, flags);
>>  }
> 
> We also should do the same in update_mmu_cache_pmd().
> 

I have already done this. I should have updated the patch. the issue still
persists though.

- Allen
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Allen Feb. 12, 2014, 7:48 a.m. UTC | #3
On Wednesday 12 February 2014 02:43 AM, Kirill Tkhai wrote:
> 
> 
> 06.01.2014, 07:56, "Allen Pais" <allen.pais@oracle.com>:
>> In the attempt of get PREEMPT_RT working on sparc64 using
>> linux-stable-rt version 3.10.22-rt19+, the kernel crash
>> with the following trace:
>>
>> [ 1487.027884] I7: <rt_mutex_setprio+0x3c/0x2c0>
>> [ 1487.027885] Call Trace:
>> [ 1487.027887]  [00000000004967dc] rt_mutex_setprio+0x3c/0x2c0
>> [ 1487.027892]  [00000000004afe20] task_blocks_on_rt_mutex+0x180/0x200
>> [ 1487.027895]  [0000000000819114] rt_spin_lock_slowlock+0x94/0x300
>> [ 1487.027897]  [0000000000817ebc] __schedule+0x39c/0x53c
>> [ 1487.027899]  [00000000008185fc] schedule+0x1c/0xc0
>> [ 1487.027908]  [000000000048fff4] smpboot_thread_fn+0x154/0x2e0
>> [ 1487.027913]  [000000000048753c] kthread+0x7c/0xa0
>> [ 1487.027920]  [00000000004060c4] ret_from_syscall+0x1c/0x2c
>> [ 1487.027922]  [0000000000000000]           (null)
>>

Now, consistently I've been getting sun4v_data_access_exception.
Here's the trace:

[ 4673.360121] sun4v_data_access_exception: ADDR[0000080000000000] CTX[0000] TYPE[0004], going.
[ 4673.360124]               \|/ ____ \|/
[ 4673.360124]               "@'/ .. \`@"
[ 4673.360124]               /_| \__/ |_\
[ 4673.360124]                  \__U_/
[ 4673.360128] hackbench(4183): Dax [#1]
[ 4673.360137] CPU: 5 PID: 4183 Comm: hackbench Tainted: G        W    3.10.24-rt22+ #12
[ 4673.360141] task: fffff80f9c793840 ti: fffff80f9b270000 task.ti: fffff80f9b270000
[ 4673.360146] TSTATE: 0000004411e01606 TPC: 0000000000407b64 TNPC: 0000000000407b68 Y: 00000000    Tainted: G        W   
[ 4673.360157] TPC: <tsb_flush+0x4/0x40>
[ 4673.360160] g0: fffff80f9c7c54b8 g1: 0000000000000001 g2: 0000000000008000 g3: 0000000000000000
[ 4673.360163] g4: fffff80f9c793840 g5: fffff80fcfc9c000 g6: fffff80f9b270000 g7: 0000000000000000
[ 4673.360167] o0: 0000080000000130 o1: 000003ffffe00400 o2: 0000000000878e48 o3: 0000000000000000
[ 4673.360170] o4: 0000000000000002 o5: 0000000000000000 sp: fffff80f9b272ec1 ret_pc: 00000000004520d0
[ 4673.360177] RPC: <flush_tsb_user+0x70/0x120>
[ 4673.360180] l0: 0000000000000001 l1: fffff80fd0800870 l2: 0000080000000000 l3: 00000000000001ff
[ 4673.360183] l4: fffff80f9852ea00 l5: fffff80f9852ee10 l6: 0000000000a87000 l7: 0000000000000000
[ 4673.360185] i0: fffff80fd0800868 i1: 0000000000000000 i2: 0000000000000000 i3: 0000000000000000
[ 4673.360187] i4: 0000000000000002 i5: 0000000000000030 i6: fffff80f9b272f71 i7: 00000000004515a8
[ 4673.360192] I7: <flush_tlb_pending+0x68/0xe0>
[ 4673.360193] Call Trace:
[ 4673.360198]  [00000000004515a8] flush_tlb_pending+0x68/0xe0
[ 4673.360203]  [000000000045185c] arch_leave_lazy_mmu_mode+0x3c/0x60
[ 4673.360210]  [000000000052e520] unmap_single_vma+0x400/0x6c0
[ 4673.360213]  [000000000052e808] unmap_vmas+0x28/0x60
[ 4673.360220]  [0000000000530cc8] exit_mmap+0x88/0x160
[ 4673.360226]  [000000000045e0d4] mmput+0x34/0xe0
[ 4673.360236]  [00000000004669fc] do_exit+0x1fc/0xa40
[ 4673.360241]  [0000000000467270] do_group_exit+0x30/0xe0
[ 4673.360245]  [000000000046733c] SyS_exit_group+0x1c/0x40
[ 4673.360256]  [0000000000406234] linux_sparc_syscall+0x34/0x44
[ 4673.360260] Caller[00000000004515a8]: flush_tlb_pending+0x68/0xe0
[ 4673.360264] Caller[000000000045185c]: arch_leave_lazy_mmu_mode+0x3c/0x60
[ 4673.360267] Caller[000000000052e520]: unmap_single_vma+0x400/0x6c0
[ 4673.360270] Caller[000000000052e808]: unmap_vmas+0x28/0x60
[ 4673.360274] Caller[0000000000530cc8]: exit_mmap+0x88/0x160
[ 4673.360277] Caller[000000000045e0d4]: mmput+0x34/0xe0
[ 4673.360280] Caller[00000000004669fc]: do_exit+0x1fc/0xa40
[ 4673.360284] Caller[0000000000467270]: do_group_exit+0x30/0xe0
[ 4673.360287] Caller[000000000046733c]: SyS_exit_group+0x1c/0x40
[ 4673.360291] Caller[0000000000406234]: linux_sparc_syscall+0x34/0x44



- Allen
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirill Tkhai Feb. 12, 2014, 8:33 a.m. UTC | #4
12.02.2014, 11:48, "Allen Pais" <allen.pais@oracle.com>:

>  On Wednesday 12 February 2014 02:43 AM, Kirill Tkhai wrote:
>>   06.01.2014, 07:56, "Allen Pais" <allen.pais@oracle.com>:
>>>   In the attempt of get PREEMPT_RT working on sparc64 using
>>>   linux-stable-rt version 3.10.22-rt19+, the kernel crash
>>>   with the following trace:
>>>
>>>   [ 1487.027884] I7: <rt_mutex_setprio+0x3c/0x2c0>
>>>   [ 1487.027885] Call Trace:
>>>   [ 1487.027887]  [00000000004967dc] rt_mutex_setprio+0x3c/0x2c0
>>>   [ 1487.027892]  [00000000004afe20] task_blocks_on_rt_mutex+0x180/0x200
>>>   [ 1487.027895]  [0000000000819114] rt_spin_lock_slowlock+0x94/0x300
>>>   [ 1487.027897]  [0000000000817ebc] __schedule+0x39c/0x53c
>>>   [ 1487.027899]  [00000000008185fc] schedule+0x1c/0xc0
>>>   [ 1487.027908]  [000000000048fff4] smpboot_thread_fn+0x154/0x2e0
>>>   [ 1487.027913]  [000000000048753c] kthread+0x7c/0xa0
>>>   [ 1487.027920]  [00000000004060c4] ret_from_syscall+0x1c/0x2c
>>>   [ 1487.027922]  [0000000000000000]           (null)
>  Now, consistently I've been getting sun4v_data_access_exception.
>  Here's the trace:
>  [ 4673.360121] sun4v_data_access_exception: ADDR[0000080000000000] CTX[0000] TYPE[0004], going.

I've never dived at sparc's tlb before, but it seems now I'm understanding.

arch_enter_lazy_mmu_mode() makes possible delayed tlb flushing. In !RT kernel
you collect flush requests before you really flush all of them.

In RT you collect them too, but you are able to be preempted in any moment.
So, you may switch to other process with unflushed tlb, which is very bad.

Try to not to set tb->active = 1; in arch_enter_lazy_mmu_mode(). Set it to zero.
We will look if this robust fix helps.


>  [ 4673.360124]               \|/ ____ \|/
>  [ 4673.360124]               "@'/ .. \`@"
>  [ 4673.360124]               /_| \__/ |_\
>  [ 4673.360124]                  \__U_/
>  [ 4673.360128] hackbench(4183): Dax [#1]
>  [ 4673.360137] CPU: 5 PID: 4183 Comm: hackbench Tainted: G        W    3.10.24-rt22+ #12
>  [ 4673.360141] task: fffff80f9c793840 ti: fffff80f9b270000 task.ti: fffff80f9b270000
>  [ 4673.360146] TSTATE: 0000004411e01606 TPC: 0000000000407b64 TNPC: 0000000000407b68 Y: 00000000    Tainted: G        W
>  [ 4673.360157] TPC: <tsb_flush+0x4/0x40>
>  [ 4673.360160] g0: fffff80f9c7c54b8 g1: 0000000000000001 g2: 0000000000008000 g3: 0000000000000000
>  [ 4673.360163] g4: fffff80f9c793840 g5: fffff80fcfc9c000 g6: fffff80f9b270000 g7: 0000000000000000
>  [ 4673.360167] o0: 0000080000000130 o1: 000003ffffe00400 o2: 0000000000878e48 o3: 0000000000000000
>  [ 4673.360170] o4: 0000000000000002 o5: 0000000000000000 sp: fffff80f9b272ec1 ret_pc: 00000000004520d0
>  [ 4673.360177] RPC: <flush_tsb_user+0x70/0x120>
>  [ 4673.360180] l0: 0000000000000001 l1: fffff80fd0800870 l2: 0000080000000000 l3: 00000000000001ff
>  [ 4673.360183] l4: fffff80f9852ea00 l5: fffff80f9852ee10 l6: 0000000000a87000 l7: 0000000000000000
>  [ 4673.360185] i0: fffff80fd0800868 i1: 0000000000000000 i2: 0000000000000000 i3: 0000000000000000
>  [ 4673.360187] i4: 0000000000000002 i5: 0000000000000030 i6: fffff80f9b272f71 i7: 00000000004515a8
>  [ 4673.360192] I7: <flush_tlb_pending+0x68/0xe0>
>  [ 4673.360193] Call Trace:
>  [ 4673.360198]  [00000000004515a8] flush_tlb_pending+0x68/0xe0
>  [ 4673.360203]  [000000000045185c] arch_leave_lazy_mmu_mode+0x3c/0x60
>  [ 4673.360210]  [000000000052e520] unmap_single_vma+0x400/0x6c0
>  [ 4673.360213]  [000000000052e808] unmap_vmas+0x28/0x60
>  [ 4673.360220]  [0000000000530cc8] exit_mmap+0x88/0x160
>  [ 4673.360226]  [000000000045e0d4] mmput+0x34/0xe0
>  [ 4673.360236]  [00000000004669fc] do_exit+0x1fc/0xa40
>  [ 4673.360241]  [0000000000467270] do_group_exit+0x30/0xe0
>  [ 4673.360245]  [000000000046733c] SyS_exit_group+0x1c/0x40
>  [ 4673.360256]  [0000000000406234] linux_sparc_syscall+0x34/0x44
>  [ 4673.360260] Caller[00000000004515a8]: flush_tlb_pending+0x68/0xe0
>  [ 4673.360264] Caller[000000000045185c]: arch_leave_lazy_mmu_mode+0x3c/0x60
>  [ 4673.360267] Caller[000000000052e520]: unmap_single_vma+0x400/0x6c0
>  [ 4673.360270] Caller[000000000052e808]: unmap_vmas+0x28/0x60
>  [ 4673.360274] Caller[0000000000530cc8]: exit_mmap+0x88/0x160
>  [ 4673.360277] Caller[000000000045e0d4]: mmput+0x34/0xe0
>  [ 4673.360280] Caller[00000000004669fc]: do_exit+0x1fc/0xa40
>  [ 4673.360284] Caller[0000000000467270]: do_group_exit+0x30/0xe0
>  [ 4673.360287] Caller[000000000046733c]: SyS_exit_group+0x1c/0x40
>  [ 4673.360291] Caller[0000000000406234]: linux_sparc_syscall+0x34/0x44
>
>  - Allen
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Allen Feb. 12, 2014, 11:28 a.m. UTC | #5
>>>>   [ 1487.027884] I7: <rt_mutex_setprio+0x3c/0x2c0>
>>>>   [ 1487.027885] Call Trace:
>>>>   [ 1487.027887]  [00000000004967dc] rt_mutex_setprio+0x3c/0x2c0
>>>>   [ 1487.027892]  [00000000004afe20] task_blocks_on_rt_mutex+0x180/0x200
>>>>   [ 1487.027895]  [0000000000819114] rt_spin_lock_slowlock+0x94/0x300
>>>>   [ 1487.027897]  [0000000000817ebc] __schedule+0x39c/0x53c
>>>>   [ 1487.027899]  [00000000008185fc] schedule+0x1c/0xc0
>>>>   [ 1487.027908]  [000000000048fff4] smpboot_thread_fn+0x154/0x2e0
>>>>   [ 1487.027913]  [000000000048753c] kthread+0x7c/0xa0
>>>>   [ 1487.027920]  [00000000004060c4] ret_from_syscall+0x1c/0x2c
>>>>   [ 1487.027922]  [0000000000000000]           (null)
>>  Now, consistently I've been getting sun4v_data_access_exception.
>>  Here's the trace:
>>  [ 4673.360121] sun4v_data_access_exception: ADDR[0000080000000000] CTX[0000] TYPE[0004], going.
> 
> I've never dived at sparc's tlb before, but it seems now I'm understanding.
> 
> arch_enter_lazy_mmu_mode() makes possible delayed tlb flushing. In !RT kernel
> you collect flush requests before you really flush all of them.
> 
> In RT you collect them too, but you are able to be preempted in any moment.
> So, you may switch to other process with unflushed tlb, which is very bad.
> 
> Try to not to set tb->active = 1; in arch_enter_lazy_mmu_mode(). Set it to zero.
> We will look if this robust fix helps.
> 

Kirill, Well the change works. So far the machine is up and no stall or crashes
with Hackbench. I'll run it for longer period and check.

Thanks,

Allen

 

--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirill Tkhai Feb. 12, 2014, 11:43 a.m. UTC | #6
12.02.2014, 15:29, "Allen Pais" <allen.pais@oracle.com>:
>>>>>    [ 1487.027884] I7: <rt_mutex_setprio+0x3c/0x2c0>
>>>>>    [ 1487.027885] Call Trace:
>>>>>    [ 1487.027887]  [00000000004967dc] rt_mutex_setprio+0x3c/0x2c0
>>>>>    [ 1487.027892]  [00000000004afe20] task_blocks_on_rt_mutex+0x180/0x200
>>>>>    [ 1487.027895]  [0000000000819114] rt_spin_lock_slowlock+0x94/0x300
>>>>>    [ 1487.027897]  [0000000000817ebc] __schedule+0x39c/0x53c
>>>>>    [ 1487.027899]  [00000000008185fc] schedule+0x1c/0xc0
>>>>>    [ 1487.027908]  [000000000048fff4] smpboot_thread_fn+0x154/0x2e0
>>>>>    [ 1487.027913]  [000000000048753c] kthread+0x7c/0xa0
>>>>>    [ 1487.027920]  [00000000004060c4] ret_from_syscall+0x1c/0x2c
>>>>>    [ 1487.027922]  [0000000000000000]           (null)
>>>   Now, consistently I've been getting sun4v_data_access_exception.
>>>   Here's the trace:
>>>   [ 4673.360121] sun4v_data_access_exception: ADDR[0000080000000000] CTX[0000] TYPE[0004], going.
>>  I've never dived at sparc's tlb before, but it seems now I'm understanding.
>>
>>  arch_enter_lazy_mmu_mode() makes possible delayed tlb flushing. In !RT kernel
>>  you collect flush requests before you really flush all of them.
>>
>>  In RT you collect them too, but you are able to be preempted in any moment.
>>  So, you may switch to other process with unflushed tlb, which is very bad.
>>
>>  Try to not to set tb->active = 1; in arch_enter_lazy_mmu_mode(). Set it to zero.
>>  We will look if this robust fix helps.
>
> Kirill, Well the change works. So far the machine is up and no stall or crashes
> with Hackbench. I'll run it for longer period and check.

Ok, good.

But I don't know is this the best fix. May we have to implement another optimization
for RT.

For example, collect only batches which does not require smp call function. Or the
main goal of lazy tlb was to prevent smp calls?! It's good to discover this..

The other serious thing is to know does __set_pte_at() execute in preemption disable
context on !RT kernel. Because the place is interesting.

If yes, we have to do the same for RT. If not, then no.

Kirill

>
> Thanks,
>
> Allen
>
> --
> To unsubscribe from this list: send the line "unsubscribe sparclinux" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Allen Feb. 12, 2014, 12:14 p.m. UTC | #7
On Wednesday 12 February 2014 05:13 PM, Kirill Tkhai wrote:
> 12.02.2014, 15:29, "Allen Pais" <allen.pais@oracle.com>:
>>>>>>    [ 1487.027884] I7: <rt_mutex_setprio+0x3c/0x2c0>
>>>>>>    [ 1487.027885] Call Trace:
>>>>>>    [ 1487.027887]  [00000000004967dc] rt_mutex_setprio+0x3c/0x2c0
>>>>>>    [ 1487.027892]  [00000000004afe20] task_blocks_on_rt_mutex+0x180/0x200
>>>>>>    [ 1487.027895]  [0000000000819114] rt_spin_lock_slowlock+0x94/0x300
>>>>>>    [ 1487.027897]  [0000000000817ebc] __schedule+0x39c/0x53c
>>>>>>    [ 1487.027899]  [00000000008185fc] schedule+0x1c/0xc0
>>>>>>    [ 1487.027908]  [000000000048fff4] smpboot_thread_fn+0x154/0x2e0
>>>>>>    [ 1487.027913]  [000000000048753c] kthread+0x7c/0xa0
>>>>>>    [ 1487.027920]  [00000000004060c4] ret_from_syscall+0x1c/0x2c
>>>>>>    [ 1487.027922]  [0000000000000000]           (null)

>>
>> Kirill, Well the change works. So far the machine is up and no stall or crashes
>> with Hackbench. I'll run it for longer period and check.
> 
> Ok, good.
> 
> But I don't know is this the best fix. May we have to implement another optimization
> for RT.

No, unfortunately, the system hit a stall on about 8 cpu's. 
CPU: 31 PID: 28675 Comm: hackbench Tainted: G      D W    3.10.24-rt22+ #13
[ 5725.097645] task: fffff80f929da8c0 ti: fffff80f8a4fc000 task.ti: fffff80f8a4fc000
[ 5725.097649] TSTATE: 0000000011001604 TPC: 0000000000671e54 TNPC: 0000000000671e58 Y: 00000000    Tainted: G      D W   
TPC: <do_raw_spin_lock+0xb4/0x120>
[ 5725.097657] g0: 0000000000671e4c g1: 00000000000000ff g2: 0000000002625010 g3: 0000000000000000
[ 5725.097661] g4: fffff80f929da8c0 g5: fffff80fd649c000 g6: fffff80f8a4fc000 g7: 0000000000000000
[ 5725.097664] o0: 0000000000000001 o1: 00000000009dfc00 o2: 0000000000000000 o3: 0000000000000000
[ 5725.097667] o4: 0000000000000002 o5: 0000000000000000 sp: fffff80f8a4fee21 ret_pc: 0000000000671e58
[ 5725.097671] RPC: <do_raw_spin_lock+0xb8/0x120>
[ 5725.097675] l0: 000000000933b401 l1: 000000003b99d190 l2: 0000000000e25c00 l3: 0000000000000000
[ 5725.097678] l4: 0000000000000000 l5: 0000000000000000 l6: 0000000000000000 l7: fffff801001254c8
[ 5725.097682] i0: fffff80f89a367c8 i1: 0000000000878be4 i2: 0000000000000000 i3: 0000000000000000
[ 5725.097685] i4: 0000000000000002 i5: 0000000000000000 i6: fffff80f8a4feed1 i7: 0000000000879b14
[ 5725.097690] I7: <_raw_spin_lock+0x54/0x80>
[ 5725.097692] Call Trace:
[ 5725.097697]  [0000000000879b14] _raw_spin_lock+0x54/0x80
[ 5725.097702]  [0000000000878be4] rt_spin_lock_slowlock+0x24/0x340
[ 5725.097707]  [00000000008790ac] rt_spin_lock+0xc/0x40
[ 5725.097712]  [00000000008610bc] unix_stream_sendmsg+0x15c/0x380
[ 5725.097717]  [00000000007ac114] sock_aio_write+0xf4/0x120
[ 5725.097722]  [000000000055891c] do_sync_write+0x5c/0xa0
[ 5725.097727]  [0000000000559e1c] vfs_write+0x15c/0x180
[ 5725.097732]  [0000000000559ef8] SyS_write+0x38/0x80
[ 5725.097738]  [0000000000406234] linux_sparc_syscall+0x34/0x44

This(above) on a few cpu's and this(below) on the other

BUG: soft lockup - CPU#13 stuck for 22s! [hackbench:28701]
[ 5728.378345] Modules linked in: binfmt_misc usb_storage ehci_pci ehci_hcd sg n2_rng rng_core ext4 jbd2 crc16 sr_mod mpt2sas scsi_transport_sas raid_class sunvnet sunvdc dm_mirror dm_region_hash dm_log dm_mod be2iscsi iscsi_boot_sysfs bnx2i cnic uio ipv6 cxgb4i cxgb4 cxgb3i libcxgbi cxgb3 mdio libiscsi_tcp libiscsi scsi_transport_iscsi
[ 5728.378347] irq event stamp: 0
[ 5728.378350] hardirqs last  enabled at (0): [<          (null)>]           (null)
[ 5728.378356] hardirqs last disabled at (0): [<000000000045eb38>] copy_process+0x418/0x1080
[ 5728.378361] softirqs last  enabled at (0): [<000000000045eb38>] copy_process+0x418/0x1080
[ 5728.378364] softirqs last disabled at (0): [<          (null)>]           (null)
[ 5728.378368] CPU: 13 PID: 28701 Comm: hackbench Tainted: G      D W    3.10.24-rt22+ #13
[ 5728.378371] task: fffff80f90efbb80 ti: fffff80f925ac000 task.ti: fffff80f925ac000
[ 5728.378374] TSTATE: 0000000011001604 TPC: 00000000004668b4 TNPC: 00000000004668b8 Y: 00000000    Tainted: G      D W   
[ 5728.378378] TPC: <do_exit+0xb4/0xa40>
[ 5728.378380] g0: 0000000000003f40 g1: 00000000000000ff g2: fffff80f90efbeb0 g3: 0000000000000002
[ 5728.378383] g4: fffff80f90efbb80 g5: fffff80fd1c9c000 g6: fffff80f925ac000 g7: 0000000000000000
[ 5728.378385] o0: fffff80f90efbb80 o1: fffff80f925ac400 o2: 000000000087a654 o3: 0000000000000000
[ 5728.378387] o4: 0000000000000000 o5: fffff80f925aff40 sp: fffff80fff98f671 ret_pc: 000000000046689c
[ 5728.378390] RPC: <do_exit+0x9c/0xa40>
[ 5728.378393] l0: fffff80f90efbb80 l1: 0000004480001603 l2: 000000000087a650 l3: 0000000000000400
[ 5728.378395] l4: 0000000000000000 l5: 0000000000000003 l6: 0000000000000000 l7: 0000000000000008
[ 5728.378397] i0: 000000000000000a i1: 000000000000000d i2: 000000000042f608 i3: 0000000000000000
[ 5728.378400] i4: 000000000000004f i5: 0000000000000002 i6: fffff80fff98f741 i7: 000000000087a650
[ 5728.378405] I7: <perfctr_irq+0x3d0/0x420>
[ 5728.378406] Call Trace:
[ 5728.378410]  [000000000087a650] perfctr_irq+0x3d0/0x420
[ 5728.378415]  [00000000004209f4] tl0_irq15+0x14/0x20
[ 5728.378419]  [000000000042f608] stick_get_tick+0x8/0x20
[ 5728.378422]  [000000000042fa24] __delay+0x24/0x60
[ 5728.378426]  [0000000000671e58] do_raw_spin_lock+0xb8/0x120
[ 5728.378430]  [0000000000879b14] _raw_spin_lock+0x54/0x80
[ 5728.378435]  [00000000004a1978] load_balance+0x538/0x860
[ 5728.378438]  [00000000004a2154] idle_balance+0x134/0x1c0
[ 5728.378442]  [0000000000877d54] switch_to_pc+0x1f4/0x2c0
[ 5728.378445]  [0000000000877ec4] schedule+0x24/0xc0
[ 5728.378449]  [0000000000876860] schedule_timeout+0x1c0/0x2a0
[ 5728.378452]  [0000000000860ac0] unix_stream_recvmsg+0x240/0x6e0
[ 5728.378456]  [00000000007ac23c] sock_aio_read+0xfc/0x120
[ 5728.378460]  [0000000000558adc] do_sync_read+0x5c/0xa0
[ 5728.378464]  [000000000055a04c] vfs_read+0x10c/0x120
[ 5728.378467]  [000000000055a118] SyS_read+0x38/0x80

> 
> For example, collect only batches which does not require smp call function. Or the
> main goal of lazy tlb was to prevent smp calls?! It's good to discover this..
> 
> The other serious thing is to know does __set_pte_at() execute in preemption disable
> context on !RT kernel. Because the place is interesting.
> 
> If yes, we have to do the same for RT. If not, then no.

I am not convinced that I've covered all tlb/smp code. Guess I'll need to dig more.

Thanks,

Allen
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirill Tkhai Feb. 12, 2014, 12:45 p.m. UTC | #8
12.02.2014, 16:15, "Allen Pais" <allen.pais@oracle.com>:
> On Wednesday 12 February 2014 05:13 PM, Kirill Tkhai wrote:
>
>>  12.02.2014, 15:29, "Allen Pais" <allen.pais@oracle.com>:
>>>>>>>     [ 1487.027884] I7: <rt_mutex_setprio+0x3c/0x2c0>
>>>>>>>     [ 1487.027885] Call Trace:
>>>>>>>     [ 1487.027887]  [00000000004967dc] rt_mutex_setprio+0x3c/0x2c0
>>>>>>>     [ 1487.027892]  [00000000004afe20] task_blocks_on_rt_mutex+0x180/0x200
>>>>>>>     [ 1487.027895]  [0000000000819114] rt_spin_lock_slowlock+0x94/0x300
>>>>>>>     [ 1487.027897]  [0000000000817ebc] __schedule+0x39c/0x53c
>>>>>>>     [ 1487.027899]  [00000000008185fc] schedule+0x1c/0xc0
>>>>>>>     [ 1487.027908]  [000000000048fff4] smpboot_thread_fn+0x154/0x2e0
>>>>>>>     [ 1487.027913]  [000000000048753c] kthread+0x7c/0xa0
>>>>>>>     [ 1487.027920]  [00000000004060c4] ret_from_syscall+0x1c/0x2c
>>>>>>>     [ 1487.027922]  [0000000000000000]           (null)
>>>  Kirill, Well the change works. So far the machine is up and no stall or crashes
>>>  with Hackbench. I'll run it for longer period and check.
>>  Ok, good.
>>
>>  But I don't know is this the best fix. May we have to implement another optimization
>>  for RT.
>
> No, unfortunately, the system hit a stall on about 8 cpu's.
> CPU: 31 PID: 28675 Comm: hackbench Tainted: G      D W    3.10.24-rt22+ #13
> [ 5725.097645] task: fffff80f929da8c0 ti: fffff80f8a4fc000 task.ti: fffff80f8a4fc000
> [ 5725.097649] TSTATE: 0000000011001604 TPC: 0000000000671e54 TNPC: 0000000000671e58 Y: 00000000    Tainted: G      D W
> TPC: <do_raw_spin_lock+0xb4/0x120>
> [ 5725.097657] g0: 0000000000671e4c g1: 00000000000000ff g2: 0000000002625010 g3: 0000000000000000
> [ 5725.097661] g4: fffff80f929da8c0 g5: fffff80fd649c000 g6: fffff80f8a4fc000 g7: 0000000000000000
> [ 5725.097664] o0: 0000000000000001 o1: 00000000009dfc00 o2: 0000000000000000 o3: 0000000000000000
> [ 5725.097667] o4: 0000000000000002 o5: 0000000000000000 sp: fffff80f8a4fee21 ret_pc: 0000000000671e58
> [ 5725.097671] RPC: <do_raw_spin_lock+0xb8/0x120>
> [ 5725.097675] l0: 000000000933b401 l1: 000000003b99d190 l2: 0000000000e25c00 l3: 0000000000000000
> [ 5725.097678] l4: 0000000000000000 l5: 0000000000000000 l6: 0000000000000000 l7: fffff801001254c8
> [ 5725.097682] i0: fffff80f89a367c8 i1: 0000000000878be4 i2: 0000000000000000 i3: 0000000000000000
> [ 5725.097685] i4: 0000000000000002 i5: 0000000000000000 i6: fffff80f8a4feed1 i7: 0000000000879b14
> [ 5725.097690] I7: <_raw_spin_lock+0x54/0x80>
> [ 5725.097692] Call Trace:
> [ 5725.097697]  [0000000000879b14] _raw_spin_lock+0x54/0x80
> [ 5725.097702]  [0000000000878be4] rt_spin_lock_slowlock+0x24/0x340
> [ 5725.097707]  [00000000008790ac] rt_spin_lock+0xc/0x40
> [ 5725.097712]  [00000000008610bc] unix_stream_sendmsg+0x15c/0x380
> [ 5725.097717]  [00000000007ac114] sock_aio_write+0xf4/0x120
> [ 5725.097722]  [000000000055891c] do_sync_write+0x5c/0xa0
> [ 5725.097727]  [0000000000559e1c] vfs_write+0x15c/0x180
> [ 5725.097732]  [0000000000559ef8] SyS_write+0x38/0x80
> [ 5725.097738]  [0000000000406234] linux_sparc_syscall+0x34/0x44

No ideas right now.

> This(above) on a few cpu's and this(below) on the other
>
> BUG: soft lockup - CPU#13 stuck for 22s! [hackbench:28701]
> [ 5728.378345] Modules linked in: binfmt_misc usb_storage ehci_pci ehci_hcd sg n2_rng rng_core ext4 jbd2 crc16 sr_mod mpt2sas scsi_transport_sas raid_class sunvnet sunvdc dm_mirror dm_region_hash dm_log dm_mod be2iscsi iscsi_boot_sysfs bnx2i cnic uio ipv6 cxgb4i cxgb4 cxgb3i libcxgbi cxgb3 mdio libiscsi_tcp libiscsi scsi_transport_iscsi
> [ 5728.378347] irq event stamp: 0
> [ 5728.378350] hardirqs last  enabled at (0): [<          (null)>]           (null)
> [ 5728.378356] hardirqs last disabled at (0): [<000000000045eb38>] copy_process+0x418/0x1080
> [ 5728.378361] softirqs last  enabled at (0): [<000000000045eb38>] copy_process+0x418/0x1080
> [ 5728.378364] softirqs last disabled at (0): [<          (null)>]           (null)
> [ 5728.378368] CPU: 13 PID: 28701 Comm: hackbench Tainted: G      D W    3.10.24-rt22+ #13
> [ 5728.378371] task: fffff80f90efbb80 ti: fffff80f925ac000 task.ti: fffff80f925ac000
> [ 5728.378374] TSTATE: 0000000011001604 TPC: 00000000004668b4 TNPC: 00000000004668b8 Y: 00000000    Tainted: G      D W
> [ 5728.378378] TPC: <do_exit+0xb4/0xa40>
> [ 5728.378380] g0: 0000000000003f40 g1: 00000000000000ff g2: fffff80f90efbeb0 g3: 0000000000000002
> [ 5728.378383] g4: fffff80f90efbb80 g5: fffff80fd1c9c000 g6: fffff80f925ac000 g7: 0000000000000000
> [ 5728.378385] o0: fffff80f90efbb80 o1: fffff80f925ac400 o2: 000000000087a654 o3: 0000000000000000
> [ 5728.378387] o4: 0000000000000000 o5: fffff80f925aff40 sp: fffff80fff98f671 ret_pc: 000000000046689c
> [ 5728.378390] RPC: <do_exit+0x9c/0xa40>
> [ 5728.378393] l0: fffff80f90efbb80 l1: 0000004480001603 l2: 000000000087a650 l3: 0000000000000400
> [ 5728.378395] l4: 0000000000000000 l5: 0000000000000003 l6: 0000000000000000 l7: 0000000000000008
> [ 5728.378397] i0: 000000000000000a i1: 000000000000000d i2: 000000000042f608 i3: 0000000000000000
> [ 5728.378400] i4: 000000000000004f i5: 0000000000000002 i6: fffff80fff98f741 i7: 000000000087a650
> [ 5728.378405] I7: <perfctr_irq+0x3d0/0x420>
> [ 5728.378406] Call Trace:
> [ 5728.378410]  [000000000087a650] perfctr_irq+0x3d0/0x420
> [ 5728.378415]  [00000000004209f4] tl0_irq15+0x14/0x20
> [ 5728.378419]  [000000000042f608] stick_get_tick+0x8/0x20
> [ 5728.378422]  [000000000042fa24] __delay+0x24/0x60
> [ 5728.378426]  [0000000000671e58] do_raw_spin_lock+0xb8/0x120
> [ 5728.378430]  [0000000000879b14] _raw_spin_lock+0x54/0x80
> [ 5728.378435]  [00000000004a1978] load_balance+0x538/0x860
> [ 5728.378438]  [00000000004a2154] idle_balance+0x134/0x1c0
> [ 5728.378442]  [0000000000877d54] switch_to_pc+0x1f4/0x2c0
> [ 5728.378445]  [0000000000877ec4] schedule+0x24/0xc0
> [ 5728.378449]  [0000000000876860] schedule_timeout+0x1c0/0x2a0
> [ 5728.378452]  [0000000000860ac0] unix_stream_recvmsg+0x240/0x6e0
> [ 5728.378456]  [00000000007ac23c] sock_aio_read+0xfc/0x120
> [ 5728.378460]  [0000000000558adc] do_sync_read+0x5c/0xa0
> [ 5728.378464]  [000000000055a04c] vfs_read+0x10c/0x120
> [ 5728.378467]  [000000000055a118] SyS_read+0x38/0x80
>
>>  For example, collect only batches which does not require smp call function. Or the
>>  main goal of lazy tlb was to prevent smp calls?! It's good to discover this..
>>
>>  The other serious thing is to know does __set_pte_at() execute in preemption disable
>>  context on !RT kernel. Because the place is interesting.
>>
>>  If yes, we have to do the same for RT. If not, then no.
>
> I am not convinced that I've covered all tlb/smp code. Guess I'll need to dig more.

++all above. May we have to add one more crutch... Put preempt_disable() at begining of
__set_pte_at() and enable at end...


> Thanks,
>
> Allen
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Allen Feb. 12, 2014, 1:05 p.m. UTC | #9
On Wednesday 12 February 2014 06:15 PM, Kirill Tkhai wrote:
> 
> 
> 12.02.2014, 16:15, "Allen Pais" <allen.pais@oracle.com>:
>> On Wednesday 12 February 2014 05:13 PM, Kirill Tkhai wrote:
>>
>>>  12.02.2014, 15:29, "Allen Pais" <allen.pais@oracle.com>:
>>>>>>>>     [ 1487.027884] I7: <rt_mutex_setprio+0x3c/0x2c0>
>>>>>>>>     [ 1487.027885] Call Trace:
>>>>>>>>     [ 1487.027887]  [00000000004967dc] rt_mutex_setprio+0x3c/0x2c0
>>>>>>>>     [ 1487.027892]  [00000000004afe20] task_blocks_on_rt_mutex+0x180/0x200
>>>>>>>>     [ 1487.027895]  [0000000000819114] rt_spin_lock_slowlock+0x94/0x300
>>>>>>>>     [ 1487.027897]  [0000000000817ebc] __schedule+0x39c/0x53c
>>>>>>>>     [ 1487.027899]  [00000000008185fc] schedule+0x1c/0xc0
>>>>>>>>     [ 1487.027908]  [000000000048fff4] smpboot_thread_fn+0x154/0x2e0
>>>>>>>>     [ 1487.027913]  [000000000048753c] kthread+0x7c/0xa0
>>>>>>>>     [ 1487.027920]  [00000000004060c4] ret_from_syscall+0x1c/0x2c
>>>>>>>>     [ 1487.027922]  [0000000000000000]           (null)

>>
>> I am not convinced that I've covered all tlb/smp code. Guess I'll need to dig more.
> 
> ++all above. May we have to add one more crutch... Put preempt_disable() at begining of
> __set_pte_at() and enable at end...
> 
I'll look into it. Thanks again.

Allen

--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller March 4, 2014, 7:55 p.m. UTC | #10
From: Kirill Tkhai <tkhai@yandex.ru>
Date: Wed, 12 Feb 2014 12:33:58 +0400

> 12.02.2014, 11:48, "Allen Pais" <allen.pais@oracle.com>:
> 
>>  On Wednesday 12 February 2014 02:43 AM, Kirill Tkhai wrote:
>>>   06.01.2014, 07:56, "Allen Pais" <allen.pais@oracle.com>:
>>>>   In the attempt of get PREEMPT_RT working on sparc64 using
>>>>   linux-stable-rt version 3.10.22-rt19+, the kernel crash
>>>>   with the following trace:
>>>>
>>>>   [ 1487.027884] I7: <rt_mutex_setprio+0x3c/0x2c0>
>>>>   [ 1487.027885] Call Trace:
>>>>   [ 1487.027887]  [00000000004967dc] rt_mutex_setprio+0x3c/0x2c0
>>>>   [ 1487.027892]  [00000000004afe20] task_blocks_on_rt_mutex+0x180/0x200
>>>>   [ 1487.027895]  [0000000000819114] rt_spin_lock_slowlock+0x94/0x300
>>>>   [ 1487.027897]  [0000000000817ebc] __schedule+0x39c/0x53c
>>>>   [ 1487.027899]  [00000000008185fc] schedule+0x1c/0xc0
>>>>   [ 1487.027908]  [000000000048fff4] smpboot_thread_fn+0x154/0x2e0
>>>>   [ 1487.027913]  [000000000048753c] kthread+0x7c/0xa0
>>>>   [ 1487.027920]  [00000000004060c4] ret_from_syscall+0x1c/0x2c
>>>>   [ 1487.027922]  [0000000000000000]           (null)
>>  Now, consistently I've been getting sun4v_data_access_exception.
>>  Here's the trace:
>>  [ 4673.360121] sun4v_data_access_exception: ADDR[0000080000000000] CTX[0000] TYPE[0004], going.
> 
> I've never dived at sparc's tlb before, but it seems now I'm understanding.
> 
> arch_enter_lazy_mmu_mode() makes possible delayed tlb flushing. In !RT kernel
> you collect flush requests before you really flush all of them.
> 
> In RT you collect them too, but you are able to be preempted in any moment.
> So, you may switch to other process with unflushed tlb, which is very bad.
> 
> Try to not to set tb->active = 1; in arch_enter_lazy_mmu_mode(). Set it to zero.
> We will look if this robust fix helps.

Sorry for coming into this discussion so late.

Indeed, the pending flushes are per-cpu and we must flush them out in the
event of a preemption.

PowerPC does the same exact thing with arch_enter_lazy_mmu_mode(), in
fact that's where I copied the logic from.  Does PowerPC not work with
-rt? :-)
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller March 4, 2014, 7:59 p.m. UTC | #11
From: Kirill Tkhai <tkhai@yandex.ru>
Date: Wed, 12 Feb 2014 15:43:06 +0400

> For example, collect only batches which does not require smp call function. Or the
> main goal of lazy tlb was to prevent smp calls?! It's good to discover this..

The goal of the lazy tlb stuff is to only have one (potential)
synchronization point with other cpus.

Then we sweep away the TSB entries, after which any TLB miss must
enter the full fault path and synchronize with the current thread
doing the flush.

Then we kill the TLB entries, both local and remote.

The tsb_grow() code path is quite the animal, as any of you who have
read the comment above it's implementation can attest :-) It took 6
months to get that code right in a non-RT context back when it was
originally written.
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirill Tkhai March 4, 2014, 8:44 p.m. UTC | #12
On 04.03.2014 23:55, David Miller wrote:
> From: Kirill Tkhai <tkhai@yandex.ru>
> Date: Wed, 12 Feb 2014 12:33:58 +0400
> 
>> 12.02.2014, 11:48, "Allen Pais" <allen.pais@oracle.com>:
>>
>>>  On Wednesday 12 February 2014 02:43 AM, Kirill Tkhai wrote:
>>>>   06.01.2014, 07:56, "Allen Pais" <allen.pais@oracle.com>:
>>>>>   In the attempt of get PREEMPT_RT working on sparc64 using
>>>>>   linux-stable-rt version 3.10.22-rt19+, the kernel crash
>>>>>   with the following trace:
>>>>>
>>>>>   [ 1487.027884] I7: <rt_mutex_setprio+0x3c/0x2c0>
>>>>>   [ 1487.027885] Call Trace:
>>>>>   [ 1487.027887]  [00000000004967dc] rt_mutex_setprio+0x3c/0x2c0
>>>>>   [ 1487.027892]  [00000000004afe20] task_blocks_on_rt_mutex+0x180/0x200
>>>>>   [ 1487.027895]  [0000000000819114] rt_spin_lock_slowlock+0x94/0x300
>>>>>   [ 1487.027897]  [0000000000817ebc] __schedule+0x39c/0x53c
>>>>>   [ 1487.027899]  [00000000008185fc] schedule+0x1c/0xc0
>>>>>   [ 1487.027908]  [000000000048fff4] smpboot_thread_fn+0x154/0x2e0
>>>>>   [ 1487.027913]  [000000000048753c] kthread+0x7c/0xa0
>>>>>   [ 1487.027920]  [00000000004060c4] ret_from_syscall+0x1c/0x2c
>>>>>   [ 1487.027922]  [0000000000000000]           (null)
>>>  Now, consistently I've been getting sun4v_data_access_exception.
>>>  Here's the trace:
>>>  [ 4673.360121] sun4v_data_access_exception: ADDR[0000080000000000] CTX[0000] TYPE[0004], going.
>>
>> I've never dived at sparc's tlb before, but it seems now I'm understanding.
>>
>> arch_enter_lazy_mmu_mode() makes possible delayed tlb flushing. In !RT kernel
>> you collect flush requests before you really flush all of them.
>>
>> In RT you collect them too, but you are able to be preempted in any moment.
>> So, you may switch to other process with unflushed tlb, which is very bad.
>>
>> Try to not to set tb->active = 1; in arch_enter_lazy_mmu_mode(). Set it to zero.
>> We will look if this robust fix helps.
> 
> Sorry for coming into this discussion so late.
> 
> Indeed, the pending flushes are per-cpu and we must flush them out in the
> event of a preemption.
> 
> PowerPC does the same exact thing with arch_enter_lazy_mmu_mode(), in
> fact that's where I copied the logic from.  Does PowerPC not work with
> -rt? :-)
> 

It does not work, but we will :)
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sebastian Andrzej Siewior March 7, 2014, 2:29 p.m. UTC | #13
* David Miller | 2014-03-04 14:55:23 [-0500]:

>PowerPC does the same exact thing with arch_enter_lazy_mmu_mode(), in
>fact that's where I copied the logic from.  Does PowerPC not work with
>-rt? :-)

It is only PPC64 as far as I can see and I have mostly 32bit book-e
around. That preempt-lazy stuff also touches PPC64 bits so I assume
PPC64 works.

Sebastian
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h
index 76092c4..e945ddb 100644
--- a/arch/sparc/include/asm/mmu_64.h
+++ b/arch/sparc/include/asm/mmu_64.h
@@ -90,7 +90,7 @@  struct tsb_config {
 #endif
 
 typedef struct {
-	spinlock_t		lock;
+	raw_spinlock_t		lock;
 	unsigned long		sparc64_ctx_val;
 	unsigned long		huge_pte_count;
 	struct page		*pgtable_page;
diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
index 3d528f0..3a85624 100644
--- a/arch/sparc/include/asm/mmu_context_64.h
+++ b/arch/sparc/include/asm/mmu_context_64.h
@@ -77,7 +77,7 @@  static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
 	if (unlikely(mm == &init_mm))
 		return;
 
-	spin_lock_irqsave(&mm->context.lock, flags);
+	raw_spin_lock_irqsave(&mm->context.lock, flags);
 	ctx_valid = CTX_VALID(mm->context);
 	if (!ctx_valid)
 		get_new_mmu_context(mm);
@@ -125,7 +125,7 @@  static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
 		__flush_tlb_mm(CTX_HWBITS(mm->context),
 			       SECONDARY_CONTEXT);
 	}
-	spin_unlock_irqrestore(&mm->context.lock, flags);
+	raw_spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 
 #define deactivate_mm(tsk,mm)	do { } while (0)
@@ -136,7 +136,7 @@  static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm
 	unsigned long flags;
 	int cpu;
 
-	spin_lock_irqsave(&mm->context.lock, flags);
+	raw_spin_lock_irqsave(&mm->context.lock, flags);
 	if (!CTX_VALID(mm->context))
 		get_new_mmu_context(mm);
 	cpu = smp_processor_id();
@@ -146,7 +146,7 @@  static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm
 	load_secondary_context(mm);
 	__flush_tlb_mm(CTX_HWBITS(mm->context), SECONDARY_CONTEXT);
 	tsb_context_switch(mm);
-	spin_unlock_irqrestore(&mm->context.lock, flags);
+	raw_spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 
 #endif /* !(__ASSEMBLY__) */
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 77539ed..f42e1a7 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -975,12 +975,12 @@  void __irq_entry smp_new_mmu_context_version_client(int irq, struct pt_regs *reg
 	if (unlikely(!mm || (mm == &init_mm)))
 		return;
 
-	spin_lock_irqsave(&mm->context.lock, flags);
+	raw_spin_lock_irqsave(&mm->context.lock, flags);
 
 	if (unlikely(!CTX_VALID(mm->context)))
 		get_new_mmu_context(mm);
 
-	spin_unlock_irqrestore(&mm->context.lock, flags);
+	raw_spin_unlock_irqrestore(&mm->context.lock, flags);
 
 	load_secondary_context(mm);
 	__flush_tlb_mm(CTX_HWBITS(mm->context),
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 04fd55a..bd5253d 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -350,7 +350,7 @@  void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
 
 	mm = vma->vm_mm;
 
-	spin_lock_irqsave(&mm->context.lock, flags);
+	raw_spin_lock_irqsave(&mm->context.lock, flags);
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 	if (mm->context.huge_pte_count && is_hugetlb_pte(pte))
@@ -361,7 +361,7 @@  void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
 		__update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT,
 					address, pte_val(pte));
 
-	spin_unlock_irqrestore(&mm->context.lock, flags);
+	raw_spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 
 void flush_dcache_page(struct page *page)
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index 2cc3bce..d84d4ea 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -73,7 +73,7 @@  void flush_tsb_user(struct tlb_batch *tb)
 	struct mm_struct *mm = tb->mm;
 	unsigned long nentries, base, flags;
 
-	spin_lock_irqsave(&mm->context.lock, flags);
+	raw_spin_lock_irqsave(&mm->context.lock, flags);
 
 	base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
 	nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
@@ -90,14 +90,14 @@  void flush_tsb_user(struct tlb_batch *tb)
 		__flush_tsb_one(tb, HPAGE_SHIFT, base, nentries);
 	}
 #endif
-	spin_unlock_irqrestore(&mm->context.lock, flags);
+	raw_spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 
 void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr)
 {
 	unsigned long nentries, base, flags;
 
-	spin_lock_irqsave(&mm->context.lock, flags);
+	raw_spin_lock_irqsave(&mm->context.lock, flags);
 
 	base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
 	nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
@@ -114,7 +114,7 @@  void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr)
 		__flush_tsb_one_entry(base, vaddr, HPAGE_SHIFT, nentries);
 	}
 #endif
-	spin_unlock_irqrestore(&mm->context.lock, flags);
+	raw_spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 
 #define HV_PGSZ_IDX_BASE	HV_PGSZ_IDX_8K
@@ -392,7 +392,7 @@  retry_tsb_alloc:
 	 * the lock and ask all other cpus running this address space
 	 * to run tsb_context_switch() to see the new TSB table.
 	 */
-	spin_lock_irqsave(&mm->context.lock, flags);
+	raw_spin_lock_irqsave(&mm->context.lock, flags);
 
 	old_tsb = mm->context.tsb_block[tsb_index].tsb;
 	old_cache_index =
@@ -407,7 +407,7 @@  retry_tsb_alloc:
 	 */
 	if (unlikely(old_tsb &&
 		     (rss < mm->context.tsb_block[tsb_index].tsb_rss_limit))) {
-		spin_unlock_irqrestore(&mm->context.lock, flags);
+		raw_spin_unlock_irqrestore(&mm->context.lock, flags);
 
 		kmem_cache_free(tsb_caches[new_cache_index], new_tsb);
 		return;
@@ -433,7 +433,7 @@  retry_tsb_alloc:
 	mm->context.tsb_block[tsb_index].tsb = new_tsb;
 	setup_tsb_params(mm, tsb_index, new_size);
 
-	spin_unlock_irqrestore(&mm->context.lock, flags);
+	raw_spin_unlock_irqrestore(&mm->context.lock, flags);
 
 	/* If old_tsb is NULL, we're being invoked for the first time
 	 * from init_new_context().
@@ -459,7 +459,7 @@  int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 #endif
 	unsigned int i;
 
-	spin_lock_init(&mm->context.lock);
+	raw_spin_lock_init(&mm->context.lock);
 
 	mm->context.sparc64_ctx_val = 0UL;