diff mbox

Multicast packet loss

Message ID 49B72956.9050504@cosmosbay.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet March 11, 2009, 3 a.m. UTC
Brian Bloniarz a écrit :
> Hi Eric,
> 
> FYI: with your patch applied and lockdep enabled, I see:
> [   39.114628] ================================================
> [   39.121964] [ BUG: lock held when returning to user space! ]
> [   39.127704] ------------------------------------------------
> [   39.133461] msgtest/5242 is leaving the kernel with locks still held!
> [   39.140132] 1 lock held by msgtest/5242:
> [   39.144287]  #0:  (clock-AF_INET){-.-?}, at: [<ffffffff8041f5b9>]
> sock_def_readable+0x19/0xb0

And you told me you were not a kernel hacker ;)

> 
> I can't reproduced this with the mcasttest program yet, it
> was with an internal test program which does some userspace
> processing on the messages. I'll let you know if I find a way
> to reproduce it with a simple program I can share.

I reproduced it as well here quite easily with a tcpdump of a tcp session,
thanks for the report.

It seems  "if (in_softirq())" doesnt do what I thought.

I wanted to test if we were called from __do_softirq() handler,
since only this function is calling softirq_delay_exec()
to dequeue events.

It appears I have to make current->softirq_context available
even if !CONFIG_TRACE_IRQFLAGS

Here is an updated version of the patch.

I also made the call to softirq_delay_exec() is performed
with interrupts enabled, and that preempt count wont
overflow if many events are queued.

[PATCH] softirq: Introduce mechanism to defer wakeups

Some network workloads need to call scheduler too many times. For example,
each received multicast frame can wakeup many threads. ksoftirqd is then
not able to drain NIC RX queues and we get frame losses and high latencies.

This patch adds an infrastructure to delay part of work done in
sock_def_readable() at end of do_softirq(). This need to
make available current->softirq_context even if !CONFIG_TRACE_IRQFLAGS


Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/interrupt.h |   18 +++++++++++++++++
 include/linux/irqflags.h  |    7 ++----
 include/linux/sched.h     |    2 -
 include/net/sock.h        |    1
 kernel/softirq.c          |   34 +++++++++++++++++++++++++++++++++
 net/core/sock.c           |   37 ++++++++++++++++++++++++++++++++++--
 6 files changed, 92 insertions(+), 7 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Brian Bloniarz March 12, 2009, 3:47 p.m. UTC | #1
Eric Dumazet wrote:
> Here is an updated version of the patch.

This works great in all my tests so far.

Thanks,
Brian Bloniarz
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 12, 2009, 4:34 p.m. UTC | #2
Brian Bloniarz a écrit :
> Eric Dumazet wrote:
>> Here is an updated version of the patch.
> 
> This works great in all my tests so far.
> 
> Thanks,
> Brian Bloniarz

Cool

I am wondering if we should extend the mechanism and change
softirq_delay_exec() to wakeup a workqueue instead of
doing the loop from softirq handler, in case a given
level of stress / load is hit.

This could help machines with several cpus, and one NIC (without
multi RX queues) flooded by messages (not necessarly multicast trafic).
Imagine a media/chat server receiving XXX.000 frames / second

One cpu could be dedicated to pure softirq/network handling,
and other cpus could participate and handle the scheduler part if any.

Condition could be : 

- We run __do_softirq() from ksoftirqd and 
- We queued more than N 'struct softirq_delay' in softirq_delay_head
- We have more than one cpu online

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9127f6b..a773d0c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -295,6 +295,24 @@  extern void send_remote_softirq(struct call_single_data *cp, int cpu, int softir
 extern void __send_remote_softirq(struct call_single_data *cp, int cpu,
 				  int this_cpu, int softirq);
 
+/*
+ * softirq delayed works : should be delayed at do_softirq() end
+ */
+struct softirq_delay {
+	struct softirq_delay	*next;
+	void 			(*func)(struct softirq_delay *);
+};
+
+int softirq_delay_queue(struct softirq_delay *sdel);
+
+static inline void softirq_delay_init(struct softirq_delay *sdel,
+				      void (*func)(struct softirq_delay *))
+{
+	sdel->next = NULL;
+	sdel->func = func;
+}
+
+
 /* Tasklets --- multithreaded analogue of BHs.
 
    Main feature differing them of generic softirqs: tasklet
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 74bde13..fe55ec4 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -13,6 +13,9 @@ 
 
 #include <linux/typecheck.h>
 
+#define trace_softirq_enter()	do { current->softirq_context++; } while (0)
+#define trace_softirq_exit()	do { current->softirq_context--; } while (0)
+
 #ifdef CONFIG_TRACE_IRQFLAGS
   extern void trace_softirqs_on(unsigned long ip);
   extern void trace_softirqs_off(unsigned long ip);
@@ -24,8 +27,6 @@ 
 # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
 # define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
 # define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
-# define trace_softirq_enter()	do { current->softirq_context++; } while (0)
-# define trace_softirq_exit()	do { current->softirq_context--; } while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()		do { } while (0)
@@ -38,8 +39,6 @@ 
 # define trace_softirqs_enabled(p)	0
 # define trace_hardirq_enter()		do { } while (0)
 # define trace_hardirq_exit()		do { } while (0)
-# define trace_softirq_enter()		do { } while (0)
-# define trace_softirq_exit()		do { } while (0)
 # define INIT_TRACE_IRQFLAGS
 #endif
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8c216e0..5dd8487 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1320,8 +1320,8 @@  struct task_struct {
 	unsigned long softirq_enable_ip;
 	unsigned int softirq_enable_event;
 	int hardirq_context;
-	int softirq_context;
 #endif
+	int softirq_context;
 #ifdef CONFIG_LOCKDEP
 # define MAX_LOCK_DEPTH 48UL
 	u64 curr_chain_key;
diff --git a/include/net/sock.h b/include/net/sock.h
index eefeeaf..1bfd9b8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -260,6 +260,7 @@  struct sock {
 	unsigned long	        sk_lingertime;
 	struct sk_buff_head	sk_error_queue;
 	struct proto		*sk_prot_creator;
+	struct softirq_delay	sk_delay;
 	rwlock_t		sk_callback_lock;
 	int			sk_err,
 				sk_err_soft;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9041ea7..c601730 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -158,6 +158,38 @@  void local_bh_enable_ip(unsigned long ip)
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 
+
+#define SOFTIRQ_DELAY_END (struct softirq_delay *)1L
+
+static DEFINE_PER_CPU(struct softirq_delay *, softirq_delay_head) = {
+	SOFTIRQ_DELAY_END
+};
+
+/*
+ * Preemption is disabled by caller
+ */
+int softirq_delay_queue(struct softirq_delay *sdel)
+{
+	if (cmpxchg(&sdel->next, NULL, __get_cpu_var(softirq_delay_head)) == NULL) {
+		__get_cpu_var(softirq_delay_head) = sdel;
+		return 1;
+	}
+	return 0;
+}
+
+static void softirq_delay_exec(void)
+{
+	struct softirq_delay *sdel;
+
+	while ((sdel = __get_cpu_var(softirq_delay_head)) != SOFTIRQ_DELAY_END) {
+		__get_cpu_var(softirq_delay_head) = sdel->next;
+		sdel->next = NULL;
+		sdel->func(sdel);
+		}
+}
+
+
+
 /*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
@@ -211,6 +243,8 @@  restart:
 		pending >>= 1;
 	} while (pending);
 
+	softirq_delay_exec();
+
 	local_irq_disable();
 
 	pending = local_softirq_pending();
diff --git a/net/core/sock.c b/net/core/sock.c
index 5f97caa..d51d57d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -212,6 +212,8 @@  __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 /* Maximal space eaten by iovec or ancilliary data plus some space */
 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 
+static void sock_readable_defer(struct softirq_delay *sdel);
+
 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 {
 	struct timeval tv;
@@ -1026,6 +1028,7 @@  struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 #endif
 
 		rwlock_init(&newsk->sk_dst_lock);
+		softirq_delay_init(&newsk->sk_delay, sock_readable_defer);
 		rwlock_init(&newsk->sk_callback_lock);
 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
 				af_callback_keys + newsk->sk_family,
@@ -1634,12 +1637,41 @@  static void sock_def_error_report(struct sock *sk)
 	read_unlock(&sk->sk_callback_lock);
 }
 
+static void sock_readable_defer(struct softirq_delay *sdel)
+{
+	struct sock *sk = container_of(sdel, struct sock, sk_delay);
+
+	wake_up_interruptible_sync(sk->sk_sleep);
+	/*
+	 * Before unlocking, we increase preempt_count,
+	 * as it was decreased in sock_def_readable()
+	 */
+	preempt_disable();
+	read_unlock(&sk->sk_callback_lock);
+}
+
 static void sock_def_readable(struct sock *sk, int len)
 {
 	read_lock(&sk->sk_callback_lock);
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible_sync(sk->sk_sleep);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) {
+		if (current->softirq_context) {
+			/*
+			 * If called from __do_softirq(), we want to delay
+			 * calls to wake_up_interruptible_sync()
+			 */
+			if (!softirq_delay_queue(&sk->sk_delay))
+				goto unlock;
+			/*
+			 * We keep sk->sk_callback_lock read locked,
+			 * but decrease preempt_count to avoid an overflow
+			 */
+			preempt_enable_no_resched();
+			return;
+		}
+		wake_up_interruptible_sync(sk->sk_sleep);
+	}
+unlock:
 	read_unlock(&sk->sk_callback_lock);
 }
 
@@ -1720,6 +1752,7 @@  void sock_init_data(struct socket *sock, struct sock *sk)
 		sk->sk_sleep	=	NULL;
 
 	rwlock_init(&sk->sk_dst_lock);
+	softirq_delay_init(&sk->sk_delay, sock_readable_defer);
 	rwlock_init(&sk->sk_callback_lock);
 	lockdep_set_class_and_name(&sk->sk_callback_lock,
 			af_callback_keys + sk->sk_family,