diff mbox

[v2,3/3] pppoatm: protect against freeing of vcc

Message ID 1354141115.21562.101.camel@shinybook.infradead.org
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

David Woodhouse Nov. 28, 2012, 10:18 p.m. UTC
On Wed, 2012-11-28 at 09:21 +0000, David Laight wrote:
> Even when it might make sense to sleep in close until tx drains
> there needs to be a finite timeout before it become abortive.

You are, of course, right. We should never wait for hardware for ever.
And just to serve me right, I seem to have hit a bug in the latest Solos
firmware (1.11) which makes it sometimes lock up when I reboot. So it
never responds to the PKT_PCLOSE packet... and thus it deadlocks when I
try to kill pppd and unload the module to reset it :)

New version...

From 53dd01c08fec5b26006a009b25e4210127fdb27a Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 27 Nov 2012 23:49:24 +0000
Subject: [PATCH] solos-pci: Wait for pending TX to complete when releasing
 vcc

We should no longer be calling the old pop routine for the vcc, after
vcc_release() has completed. Make sure we wait for any pending TX skbs
to complete, by waiting for our own PKT_PCLOSE control skb to be sent.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/atm/solos-pci.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

Comments

Krzysztof Mazur Nov. 29, 2012, 10:57 a.m. UTC | #1
On Wed, Nov 28, 2012 at 10:18:35PM +0000, David Woodhouse wrote:
> On Wed, 2012-11-28 at 09:21 +0000, David Laight wrote:
> > Even when it might make sense to sleep in close until tx drains
> > there needs to be a finite timeout before it become abortive.
> 
> You are, of course, right. We should never wait for hardware for ever.
> And just to serve me right, I seem to have hit a bug in the latest Solos
> firmware (1.11) which makes it sometimes lock up when I reboot. So it
> never responds to the PKT_PCLOSE packet... and thus it deadlocks when I
> try to kill pppd and unload the module to reset it :)
> 
> New version...
> 
> From 53dd01c08fec5b26006a009b25e4210127fdb27a Mon Sep 17 00:00:00 2001
> From: David Woodhouse <David.Woodhouse@intel.com>
> Date: Tue, 27 Nov 2012 23:49:24 +0000
> Subject: [PATCH] solos-pci: Wait for pending TX to complete when releasing
>  vcc
> 
> We should no longer be calling the old pop routine for the vcc, after
> vcc_release() has completed. Make sure we wait for any pending TX skbs
> to complete, by waiting for our own PKT_PCLOSE control skb to be sent.
> 
> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
> ---
>  drivers/atm/solos-pci.c | 17 +++++++++++++++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
> index 9851093..3720670 100644
> --- a/drivers/atm/solos-pci.c
> +++ b/drivers/atm/solos-pci.c
> @@ -92,6 +92,7 @@ struct pkt_hdr {
>  };
>  
>  struct solos_skb_cb {
> +	struct completion c;
>  	struct atm_vcc *vcc;
>  	uint32_t dma_addr;
>  };
> @@ -881,11 +882,18 @@ static void pclose(struct atm_vcc *vcc)
>  	header->vci = cpu_to_le16(vcc->vci);
>  	header->type = cpu_to_le16(PKT_PCLOSE);
>  
> +	init_completion(&SKB_CB(skb)->c);
> +
>  	fpga_queue(card, SOLOS_CHAN(vcc->dev), skb, NULL);
>  
>  	clear_bit(ATM_VF_ADDR, &vcc->flags);
>  	clear_bit(ATM_VF_READY, &vcc->flags);
>  
> +	if (!wait_for_completion_timeout(&SKB_CB(skb)->c,
> +					 jiffies + msecs_to_jiffies(5000)))
> +		dev_warn(&card->dev->dev, "Timeout waiting for VCC close on port %d\n",
> +			 SOLOS_CHAN(vcc->dev));
> +

do we really need to wait here?

Why don't just do something like that:

	tasklet_disable(&card->tlet);
	spin_lock(&card->tx_queue_lock);
	for each skb in queue
		SKB_CB(skb)->vcc = NULL;
	spin_unlock(&card->tx_queue_lock);
	tasklet_enable(&card->tlet);

or if we really want to call vcc->pop() for such skbs:

	tasklet_disable(&card->tlet);
	spin_lock(&card->tx_queue_lock);
	for each skb in queue {
		skb_get(skb);
		solos_pop(SKB_CB(skb)->vcc, skb);
		SKB_CB(skb)->vcc = NULL;
	}
	spin_unlock(&card->tx_queue_lock);
	tasklet_enable(&card->tlet);

Krzysiek
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Woodhouse Nov. 29, 2012, 11:55 a.m. UTC | #2
On Thu, 2012-11-29 at 11:57 +0100, Krzysztof Mazur wrote:
> do we really need to wait here?
> Why don't just do something like that:
> 
> 	tasklet_disable(&card->tlet);
> 	spin_lock(&card->tx_queue_lock);
> 	for each skb in queue
> 		SKB_CB(skb)->vcc = NULL;
> 	spin_unlock(&card->tx_queue_lock);
> 	tasklet_enable(&card->tlet);
> 
> or if we really want to call vcc->pop() for such skbs:
> 
> 	tasklet_disable(&card->tlet);
> 	spin_lock(&card->tx_queue_lock);
> 	for each skb in queue {
> 		skb_get(skb);
> 		solos_pop(SKB_CB(skb)->vcc, skb);
> 		SKB_CB(skb)->vcc = NULL;
> 	}
> 	spin_unlock(&card->tx_queue_lock);
> 	tasklet_enable(&card->tlet);

Yes, we could certainly remove the packets from the tx_queue first.

However, in the card->using_dma case there might be a skb for this vcc
*currently* being DMA'd, and we'd still need to wait for that one.

I suppose we could just have a waitqueue in *every* TX skb, and under
card->tx_lock we could add ourselves to *that* waitqueue. Or just a
global waitqueue for DMA tx_done, perhaps. But waiting for our own
PKT_PCLOSE skb is just 'cleaner' in my view. It's simpler, and it's much
easier to test. Even if I had DMA-capable hardware, I'd have to get the
right timing to properly test that TX-pending-DMA case.

So dequeuing the packets would only serve to make pclose() slightly
faster, rather than simplifying it. It's hardly a fast path that we care
about, and I've also already ensured that there should only be one or
two packets queued per vcc *anyway*. So I'm mostly inclined not to
bother.

(I did fix the timeout argument to wait_for_completion_timeout())
chas williams - CONTRACTOR Nov. 29, 2012, 2:29 p.m. UTC | #3
On Thu, 29 Nov 2012 11:57:15 +0100
Krzysztof Mazur <krzysiek@podlesie.net> wrote:

> or if we really want to call vcc->pop() for such skbs:

you need to call ->pop() to cleaning up the wmem_alloc accounting.
otherwise you will get complaints from the atm stack later about
freeing a vcc that had outstanding data.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Krzysztof Mazur Nov. 29, 2012, 3:09 p.m. UTC | #4
On Wed, Nov 28, 2012 at 10:18:35PM +0000, David Woodhouse wrote:
> On Wed, 2012-11-28 at 09:21 +0000, David Laight wrote:
> > Even when it might make sense to sleep in close until tx drains
> > there needs to be a finite timeout before it become abortive.
> 
> You are, of course, right. We should never wait for hardware for ever.
> And just to serve me right, I seem to have hit a bug in the latest Solos
> firmware (1.11) which makes it sometimes lock up when I reboot. So it
> never responds to the PKT_PCLOSE packet... and thus it deadlocks when I
> try to kill pppd and unload the module to reset it :)
> 
> New version...
> 
> From 53dd01c08fec5b26006a009b25e4210127fdb27a Mon Sep 17 00:00:00 2001
> From: David Woodhouse <David.Woodhouse@intel.com>
> Date: Tue, 27 Nov 2012 23:49:24 +0000
> Subject: [PATCH] solos-pci: Wait for pending TX to complete when releasing
>  vcc
> 
> We should no longer be calling the old pop routine for the vcc, after
> vcc_release() has completed. Make sure we wait for any pending TX skbs
> to complete, by waiting for our own PKT_PCLOSE control skb to be sent.
> 
> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
> ---
>  drivers/atm/solos-pci.c | 17 +++++++++++++++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
> index 9851093..3720670 100644
> --- a/drivers/atm/solos-pci.c
> +++ b/drivers/atm/solos-pci.c
> @@ -92,6 +92,7 @@ struct pkt_hdr {
>  };
>  
>  struct solos_skb_cb {
> +	struct completion c;
>  	struct atm_vcc *vcc;
>  	uint32_t dma_addr;
>  };
> @@ -881,11 +882,18 @@ static void pclose(struct atm_vcc *vcc)
>  	header->vci = cpu_to_le16(vcc->vci);
>  	header->type = cpu_to_le16(PKT_PCLOSE);
>  
> +	init_completion(&SKB_CB(skb)->c);
> +
>  	fpga_queue(card, SOLOS_CHAN(vcc->dev), skb, NULL);
>  
>  	clear_bit(ATM_VF_ADDR, &vcc->flags);
>  	clear_bit(ATM_VF_READY, &vcc->flags);
>  
> +	if (!wait_for_completion_timeout(&SKB_CB(skb)->c,
> +					 jiffies + msecs_to_jiffies(5000)))
> +		dev_warn(&card->dev->dev, "Timeout waiting for VCC close on port %d\n",
> +			 SOLOS_CHAN(vcc->dev));
> +

I don't like two thinks about this patch:

	- if allos_skb(sizeof(*header), GFP_ATOMIC) at beginning of
	  pclose() fails we will crash

	- if card wakes up after this timeout we will probably crash too

That's why proposed different approach, but it has other problems.

Krzysiek
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
chas williams - CONTRACTOR Nov. 29, 2012, 3:37 p.m. UTC | #5
On Wed, 28 Nov 2012 22:18:35 +0000
David Woodhouse <dwmw2@infradead.org> wrote:


> diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
> index 9851093..3720670 100644
> --- a/drivers/atm/solos-pci.c
> +++ b/drivers/atm/solos-pci.c
> @@ -92,6 +92,7 @@ struct pkt_hdr {
>  };
>  
>  struct solos_skb_cb {
> +	struct completion c;
>  	struct atm_vcc *vcc;
>  	uint32_t dma_addr;
>  };
> @@ -881,11 +882,18 @@ static void pclose(struct atm_vcc *vcc)
>  	header->vci = cpu_to_le16(vcc->vci);
>  	header->type = cpu_to_le16(PKT_PCLOSE);
>  
> +	init_completion(&SKB_CB(skb)->c);
> +
>  	fpga_queue(card, SOLOS_CHAN(vcc->dev), skb, NULL);
>  
>  	clear_bit(ATM_VF_ADDR, &vcc->flags);
>  	clear_bit(ATM_VF_READY, &vcc->flags);

you shouldnt clear ATM_VF_ADDR until the vpi/vci is actually closed and
ready for reuse.  at this point, it isnt. ATM_VF_READY should already
be clear at this point but you should set it before you queue your
PKT_CLOSE.  these flags probably should be handled outside the drivers
since the context for them is pretty clear.  just another patch i never
got around to writing...

checking for ATM_VF_READY in find_vcc() is probably going to give you
grief as well since ATM_VF_READY isnt entirely under your control.  you
need to be able to find the vcc until after pclose() is finished since
your tasklet might have a few packets it is still processing?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Woodhouse Nov. 29, 2012, 3:59 p.m. UTC | #6
On Thu, 2012-11-29 at 10:37 -0500, chas williams - CONTRACTOR wrote:
> you shouldnt clear ATM_VF_ADDR until the vpi/vci is actually closed and
> ready for reuse.  at this point, it isnt.

So I should always wait for the completion of my PKT_CLOSE and only
clear ATM_VF_ADDR when it's actually done?

But can you define 'ready for reuse'? From the moment I clear
ATM_VF_ADDR, another CPU may enter my popen() function to set up another
VCC with the same parameters, and everything should work fine. The
PKT_POPEN will end up on the queue *after* my PKT_PCLOSE for the old
VCC. Any received packets will be dropped until the new VCC gets
ATM_VF_READY set (by the popen function).

What's the actual failure mode, caused by me clearing ATM_VF_ADDR "too
early"?

>  ATM_VF_READY should already be clear at this point but you should set
> it before you queue your PKT_CLOSE. 

I should *set* it? Do you mean clear it? Yes, I see it's cleared by
vcc_destroy_socket()... but all the other ATM drivers also seem to clear
it for themselves, and that would appear to be harmless.

> checking for ATM_VF_READY in find_vcc() is probably going to give you
> grief as well since ATM_VF_READY isnt entirely under your control. 

That's fine. If *anyone* has cleared ATM_VF_READY, I stop sending
packets up it. Or, more to the point, I stop using the damn thing at
all. See commit 1f6ea6e511e5ec730d8e88651da1b7b6e8fd1333.

>  you need to be able to find the vcc until after pclose() is finished since
> your tasklet might have a few packets it is still processing?

The whole point of that check is that the tasklet *won't* be able to
find it any more, and it'll just discard incoming packets for the
obsolescent VCC.
chas williams - CONTRACTOR Nov. 29, 2012, 4:11 p.m. UTC | #7
On Thu, 29 Nov 2012 15:59:08 +0000
David Woodhouse <dwmw2@infradead.org> wrote:

> On Thu, 2012-11-29 at 10:37 -0500, chas williams - CONTRACTOR wrote:
> > you shouldnt clear ATM_VF_ADDR until the vpi/vci is actually closed and
> > ready for reuse.  at this point, it isnt.
> 
> So I should always wait for the completion of my PKT_CLOSE and only
> clear ATM_VF_ADDR when it's actually done?
> 
> But can you define 'ready for reuse'? From the moment I clear
> ATM_VF_ADDR, another CPU may enter my popen() function to set up another
> VCC with the same parameters, and everything should work fine. The
> PKT_POPEN will end up on the queue *after* my PKT_PCLOSE for the old
> VCC. Any received packets will be dropped until the new VCC gets
> ATM_VF_READY set (by the popen function).
> 
> What's the actual failure mode, caused by me clearing ATM_VF_ADDR "too
> early"?

there may not be one (due to serialization from other parts of the
atm stack) but you "shouldn't" clear ATM_VF_ADDR until the vpi/vci pair
is ready for reuse.  by reuse, i mean that any previous rx/tx data in
the vpi/vci segmentation hardware has been removed/cleared.

> >  ATM_VF_READY should already be clear at this point but you should set
> > it before you queue your PKT_CLOSE. 
> 
> I should *set* it? Do you mean clear it? Yes, I see it's cleared by

sorry, i did mean clear it.

> vcc_destroy_socket()... but all the other ATM drivers also seem to clear
> it for themselves, and that would appear to be harmless.

yeah, like i said, it is spuriously cleared in the drivers and should
probably just be moved to under the control of the next layer up
completely.  drivers/atm should just handle the hardware side, not the
software side.

> > checking for ATM_VF_READY in find_vcc() is probably going to give you
> > grief as well since ATM_VF_READY isnt entirely under your control. 
> 
> That's fine. If *anyone* has cleared ATM_VF_READY, I stop sending
> packets up it. Or, more to the point, I stop using the damn thing at
> all. See commit 1f6ea6e511e5ec730d8e88651da1b7b6e8fd1333.
> 
> >  you need to be able to find the vcc until after pclose() is finished since
> > your tasklet might have a few packets it is still processing?
> 
> The whole point of that check is that the tasklet *won't* be able to
> find it any more, and it'll just discard incoming packets for the
> obsolescent VCC.

that's fine as long as you understand this.  in the case of the he, i
needed to be able to find the vcc until close() is finished so that i
can wakeup the sleeper in the close() routine that is waiting for the
reassembly queue to be cleared/reset.  also, i still needed to find the
vcc for the tx side during close() since i still might need to pop()
skb's that are being sent during the close() while i am still trying to
get the hardware to shutdown the transmit dma engine.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
index 9851093..3720670 100644
--- a/drivers/atm/solos-pci.c
+++ b/drivers/atm/solos-pci.c
@@ -92,6 +92,7 @@  struct pkt_hdr {
 };
 
 struct solos_skb_cb {
+	struct completion c;
 	struct atm_vcc *vcc;
 	uint32_t dma_addr;
 };
@@ -881,11 +882,18 @@  static void pclose(struct atm_vcc *vcc)
 	header->vci = cpu_to_le16(vcc->vci);
 	header->type = cpu_to_le16(PKT_PCLOSE);
 
+	init_completion(&SKB_CB(skb)->c);
+
 	fpga_queue(card, SOLOS_CHAN(vcc->dev), skb, NULL);
 
 	clear_bit(ATM_VF_ADDR, &vcc->flags);
 	clear_bit(ATM_VF_READY, &vcc->flags);
 
+	if (!wait_for_completion_timeout(&SKB_CB(skb)->c,
+					 jiffies + msecs_to_jiffies(5000)))
+		dev_warn(&card->dev->dev, "Timeout waiting for VCC close on port %d\n",
+			 SOLOS_CHAN(vcc->dev));
+
 	/* Hold up vcc_destroy_socket() (our caller) until solos_bh() in the
 	   tasklet has finished processing any incoming packets (and, more to
 	   the point, using the vcc pointer). */
@@ -1011,9 +1019,12 @@  static uint32_t fpga_tx(struct solos_card *card)
 			if (vcc) {
 				atomic_inc(&vcc->stats->tx);
 				solos_pop(vcc, oldskb);
-			} else
+			} else {
+				struct pkt_hdr *header = (void *)oldskb->data;
+				if (le16_to_cpu(header->type) == PKT_PCLOSE)
+					complete(&SKB_CB(oldskb)->c);
 				dev_kfree_skb_irq(oldskb);
-
+			}
 		}
 	}
 	/* For non-DMA TX, write the 'TX start' bit for all four ports simultaneously */
@@ -1345,6 +1356,8 @@  static struct pci_driver fpga_driver = {
 
 static int __init solos_pci_init(void)
 {
+	BUILD_BUG_ON(sizeof(struct solos_skb_cb) > sizeof(((struct sk_buff *)0)->cb));
+
 	printk(KERN_INFO "Solos PCI Driver Version %s\n", VERSION);
 	return pci_register_driver(&fpga_driver);
 }