diff mbox series

[7/7] um: simplify IRQ handling code

Message ID 20201123205446.6370687fa983.I811873233b8d71fbde9154b84c85b498521e3b12@changeid
State Changes Requested
Headers show
Series [1/7] um: support dynamic IRQ allocation | expand

Commit Message

Johannes Berg Nov. 23, 2020, 7:56 p.m. UTC
From: Johannes Berg <johannes.berg@intel.com>

Reduce dynamic allocations (and thereby cache misses) by simply
embedding the registration data for IRQs in the irq_entry, we
never supported these being really dynamic anyway as only one
was ever allowed ("Trying to reregister ...").

Lockless behaviour is preserved by removing the FD from the poll
set appropriately, but we use reg->events to indicate whether or
not this entry is used, rather than dynamically allocating them.

Also port the list of IRQ entries to list_head instead of the
current open-coded singly-linked list implementation, just for
sanity.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/kernel/irq.c | 369 +++++++++++++++----------------------------
 1 file changed, 127 insertions(+), 242 deletions(-)

Comments

Anton Ivanov Nov. 24, 2020, 9:50 p.m. UTC | #1
On 23/11/2020 19:56, Johannes Berg wrote:
> From: Johannes Berg <johannes.berg@intel.com>
>
> Reduce dynamic allocations (and thereby cache misses) by simply
> embedding the registration data for IRQs in the irq_entry, we
> never supported these being really dynamic anyway as only one
> was ever allowed ("Trying to reregister ...").
>
> Lockless behaviour is preserved by removing the FD from the poll
> set appropriately, but we use reg->events to indicate whether or
> not this entry is used, rather than dynamically allocating them.
>
> Also port the list of IRQ entries to list_head instead of the
> current open-coded singly-linked list implementation, just for
> sanity.
>
> Signed-off-by: Johannes Berg <johannes.berg@intel.com>
> ---
>   arch/um/kernel/irq.c | 369 +++++++++++++++----------------------------
>   1 file changed, 127 insertions(+), 242 deletions(-)
>
> diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
> index 9e8f776bb43a..bbf5a466b44c 100644
> --- a/arch/um/kernel/irq.c
> +++ b/arch/um/kernel/irq.c
> @@ -29,26 +29,23 @@ extern void free_irqs(void);
>    * This is why we keep a small irq_reg array for each fd -
>    * one entry per IRQ type
>    */
> -
>   struct irq_reg {
>   	void *id;
> -	enum um_irq_type type;
>   	int irq;
> +	/* it's cheaper to store this than to query it */
>   	int events;
>   	bool active;
>   	bool pending;
> -	bool purge;
>   };
>   
>   struct irq_entry {
> -	struct irq_entry *next;
> +	struct list_head list;
>   	int fd;
> -	struct irq_reg *irq_array[NUM_IRQ_TYPES];
> +	struct irq_reg reg[NUM_IRQ_TYPES];
>   };
>   
> -static struct irq_entry *active_fds;
> -
>   static DEFINE_SPINLOCK(irq_lock);
> +static LIST_HEAD(active_fds);
>   static DECLARE_BITMAP(irqs_allocated, NR_IRQS);
>   
>   static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs)
> @@ -61,12 +58,13 @@ static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs)
>    */
>   	if (irq->active) {
>   		irq->active = false;
> +
>   		do {
>   			irq->pending = false;
>   			do_IRQ(irq->irq, regs);
> -		} while (irq->pending && (!irq->purge));
> -		if (!irq->purge)
> -			irq->active = true;
> +		} while (irq->pending);
> +
> +		irq->active = true;
>   	} else {
>   		irq->pending = true;
>   	}
> @@ -75,9 +73,7 @@ static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs)
>   void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
>   {
>   	struct irq_entry *irq_entry;
> -	struct irq_reg *irq;
> -
> -	int n, i, j;
> +	int n, i;
>   
>   	while (1) {
>   		/* This is now lockless - epoll keeps back-referencesto the irqs
> @@ -96,21 +92,18 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
>   		}
>   
>   		for (i = 0; i < n ; i++) {
> -			/* Epoll back reference is the entry with 2 irq_reg
> -			 * leaves - one for each irq type.
> -			 */
> -			irq_entry = (struct irq_entry *)
> -				os_epoll_get_data_pointer(i);
> -			for (j = 0; j < NUM_IRQ_TYPES ; j++) {
> -				irq = irq_entry->irq_array[j];
> -				if (irq == NULL)
> +			enum um_irq_type t;
> +
> +			irq_entry = os_epoll_get_data_pointer(i);
> +
> +			for (t = 0; t < NUM_IRQ_TYPES; t++) {
> +				int events = irq_entry->reg[t].events;
> +
> +				if (!events)
>   					continue;
> -				if (os_epoll_triggered(i, irq->events) > 0)
> -					irq_io_loop(irq, regs);
> -				if (irq->purge) {
> -					irq_entry->irq_array[j] = NULL;
> -					kfree(irq);
> -				}
> +
> +				if (os_epoll_triggered(i, events) > 0)
> +					irq_io_loop(&irq_entry->reg[t], regs);
>   			}
>   		}
>   	}
> @@ -118,32 +111,59 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
>   	free_irqs();
>   }
>   
> -static int assign_epoll_events_to_irq(struct irq_entry *irq_entry)
> +static struct irq_entry *get_irq_entry_by_fd(int fd)
>   {
> -	int i;
> -	int events = 0;
> -	struct irq_reg *irq;
> +	struct irq_entry *walk;
>   
> -	for (i = 0; i < NUM_IRQ_TYPES ; i++) {
> -		irq = irq_entry->irq_array[i];
> -		if (irq != NULL)
> -			events = irq->events | events;
> -	}
> -	if (events > 0) {
> -	/* os_add_epoll will call os_mod_epoll if this already exists */
> -		return os_add_epoll_fd(events, irq_entry->fd, irq_entry);
> +	lockdep_assert_held(&irq_lock);
> +
> +	list_for_each_entry(walk, &active_fds, list) {
> +		if (walk->fd == fd)
> +			return walk;
>   	}
> -	/* No events - delete */
> -	return os_del_epoll_fd(irq_entry->fd);
> +
> +	return NULL;
> +}
> +
> +static void free_irq_entry(struct irq_entry *to_free, bool remove)
> +{
> +	if (!to_free)
> +		return;
> +
> +	if (remove)
> +		os_del_epoll_fd(to_free->fd);
> +	list_del(&to_free->list);
> +	kfree(to_free);
>   }
>   
> +static bool update_irq_entry(struct irq_entry *entry)
> +{
> +	enum um_irq_type i;
> +	int events = 0;
> +
> +	for (i = 0; i < NUM_IRQ_TYPES; i++)
> +		events |= entry->reg[i].events;
>   
> +	if (events) {
> +		/* will modify (instead of add) if needed */
> +		os_add_epoll_fd(events, entry->fd, entry);
> +		return true;
> +	}
> +
> +	os_del_epoll_fd(entry->fd);
> +	return false;
> +}
> +
> +static void update_or_free_irq_entry(struct irq_entry *entry)
> +{
> +	if (!update_irq_entry(entry))
> +		free_irq_entry(entry, false);
> +}
>   
>   static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
>   {
> -	struct irq_reg *new_fd;
>   	struct irq_entry *irq_entry;
> -	int i, err, events;
> +	int err, events = os_event_mask(type);
>   	unsigned long flags;
>   
>   	err = os_set_fd_async(fd);
> @@ -151,73 +171,33 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
>   		goto out;
>   
>   	spin_lock_irqsave(&irq_lock, flags);
> -
> -	/* Check if we have an entry for this fd */
> -
> -	err = -EBUSY;
> -	for (irq_entry = active_fds;
> -		irq_entry != NULL; irq_entry = irq_entry->next) {
> -		if (irq_entry->fd == fd)
> -			break;
> -	}
> -
> -	if (irq_entry == NULL) {
> -		/* This needs to be atomic as it may be called from an
> -		 * IRQ context.
> -		 */
> -		irq_entry = kmalloc(sizeof(struct irq_entry), GFP_ATOMIC);
> -		if (irq_entry == NULL) {
> -			printk(KERN_ERR
> -				"Failed to allocate new IRQ entry\n");
> +	irq_entry = get_irq_entry_by_fd(fd);
> +	if (irq_entry) {

This is not right.

You can re-register the same IRQ/fd, but with a different mask - f.e. 
turn on/off write or read on the same fd. F.E. - you have registered a 
read IRQ, you after that register same IRQ for write and you can alter 
the mask.

Though I have managed to get rid of most of the code which does that 
(f.e. serials now have separate irqs for read and write), there may be 
cases where it will be needed.

> +		/* cannot register the same FD twice with the same type */
> +		if (WARN_ON(irq_entry->reg[type].events)) {
> +			err = -EALREADY;
>   			goto out_unlock;
>   		}
> -		irq_entry->fd = fd;
> -		for (i = 0; i < NUM_IRQ_TYPES; i++)
> -			irq_entry->irq_array[i] = NULL;
> -		irq_entry->next = active_fds;
> -		active_fds = irq_entry;
> -	}
> -
> -	/* Check if we are trying to re-register an interrupt for a
> -	 * particular fd
> -	 */
>   
> -	if (irq_entry->irq_array[type] != NULL) {
> -		printk(KERN_ERR
> -			"Trying to reregister IRQ %d FD %d TYPE %d ID %p\n",
> -			irq, fd, type, dev_id
> -		);
> -		goto out_unlock;
> +		/* temporarily disable to avoid IRQ-side locking */
> +		os_del_epoll_fd(fd);
>   	} else {
> -		/* New entry for this fd */
> -
> -		err = -ENOMEM;
> -		new_fd = kmalloc(sizeof(struct irq_reg), GFP_ATOMIC);
> -		if (new_fd == NULL)
> +		irq_entry = kzalloc(sizeof(*irq_entry), GFP_ATOMIC);
> +		if (!irq_entry) {
> +			err = -ENOMEM;
>   			goto out_unlock;
> -
> -		events = os_event_mask(type);
> -
> -		*new_fd = ((struct irq_reg) {
> -			.id		= dev_id,
> -			.irq		= irq,
> -			.type		= type,
> -			.events		= events,
> -			.active		= true,
> -			.pending	= false,
> -			.purge		= false
> -		});
> -		/* Turn off any IO on this fd - allows us to
> -		 * avoid locking the IRQ loop
> -		 */
> -		os_del_epoll_fd(irq_entry->fd);
> -		irq_entry->irq_array[type] = new_fd;
> +		}
> +		irq_entry->fd = fd;
> +		list_add_tail(&irq_entry->list, &active_fds);
> +		maybe_sigio_broken(fd);
>   	}
>   
> -	/* Turn back IO on with the correct (new) IO event mask */
> -	assign_epoll_events_to_irq(irq_entry);
> +	irq_entry->reg[type].irq = irq;
> +	irq_entry->reg[type].active = true;
> +	irq_entry->reg[type].events = events;
> +
> +	WARN_ON(!update_irq_entry(irq_entry));
>   	spin_unlock_irqrestore(&irq_lock, flags);
> -	maybe_sigio_broken(fd);
>   
>   	return 0;
>   out_unlock:
> @@ -227,104 +207,10 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
>   }
>   
>   /*
> - * Walk the IRQ list and dispose of any unused entries.
> - * Should be done under irq_lock.
> + * Remove the entry or entries for a specific FD, if you
> + * don't want to remove all the possible entries then use
> + * um_free_irq() or deactivate_fd() instead.
>    */
> -
> -static void garbage_collect_irq_entries(void)
> -{
> -	int i;
> -	bool reap;
> -	struct irq_entry *walk;
> -	struct irq_entry *previous = NULL;
> -	struct irq_entry *to_free;
> -
> -	if (active_fds == NULL)
> -		return;
> -	walk = active_fds;
> -	while (walk != NULL) {
> -		reap = true;
> -		for (i = 0; i < NUM_IRQ_TYPES ; i++) {
> -			if (walk->irq_array[i] != NULL) {
> -				reap = false;
> -				break;
> -			}
> -		}
> -		if (reap) {
> -			if (previous == NULL)
> -				active_fds = walk->next;
> -			else
> -				previous->next = walk->next;
> -			to_free = walk;
> -		} else {
> -			to_free = NULL;
> -		}
> -		walk = walk->next;
> -		kfree(to_free);
> -	}
> -}
> -
> -/*
> - * Walk the IRQ list and get the descriptor for our FD
> - */
> -
> -static struct irq_entry *get_irq_entry_by_fd(int fd)
> -{
> -	struct irq_entry *walk = active_fds;
> -
> -	while (walk != NULL) {
> -		if (walk->fd == fd)
> -			return walk;
> -		walk = walk->next;
> -	}
> -	return NULL;
> -}
> -
> -
> -/*
> - * Walk the IRQ list and dispose of an entry for a specific
> - * device and number. Note - if sharing an IRQ for read
> - * and write for the same FD it will be disposed in either case.
> - * If this behaviour is undesirable use different IRQ ids.
> - */
> -
> -#define IGNORE_IRQ 1
> -#define IGNORE_DEV (1<<1)
> -
> -static void do_free_by_irq_and_dev(
> -	struct irq_entry *irq_entry,
> -	unsigned int irq,
> -	void *dev,
> -	int flags
> -)
> -{
> -	int i;
> -	struct irq_reg *to_free;
> -
> -	for (i = 0; i < NUM_IRQ_TYPES ; i++) {
> -		if (irq_entry->irq_array[i] != NULL) {
> -			if (
> -			((flags & IGNORE_IRQ) ||
> -				(irq_entry->irq_array[i]->irq == irq)) &&
> -			((flags & IGNORE_DEV) ||
> -				(irq_entry->irq_array[i]->id == dev))
> -			) {
> -				/* Turn off any IO on this fd - allows us to
> -				 * avoid locking the IRQ loop
> -				 */
> -				os_del_epoll_fd(irq_entry->fd);
> -				to_free = irq_entry->irq_array[i];
> -				irq_entry->irq_array[i] = NULL;
> -				assign_epoll_events_to_irq(irq_entry);
> -				if (to_free->active)
> -					to_free->purge = true;
> -				else
> -					kfree(to_free);
> -			}
> -		}
> -	}
> -}
> -
>   void free_irq_by_fd(int fd)
>   {
>   	struct irq_entry *to_free;
> @@ -332,58 +218,64 @@ void free_irq_by_fd(int fd)
>   
>   	spin_lock_irqsave(&irq_lock, flags);
>   	to_free = get_irq_entry_by_fd(fd);
> -	if (to_free != NULL) {
> -		do_free_by_irq_and_dev(
> -			to_free,
> -			-1,
> -			NULL,
> -			IGNORE_IRQ | IGNORE_DEV
> -		);
> -	}
> -	garbage_collect_irq_entries();
> +	free_irq_entry(to_free, true);
>   	spin_unlock_irqrestore(&irq_lock, flags);
>   }
>   EXPORT_SYMBOL(free_irq_by_fd);
>   
>   static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
>   {
> -	struct irq_entry *to_free;
> +	struct irq_entry *entry;
>   	unsigned long flags;
>   
>   	spin_lock_irqsave(&irq_lock, flags);
> -	to_free = active_fds;
> -	while (to_free != NULL) {
> -		do_free_by_irq_and_dev(
> -			to_free,
> -			irq,
> -			dev,
> -			0
> -		);
> -		to_free = to_free->next;
> +	list_for_each_entry(entry, &active_fds, list) {
> +		enum um_irq_type i;
> +
> +		for (i = 0; i < NUM_IRQ_TYPES; i++) {
> +			struct irq_reg *reg = &entry->reg[i];
> +
> +			if (!reg->events)
> +				continue;
> +			if (reg->irq != irq)
> +				continue;
> +			if (reg->id != dev)
> +				continue;
> +
> +			os_del_epoll_fd(entry->fd);
> +			reg->events = 0;
> +			update_or_free_irq_entry(entry);
> +			goto out;
> +		}
>   	}
> -	garbage_collect_irq_entries();
> +out:
>   	spin_unlock_irqrestore(&irq_lock, flags);
>   }
>   
> -
>   void deactivate_fd(int fd, int irqnum)
>   {
> -	struct irq_entry *to_free;
> +	struct irq_entry *entry;
>   	unsigned long flags;
> +	enum um_irq_type i;
>   
>   	os_del_epoll_fd(fd);
> +
>   	spin_lock_irqsave(&irq_lock, flags);
> -	to_free = get_irq_entry_by_fd(fd);
> -	if (to_free != NULL) {
> -		do_free_by_irq_and_dev(
> -			to_free,
> -			irqnum,
> -			NULL,
> -			IGNORE_DEV
> -		);
> +	entry = get_irq_entry_by_fd(fd);
> +	if (!entry)
> +		goto out;
> +
> +	for (i = 0; i < NUM_IRQ_TYPES; i++) {
> +		if (!entry->reg[i].events)
> +			continue;
> +		if (entry->reg[i].irq == irqnum)
> +			entry->reg[i].events = 0;
>   	}
> -	garbage_collect_irq_entries();
> +
> +	update_or_free_irq_entry(entry);
> +out:
>   	spin_unlock_irqrestore(&irq_lock, flags);
> +
>   	ignore_sigio_fd(fd);
>   }
>   EXPORT_SYMBOL(deactivate_fd);
> @@ -396,24 +288,17 @@ EXPORT_SYMBOL(deactivate_fd);
>    */
>   int deactivate_all_fds(void)
>   {
> -	struct irq_entry *to_free;
> +	struct irq_entry *entry;
>   
>   	/* Stop IO. The IRQ loop has no lock so this is our
>   	 * only way of making sure we are safe to dispose
>   	 * of all IRQ handlers
>   	 */
>   	os_set_ioignore();
> -	to_free = active_fds;
> -	while (to_free != NULL) {
> -		do_free_by_irq_and_dev(
> -			to_free,
> -			-1,
> -			NULL,
> -			IGNORE_IRQ | IGNORE_DEV
> -		);
> -		to_free = to_free->next;
> -	}
> -	/* don't garbage collect - we can no longer call kfree() here */
> +
> +	/* we can no longer call kfree() here so just deactivate */
> +	list_for_each_entry(entry, &active_fds, list)
> +		os_del_epoll_fd(entry->fd);
>   	os_close_epoll_fd();
>   	return 0;
>   }
Johannes Berg Nov. 24, 2020, 9:58 p.m. UTC | #2
On Tue, 2020-11-24 at 21:50 +0000, Anton Ivanov wrote:
> 
> >   static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
> >   {
> > -	struct irq_reg *new_fd;
> >   	struct irq_entry *irq_entry;
> > -	int i, err, events;
> > +	int err, events = os_event_mask(type);
> >   	unsigned long flags;
> >   
> >   	err = os_set_fd_async(fd);
> > @@ -151,73 +171,33 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
> >   		goto out;
> >   
> >   	spin_lock_irqsave(&irq_lock, flags);
> > -
> > -	/* Check if we have an entry for this fd */
> > -
> > -	err = -EBUSY;
> > -	for (irq_entry = active_fds;
> > -		irq_entry != NULL; irq_entry = irq_entry->next) {
> > -		if (irq_entry->fd == fd)
> > -			break;
> > -	}
> > -
> > -	if (irq_entry == NULL) {
> > -		/* This needs to be atomic as it may be called from an
> > -		 * IRQ context.
> > -		 */
> > -		irq_entry = kmalloc(sizeof(struct irq_entry), GFP_ATOMIC);
> > -		if (irq_entry == NULL) {
> > -			printk(KERN_ERR
> > -				"Failed to allocate new IRQ entry\n");
> > +	irq_entry = get_irq_entry_by_fd(fd);
> > +	if (irq_entry) {
> 
> This is not right.
> 
> You can re-register the same IRQ/fd, but with a different mask - f.e. 
> turn on/off write or read on the same fd. F.E. - you have registered a 
> read IRQ, you after that register same IRQ for write and you can alter 
> the mask.

Hmm. You snipped some code, and it continued like this:

        irq_entry = get_irq_entry_by_fd(fd);
        if (irq_entry) {
                /* cannot register the same FD twice with the same type */
                if (WARN_ON(irq_entry->reg[type].events)) {
                        // basically return -EALREADY


I'm not sure I see this is different from what it was before? If the
irq_entry was found before (by fd, so that's equivalent to
get_irq_entry_by_fd(), was just open-coded), then previously it would
have gone into

-       if (irq_entry->irq_array[type] != NULL) {
-               printk(KERN_ERR
-                       "Trying to reregister IRQ %d FD %d TYPE %d ID %p\n",
-                       irq, fd, type, dev_id
-               );
-               goto out_unlock;

which was a failure?

But you meant a *different* type (IRQ_READ vs. IRQ_WRITE), and then in
my new code we also accept that:

        irq_entry = get_irq_entry_by_fd(fd);
        if (irq_entry) {
                /* cannot register the same FD twice with the same type */
                if (WARN_ON(irq_entry->reg[type].events)) {
                        err = -EALREADY;
                        goto out_unlock;
                }

                /* temporarily disable to avoid IRQ-side locking */
                os_del_epoll_fd(fd);
        } else {
[...]
        }

        irq_entry->reg[type].irq = irq;
        irq_entry->reg[type].active = true;
        irq_entry->reg[type].events = events;


So not sure I see the difference wrt. this?


You can't really *alter* the mask, even in the old code, you can just
register a new IRQ (or even the same) for the other type (read/write),
and then unregister the old type or such.

Ok ... what am I missing?

johannes
Anton Ivanov Nov. 24, 2020, 10:36 p.m. UTC | #3
On 24/11/2020 21:58, Johannes Berg wrote:
> On Tue, 2020-11-24 at 21:50 +0000, Anton Ivanov wrote:
>>
>>>    static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
>>>    {
>>> -	struct irq_reg *new_fd;
>>>    	struct irq_entry *irq_entry;
>>> -	int i, err, events;
>>> +	int err, events = os_event_mask(type);
>>>    	unsigned long flags;
>>>    
>>>    	err = os_set_fd_async(fd);
>>> @@ -151,73 +171,33 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
>>>    		goto out;
>>>    
>>>    	spin_lock_irqsave(&irq_lock, flags);
>>> -
>>> -	/* Check if we have an entry for this fd */
>>> -
>>> -	err = -EBUSY;
>>> -	for (irq_entry = active_fds;
>>> -		irq_entry != NULL; irq_entry = irq_entry->next) {
>>> -		if (irq_entry->fd == fd)
>>> -			break;
>>> -	}
>>> -
>>> -	if (irq_entry == NULL) {
>>> -		/* This needs to be atomic as it may be called from an
>>> -		 * IRQ context.
>>> -		 */
>>> -		irq_entry = kmalloc(sizeof(struct irq_entry), GFP_ATOMIC);
>>> -		if (irq_entry == NULL) {
>>> -			printk(KERN_ERR
>>> -				"Failed to allocate new IRQ entry\n");
>>> +	irq_entry = get_irq_entry_by_fd(fd);
>>> +	if (irq_entry) {
>>
>> This is not right.
>>
>> You can re-register the same IRQ/fd, but with a different mask - f.e.
>> turn on/off write or read on the same fd. F.E. - you have registered a
>> read IRQ, you after that register same IRQ for write and you can alter
>> the mask.
> 
> Hmm. You snipped some code, and it continued like this:
> 
>          irq_entry = get_irq_entry_by_fd(fd);
>          if (irq_entry) {
>                  /* cannot register the same FD twice with the same type */
>                  if (WARN_ON(irq_entry->reg[type].events)) {
>                          // basically return -EALREADY
> 
> 
> I'm not sure I see this is different from what it was before? If the

The original intention was to be able to do it :)

If there is no code to use that and/or if it was not working properly 
anyway we may abandon that for the time being.

A.

> irq_entry was found before (by fd, so that's equivalent to
> get_irq_entry_by_fd(), was just open-coded), then previously it would
> have gone into
> 
> -       if (irq_entry->irq_array[type] != NULL) {
> -               printk(KERN_ERR
> -                       "Trying to reregister IRQ %d FD %d TYPE %d ID %p\n",
> -                       irq, fd, type, dev_id
> -               );
> -               goto out_unlock;
> 
> which was a failure?
> 
> But you meant a *different* type (IRQ_READ vs. IRQ_WRITE), and then in
> my new code we also accept that:
> 
>          irq_entry = get_irq_entry_by_fd(fd);
>          if (irq_entry) {
>                  /* cannot register the same FD twice with the same type */
>                  if (WARN_ON(irq_entry->reg[type].events)) {
>                          err = -EALREADY;
>                          goto out_unlock;
>                  }
> 
>                  /* temporarily disable to avoid IRQ-side locking */
>                  os_del_epoll_fd(fd);
>          } else {
> [...]
>          }
> 
>          irq_entry->reg[type].irq = irq;
>          irq_entry->reg[type].active = true;
>          irq_entry->reg[type].events = events;
> 
> 
> So not sure I see the difference wrt. this?
> 
> 
> You can't really *alter* the mask, even in the old code, you can just
> register a new IRQ (or even the same) for the other type (read/write),
> and then unregister the old type or such.
> 
> Ok ... what am I missing?
> 
> johannes
> 
> 
> _______________________________________________
> linux-um mailing list
> linux-um@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-um
>
Johannes Berg Nov. 30, 2020, noon UTC | #4
Sorry, looks like I forgot to reply to this earlier.

On Tue, 2020-11-24 at 22:36 +0000, Anton Ivanov wrote:

> > > > @@ -151,73 +171,33 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
> > > >    		goto out;
> > > >    
> > > >    	spin_lock_irqsave(&irq_lock, flags);
> > > > -
> > > > -	/* Check if we have an entry for this fd */
> > > > -
> > > > -	err = -EBUSY;
> > > > -	for (irq_entry = active_fds;
> > > > -		irq_entry != NULL; irq_entry = irq_entry->next) {
> > > > -		if (irq_entry->fd == fd)
> > > > -			break;
> > > > -	}
> > > > -
> > > > -	if (irq_entry == NULL) {
> > > > -		/* This needs to be atomic as it may be called from an
> > > > -		 * IRQ context.
> > > > -		 */
> > > > -		irq_entry = kmalloc(sizeof(struct irq_entry), GFP_ATOMIC);
> > > > -		if (irq_entry == NULL) {
> > > > -			printk(KERN_ERR
> > > > -				"Failed to allocate new IRQ entry\n");
> > > > +	irq_entry = get_irq_entry_by_fd(fd);
> > > > +	if (irq_entry) {
> > > 
> > > This is not right.
> > > 
> > > You can re-register the same IRQ/fd, but with a different mask - f.e.
> > > turn on/off write or read on the same fd. F.E. - you have registered a
> > > read IRQ, you after that register same IRQ for write and you can alter
> > > the mask.
> > 
> > Hmm. You snipped some code, and it continued like this:
> > 
> >          irq_entry = get_irq_entry_by_fd(fd);
> >          if (irq_entry) {
> >                  /* cannot register the same FD twice with the same type */
> >                  if (WARN_ON(irq_entry->reg[type].events)) {
> >                          // basically return -EALREADY
> > 
> > 
> > I'm not sure I see this is different from what it was before? If the
> 
> The original intention was to be able to do it :)

To do _what_ exactly? You said to re-register the same FD, but you could
do that before by unregistering?

And read/write are completely separate entries anyway. You cannot do
both at the same time (they're not bitmasks, just enum values.)

In fact, you *can* still use the same FD for both, with and without this
patch. We find the irq_entry here (by FD), and then check if this type
of event is already used. That means you cannot register for READ for
the same FD twice, but using the other event (WRITE) is still fine.

johannes
Anton Ivanov Nov. 30, 2020, 1:40 p.m. UTC | #5
On 30/11/2020 12:00, Johannes Berg wrote:
> Sorry, looks like I forgot to reply to this earlier.
>
> On Tue, 2020-11-24 at 22:36 +0000, Anton Ivanov wrote:
>
>>>>> @@ -151,73 +171,33 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
>>>>>     		goto out;
>>>>>     
>>>>>     	spin_lock_irqsave(&irq_lock, flags);
>>>>> -
>>>>> -	/* Check if we have an entry for this fd */
>>>>> -
>>>>> -	err = -EBUSY;
>>>>> -	for (irq_entry = active_fds;
>>>>> -		irq_entry != NULL; irq_entry = irq_entry->next) {
>>>>> -		if (irq_entry->fd == fd)
>>>>> -			break;
>>>>> -	}
>>>>> -
>>>>> -	if (irq_entry == NULL) {
>>>>> -		/* This needs to be atomic as it may be called from an
>>>>> -		 * IRQ context.
>>>>> -		 */
>>>>> -		irq_entry = kmalloc(sizeof(struct irq_entry), GFP_ATOMIC);
>>>>> -		if (irq_entry == NULL) {
>>>>> -			printk(KERN_ERR
>>>>> -				"Failed to allocate new IRQ entry\n");
>>>>> +	irq_entry = get_irq_entry_by_fd(fd);
>>>>> +	if (irq_entry) {
>>>> This is not right.
>>>>
>>>> You can re-register the same IRQ/fd, but with a different mask - f.e.
>>>> turn on/off write or read on the same fd. F.E. - you have registered a
>>>> read IRQ, you after that register same IRQ for write and you can alter
>>>> the mask.
>>> Hmm. You snipped some code, and it continued like this:
>>>
>>>           irq_entry = get_irq_entry_by_fd(fd);
>>>           if (irq_entry) {
>>>                   /* cannot register the same FD twice with the same type */
>>>                   if (WARN_ON(irq_entry->reg[type].events)) {
>>>                           // basically return -EALREADY
>>>
>>>
>>> I'm not sure I see this is different from what it was before? If the
>> The original intention was to be able to do it :)
> To do _what_ exactly? You said to re-register the same FD, but you could
> do that before by unregistering?

I need to remember what I though in 2011 when I wrote the first version :)

>
> And read/write are completely separate entries anyway. You cannot do
> both at the same time (they're not bitmasks, just enum values.)

Yeah, we reached that point over time anyway. I forgot exactly why I dropped the idea of sharing read and write IRQs, but I dropped it :)

>
> In fact, you *can* still use the same FD for both, with and without this
> patch. We find the irq_entry here (by FD), and then check if this type
> of event is already used. That means you cannot register for READ for
> the same FD twice, but using the other event (WRITE) is still fine.

I am going through the patchset. As I said - we ended up with a design where the original idea was dropped. Probably it was dropped for good too. It was making things overly complicated.

If it is dropped for good and not used anywhere, we might as well simplify the code.


Brgds,

>
> johannes
>
>
Anton Ivanov Nov. 30, 2020, 4:30 p.m. UTC | #6
On 23/11/2020 19:56, Johannes Berg wrote:
> From: Johannes Berg <johannes.berg@intel.com>
>
> Reduce dynamic allocations (and thereby cache misses) by simply
> embedding the registration data for IRQs in the irq_entry, we
> never supported these being really dynamic anyway as only one
> was ever allowed ("Trying to reregister ...").
>
> Lockless behaviour is preserved by removing the FD from the poll
> set appropriately, but we use reg->events to indicate whether or
> not this entry is used, rather than dynamically allocating them.
>
> Also port the list of IRQ entries to list_head instead of the
> current open-coded singly-linked list implementation, just for
> sanity.


This one is broken. I will look exactly where. It is somewhere in the IRQ delete/re-request logic.

How to test

run iperf -s in the UML with a vector network driver.

run iperf -c to the UML instance from the host in a loop

Try to ifup/ifdown the vecX interface inside the UML.

With master it works fine. With this patch it fails.

A.

> Signed-off-by: Johannes Berg <johannes.berg@intel.com>
> ---
>   arch/um/kernel/irq.c | 369 +++++++++++++++----------------------------
>   1 file changed, 127 insertions(+), 242 deletions(-)
>
> diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
> index 9e8f776bb43a..bbf5a466b44c 100644
> --- a/arch/um/kernel/irq.c
> +++ b/arch/um/kernel/irq.c
> @@ -29,26 +29,23 @@ extern void free_irqs(void);
>    * This is why we keep a small irq_reg array for each fd -
>    * one entry per IRQ type
>    */
> -
>   struct irq_reg {
>   	void *id;
> -	enum um_irq_type type;
>   	int irq;
> +	/* it's cheaper to store this than to query it */
>   	int events;
>   	bool active;
>   	bool pending;
> -	bool purge;
>   };
>   
>   struct irq_entry {
> -	struct irq_entry *next;
> +	struct list_head list;
>   	int fd;
> -	struct irq_reg *irq_array[NUM_IRQ_TYPES];
> +	struct irq_reg reg[NUM_IRQ_TYPES];
>   };
>   
> -static struct irq_entry *active_fds;
> -
>   static DEFINE_SPINLOCK(irq_lock);
> +static LIST_HEAD(active_fds);
>   static DECLARE_BITMAP(irqs_allocated, NR_IRQS);
>   
>   static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs)
> @@ -61,12 +58,13 @@ static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs)
>    */
>   	if (irq->active) {
>   		irq->active = false;
> +
>   		do {
>   			irq->pending = false;
>   			do_IRQ(irq->irq, regs);
> -		} while (irq->pending && (!irq->purge));
> -		if (!irq->purge)
> -			irq->active = true;
> +		} while (irq->pending);
> +
> +		irq->active = true;
>   	} else {
>   		irq->pending = true;
>   	}
> @@ -75,9 +73,7 @@ static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs)
>   void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
>   {
>   	struct irq_entry *irq_entry;
> -	struct irq_reg *irq;
> -
> -	int n, i, j;
> +	int n, i;
>   
>   	while (1) {
>   		/* This is now lockless - epoll keeps back-referencesto the irqs
> @@ -96,21 +92,18 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
>   		}
>   
>   		for (i = 0; i < n ; i++) {
> -			/* Epoll back reference is the entry with 2 irq_reg
> -			 * leaves - one for each irq type.
> -			 */
> -			irq_entry = (struct irq_entry *)
> -				os_epoll_get_data_pointer(i);
> -			for (j = 0; j < NUM_IRQ_TYPES ; j++) {
> -				irq = irq_entry->irq_array[j];
> -				if (irq == NULL)
> +			enum um_irq_type t;
> +
> +			irq_entry = os_epoll_get_data_pointer(i);
> +
> +			for (t = 0; t < NUM_IRQ_TYPES; t++) {
> +				int events = irq_entry->reg[t].events;
> +
> +				if (!events)
>   					continue;
> -				if (os_epoll_triggered(i, irq->events) > 0)
> -					irq_io_loop(irq, regs);
> -				if (irq->purge) {
> -					irq_entry->irq_array[j] = NULL;
> -					kfree(irq);
> -				}
> +
> +				if (os_epoll_triggered(i, events) > 0)
> +					irq_io_loop(&irq_entry->reg[t], regs);
>   			}
>   		}
>   	}
> @@ -118,32 +111,59 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
>   	free_irqs();
>   }
>   
> -static int assign_epoll_events_to_irq(struct irq_entry *irq_entry)
> +static struct irq_entry *get_irq_entry_by_fd(int fd)
>   {
> -	int i;
> -	int events = 0;
> -	struct irq_reg *irq;
> +	struct irq_entry *walk;
>   
> -	for (i = 0; i < NUM_IRQ_TYPES ; i++) {
> -		irq = irq_entry->irq_array[i];
> -		if (irq != NULL)
> -			events = irq->events | events;
> -	}
> -	if (events > 0) {
> -	/* os_add_epoll will call os_mod_epoll if this already exists */
> -		return os_add_epoll_fd(events, irq_entry->fd, irq_entry);
> +	lockdep_assert_held(&irq_lock);
> +
> +	list_for_each_entry(walk, &active_fds, list) {
> +		if (walk->fd == fd)
> +			return walk;
>   	}
> -	/* No events - delete */
> -	return os_del_epoll_fd(irq_entry->fd);
> +
> +	return NULL;
> +}
> +
> +static void free_irq_entry(struct irq_entry *to_free, bool remove)
> +{
> +	if (!to_free)
> +		return;
> +
> +	if (remove)
> +		os_del_epoll_fd(to_free->fd);
> +	list_del(&to_free->list);
> +	kfree(to_free);
>   }
>   
> +static bool update_irq_entry(struct irq_entry *entry)
> +{
> +	enum um_irq_type i;
> +	int events = 0;
> +
> +	for (i = 0; i < NUM_IRQ_TYPES; i++)
> +		events |= entry->reg[i].events;
>   
> +	if (events) {
> +		/* will modify (instead of add) if needed */
> +		os_add_epoll_fd(events, entry->fd, entry);
> +		return true;
> +	}
> +
> +	os_del_epoll_fd(entry->fd);
> +	return false;
> +}
> +
> +static void update_or_free_irq_entry(struct irq_entry *entry)
> +{
> +	if (!update_irq_entry(entry))
> +		free_irq_entry(entry, false);
> +}
>   
>   static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
>   {
> -	struct irq_reg *new_fd;
>   	struct irq_entry *irq_entry;
> -	int i, err, events;
> +	int err, events = os_event_mask(type);
>   	unsigned long flags;
>   
>   	err = os_set_fd_async(fd);
> @@ -151,73 +171,33 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
>   		goto out;
>   
>   	spin_lock_irqsave(&irq_lock, flags);
> -
> -	/* Check if we have an entry for this fd */
> -
> -	err = -EBUSY;
> -	for (irq_entry = active_fds;
> -		irq_entry != NULL; irq_entry = irq_entry->next) {
> -		if (irq_entry->fd == fd)
> -			break;
> -	}
> -
> -	if (irq_entry == NULL) {
> -		/* This needs to be atomic as it may be called from an
> -		 * IRQ context.
> -		 */
> -		irq_entry = kmalloc(sizeof(struct irq_entry), GFP_ATOMIC);
> -		if (irq_entry == NULL) {
> -			printk(KERN_ERR
> -				"Failed to allocate new IRQ entry\n");
> +	irq_entry = get_irq_entry_by_fd(fd);
> +	if (irq_entry) {
> +		/* cannot register the same FD twice with the same type */
> +		if (WARN_ON(irq_entry->reg[type].events)) {
> +			err = -EALREADY;
>   			goto out_unlock;
>   		}
> -		irq_entry->fd = fd;
> -		for (i = 0; i < NUM_IRQ_TYPES; i++)
> -			irq_entry->irq_array[i] = NULL;
> -		irq_entry->next = active_fds;
> -		active_fds = irq_entry;
> -	}
> -
> -	/* Check if we are trying to re-register an interrupt for a
> -	 * particular fd
> -	 */
>   
> -	if (irq_entry->irq_array[type] != NULL) {
> -		printk(KERN_ERR
> -			"Trying to reregister IRQ %d FD %d TYPE %d ID %p\n",
> -			irq, fd, type, dev_id
> -		);
> -		goto out_unlock;
> +		/* temporarily disable to avoid IRQ-side locking */
> +		os_del_epoll_fd(fd);
>   	} else {
> -		/* New entry for this fd */
> -
> -		err = -ENOMEM;
> -		new_fd = kmalloc(sizeof(struct irq_reg), GFP_ATOMIC);
> -		if (new_fd == NULL)
> +		irq_entry = kzalloc(sizeof(*irq_entry), GFP_ATOMIC);
> +		if (!irq_entry) {
> +			err = -ENOMEM;
>   			goto out_unlock;
> -
> -		events = os_event_mask(type);
> -
> -		*new_fd = ((struct irq_reg) {
> -			.id		= dev_id,
> -			.irq		= irq,
> -			.type		= type,
> -			.events		= events,
> -			.active		= true,
> -			.pending	= false,
> -			.purge		= false
> -		});
> -		/* Turn off any IO on this fd - allows us to
> -		 * avoid locking the IRQ loop
> -		 */
> -		os_del_epoll_fd(irq_entry->fd);
> -		irq_entry->irq_array[type] = new_fd;
> +		}
> +		irq_entry->fd = fd;
> +		list_add_tail(&irq_entry->list, &active_fds);
> +		maybe_sigio_broken(fd);
>   	}
>   
> -	/* Turn back IO on with the correct (new) IO event mask */
> -	assign_epoll_events_to_irq(irq_entry);
> +	irq_entry->reg[type].irq = irq;
> +	irq_entry->reg[type].active = true;
> +	irq_entry->reg[type].events = events;
> +
> +	WARN_ON(!update_irq_entry(irq_entry));
>   	spin_unlock_irqrestore(&irq_lock, flags);
> -	maybe_sigio_broken(fd);
>   
>   	return 0;
>   out_unlock:
> @@ -227,104 +207,10 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
>   }
>   
>   /*
> - * Walk the IRQ list and dispose of any unused entries.
> - * Should be done under irq_lock.
> + * Remove the entry or entries for a specific FD, if you
> + * don't want to remove all the possible entries then use
> + * um_free_irq() or deactivate_fd() instead.
>    */
> -
> -static void garbage_collect_irq_entries(void)
> -{
> -	int i;
> -	bool reap;
> -	struct irq_entry *walk;
> -	struct irq_entry *previous = NULL;
> -	struct irq_entry *to_free;
> -
> -	if (active_fds == NULL)
> -		return;
> -	walk = active_fds;
> -	while (walk != NULL) {
> -		reap = true;
> -		for (i = 0; i < NUM_IRQ_TYPES ; i++) {
> -			if (walk->irq_array[i] != NULL) {
> -				reap = false;
> -				break;
> -			}
> -		}
> -		if (reap) {
> -			if (previous == NULL)
> -				active_fds = walk->next;
> -			else
> -				previous->next = walk->next;
> -			to_free = walk;
> -		} else {
> -			to_free = NULL;
> -		}
> -		walk = walk->next;
> -		kfree(to_free);
> -	}
> -}
> -
> -/*
> - * Walk the IRQ list and get the descriptor for our FD
> - */
> -
> -static struct irq_entry *get_irq_entry_by_fd(int fd)
> -{
> -	struct irq_entry *walk = active_fds;
> -
> -	while (walk != NULL) {
> -		if (walk->fd == fd)
> -			return walk;
> -		walk = walk->next;
> -	}
> -	return NULL;
> -}
> -
> -
> -/*
> - * Walk the IRQ list and dispose of an entry for a specific
> - * device and number. Note - if sharing an IRQ for read
> - * and write for the same FD it will be disposed in either case.
> - * If this behaviour is undesirable use different IRQ ids.
> - */
> -
> -#define IGNORE_IRQ 1
> -#define IGNORE_DEV (1<<1)
> -
> -static void do_free_by_irq_and_dev(
> -	struct irq_entry *irq_entry,
> -	unsigned int irq,
> -	void *dev,
> -	int flags
> -)
> -{
> -	int i;
> -	struct irq_reg *to_free;
> -
> -	for (i = 0; i < NUM_IRQ_TYPES ; i++) {
> -		if (irq_entry->irq_array[i] != NULL) {
> -			if (
> -			((flags & IGNORE_IRQ) ||
> -				(irq_entry->irq_array[i]->irq == irq)) &&
> -			((flags & IGNORE_DEV) ||
> -				(irq_entry->irq_array[i]->id == dev))
> -			) {
> -				/* Turn off any IO on this fd - allows us to
> -				 * avoid locking the IRQ loop
> -				 */
> -				os_del_epoll_fd(irq_entry->fd);
> -				to_free = irq_entry->irq_array[i];
> -				irq_entry->irq_array[i] = NULL;
> -				assign_epoll_events_to_irq(irq_entry);
> -				if (to_free->active)
> -					to_free->purge = true;
> -				else
> -					kfree(to_free);
> -			}
> -		}
> -	}
> -}
> -
>   void free_irq_by_fd(int fd)
>   {
>   	struct irq_entry *to_free;
> @@ -332,58 +218,64 @@ void free_irq_by_fd(int fd)
>   
>   	spin_lock_irqsave(&irq_lock, flags);
>   	to_free = get_irq_entry_by_fd(fd);
> -	if (to_free != NULL) {
> -		do_free_by_irq_and_dev(
> -			to_free,
> -			-1,
> -			NULL,
> -			IGNORE_IRQ | IGNORE_DEV
> -		);
> -	}
> -	garbage_collect_irq_entries();
> +	free_irq_entry(to_free, true);
>   	spin_unlock_irqrestore(&irq_lock, flags);
>   }
>   EXPORT_SYMBOL(free_irq_by_fd);
>   
>   static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
>   {
> -	struct irq_entry *to_free;
> +	struct irq_entry *entry;
>   	unsigned long flags;
>   
>   	spin_lock_irqsave(&irq_lock, flags);
> -	to_free = active_fds;
> -	while (to_free != NULL) {
> -		do_free_by_irq_and_dev(
> -			to_free,
> -			irq,
> -			dev,
> -			0
> -		);
> -		to_free = to_free->next;
> +	list_for_each_entry(entry, &active_fds, list) {
> +		enum um_irq_type i;
> +
> +		for (i = 0; i < NUM_IRQ_TYPES; i++) {
> +			struct irq_reg *reg = &entry->reg[i];
> +
> +			if (!reg->events)
> +				continue;
> +			if (reg->irq != irq)
> +				continue;
> +			if (reg->id != dev)
> +				continue;
> +
> +			os_del_epoll_fd(entry->fd);
> +			reg->events = 0;
> +			update_or_free_irq_entry(entry);
> +			goto out;
> +		}
>   	}
> -	garbage_collect_irq_entries();
> +out:
>   	spin_unlock_irqrestore(&irq_lock, flags);
>   }
>   
> -
>   void deactivate_fd(int fd, int irqnum)
>   {
> -	struct irq_entry *to_free;
> +	struct irq_entry *entry;
>   	unsigned long flags;
> +	enum um_irq_type i;
>   
>   	os_del_epoll_fd(fd);
> +
>   	spin_lock_irqsave(&irq_lock, flags);
> -	to_free = get_irq_entry_by_fd(fd);
> -	if (to_free != NULL) {
> -		do_free_by_irq_and_dev(
> -			to_free,
> -			irqnum,
> -			NULL,
> -			IGNORE_DEV
> -		);
> +	entry = get_irq_entry_by_fd(fd);
> +	if (!entry)
> +		goto out;
> +
> +	for (i = 0; i < NUM_IRQ_TYPES; i++) {
> +		if (!entry->reg[i].events)
> +			continue;
> +		if (entry->reg[i].irq == irqnum)
> +			entry->reg[i].events = 0;
>   	}
> -	garbage_collect_irq_entries();
> +
> +	update_or_free_irq_entry(entry);
> +out:
>   	spin_unlock_irqrestore(&irq_lock, flags);
> +
>   	ignore_sigio_fd(fd);
>   }
>   EXPORT_SYMBOL(deactivate_fd);
> @@ -396,24 +288,17 @@ EXPORT_SYMBOL(deactivate_fd);
>    */
>   int deactivate_all_fds(void)
>   {
> -	struct irq_entry *to_free;
> +	struct irq_entry *entry;
>   
>   	/* Stop IO. The IRQ loop has no lock so this is our
>   	 * only way of making sure we are safe to dispose
>   	 * of all IRQ handlers
>   	 */
>   	os_set_ioignore();
> -	to_free = active_fds;
> -	while (to_free != NULL) {
> -		do_free_by_irq_and_dev(
> -			to_free,
> -			-1,
> -			NULL,
> -			IGNORE_IRQ | IGNORE_DEV
> -		);
> -		to_free = to_free->next;
> -	}
> -	/* don't garbage collect - we can no longer call kfree() here */
> +
> +	/* we can no longer call kfree() here so just deactivate */
> +	list_for_each_entry(entry, &active_fds, list)
> +		os_del_epoll_fd(entry->fd);
>   	os_close_epoll_fd();
>   	return 0;
>   }
Johannes Berg Dec. 1, 2020, 8:15 p.m. UTC | #7
On Mon, 2020-11-30 at 16:30 +0000, Anton Ivanov wrote:
> On 23/11/2020 19:56, Johannes Berg wrote:
> > From: Johannes Berg <johannes.berg@intel.com>
> > 
> > Reduce dynamic allocations (and thereby cache misses) by simply
> > embedding the registration data for IRQs in the irq_entry, we
> > never supported these being really dynamic anyway as only one
> > was ever allowed ("Trying to reregister ...").
> > 
> > Lockless behaviour is preserved by removing the FD from the poll
> > set appropriately, but we use reg->events to indicate whether or
> > not this entry is used, rather than dynamically allocating them.
> > 
> > Also port the list of IRQ entries to list_head instead of the
> > current open-coded singly-linked list implementation, just for
> > sanity.
> 
> This one is broken. I will look exactly where. It is somewhere in the IRQ delete/re-request logic.
> 
> How to test
> 
> run iperf -s in the UML with a vector network driver.
> 
> run iperf -c to the UML instance from the host in a loop

How do you usually use it? tap device?

> Try to ifup/ifdown the vecX interface inside the UML.
> 
> With master it works fine. With this patch it fails.

How does it fail?

johannes
Anton Ivanov Dec. 2, 2020, 9:03 a.m. UTC | #8
On 01/12/2020 20:15, Johannes Berg wrote:
> On Mon, 2020-11-30 at 16:30 +0000, Anton Ivanov wrote:
>> On 23/11/2020 19:56, Johannes Berg wrote:
>>> From: Johannes Berg <johannes.berg@intel.com>
>>>
>>> Reduce dynamic allocations (and thereby cache misses) by simply
>>> embedding the registration data for IRQs in the irq_entry, we
>>> never supported these being really dynamic anyway as only one
>>> was ever allowed ("Trying to reregister ...").
>>>
>>> Lockless behaviour is preserved by removing the FD from the poll
>>> set appropriately, but we use reg->events to indicate whether or
>>> not this entry is used, rather than dynamically allocating them.
>>>
>>> Also port the list of IRQ entries to list_head instead of the
>>> current open-coded singly-linked list implementation, just for
>>> sanity.
>> This one is broken. I will look exactly where. It is somewhere in the IRQ delete/re-request logic.
>>
>> How to test
>>
>> run iperf -s in the UML with a vector network driver.
>>
>> run iperf -c to the UML instance from the host in a loop
> How do you usually use it? tap device?

raw sockets on a vEth pair.

I avoid tap as it does not exercise the full vector IO code for now. One day I will find a way to extract from the kernel the underlying socket which tap uses internally and it will become 100% equivalent to the other transports.

>
>> Try to ifup/ifdown the vecX interface inside the UML.
>>
>> With master it works fine. With this patch it fails.
> How does it fail?

------------[ cut here ]------------
WARNING: CPU: 0 PID: 401 at arch/um/kernel/irq.c:177 um_request_irq+0xff/0x22e
Modules linked in:
CPU: 0 PID: 401 Comm: ip Tainted: G        W 5.10.0-rc6-00007-g6a51713e7f43 #377
Stack:
  60312019 00000000 65bc31d0 60312019
  00000009 00000000 00000000 ffffff8e
  65bc31e0 60313228 65bc3220 6003e2d2
Call Trace:
  [<60312019>] ? printk+0x0/0x94
  [<6001f34b>] show_stack+0x13e/0x14d
  [<60312019>] ? printk+0x0/0x94
  [<60312019>] ? printk+0x0/0x94
  [<60313228>] dump_stack+0x34/0x36
  [<6003e2d2>] __warn+0xf0/0x137
  [<60035374>] ? set_signals+0x0/0x36
  [<60311606>] warn_slowpath_fmt+0xd1/0xdf
  [<60028624>] ? uml_raw_enable_vnet_headers+0x0/0x5d
  [<60227c06>] ? strlen+0x0/0x11
  [<60314bb7>] ? netdev_info+0xad/0xaf
  [<60311535>] ? warn_slowpath_fmt+0x0/0xdf
  [<600353a4>] ? set_signals+0x30/0x36
  [<600353a4>] ? set_signals+0x30/0x36
  [<6001d7e1>] um_request_irq+0xff/0x22e
  [<60027f05>] ? vector_rx_interrupt+0x0/0x2a
  [<60026288>] ? get_depth+0x0/0x4b
  [<6001d6e2>] ? um_request_irq+0x0/0x22e
  [<60027cd8>] vector_net_open+0x1e9/0x416
  [<60272059>] __dev_open+0x100/0x15a
  [<60271f20>] ? dev_set_rx_mode+0x0/0x39
  [<602723d7>] __dev_change_flags+0x125/0x1cc
  [<602724a8>] dev_change_flags+0x2a/0x67
  [<60282502>] do_setlink+0x375/0xe90
  [<6021c4e5>] ? __nla_validate_parse+0x6d/0x8cf
  [<6021cd88>] ? __nla_parse+0x2b/0x2d
  [<602836db>] __rtnl_newlink+0x3f2/0x81f
  [<600353a4>] ? set_signals+0x30/0x36
  [<6027c23f>] ? __nlmsg_parse+0x58/0x60
  [<600cf37a>] ? get_page_from_freelist+0x0/0x973
  [<600cd71b>] ? first_zones_zonelist+0x0/0x20
  [<600d0393>] ? __alloc_pages_nodemask+0x128/0x81c
  [<600377fe>] ? wait_stub_done+0x40/0x10c
  [<60283b63>] rtnl_newlink+0x5b/0x7b
  [<6029af9a>] ? __netlink_ns_capable+0x25/0x53
  [<6027bc5b>] ? rtnl_get_link+0x0/0x2b
  [<6027e0ae>] rtnetlink_rcv_msg+0x229/0x25a
  [<6002135c>] ? copy_chunk_from_user+0x0/0x2e
  [<6002135c>] ? copy_chunk_from_user+0x0/0x2e
  [<60021644>] ? buffer_op+0x4a/0xdb
  [<6027de85>] ? rtnetlink_rcv_msg+0x0/0x25a
  [<6029f289>] netlink_rcv_skb+0x67/0xcb
  [<6025c08d>] ? kfree_skb+0x0/0x2c
  [<6027c66a>] rtnetlink_rcv+0x1a/0x1c
  [<6029eace>] netlink_unicast+0x139/0x1df
  [<6029eec9>] netlink_sendmsg+0x355/0x37a
  [<602156b4>] ? __import_iovec+0xe1/0x104
  [<60250913>] ? sock_sendmsg_nosec+0x0/0x65
  [<60250925>] sock_sendmsg_nosec+0x12/0x65
  [<60250b65>] ____sys_sendmsg+0x10b/0x16b
  [<60252966>] ___sys_sendmsg+0x79/0xa8
  [<60037179>] ? do_syscall_stub+0xff/0x22f
  [<60037347>] ? run_syscall_stub+0x9e/0xa0
  [<60037441>] ? map+0x4a/0x4c
  [<600f9ed7>] ? __fget_light+0x36/0x63
  [<600f9f19>] ? __fdget+0x15/0x17
  [<602503fb>] ? fdget+0x10/0x1c
  [<60252a18>] __sys_sendmsg+0x54/0x82
  [<60252a5b>] sys_sendmsg+0x15/0x17
  [<6002132e>] handle_syscall+0x79/0xa7
  [<60037f0e>] userspace+0x483/0x510
  [<6001e166>] fork_handler+0x94/0x96
---[ end trace 95ba15d7fc338465 ]---
uml-vector uml-vector.0 vec0: vector_open: failed to get rx irq(-114)

>
> johannes
>
>
Johannes Berg Dec. 2, 2020, 11:29 a.m. UTC | #9
On Wed, 2020-12-02 at 09:03 +0000, Anton Ivanov wrote:
> 
> raw sockets on a vEth pair.
> 
> I avoid tap as it does not exercise the full vector IO code for now.
> One day I will find a way to extract from the kernel the underlying
> socket which tap uses internally and it will become 100% equivalent to
> the other transports.

OK, thanks.

> > > Try to ifup/ifdown the vecX interface inside the UML.
> > > 
> > > With master it works fine. With this patch it fails.
> > How does it fail?
> 
> ------------[ cut here ]------------
> WARNING: CPU: 0 PID: 401 at arch/um/kernel/irq.c:177 um_request_irq+0xff/0x22e

Hm, interesting. So that means the IRQ was still busy after
vector_net_close()?

I'm still digging and trying to build a test case, but if you have a
test case at hand .. I wonder if IRQ aliasing happened - if you remove
the "goto out;" in free_irq_by_irq_and_dev(), does that help?


johannes
Anton Ivanov Dec. 2, 2020, 11:31 a.m. UTC | #10
On 02/12/2020 11:29, Johannes Berg wrote:
> On Wed, 2020-12-02 at 09:03 +0000, Anton Ivanov wrote:
>>
>> raw sockets on a vEth pair.
>>
>> I avoid tap as it does not exercise the full vector IO code for now.
>> One day I will find a way to extract from the kernel the underlying
>> socket which tap uses internally and it will become 100% equivalent to
>> the other transports.
> 
> OK, thanks.
> 
>>>> Try to ifup/ifdown the vecX interface inside the UML.
>>>>
>>>> With master it works fine. With this patch it fails.
>>> How does it fail?
>>
>> ------------[ cut here ]------------
>> WARNING: CPU: 0 PID: 401 at arch/um/kernel/irq.c:177 um_request_irq+0xff/0x22e
> 
> Hm, interesting. So that means the IRQ was still busy after
> vector_net_close()?
> 
> I'm still digging and trying to build a test case, but if you have a
> test case at hand .. I wonder if IRQ aliasing happened - if you remove
> the "goto out;" in free_irq_by_irq_and_dev(), does that help?
> 
> 
> johannes
> 
> 

I think we should just handle EPOLLHUP and do an IRQ + fd disable if it HUPS.

This way a close will always be handled correctly regardless of where and how it closed.

I am about to try a patch on top of your 7 which does that.
Johannes Berg Dec. 2, 2020, 11:32 a.m. UTC | #11
On Wed, 2020-12-02 at 11:31 +0000, Anton Ivanov wrote:
> 
> I think we should just handle EPOLLHUP and do an IRQ + fd disable if it HUPS.
> 
> This way a close will always be handled correctly regardless of where and how it closed.

Yes, but that's sort of a separate thing?

Also, we probably should disable SIGIO if the IRQ is freed, otherwise
the FD can keep interrupting us but we don't find anything in the epoll
set ... But again, a separate cleanup?

johannes
Johannes Berg Dec. 2, 2020, 11:43 a.m. UTC | #12
On Wed, 2020-12-02 at 12:29 +0100, Johannes Berg wrote:
> 
> I'm still digging and trying to build a test case, but if you have a
> test case at hand .. I wonder if IRQ aliasing happened - if you remove
> the "goto out;" in free_irq_by_irq_and_dev(), does that help?

That wasn't it, but I can reproduce now.

johannes
Johannes Berg Dec. 2, 2020, 11:49 a.m. UTC | #13
> run iperf -s in the UML with a vector network driver.

Btw, 'ip link set vec0 up' reports a lockdep issue.

Looks sort of legitimate, though we're not SMP, so ...


======================================================
WARNING: possible circular locking dependency detected
5.10.0-rc4-00392-gedc4ff2eb69b-dirty #147 Not tainted
------------------------------------------------------
swapper/0 is trying to acquire lock:
0000000063701138 (&result->tail_lock){+.-.}-{2:2}, at: vector_send.isra.0+0x1ba/0x280

but task is already holding lock:
00000000637010f0 (&result->head_lock){+.-.}-{2:2}, at: vector_send.isra.0+0x30/0x280

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (&result->head_lock){+.-.}-{2:2}:
       save_stack_trace+0x2e/0x30
       stack_trace_save+0x34/0x39
       save_trace+0x8c/0x261
       __lock_acquire+0x1480/0x16b8
       lock_acquire+0x335/0x414
       _raw_spin_lock+0x31/0x85
       vector_net_start_xmit+0x164/0x340
       netdev_start_xmit+0x1f/0x47
       dev_hard_start_xmit+0x14b/0x230
       sch_direct_xmit+0xb0/0x248
       __qdisc_run+0x4f3/0x532
       qdisc_run+0x3d/0x51
       __dev_queue_xmit+0x2e0/0x6e8
       dev_queue_xmit+0x12/0x14
       neigh_resolve_output+0x13f/0x162
       ip6_finish_output2+0x642/0x713
       __ip6_finish_output+0xac/0xb5
       ip6_output+0xcf/0x113
       dst_output+0x72/0x7b
       NF_HOOK.constprop.0+0x115/0x124
       mld_sendpack+0x20e/0x2ba
       mld_ifc_timer_expire+0x259/0x2ab
       call_timer_fn+0x116/0x243
       __run_timers+0x207/0x246
       run_timer_softirq+0x1c/0x2c
       __do_softirq+0x1b9/0x43e
       irq_exit+0xc1/0x113
       do_IRQ+0x45/0x54
       timer_handler+0xce/0xed
       timer_real_alarm_handler+0x5c/0x5e
       unblock_signals+0x9c/0xd7
       arch_cpu_idle+0x5b/0x62
       default_idle_call+0x52/0x5e
       do_idle+0xd8/0x14b
       cpu_startup_entry+0x1e/0x20
       rest_init+0x135/0x141
       0x600016d2
       0x60001dfe
       0x60003845
       new_thread_handler+0x81/0xb2
       uml_finishsetup+0x54/0x59

-> #0 (&result->tail_lock){+.-.}-{2:2}:
       save_stack_trace+0x2e/0x30
       stack_trace_save+0x34/0x39
       save_trace+0x8c/0x261
       print_circular_bug+0x64/0x24b
       check_noncircular+0xd3/0xe3
       __lock_acquire+0x12ca/0x16b8
       lock_acquire+0x335/0x414
       _raw_spin_lock+0x31/0x85
       vector_send.isra.0+0x1ba/0x280
       vector_net_start_xmit+0x32b/0x340
       netdev_start_xmit+0x1f/0x47
       dev_hard_start_xmit+0x14b/0x230
       sch_direct_xmit+0xb0/0x248
       __qdisc_run+0x4f3/0x532
       qdisc_run+0x3d/0x51
       __dev_queue_xmit+0x2e0/0x6e8
       dev_queue_xmit+0x12/0x14
       neigh_resolve_output+0x13f/0x162
       ip6_finish_output2+0x642/0x713
       __ip6_finish_output+0xac/0xb5
       ip6_output+0xcf/0x113
       dst_output+0x72/0x7b
       NF_HOOK.constprop.0+0x115/0x124
       mld_sendpack+0x20e/0x2ba
       mld_ifc_timer_expire+0x259/0x2ab
       call_timer_fn+0x116/0x243
       __run_timers+0x207/0x246
       run_timer_softirq+0x1c/0x2c
       __do_softirq+0x1b9/0x43e
       irq_exit+0xc1/0x113
       do_IRQ+0x45/0x54
       timer_handler+0xce/0xed
       timer_real_alarm_handler+0x5c/0x5e
       unblock_signals+0x9c/0xd7
       arch_cpu_idle+0x5b/0x62
       default_idle_call+0x52/0x5e
       do_idle+0xd8/0x14b
       cpu_startup_entry+0x1e/0x20
       rest_init+0x135/0x141
       0x600016d2
       0x60001dfe
       0x60003845
       new_thread_handler+0x81/0xb2
       uml_finishsetup+0x54/0x59

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&result->head_lock);
                               lock(&result->tail_lock);
                               lock(&result->head_lock);
  lock(&result->tail_lock);

 *** DEADLOCK ***

8 locks held by swapper/0:
 #0: 0000000060687050 ((&idev->mc_ifc_timer)){+.-.}-{0:0}, at: call_timer_fn+0x0/0x243
 #1: 0000000060887e40 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire.constprop.0+0x0/0x39
 #2: 0000000060887e00 (rcu_read_lock_bh){....}-{1:2}, at: rcu_lock_acquire+0x0/0x2f
 #3: 0000000060887e00 (rcu_read_lock_bh){....}-{1:2}, at: rcu_lock_acquire+0x0/0x2f
 #4: 0000000063fa9258 (&sch->seqlock){+...}-{2:2}, at: qdisc_run_begin+0x22/0x76
 #5: 0000000063fa9140 (dev->qdisc_running_key ?: &qdisc_running_key){+...}-{0:0}, at: qdisc_run+0x16/0x51
 #6: 0000000063636e88 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x84/0x248
 #7: 00000000637010f0 (&result->head_lock){+.-.}-{2:2}, at: vector_send.isra.0+0x30/0x280

stack backtrace:
CPU: 0 PID: 0 Comm: swapper Not tainted 5.10.0-rc4-00392-gedc4ff2eb69b-dirty #147
Stack:
 605d20dd 60c14280 60686830 604765ed
 60475ac7 60c14280 60c14180 60c14280
 60686840 60477bcd 606868a0 6007f8e4
Call Trace:
 [<604765ed>] ? printk+0x0/0x94
 [<60025c1b>] show_stack+0x13e/0x14d
 [<604765ed>] ? printk+0x0/0x94
 [<60475ac7>] ? __print_lock_name+0x0/0x90
 [<60477bcd>] dump_stack+0x34/0x36
 [<6007f8e4>] print_circular_bug+0x23c/0x24b
 [<6007d8de>] ? save_trace+0x8c/0x261
 [<6007f9c6>] check_noncircular+0xd3/0xe3
 [<6007dda1>] ? hlock_class+0x1e/0x9e
 [<60080880>] ? mark_lock.part.0+0x0/0x410
 [<6007dd83>] ? hlock_class+0x0/0x9e
 [<6008304e>] __lock_acquire+0x12ca/0x16b8
 [<6003b4b5>] ? set_signals+0x37/0x3f
 [<6007dd83>] ? hlock_class+0x0/0x9e
 [<60081515>] lock_acquire+0x335/0x414
 [<6002db48>] ? vector_send.isra.0+0x1ba/0x280
 [<6047a506>] ? debug_lockdep_rcu_enabled+0x0/0x3b
 [<6047f337>] _raw_spin_lock+0x31/0x85
 [<6002db48>] ? vector_send.isra.0+0x1ba/0x280
 [<6047f306>] ? _raw_spin_lock+0x0/0x85
 [<6002db48>] vector_send.isra.0+0x1ba/0x280
 [<6047f7cb>] ? _raw_spin_unlock+0x0/0x32
 [<6002e06d>] vector_net_start_xmit+0x32b/0x340
 [<6032845a>] netdev_start_xmit+0x1f/0x47
 [<60479fa2>] ? match_held_lock+0x0/0x1de
 [<6047a506>] ? debug_lockdep_rcu_enabled+0x0/0x3b
 [<60330d44>] dev_hard_start_xmit+0x14b/0x230
 [<6047a506>] ? debug_lockdep_rcu_enabled+0x0/0x3b
 [<60367250>] sch_direct_xmit+0xb0/0x248
 [<60365ee5>] ? qdisc_qstats_cpu_qlen_dec+0x2a/0x2e
 [<60365ebb>] ? qdisc_qstats_cpu_qlen_dec+0x0/0x2e
 [<603678db>] __qdisc_run+0x4f3/0x532
 [<6047a506>] ? debug_lockdep_rcu_enabled+0x0/0x3b
 [<6032d233>] qdisc_run+0x3d/0x51
 [<603311af>] __dev_queue_xmit+0x2e0/0x6e8
 [<6047a506>] ? debug_lockdep_rcu_enabled+0x0/0x3b
 [<60338c91>] ? read_seqbegin+0x0/0xc9
 [<603315c9>] dev_queue_xmit+0x12/0x14
 [<6033d0d9>] neigh_resolve_output+0x13f/0x162
 [<6047a506>] ? debug_lockdep_rcu_enabled+0x0/0x3b
 [<603f8e94>] ip6_finish_output2+0x642/0x713
 [<603fa9ef>] __ip6_finish_output+0xac/0xb5
 [<603faad9>] ip6_output+0xcf/0x113
 [<60424cac>] ? dst_output+0x0/0x7b
 [<60424d1e>] dst_output+0x72/0x7b
 [<6042571a>] NF_HOOK.constprop.0+0x115/0x124
 [<6041058a>] ? icmp6_dst_alloc+0xea/0x103
 [<6003b47e>] ? set_signals+0x0/0x3f
 [<60425937>] mld_sendpack+0x20e/0x2ba
 [<60427902>] ? add_grec+0x0/0x3d0
 [<6047f493>] ? _raw_spin_lock_bh+0x0/0x9b
 [<60428340>] mld_ifc_timer_expire+0x259/0x2ab
 [<604280e7>] ? mld_ifc_timer_expire+0x0/0x2ab
 [<6047a506>] ? debug_lockdep_rcu_enabled+0x0/0x3b
 [<6009a795>] call_timer_fn+0x116/0x243
 [<6009a67f>] ? call_timer_fn+0x0/0x243
 [<6009b02e>] __run_timers+0x207/0x246
 [<604280e7>] ? mld_ifc_timer_expire+0x0/0x2ab
 [<6047f52e>] ? _raw_spin_lock_irq+0x0/0xb2
 [<6003b4a7>] ? set_signals+0x29/0x3f
 [<6047a4f4>] ? lock_is_held_type+0x135/0x147
 [<6047a362>] ? lockdep_hardirqs_on+0x1e2/0x23f
 [<6009ae27>] ? __run_timers+0x0/0x246
 [<6009b089>] run_timer_softirq+0x1c/0x2c
 [<60480331>] __do_softirq+0x1b9/0x43e
 [<600953e3>] ? unmask_irq+0x0/0x37
 [<6047f7cb>] ? _raw_spin_unlock+0x0/0x32
 [<60022c6b>] ? rcu_lock_acquire.constprop.0+0x0/0x39
 [<600492c8>] irq_exit+0xc1/0x113
 [<60049205>] ? irq_enter+0x10/0x12
 [<60024138>] do_IRQ+0x45/0x54
 [<6003b47e>] ? set_signals+0x0/0x3f
 [<60026ba5>] timer_handler+0xce/0xed
 [<600838e7>] ? lock_release+0x0/0x36c
 [<60061e0a>] ? find_task_by_pid_ns+0x0/0x97
 [<6003b03a>] timer_real_alarm_handler+0x5c/0x5e
 [<6003b439>] unblock_signals+0x9c/0xd7
 [<60024dac>] arch_cpu_idle+0x5b/0x62
 [<60097200>] ? rcu_read_lock_sched_held+0x2d/0x34
 [<600709f6>] ? trace_cpu_idle.constprop.0+0x0/0xa1
 [<6047f2aa>] default_idle_call+0x52/0x5e
 [<600838e7>] ? lock_release+0x0/0x36c
 [<60061e0a>] ? find_task_by_pid_ns+0x0/0x97
 [<6003b474>] ? get_signals+0x0/0xa
 [<60070c21>] do_idle+0xd8/0x14b
 [<6047ba87>] ? schedule+0x95/0xd6
 [<60070b49>] ? do_idle+0x0/0x14b
 [<6007101f>] cpu_startup_entry+0x1e/0x20
 [<60071001>] ? cpu_startup_entry+0x0/0x20
 [<6047a506>] ? debug_lockdep_rcu_enabled+0x0/0x3b
 [<6047a6c5>] rest_init+0x135/0x141
 [<6047a590>] ? rest_init+0x0/0x141
 [<6003b474>] ? get_signals+0x0/0xa
 [<604765ed>] ? printk+0x0/0x94
 [<602b569e>] ? strlen+0x0/0x11
 [<600016d2>] 0x600016d2
 [<60001dfe>] 0x60001dfe
 [<6003b38c>] ? block_signals+0x0/0x11
 [<60003845>] 0x60003845
 [<60024816>] new_thread_handler+0x81/0xb2
 [<600037fa>] ? 0x600037fa
 [<600284b7>] uml_finishsetup+0x54/0x59

johannes
Johannes Berg Dec. 2, 2020, 11:54 a.m. UTC | #14
Oh, I found the bug ...


>  static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
>  {

[...]

> +	irq_entry->reg[type].irq = irq;
> +	irq_entry->reg[type].active = true;
> +	irq_entry->reg[type].events = events;

Obviously, here we need

	irq_entry->reg[type].id = dev_id;

:-)

johannes
Anton Ivanov Dec. 2, 2020, 11:56 a.m. UTC | #15
On 02/12/2020 11:32, Johannes Berg wrote:
> On Wed, 2020-12-02 at 11:31 +0000, Anton Ivanov wrote:
>> I think we should just handle EPOLLHUP and do an IRQ + fd disable if it HUPS.
>>
>> This way a close will always be handled correctly regardless of where and how it closed.
> Yes, but that's sort of a separate thing?

Yep it is. Parking it for later as it will allow to remove the "free_irqs" cludge.

>
> Also, we probably should disable SIGIO if the IRQ is freed, otherwise
> the FD can keep interrupting us but we don't find anything in the epoll
> set ... But again, a separate cleanup?

I think that deleting the fd from the set should stop that.

Let me think on it. It was clearly being held together by duck tape and baling wire and it needs fixing. You got most of it which is grand, we just need to finish it.

>
> johannes
>
>
> _______________________________________________
> linux-um mailing list
> linux-um@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-um
>
Anton Ivanov Dec. 2, 2020, 11:58 a.m. UTC | #16
On 02/12/2020 11:54, Johannes Berg wrote:
> Oh, I found the bug ...
>
>
>>   static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
>>   {
> [...]
>
>> +	irq_entry->reg[type].irq = irq;
>> +	irq_entry->reg[type].active = true;
>> +	irq_entry->reg[type].events = events;
> Obviously, here we need
>
> 	irq_entry->reg[type].id = dev_id;
>
> :-)

OK. Cool. I am waiting for the respin.

I will look at adding proper "close" handing and getting rid of the free_irqs kludge later.


>
> johannes
>
>
> _______________________________________________
> linux-um mailing list
> linux-um@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-um
>
Johannes Berg Dec. 2, 2020, 11:58 a.m. UTC | #17
On Wed, 2020-12-02 at 11:56 +0000, Anton Ivanov wrote:
> 
> > Also, we probably should disable SIGIO if the IRQ is freed, otherwise
> > the FD can keep interrupting us but we don't find anything in the epoll
> > set ... But again, a separate cleanup?
> 
> I think that deleting the fd from the set should stop that.

It doesn't. We'd have to separately call ignore_sigio_fd(), but we only
do that in deactivate_fd(), not in any of the other code paths ...

> Let me think on it. It was clearly being held together by duck tape
> and baling wire and it needs fixing. You got most of it which is
> grand, we just need to finish it.

Indeed.

Will send out this set again with the fix in a few minutes.

johannes
diff mbox series

Patch

diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 9e8f776bb43a..bbf5a466b44c 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -29,26 +29,23 @@  extern void free_irqs(void);
  * This is why we keep a small irq_reg array for each fd -
  * one entry per IRQ type
  */
-
 struct irq_reg {
 	void *id;
-	enum um_irq_type type;
 	int irq;
+	/* it's cheaper to store this than to query it */
 	int events;
 	bool active;
 	bool pending;
-	bool purge;
 };
 
 struct irq_entry {
-	struct irq_entry *next;
+	struct list_head list;
 	int fd;
-	struct irq_reg *irq_array[NUM_IRQ_TYPES];
+	struct irq_reg reg[NUM_IRQ_TYPES];
 };
 
-static struct irq_entry *active_fds;
-
 static DEFINE_SPINLOCK(irq_lock);
+static LIST_HEAD(active_fds);
 static DECLARE_BITMAP(irqs_allocated, NR_IRQS);
 
 static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs)
@@ -61,12 +58,13 @@  static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs)
  */
 	if (irq->active) {
 		irq->active = false;
+
 		do {
 			irq->pending = false;
 			do_IRQ(irq->irq, regs);
-		} while (irq->pending && (!irq->purge));
-		if (!irq->purge)
-			irq->active = true;
+		} while (irq->pending);
+
+		irq->active = true;
 	} else {
 		irq->pending = true;
 	}
@@ -75,9 +73,7 @@  static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs)
 void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
 {
 	struct irq_entry *irq_entry;
-	struct irq_reg *irq;
-
-	int n, i, j;
+	int n, i;
 
 	while (1) {
 		/* This is now lockless - epoll keeps back-referencesto the irqs
@@ -96,21 +92,18 @@  void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
 		}
 
 		for (i = 0; i < n ; i++) {
-			/* Epoll back reference is the entry with 2 irq_reg
-			 * leaves - one for each irq type.
-			 */
-			irq_entry = (struct irq_entry *)
-				os_epoll_get_data_pointer(i);
-			for (j = 0; j < NUM_IRQ_TYPES ; j++) {
-				irq = irq_entry->irq_array[j];
-				if (irq == NULL)
+			enum um_irq_type t;
+
+			irq_entry = os_epoll_get_data_pointer(i);
+
+			for (t = 0; t < NUM_IRQ_TYPES; t++) {
+				int events = irq_entry->reg[t].events;
+
+				if (!events)
 					continue;
-				if (os_epoll_triggered(i, irq->events) > 0)
-					irq_io_loop(irq, regs);
-				if (irq->purge) {
-					irq_entry->irq_array[j] = NULL;
-					kfree(irq);
-				}
+
+				if (os_epoll_triggered(i, events) > 0)
+					irq_io_loop(&irq_entry->reg[t], regs);
 			}
 		}
 	}
@@ -118,32 +111,59 @@  void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
 	free_irqs();
 }
 
-static int assign_epoll_events_to_irq(struct irq_entry *irq_entry)
+static struct irq_entry *get_irq_entry_by_fd(int fd)
 {
-	int i;
-	int events = 0;
-	struct irq_reg *irq;
+	struct irq_entry *walk;
 
-	for (i = 0; i < NUM_IRQ_TYPES ; i++) {
-		irq = irq_entry->irq_array[i];
-		if (irq != NULL)
-			events = irq->events | events;
-	}
-	if (events > 0) {
-	/* os_add_epoll will call os_mod_epoll if this already exists */
-		return os_add_epoll_fd(events, irq_entry->fd, irq_entry);
+	lockdep_assert_held(&irq_lock);
+
+	list_for_each_entry(walk, &active_fds, list) {
+		if (walk->fd == fd)
+			return walk;
 	}
-	/* No events - delete */
-	return os_del_epoll_fd(irq_entry->fd);
+
+	return NULL;
+}
+
+static void free_irq_entry(struct irq_entry *to_free, bool remove)
+{
+	if (!to_free)
+		return;
+
+	if (remove)
+		os_del_epoll_fd(to_free->fd);
+	list_del(&to_free->list);
+	kfree(to_free);
 }
 
+static bool update_irq_entry(struct irq_entry *entry)
+{
+	enum um_irq_type i;
+	int events = 0;
+
+	for (i = 0; i < NUM_IRQ_TYPES; i++)
+		events |= entry->reg[i].events;
 
+	if (events) {
+		/* will modify (instead of add) if needed */
+		os_add_epoll_fd(events, entry->fd, entry);
+		return true;
+	}
+
+	os_del_epoll_fd(entry->fd);
+	return false;
+}
+
+static void update_or_free_irq_entry(struct irq_entry *entry)
+{
+	if (!update_irq_entry(entry))
+		free_irq_entry(entry, false);
+}
 
 static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
 {
-	struct irq_reg *new_fd;
 	struct irq_entry *irq_entry;
-	int i, err, events;
+	int err, events = os_event_mask(type);
 	unsigned long flags;
 
 	err = os_set_fd_async(fd);
@@ -151,73 +171,33 @@  static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
 		goto out;
 
 	spin_lock_irqsave(&irq_lock, flags);
-
-	/* Check if we have an entry for this fd */
-
-	err = -EBUSY;
-	for (irq_entry = active_fds;
-		irq_entry != NULL; irq_entry = irq_entry->next) {
-		if (irq_entry->fd == fd)
-			break;
-	}
-
-	if (irq_entry == NULL) {
-		/* This needs to be atomic as it may be called from an
-		 * IRQ context.
-		 */
-		irq_entry = kmalloc(sizeof(struct irq_entry), GFP_ATOMIC);
-		if (irq_entry == NULL) {
-			printk(KERN_ERR
-				"Failed to allocate new IRQ entry\n");
+	irq_entry = get_irq_entry_by_fd(fd);
+	if (irq_entry) {
+		/* cannot register the same FD twice with the same type */
+		if (WARN_ON(irq_entry->reg[type].events)) {
+			err = -EALREADY;
 			goto out_unlock;
 		}
-		irq_entry->fd = fd;
-		for (i = 0; i < NUM_IRQ_TYPES; i++)
-			irq_entry->irq_array[i] = NULL;
-		irq_entry->next = active_fds;
-		active_fds = irq_entry;
-	}
-
-	/* Check if we are trying to re-register an interrupt for a
-	 * particular fd
-	 */
 
-	if (irq_entry->irq_array[type] != NULL) {
-		printk(KERN_ERR
-			"Trying to reregister IRQ %d FD %d TYPE %d ID %p\n",
-			irq, fd, type, dev_id
-		);
-		goto out_unlock;
+		/* temporarily disable to avoid IRQ-side locking */
+		os_del_epoll_fd(fd);
 	} else {
-		/* New entry for this fd */
-
-		err = -ENOMEM;
-		new_fd = kmalloc(sizeof(struct irq_reg), GFP_ATOMIC);
-		if (new_fd == NULL)
+		irq_entry = kzalloc(sizeof(*irq_entry), GFP_ATOMIC);
+		if (!irq_entry) {
+			err = -ENOMEM;
 			goto out_unlock;
-
-		events = os_event_mask(type);
-
-		*new_fd = ((struct irq_reg) {
-			.id		= dev_id,
-			.irq		= irq,
-			.type		= type,
-			.events		= events,
-			.active		= true,
-			.pending	= false,
-			.purge		= false
-		});
-		/* Turn off any IO on this fd - allows us to
-		 * avoid locking the IRQ loop
-		 */
-		os_del_epoll_fd(irq_entry->fd);
-		irq_entry->irq_array[type] = new_fd;
+		}
+		irq_entry->fd = fd;
+		list_add_tail(&irq_entry->list, &active_fds);
+		maybe_sigio_broken(fd);
 	}
 
-	/* Turn back IO on with the correct (new) IO event mask */
-	assign_epoll_events_to_irq(irq_entry);
+	irq_entry->reg[type].irq = irq;
+	irq_entry->reg[type].active = true;
+	irq_entry->reg[type].events = events;
+
+	WARN_ON(!update_irq_entry(irq_entry));
 	spin_unlock_irqrestore(&irq_lock, flags);
-	maybe_sigio_broken(fd);
 
 	return 0;
 out_unlock:
@@ -227,104 +207,10 @@  static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id)
 }
 
 /*
- * Walk the IRQ list and dispose of any unused entries.
- * Should be done under irq_lock.
+ * Remove the entry or entries for a specific FD, if you
+ * don't want to remove all the possible entries then use
+ * um_free_irq() or deactivate_fd() instead.
  */
-
-static void garbage_collect_irq_entries(void)
-{
-	int i;
-	bool reap;
-	struct irq_entry *walk;
-	struct irq_entry *previous = NULL;
-	struct irq_entry *to_free;
-
-	if (active_fds == NULL)
-		return;
-	walk = active_fds;
-	while (walk != NULL) {
-		reap = true;
-		for (i = 0; i < NUM_IRQ_TYPES ; i++) {
-			if (walk->irq_array[i] != NULL) {
-				reap = false;
-				break;
-			}
-		}
-		if (reap) {
-			if (previous == NULL)
-				active_fds = walk->next;
-			else
-				previous->next = walk->next;
-			to_free = walk;
-		} else {
-			to_free = NULL;
-		}
-		walk = walk->next;
-		kfree(to_free);
-	}
-}
-
-/*
- * Walk the IRQ list and get the descriptor for our FD
- */
-
-static struct irq_entry *get_irq_entry_by_fd(int fd)
-{
-	struct irq_entry *walk = active_fds;
-
-	while (walk != NULL) {
-		if (walk->fd == fd)
-			return walk;
-		walk = walk->next;
-	}
-	return NULL;
-}
-
-
-/*
- * Walk the IRQ list and dispose of an entry for a specific
- * device and number. Note - if sharing an IRQ for read
- * and write for the same FD it will be disposed in either case.
- * If this behaviour is undesirable use different IRQ ids.
- */
-
-#define IGNORE_IRQ 1
-#define IGNORE_DEV (1<<1)
-
-static void do_free_by_irq_and_dev(
-	struct irq_entry *irq_entry,
-	unsigned int irq,
-	void *dev,
-	int flags
-)
-{
-	int i;
-	struct irq_reg *to_free;
-
-	for (i = 0; i < NUM_IRQ_TYPES ; i++) {
-		if (irq_entry->irq_array[i] != NULL) {
-			if (
-			((flags & IGNORE_IRQ) ||
-				(irq_entry->irq_array[i]->irq == irq)) &&
-			((flags & IGNORE_DEV) ||
-				(irq_entry->irq_array[i]->id == dev))
-			) {
-				/* Turn off any IO on this fd - allows us to
-				 * avoid locking the IRQ loop
-				 */
-				os_del_epoll_fd(irq_entry->fd);
-				to_free = irq_entry->irq_array[i];
-				irq_entry->irq_array[i] = NULL;
-				assign_epoll_events_to_irq(irq_entry);
-				if (to_free->active)
-					to_free->purge = true;
-				else
-					kfree(to_free);
-			}
-		}
-	}
-}
-
 void free_irq_by_fd(int fd)
 {
 	struct irq_entry *to_free;
@@ -332,58 +218,64 @@  void free_irq_by_fd(int fd)
 
 	spin_lock_irqsave(&irq_lock, flags);
 	to_free = get_irq_entry_by_fd(fd);
-	if (to_free != NULL) {
-		do_free_by_irq_and_dev(
-			to_free,
-			-1,
-			NULL,
-			IGNORE_IRQ | IGNORE_DEV
-		);
-	}
-	garbage_collect_irq_entries();
+	free_irq_entry(to_free, true);
 	spin_unlock_irqrestore(&irq_lock, flags);
 }
 EXPORT_SYMBOL(free_irq_by_fd);
 
 static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
 {
-	struct irq_entry *to_free;
+	struct irq_entry *entry;
 	unsigned long flags;
 
 	spin_lock_irqsave(&irq_lock, flags);
-	to_free = active_fds;
-	while (to_free != NULL) {
-		do_free_by_irq_and_dev(
-			to_free,
-			irq,
-			dev,
-			0
-		);
-		to_free = to_free->next;
+	list_for_each_entry(entry, &active_fds, list) {
+		enum um_irq_type i;
+
+		for (i = 0; i < NUM_IRQ_TYPES; i++) {
+			struct irq_reg *reg = &entry->reg[i];
+
+			if (!reg->events)
+				continue;
+			if (reg->irq != irq)
+				continue;
+			if (reg->id != dev)
+				continue;
+
+			os_del_epoll_fd(entry->fd);
+			reg->events = 0;
+			update_or_free_irq_entry(entry);
+			goto out;
+		}
 	}
-	garbage_collect_irq_entries();
+out:
 	spin_unlock_irqrestore(&irq_lock, flags);
 }
 
-
 void deactivate_fd(int fd, int irqnum)
 {
-	struct irq_entry *to_free;
+	struct irq_entry *entry;
 	unsigned long flags;
+	enum um_irq_type i;
 
 	os_del_epoll_fd(fd);
+
 	spin_lock_irqsave(&irq_lock, flags);
-	to_free = get_irq_entry_by_fd(fd);
-	if (to_free != NULL) {
-		do_free_by_irq_and_dev(
-			to_free,
-			irqnum,
-			NULL,
-			IGNORE_DEV
-		);
+	entry = get_irq_entry_by_fd(fd);
+	if (!entry)
+		goto out;
+
+	for (i = 0; i < NUM_IRQ_TYPES; i++) {
+		if (!entry->reg[i].events)
+			continue;
+		if (entry->reg[i].irq == irqnum)
+			entry->reg[i].events = 0;
 	}
-	garbage_collect_irq_entries();
+
+	update_or_free_irq_entry(entry);
+out:
 	spin_unlock_irqrestore(&irq_lock, flags);
+
 	ignore_sigio_fd(fd);
 }
 EXPORT_SYMBOL(deactivate_fd);
@@ -396,24 +288,17 @@  EXPORT_SYMBOL(deactivate_fd);
  */
 int deactivate_all_fds(void)
 {
-	struct irq_entry *to_free;
+	struct irq_entry *entry;
 
 	/* Stop IO. The IRQ loop has no lock so this is our
 	 * only way of making sure we are safe to dispose
 	 * of all IRQ handlers
 	 */
 	os_set_ioignore();
-	to_free = active_fds;
-	while (to_free != NULL) {
-		do_free_by_irq_and_dev(
-			to_free,
-			-1,
-			NULL,
-			IGNORE_IRQ | IGNORE_DEV
-		);
-		to_free = to_free->next;
-	}
-	/* don't garbage collect - we can no longer call kfree() here */
+
+	/* we can no longer call kfree() here so just deactivate */
+	list_for_each_entry(entry, &active_fds, list)
+		os_del_epoll_fd(entry->fd);
 	os_close_epoll_fd();
 	return 0;
 }