diff mbox series

[rdma-next,v1,11/12] IB/mlx5: Implement DEVX dispatching event

Message ID 20190618171540.11729-12-leon@kernel.org
State Awaiting Upstream
Delegated to: David Miller
Headers show
Series DEVX asynchronous events | expand

Commit Message

Leon Romanovsky June 18, 2019, 5:15 p.m. UTC
From: Yishai Hadas <yishaih@mellanox.com>

Implement DEVX dispatching event by looking up for the applicable
subscriptions for the reported event and using their target fd to
signal/set the event.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/devx.c         | 362 +++++++++++++++++++++-
 include/uapi/rdma/mlx5_user_ioctl_verbs.h |   5 +
 2 files changed, 357 insertions(+), 10 deletions(-)

Comments

Jason Gunthorpe June 24, 2019, 12:03 p.m. UTC | #1
On Tue, Jun 18, 2019 at 08:15:39PM +0300, Leon Romanovsky wrote:
> From: Yishai Hadas <yishaih@mellanox.com>
> 
> Implement DEVX dispatching event by looking up for the applicable
> subscriptions for the reported event and using their target fd to
> signal/set the event.
> 
> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
>  drivers/infiniband/hw/mlx5/devx.c         | 362 +++++++++++++++++++++-
>  include/uapi/rdma/mlx5_user_ioctl_verbs.h |   5 +
>  2 files changed, 357 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
> index 304b13e7a265..49fdce95d6d9 100644
> +++ b/drivers/infiniband/hw/mlx5/devx.c
> @@ -34,6 +34,11 @@ struct devx_async_data {
>  	struct mlx5_ib_uapi_devx_async_cmd_hdr hdr;
>  };
>  
> +struct devx_async_event_data {
> +	struct list_head list; /* headed in ev_queue->event_list */
> +	struct mlx5_ib_uapi_devx_async_event_hdr hdr;
> +};
> +
>  /* first level XA value data structure */
>  struct devx_event {
>  	struct xarray object_ids; /* second XA level, Key = object id */
> @@ -54,7 +59,9 @@ struct devx_event_subscription {
>  				   * devx_obj_event->obj_sub_list
>  				   */
>  	struct list_head obj_list; /* headed in devx_object */
> +	struct list_head event_list; /* headed in ev_queue->event_list */
>  
> +	u8  is_cleaned:1;

There is a loose bool 'is_obj_related' that should be combined with
this bool bitfield as well.

>  static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
> -				      struct devx_event_subscription *sub)
> +				      struct devx_event_subscription *sub,
> +				      bool file_close)
>  {
> -	list_del_rcu(&sub->file_list);
> +	if (sub->is_cleaned)
> +		goto end;
> +
> +	sub->is_cleaned = 1;
>  	list_del_rcu(&sub->xa_list);
>  
>  	if (sub->is_obj_related) {
> @@ -1303,10 +1355,15 @@ static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
>  		}
>  	}
>  
> -	if (sub->eventfd)
> -		eventfd_ctx_put(sub->eventfd);
> +end:
> +	if (file_close) {
> +		if (sub->eventfd)
> +			eventfd_ctx_put(sub->eventfd);
>  
> -	kfree_rcu(sub, rcu);
> +		list_del_rcu(&sub->file_list);
> +		/* subscription may not be used by the read API any more */
> +		kfree_rcu(sub, rcu);
> +	}

Dis like this confusing file_close stuff, just put this in the single place
that calls this with the true bool

> +static int deliver_event(struct devx_event_subscription *event_sub,
> +			 const void *data)
> +{
> +	struct ib_uobject *fd_uobj = event_sub->fd_uobj;
> +	struct devx_async_event_file *ev_file;
> +	struct devx_async_event_queue *ev_queue;
> +	struct devx_async_event_data *event_data;
> +	unsigned long flags;
> +	bool omit_data;
> +
> +	ev_file = container_of(fd_uobj, struct devx_async_event_file,
> +			       uobj);
> +	ev_queue = &ev_file->ev_queue;
> +	omit_data = ev_queue->flags &
> +		MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA;
> +
> +	if (omit_data) {
> +		spin_lock_irqsave(&ev_queue->lock, flags);
> +		if (!list_empty(&event_sub->event_list)) {
> +			spin_unlock_irqrestore(&ev_queue->lock, flags);
> +			return 0;
> +		}
> +
> +		list_add_tail(&event_sub->event_list, &ev_queue->event_list);
> +		spin_unlock_irqrestore(&ev_queue->lock, flags);
> +		wake_up_interruptible(&ev_queue->poll_wait);
> +		return 0;
> +	}
> +
> +	event_data = kzalloc(sizeof(*event_data) +
> +			     (omit_data ? 0 : sizeof(struct mlx5_eqe)),
> +			     GFP_ATOMIC);

omit_data is always false here

> +	if (!event_data) {
> +		spin_lock_irqsave(&ev_queue->lock, flags);
> +		ev_queue->is_overflow_err = 1;
> +		spin_unlock_irqrestore(&ev_queue->lock, flags);
> +		return -ENOMEM;
> +	}
> +
> +	event_data->hdr.cookie = event_sub->cookie;
> +	memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe));
> +
> +	spin_lock_irqsave(&ev_queue->lock, flags);
> +	list_add_tail(&event_data->list, &ev_queue->event_list);
> +	spin_unlock_irqrestore(&ev_queue->lock, flags);
> +	wake_up_interruptible(&ev_queue->poll_wait);
> +
> +	return 0;
> +}
> +
> +static void dispatch_event_fd(struct list_head *fd_list,
> +			      const void *data)
> +{
> +	struct devx_event_subscription *item;
> +
> +	list_for_each_entry_rcu(item, fd_list, xa_list) {
> +		if (!get_file_rcu((struct file *)item->object))
> +			continue;
> +
> +		if (item->eventfd) {
> +			eventfd_signal(item->eventfd, 1);
> +			fput(item->object);
> +			continue;
> +		}
> +
> +		deliver_event(item, data);
> +		fput(item->object);
> +	}
> +}
> +
>  static int devx_event_notifier(struct notifier_block *nb,
>  			       unsigned long event_type, void *data)
>  {
> -	return NOTIFY_DONE;
> +	struct mlx5_devx_event_table *table;
> +	struct mlx5_ib_dev *dev;
> +	struct devx_event *event;
> +	struct devx_obj_event *obj_event;
> +	u16 obj_type = 0;
> +	bool is_unaffiliated;
> +	u32 obj_id;
> +
> +	/* Explicit filtering to kernel events which may occur frequently */
> +	if (event_type == MLX5_EVENT_TYPE_CMD ||
> +	    event_type == MLX5_EVENT_TYPE_PAGE_REQUEST)
> +		return NOTIFY_OK;
> +
> +	table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb);
> +	dev = container_of(table, struct mlx5_ib_dev, devx_event_table);
> +	is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type);
> +
> +	if (!is_unaffiliated)
> +		obj_type = get_event_obj_type(event_type, data);
> +	event = xa_load(&table->event_xa, event_type | (obj_type << 16));
> +	if (!event)
> +		return NOTIFY_DONE;

event should be in the rcu as well

> +	if (is_unaffiliated) {
> +		dispatch_event_fd(&event->unaffiliated_list, data);
> +		return NOTIFY_OK;
> +	}
> +
> +	obj_id = devx_get_obj_id_from_event(event_type, data);
> +	rcu_read_lock();
> +	obj_event = xa_load(&event->object_ids, obj_id);
> +	if (!obj_event) {
> +		rcu_read_unlock();
> +		return NOTIFY_DONE;
> +	}
> +
> +	dispatch_event_fd(&obj_event->obj_sub_list, data);
> +
> +	rcu_read_unlock();
> +	return NOTIFY_OK;
>  }
>  
>  void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev)
> @@ -2221,7 +2444,7 @@ void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev)
>  		event = entry;
>  		list_for_each_entry_safe(sub, tmp, &event->unaffiliated_list,
>  					 xa_list)
> -			devx_cleanup_subscription(dev, sub);
> +			devx_cleanup_subscription(dev, sub, false);
>  		kfree(entry);
>  	}
>  	mutex_unlock(&dev->devx_event_table.event_xa_lock);
> @@ -2329,18 +2552,126 @@ static const struct file_operations devx_async_cmd_event_fops = {
>  static ssize_t devx_async_event_read(struct file *filp, char __user *buf,
>  				     size_t count, loff_t *pos)
>  {
> -	return -EINVAL;
> +	struct devx_async_event_file *ev_file = filp->private_data;
> +	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
> +	struct devx_event_subscription *event_sub;
> +	struct devx_async_event_data *uninitialized_var(event);
> +	int ret = 0;
> +	size_t eventsz;
> +	bool omit_data;
> +	void *event_data;
> +
> +	omit_data = ev_queue->flags &
> +		MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA;
> +
> +	spin_lock_irq(&ev_queue->lock);
> +
> +	if (ev_queue->is_overflow_err) {
> +		ev_queue->is_overflow_err = 0;
> +		spin_unlock_irq(&ev_queue->lock);
> +		return -EOVERFLOW;
> +	}
> +
> +	while (list_empty(&ev_queue->event_list)) {
> +		spin_unlock_irq(&ev_queue->lock);
> +
> +		if (filp->f_flags & O_NONBLOCK)
> +			return -EAGAIN;
> +
> +		if (wait_event_interruptible(ev_queue->poll_wait,
> +			    (!list_empty(&ev_queue->event_list) ||
> +			     ev_queue->is_destroyed))) {
> +			return -ERESTARTSYS;
> +		}
> +
> +		if (list_empty(&ev_queue->event_list) &&
> +		    ev_queue->is_destroyed)
> +			return -EIO;

All these tests should be under the lock.

Why don't we return EIO as soon as is-destroyed happens? What is the
point of flushing out the accumulated events?

> +
> +		spin_lock_irq(&ev_queue->lock);
> +	}
> +
> +	if (omit_data) {
> +		event_sub = list_first_entry(&ev_queue->event_list,
> +					struct devx_event_subscription,
> +					event_list);
> +		eventsz = sizeof(event_sub->cookie);
> +		event_data = &event_sub->cookie;
> +	} else {
> +		event = list_first_entry(&ev_queue->event_list,
> +				      struct devx_async_event_data, list);
> +		eventsz = sizeof(struct mlx5_eqe) +
> +			sizeof(struct mlx5_ib_uapi_devx_async_event_hdr);
> +		event_data = &event->hdr;
> +	}
> +
> +	if (eventsz > count) {
> +		spin_unlock_irq(&ev_queue->lock);
> +		return -ENOSPC;

This is probably the wrong errno

> +	}
> +
> +	if (omit_data)
> +		list_del_init(&event_sub->event_list);
> +	else
> +		list_del(&event->list);
> +
> +	spin_unlock_irq(&ev_queue->lock);
> +
> +	if (copy_to_user(buf, event_data, eventsz))
> +		ret = -EFAULT;
> +	else
> +		ret = eventsz;

This is really poorly ordered, EFAULT will cause the event to be lost. :(

Maybe the event should be re-added on error? Tricky.

> +	if (!omit_data)
> +		kfree(event);
> +	return ret;
>  }
>  
>  static __poll_t devx_async_event_poll(struct file *filp,
>  				      struct poll_table_struct *wait)
>  {
> -	return 0;
> +	struct devx_async_event_file *ev_file = filp->private_data;
> +	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
> +	__poll_t pollflags = 0;
> +
> +	poll_wait(filp, &ev_queue->poll_wait, wait);
> +
> +	spin_lock_irq(&ev_queue->lock);
> +	if (ev_queue->is_destroyed)
> +		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
> +	else if (!list_empty(&ev_queue->event_list))
> +		pollflags = EPOLLIN | EPOLLRDNORM;
> +	spin_unlock_irq(&ev_queue->lock);
> +
> +	return pollflags;
>  }
>  
>  static int devx_async_event_close(struct inode *inode, struct file *filp)
>  {
> +	struct ib_uobject *uobj = filp->private_data;
> +	struct devx_async_event_file *ev_file =
> +		container_of(uobj, struct devx_async_event_file, uobj);
> +	struct devx_event_subscription *event_sub, *event_sub_tmp;
> +	struct devx_async_event_data *entry, *tmp;
> +
> +	mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock);
> +	/* delete the subscriptions which are related to this FD */
> +	list_for_each_entry_safe(event_sub, event_sub_tmp,
> +				 &ev_file->subscribed_events_list, file_list)
> +		devx_cleanup_subscription(ev_file->dev, event_sub, true);
> +	mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock);
> +
> +	/* free the pending events allocation */
> +	if (!(ev_file->ev_queue.flags &
> +	    MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA)) {
> +		spin_lock_irq(&ev_file->ev_queue.lock);
> +		list_for_each_entry_safe(entry, tmp,
> +					 &ev_file->ev_queue.event_list, list)
> +			kfree(entry); /* read can't come any nore */

spelling

> +		spin_unlock_irq(&ev_file->ev_queue.lock);
> +	}
>  	uverbs_close_fd(filp);
> +	put_device(&ev_file->dev->ib_dev.dev);
>  	return 0;
>  }
>  
> @@ -2374,6 +2705,17 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
>  static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj,
>  					    enum rdma_remove_reason why)
>  {
> +	struct devx_async_event_file *ev_file =
> +		container_of(uobj, struct devx_async_event_file,
> +			     uobj);
> +	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
> +
> +	spin_lock_irq(&ev_queue->lock);
> +	ev_queue->is_destroyed = 1;
> +	spin_unlock_irq(&ev_queue->lock);
> +
> +	if (why == RDMA_REMOVE_DRIVER_REMOVE)
> +		wake_up_interruptible(&ev_queue->poll_wait);

Why isn't this wakeup always done?

Jason
Yishai Hadas June 24, 2019, 4:55 p.m. UTC | #2
On 6/24/2019 3:03 PM, Jason Gunthorpe wrote:
> On Tue, Jun 18, 2019 at 08:15:39PM +0300, Leon Romanovsky wrote:
>> From: Yishai Hadas <yishaih@mellanox.com>
>>
>> Implement DEVX dispatching event by looking up for the applicable
>> subscriptions for the reported event and using their target fd to
>> signal/set the event.
>>
>> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
>> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
>>   drivers/infiniband/hw/mlx5/devx.c         | 362 +++++++++++++++++++++-
>>   include/uapi/rdma/mlx5_user_ioctl_verbs.h |   5 +
>>   2 files changed, 357 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
>> index 304b13e7a265..49fdce95d6d9 100644
>> +++ b/drivers/infiniband/hw/mlx5/devx.c
>> @@ -34,6 +34,11 @@ struct devx_async_data {
>>   	struct mlx5_ib_uapi_devx_async_cmd_hdr hdr;
>>   };
>>   
>> +struct devx_async_event_data {
>> +	struct list_head list; /* headed in ev_queue->event_list */
>> +	struct mlx5_ib_uapi_devx_async_event_hdr hdr;
>> +};
>> +
>>   /* first level XA value data structure */
>>   struct devx_event {
>>   	struct xarray object_ids; /* second XA level, Key = object id */
>> @@ -54,7 +59,9 @@ struct devx_event_subscription {
>>   				   * devx_obj_event->obj_sub_list
>>   				   */
>>   	struct list_head obj_list; /* headed in devx_object */
>> +	struct list_head event_list; /* headed in ev_queue->event_list */
>>   
>> +	u8  is_cleaned:1;
> 
> There is a loose bool 'is_obj_related' that should be combined with
> this bool bitfield as well.
> 

OK

>>   static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
>> -				      struct devx_event_subscription *sub)
>> +				      struct devx_event_subscription *sub,
>> +				      bool file_close)
>>   {
>> -	list_del_rcu(&sub->file_list);
>> +	if (sub->is_cleaned)
>> +		goto end;
>> +
>> +	sub->is_cleaned = 1;
>>   	list_del_rcu(&sub->xa_list);
>>   
>>   	if (sub->is_obj_related) {
>> @@ -1303,10 +1355,15 @@ static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
>>   		}
>>   	}
>>   
>> -	if (sub->eventfd)
>> -		eventfd_ctx_put(sub->eventfd);
>> +end:
>> +	if (file_close) {
>> +		if (sub->eventfd)
>> +			eventfd_ctx_put(sub->eventfd);
>>   
>> -	kfree_rcu(sub, rcu);
>> +		list_del_rcu(&sub->file_list);
>> +		/* subscription may not be used by the read API any more */
>> +		kfree_rcu(sub, rcu);
>> +	}
> 
> Dis like this confusing file_close stuff, just put this in the single place
> that calls this with the true bool
> 

OK, will do.

>> +static int deliver_event(struct devx_event_subscription *event_sub,
>> +			 const void *data)
>> +{
>> +	struct ib_uobject *fd_uobj = event_sub->fd_uobj;
>> +	struct devx_async_event_file *ev_file;
>> +	struct devx_async_event_queue *ev_queue;
>> +	struct devx_async_event_data *event_data;
>> +	unsigned long flags;
>> +	bool omit_data;
>> +
>> +	ev_file = container_of(fd_uobj, struct devx_async_event_file,
>> +			       uobj);
>> +	ev_queue = &ev_file->ev_queue;
>> +	omit_data = ev_queue->flags &
>> +		MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA;
>> +
>> +	if (omit_data) {
>> +		spin_lock_irqsave(&ev_queue->lock, flags);
>> +		if (!list_empty(&event_sub->event_list)) {
>> +			spin_unlock_irqrestore(&ev_queue->lock, flags);
>> +			return 0;
>> +		}
>> +
>> +		list_add_tail(&event_sub->event_list, &ev_queue->event_list);
>> +		spin_unlock_irqrestore(&ev_queue->lock, flags);
>> +		wake_up_interruptible(&ev_queue->poll_wait);
>> +		return 0;
>> +	}
>> +
>> +	event_data = kzalloc(sizeof(*event_data) +
>> +			     (omit_data ? 0 : sizeof(struct mlx5_eqe)),
>> +			     GFP_ATOMIC);
> 
> omit_data is always false here
> 

Correct, will clean it up.

>> +	if (!event_data) {
>> +		spin_lock_irqsave(&ev_queue->lock, flags);
>> +		ev_queue->is_overflow_err = 1;
>> +		spin_unlock_irqrestore(&ev_queue->lock, flags);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	event_data->hdr.cookie = event_sub->cookie;
>> +	memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe));
>> +
>> +	spin_lock_irqsave(&ev_queue->lock, flags);
>> +	list_add_tail(&event_data->list, &ev_queue->event_list);
>> +	spin_unlock_irqrestore(&ev_queue->lock, flags);
>> +	wake_up_interruptible(&ev_queue->poll_wait);
>> +
>> +	return 0;
>> +}
>> +
>> +static void dispatch_event_fd(struct list_head *fd_list,
>> +			      const void *data)
>> +{
>> +	struct devx_event_subscription *item;
>> +
>> +	list_for_each_entry_rcu(item, fd_list, xa_list) {
>> +		if (!get_file_rcu((struct file *)item->object))
>> +			continue;
>> +
>> +		if (item->eventfd) {
>> +			eventfd_signal(item->eventfd, 1);
>> +			fput(item->object);
>> +			continue;
>> +		}
>> +
>> +		deliver_event(item, data);
>> +		fput(item->object);
>> +	}
>> +}
>> +
>>   static int devx_event_notifier(struct notifier_block *nb,
>>   			       unsigned long event_type, void *data)
>>   {
>> -	return NOTIFY_DONE;
>> +	struct mlx5_devx_event_table *table;
>> +	struct mlx5_ib_dev *dev;
>> +	struct devx_event *event;
>> +	struct devx_obj_event *obj_event;
>> +	u16 obj_type = 0;
>> +	bool is_unaffiliated;
>> +	u32 obj_id;
>> +
>> +	/* Explicit filtering to kernel events which may occur frequently */
>> +	if (event_type == MLX5_EVENT_TYPE_CMD ||
>> +	    event_type == MLX5_EVENT_TYPE_PAGE_REQUEST)
>> +		return NOTIFY_OK;
>> +
>> +	table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb);
>> +	dev = container_of(table, struct mlx5_ib_dev, devx_event_table);
>> +	is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type);
>> +
>> +	if (!is_unaffiliated)
>> +		obj_type = get_event_obj_type(event_type, data);
>> +	event = xa_load(&table->event_xa, event_type | (obj_type << 16));
>> +	if (!event)
>> +		return NOTIFY_DONE;
> 
> event should be in the rcu as well

Do we really need this ? I didn't see a flow that really requires that.
> 
>> +	if (is_unaffiliated) {
>> +		dispatch_event_fd(&event->unaffiliated_list, data);
>> +		return NOTIFY_OK;
>> +	}
>> +
>> +	obj_id = devx_get_obj_id_from_event(event_type, data);
>> +	rcu_read_lock();
>> +	obj_event = xa_load(&event->object_ids, obj_id);
>> +	if (!obj_event) {
>> +		rcu_read_unlock();
>> +		return NOTIFY_DONE;
>> +	}
>> +
>> +	dispatch_event_fd(&obj_event->obj_sub_list, data);
>> +
>> +	rcu_read_unlock();
>> +	return NOTIFY_OK;
>>   }
>>   
>>   void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev)
>> @@ -2221,7 +2444,7 @@ void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev)
>>   		event = entry;
>>   		list_for_each_entry_safe(sub, tmp, &event->unaffiliated_list,
>>   					 xa_list)
>> -			devx_cleanup_subscription(dev, sub);
>> +			devx_cleanup_subscription(dev, sub, false);
>>   		kfree(entry);
>>   	}
>>   	mutex_unlock(&dev->devx_event_table.event_xa_lock);
>> @@ -2329,18 +2552,126 @@ static const struct file_operations devx_async_cmd_event_fops = {
>>   static ssize_t devx_async_event_read(struct file *filp, char __user *buf,
>>   				     size_t count, loff_t *pos)
>>   {
>> -	return -EINVAL;
>> +	struct devx_async_event_file *ev_file = filp->private_data;
>> +	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
>> +	struct devx_event_subscription *event_sub;
>> +	struct devx_async_event_data *uninitialized_var(event);
>> +	int ret = 0;
>> +	size_t eventsz;
>> +	bool omit_data;
>> +	void *event_data;
>> +
>> +	omit_data = ev_queue->flags &
>> +		MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA;
>> +
>> +	spin_lock_irq(&ev_queue->lock);
>> +
>> +	if (ev_queue->is_overflow_err) {
>> +		ev_queue->is_overflow_err = 0;
>> +		spin_unlock_irq(&ev_queue->lock);
>> +		return -EOVERFLOW;
>> +	}
>> +
>> +	while (list_empty(&ev_queue->event_list)) {
>> +		spin_unlock_irq(&ev_queue->lock);
>> +
>> +		if (filp->f_flags & O_NONBLOCK)
>> +			return -EAGAIN;
>> +
>> +		if (wait_event_interruptible(ev_queue->poll_wait,
>> +			    (!list_empty(&ev_queue->event_list) ||
>> +			     ev_queue->is_destroyed))) {
>> +			return -ERESTARTSYS;
>> +		}
>> +
>> +		if (list_empty(&ev_queue->event_list) &&
>> +		    ev_queue->is_destroyed)
>> +			return -EIO;
> 
> All these tests should be under the lock.

We can't call wait_event_interruptible() above which may sleep under the 
lock, correct ? are you referring to the list_empty() and is_destroyed ?

By the way looking in uverb code [1], similar code which is not done 
under the lock as of here..

[1] 
https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_main.c#L244


> 
> Why don't we return EIO as soon as is-destroyed happens? What is the
> point of flushing out the accumulated events?

It follows the above uverb code/logic that returns existing events even 
in that case, also the async command events in this file follows that 
logic, I suggest to stay consistent.
> 
>> +
>> +		spin_lock_irq(&ev_queue->lock);
>> +	}
>> +
>> +	if (omit_data) {
>> +		event_sub = list_first_entry(&ev_queue->event_list,
>> +					struct devx_event_subscription,
>> +					event_list);
>> +		eventsz = sizeof(event_sub->cookie);
>> +		event_data = &event_sub->cookie;
>> +	} else {
>> +		event = list_first_entry(&ev_queue->event_list,
>> +				      struct devx_async_event_data, list);
>> +		eventsz = sizeof(struct mlx5_eqe) +
>> +			sizeof(struct mlx5_ib_uapi_devx_async_event_hdr);
>> +		event_data = &event->hdr;
>> +	}
>> +
>> +	if (eventsz > count) {
>> +		spin_unlock_irq(&ev_queue->lock);
>> +		return -ENOSPC;
> 
> This is probably the wrong errno

OK, will change to -EINVAL as in uverbs
https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_main.c#L254

> 
>> +	}
>> +
>> +	if (omit_data)
>> +		list_del_init(&event_sub->event_list);
>> +	else
>> +		list_del(&event->list);
>> +
>> +	spin_unlock_irq(&ev_queue->lock);
>> +
>> +	if (copy_to_user(buf, event_data, eventsz))
>> +		ret = -EFAULT;
>> +	else
>> +		ret = eventsz;
> 
> This is really poorly ordered, EFAULT will cause the event to be lost. :(

Agree but apparently rare case .. see also the below notes.

> 
> Maybe the event should be re-added on error? Tricky.

What will happen if another copy_to_user may then fail again (loop ?) 
... not sure that we want to get into this tricky handling ...

As of above, It follows the logic from uverbs at that area.
https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_main.c#L267

> 
>> +	if (!omit_data)
>> +		kfree(event);
>> +	return ret;
>>   }
>>   
>>   static __poll_t devx_async_event_poll(struct file *filp,
>>   				      struct poll_table_struct *wait)
>>   {
>> -	return 0;
>> +	struct devx_async_event_file *ev_file = filp->private_data;
>> +	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
>> +	__poll_t pollflags = 0;
>> +
>> +	poll_wait(filp, &ev_queue->poll_wait, wait);
>> +
>> +	spin_lock_irq(&ev_queue->lock);
>> +	if (ev_queue->is_destroyed)
>> +		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
>> +	else if (!list_empty(&ev_queue->event_list))
>> +		pollflags = EPOLLIN | EPOLLRDNORM;
>> +	spin_unlock_irq(&ev_queue->lock);
>> +
>> +	return pollflags;
>>   }
>>   
>>   static int devx_async_event_close(struct inode *inode, struct file *filp)
>>   {
>> +	struct ib_uobject *uobj = filp->private_data;
>> +	struct devx_async_event_file *ev_file =
>> +		container_of(uobj, struct devx_async_event_file, uobj);
>> +	struct devx_event_subscription *event_sub, *event_sub_tmp;
>> +	struct devx_async_event_data *entry, *tmp;
>> +
>> +	mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock);
>> +	/* delete the subscriptions which are related to this FD */
>> +	list_for_each_entry_safe(event_sub, event_sub_tmp,
>> +				 &ev_file->subscribed_events_list, file_list)
>> +		devx_cleanup_subscription(ev_file->dev, event_sub, true);
>> +	mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock);
>> +
>> +	/* free the pending events allocation */
>> +	if (!(ev_file->ev_queue.flags &
>> +	    MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA)) {
>> +		spin_lock_irq(&ev_file->ev_queue.lock);
>> +		list_for_each_entry_safe(entry, tmp,
>> +					 &ev_file->ev_queue.event_list, list)
>> +			kfree(entry); /* read can't come any nore */
> 
> spelling

OK

> 
>> +		spin_unlock_irq(&ev_file->ev_queue.lock);
>> +	}
>>   	uverbs_close_fd(filp);
>> +	put_device(&ev_file->dev->ib_dev.dev);
>>   	return 0;
>>   }
>>   
>> @@ -2374,6 +2705,17 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
>>   static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj,
>>   					    enum rdma_remove_reason why)
>>   {
>> +	struct devx_async_event_file *ev_file =
>> +		container_of(uobj, struct devx_async_event_file,
>> +			     uobj);
>> +	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
>> +
>> +	spin_lock_irq(&ev_queue->lock);
>> +	ev_queue->is_destroyed = 1;
>> +	spin_unlock_irq(&ev_queue->lock);
>> +
>> +	if (why == RDMA_REMOVE_DRIVER_REMOVE)
>> +		wake_up_interruptible(&ev_queue->poll_wait);
> 
> Why isn't this wakeup always done?

Maybe you are right and this can be always done to wake up any readers 
as the 'is_destroyed' was set.

By the way, any idea why it was done as such in uverbs [1] for similar 
flow ? also the command events follows that.


[1]
https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_std_types.c#L207
Jason Gunthorpe June 24, 2019, 6:06 p.m. UTC | #3
On Mon, Jun 24, 2019 at 07:55:32PM +0300, Yishai Hadas wrote:

> > > +	/* Explicit filtering to kernel events which may occur frequently */
> > > +	if (event_type == MLX5_EVENT_TYPE_CMD ||
> > > +	    event_type == MLX5_EVENT_TYPE_PAGE_REQUEST)
> > > +		return NOTIFY_OK;
> > > +
> > > +	table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb);
> > > +	dev = container_of(table, struct mlx5_ib_dev, devx_event_table);
> > > +	is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type);
> > > +
> > > +	if (!is_unaffiliated)
> > > +		obj_type = get_event_obj_type(event_type, data);
> > > +	event = xa_load(&table->event_xa, event_type | (obj_type << 16));
> > > +	if (!event)
> > > +		return NOTIFY_DONE;
> > 
> > event should be in the rcu as well
> 
> Do we really need this ? I didn't see a flow that really requires
> that.

I think there are no frees left? Even so it makes much more sense to
include the event in the rcu as if we ever did need to kfree it would
have to be via rcu

> > > +	while (list_empty(&ev_queue->event_list)) {
> > > +		spin_unlock_irq(&ev_queue->lock);
> > > +
> > > +		if (filp->f_flags & O_NONBLOCK)
> > > +			return -EAGAIN;
> > > +
> > > +		if (wait_event_interruptible(ev_queue->poll_wait,
> > > +			    (!list_empty(&ev_queue->event_list) ||
> > > +			     ev_queue->is_destroyed))) {
> > > +			return -ERESTARTSYS;
> > > +		}
> > > +
> > > +		if (list_empty(&ev_queue->event_list) &&
> > > +		    ev_queue->is_destroyed)
> > > +			return -EIO;
> > 
> > All these tests should be under the lock.
> 
> We can't call wait_event_interruptible() above which may sleep under the
> lock, correct ? are you referring to the list_empty() and
> is_destroyed ?

yes

> By the way looking in uverb code [1], similar code which is not done under
> the lock as of here..
> 
> [1] https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_main.c#L244

Also not a good idea

> > Why don't we return EIO as soon as is-destroyed happens? What is the
> > point of flushing out the accumulated events?
> 
> It follows the above uverb code/logic that returns existing events even in
> that case, also the async command events in this file follows that logic, I
> suggest to stay consistent.

Don't follow broken uverbs stuff...

> > Maybe the event should be re-added on error? Tricky.
> 
> What will happen if another copy_to_user may then fail again (loop ?) ...
> not sure that we want to get into this tricky handling ...
> 
> As of above, It follows the logic from uverbs at that area.
> https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_main.c#L267

again it is wrong...

There is no loop if you just stick the item back on the head of the
list and exit, which is probably the right thing to do..

> > > @@ -2374,6 +2705,17 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
> > >   static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj,
> > >   					    enum rdma_remove_reason why)
> > >   {
> > > +	struct devx_async_event_file *ev_file =
> > > +		container_of(uobj, struct devx_async_event_file,
> > > +			     uobj);
> > > +	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
> > > +
> > > +	spin_lock_irq(&ev_queue->lock);
> > > +	ev_queue->is_destroyed = 1;
> > > +	spin_unlock_irq(&ev_queue->lock);
> > > +
> > > +	if (why == RDMA_REMOVE_DRIVER_REMOVE)
> > > +		wake_up_interruptible(&ev_queue->poll_wait);
> > 
> > Why isn't this wakeup always done?
> 
> Maybe you are right and this can be always done to wake up any readers as
> the 'is_destroyed' was set.
> 
> By the way, any idea why it was done as such in uverbs [1] for similar flow
> ? also the command events follows that.

I don't know, it is probably pointless too.

If we don't need it here then we shouldn't have it.

These random pointless ifs bother me as we have to spend time trying
to figure out that they are pointless down the road.

Jason
Yishai Hadas June 25, 2019, 2:41 p.m. UTC | #4
On 6/24/2019 9:06 PM, Jason Gunthorpe wrote:
> On Mon, Jun 24, 2019 at 07:55:32PM +0300, Yishai Hadas wrote:
> 
>>>> +	/* Explicit filtering to kernel events which may occur frequently */
>>>> +	if (event_type == MLX5_EVENT_TYPE_CMD ||
>>>> +	    event_type == MLX5_EVENT_TYPE_PAGE_REQUEST)
>>>> +		return NOTIFY_OK;
>>>> +
>>>> +	table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb);
>>>> +	dev = container_of(table, struct mlx5_ib_dev, devx_event_table);
>>>> +	is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type);
>>>> +
>>>> +	if (!is_unaffiliated)
>>>> +		obj_type = get_event_obj_type(event_type, data);
>>>> +	event = xa_load(&table->event_xa, event_type | (obj_type << 16));
>>>> +	if (!event)
>>>> +		return NOTIFY_DONE;
>>>
>>> event should be in the rcu as well
>>
>> Do we really need this ? I didn't see a flow that really requires
>> that.
> 
> I think there are no frees left? Even so it makes much more sense to
> include the event in the rcu as if we ever did need to kfree it would
> have to be via rcu
> 

OK

>>>> +	while (list_empty(&ev_queue->event_list)) {
>>>> +		spin_unlock_irq(&ev_queue->lock);
>>>> +
>>>> +		if (filp->f_flags & O_NONBLOCK)
>>>> +			return -EAGAIN;
>>>> +
>>>> +		if (wait_event_interruptible(ev_queue->poll_wait,
>>>> +			    (!list_empty(&ev_queue->event_list) ||
>>>> +			     ev_queue->is_destroyed))) {
>>>> +			return -ERESTARTSYS;
>>>> +		}
>>>> +
>>>> +		if (list_empty(&ev_queue->event_list) &&
>>>> +		    ev_queue->is_destroyed)
>>>> +			return -EIO;
>>>
>>> All these tests should be under the lock.
>>
>> We can't call wait_event_interruptible() above which may sleep under the
>> lock, correct ? are you referring to the list_empty() and
>> is_destroyed ?
> 
> yes
> 
>> By the way looking in uverb code [1], similar code which is not done under
>> the lock as of here..
>>
>> [1] https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_main.c#L244
> 
> Also not a good idea
> 
>>> Why don't we return EIO as soon as is-destroyed happens? What is the
>>> point of flushing out the accumulated events?
>>
>> It follows the above uverb code/logic that returns existing events even in
>> that case, also the async command events in this file follows that logic, I
>> suggest to stay consistent.
> 
> Don't follow broken uverbs stuff...

May it be that there is some event that we still want to deliver post 
unbind/hot-unplug ? for example IB_EVENT_DEVICE_FATAL in uverbs and 
others from the driver code.

Not sure that we want to change this logic.
What do you think ?

> 
>>> Maybe the event should be re-added on error? Tricky.
>>
>> What will happen if another copy_to_user may then fail again (loop ?) ...
>> not sure that we want to get into this tricky handling ...
>>
>> As of above, It follows the logic from uverbs at that area.
>> https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_main.c#L267
> 
> again it is wrong...
> 
> There is no loop if you just stick the item back on the head of the
> list and exit, which is probably the right thing to do..
> 

What if copy_to_user() will fail again just later on ? we might end-up 
with loop of read(s) that always find an event as it was put back.
I suggest to leave this flow as it's now, at least for this series 
submission.

Agree ?


>>>> @@ -2374,6 +2705,17 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
>>>>    static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj,
>>>>    					    enum rdma_remove_reason why)
>>>>    {
>>>> +	struct devx_async_event_file *ev_file =
>>>> +		container_of(uobj, struct devx_async_event_file,
>>>> +			     uobj);
>>>> +	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
>>>> +
>>>> +	spin_lock_irq(&ev_queue->lock);
>>>> +	ev_queue->is_destroyed = 1;
>>>> +	spin_unlock_irq(&ev_queue->lock);
>>>> +
>>>> +	if (why == RDMA_REMOVE_DRIVER_REMOVE)
>>>> +		wake_up_interruptible(&ev_queue->poll_wait);
>>>
>>> Why isn't this wakeup always done?
>>
>> Maybe you are right and this can be always done to wake up any readers as
>> the 'is_destroyed' was set.
>>
>> By the way, any idea why it was done as such in uverbs [1] for similar flow
>> ? also the command events follows that.
> 
> I don't know, it is probably pointless too.
> 
> If we don't need it here then we shouldn't have it.
> 
> These random pointless ifs bother me as we have to spend time trying
> to figure out that they are pointless down the road.
> 

OK, will drop this if.
Jason Gunthorpe June 25, 2019, 8:23 p.m. UTC | #5
On Tue, Jun 25, 2019 at 05:41:49PM +0300, Yishai Hadas wrote:
> > > > Why don't we return EIO as soon as is-destroyed happens? What is the
> > > > point of flushing out the accumulated events?
> > > 
> > > It follows the above uverb code/logic that returns existing events even in
> > > that case, also the async command events in this file follows that logic, I
> > > suggest to stay consistent.
> > 
> > Don't follow broken uverbs stuff...
> 
> May it be that there is some event that we still want to deliver post
> unbind/hot-unplug ? for example IB_EVENT_DEVICE_FATAL in uverbs and others
> from the driver code.

EIO is DEVICE_FATAL.

> Not sure that we want to change this logic.
> What do you think ?

I think this code should exit immediately with EIO if the device is
disassociated.

> > > > Maybe the event should be re-added on error? Tricky.
> > > 
> > > What will happen if another copy_to_user may then fail again (loop ?) ...
> > > not sure that we want to get into this tricky handling ...
> > > 
> > > As of above, It follows the logic from uverbs at that area.
> > > https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_main.c#L267
> > 
> > again it is wrong...
> > 
> > There is no loop if you just stick the item back on the head of the
> > list and exit, which is probably the right thing to do..
> > 
> 
> What if copy_to_user() will fail again just later on ? we might end-up with
> loop of read(s) that always find an event as it was put back.

That is clearly an application bug and is not the concern of the
kernel..

> I suggest to leave this flow as it's now, at least for this series
> submission.
> 
> Agree ?

I don't think you can actually fix this, so maybe we have to leave
it. But add a comment explaining 

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 304b13e7a265..49fdce95d6d9 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -34,6 +34,11 @@  struct devx_async_data {
 	struct mlx5_ib_uapi_devx_async_cmd_hdr hdr;
 };
 
+struct devx_async_event_data {
+	struct list_head list; /* headed in ev_queue->event_list */
+	struct mlx5_ib_uapi_devx_async_event_hdr hdr;
+};
+
 /* first level XA value data structure */
 struct devx_event {
 	struct xarray object_ids; /* second XA level, Key = object id */
@@ -54,7 +59,9 @@  struct devx_event_subscription {
 				   * devx_obj_event->obj_sub_list
 				   */
 	struct list_head obj_list; /* headed in devx_object */
+	struct list_head event_list; /* headed in ev_queue->event_list */
 
+	u8  is_cleaned:1;
 	u32 xa_key_level1;
 	u32 xa_key_level2;
 	struct rcu_head	rcu;
@@ -71,6 +78,7 @@  struct devx_async_event_queue {
 	struct list_head	event_list;
 	atomic_t		bytes_in_use;
 	u8			is_destroyed:1;
+	u8			is_overflow_err:1;
 	u32			flags;
 };
 
@@ -294,6 +302,46 @@  static u16 get_dec_obj_type(struct devx_obj *obj, u16 event_num)
 	}
 }
 
+/* Any future affiliated event should have a fixed header to get the obj
+ * type and id including events on legacy objects.
+ */
+static u32 get_affiliated_event_obj_id(struct mlx5_eqe *eqe)
+{
+	u32 obj_id = MLX5_GET(affiliated_event_header, eqe, obj_id);
+
+	return obj_id;
+}
+
+static u16 get_affiliated_event_obj_type(struct mlx5_eqe *eqe)
+{
+	u16 obj_type = MLX5_GET(affiliated_event_header, eqe, obj_type);
+
+	return obj_type;
+}
+
+static u16 get_event_obj_type(unsigned long event_type, struct mlx5_eqe *eqe)
+{
+	switch (event_type) {
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+	case MLX5_EVENT_TYPE_PATH_MIG:
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+	case MLX5_EVENT_TYPE_COMM_EST:
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+	case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+	case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+		return eqe->data.qp_srq.type;
+	case MLX5_EVENT_TYPE_CQ_ERROR:
+		return 0;
+	case MLX5_EVENT_TYPE_DCT_DRAINED:
+		return MLX5_EVENT_QUEUE_TYPE_DCT;
+	default:
+		return get_affiliated_event_obj_type(eqe);
+	}
+}
+
 static u32 get_dec_obj_id(u64 obj_id)
 {
 	return (obj_id & 0xffffffff);
@@ -1276,9 +1324,13 @@  static void devx_cleanup_mkey(struct devx_obj *obj)
 }
 
 static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
-				      struct devx_event_subscription *sub)
+				      struct devx_event_subscription *sub,
+				      bool file_close)
 {
-	list_del_rcu(&sub->file_list);
+	if (sub->is_cleaned)
+		goto end;
+
+	sub->is_cleaned = 1;
 	list_del_rcu(&sub->xa_list);
 
 	if (sub->is_obj_related) {
@@ -1303,10 +1355,15 @@  static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
 		}
 	}
 
-	if (sub->eventfd)
-		eventfd_ctx_put(sub->eventfd);
+end:
+	if (file_close) {
+		if (sub->eventfd)
+			eventfd_ctx_put(sub->eventfd);
 
-	kfree_rcu(sub, rcu);
+		list_del_rcu(&sub->file_list);
+		/* subscription may not be used by the read API any more */
+		kfree_rcu(sub, rcu);
+	}
 }
 
 static int devx_obj_cleanup(struct ib_uobject *uobject,
@@ -1338,7 +1395,7 @@  static int devx_obj_cleanup(struct ib_uobject *uobject,
 
 	mutex_lock(&devx_event_table->event_xa_lock);
 	list_for_each_entry_safe(sub_entry, tmp, &obj->event_sub, obj_list)
-		devx_cleanup_subscription(dev, sub_entry);
+		devx_cleanup_subscription(dev, sub_entry, false);
 	mutex_unlock(&devx_event_table->event_xa_lock);
 
 	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
@@ -1588,6 +1645,7 @@  static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)(
 	devx_init_event_queue(&ev_file->ev_queue, flags);
 	INIT_LIST_HEAD(&ev_file->subscribed_events_list);
 	ev_file->dev = dev;
+	get_device(&dev->ib_dev.dev);
 	return 0;
 }
 
@@ -1981,6 +2039,7 @@  static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)(
 		event_sub_arr[i]->xa_key_level1 = key_level1;
 		event_sub_arr[i]->xa_key_level2 = obj_id;
 		event_sub_arr[i]->is_obj_related = obj ? true : false;
+		INIT_LIST_HEAD(&event_sub_arr[i]->event_list);
 	}
 
 	/* Once all the allocations and the reservations for level 2
@@ -2191,10 +2250,174 @@  static int devx_umem_cleanup(struct ib_uobject *uobject,
 	return 0;
 }
 
+static bool is_unaffiliated_event(struct mlx5_core_dev *dev,
+				  unsigned long event_type)
+{
+	__be64 *unaff_events;
+	int mask_entry;
+	int mask_bit;
+
+	if (!MLX5_CAP_GEN(dev, event_cap))
+		return is_legacy_unaffiliated_event_num(event_type);
+
+	unaff_events = MLX5_CAP_DEV_EVENT(dev,
+					  user_unaffiliated_events);
+	WARN_ON(event_type > MAX_SUPP_EVENT_NUM);
+
+	mask_entry = event_type / 64;
+	mask_bit = event_type % 64;
+
+	if (!(be64_to_cpu(unaff_events[mask_entry]) & (1ull << mask_bit)))
+		return false;
+
+	return true;
+}
+
+static u32 devx_get_obj_id_from_event(unsigned long event_type, void *data)
+{
+	struct mlx5_eqe *eqe = data;
+	u32 obj_id = 0;
+
+	switch (event_type) {
+	case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+	case MLX5_EVENT_TYPE_PATH_MIG:
+	case MLX5_EVENT_TYPE_COMM_EST:
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+		obj_id = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+		break;
+	case MLX5_EVENT_TYPE_DCT_DRAINED:
+		obj_id = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
+		break;
+	case MLX5_EVENT_TYPE_CQ_ERROR:
+		obj_id = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
+		break;
+	default:
+		obj_id = get_affiliated_event_obj_id(data);
+		break;
+	}
+
+	return obj_id;
+}
+
+static int deliver_event(struct devx_event_subscription *event_sub,
+			 const void *data)
+{
+	struct ib_uobject *fd_uobj = event_sub->fd_uobj;
+	struct devx_async_event_file *ev_file;
+	struct devx_async_event_queue *ev_queue;
+	struct devx_async_event_data *event_data;
+	unsigned long flags;
+	bool omit_data;
+
+	ev_file = container_of(fd_uobj, struct devx_async_event_file,
+			       uobj);
+	ev_queue = &ev_file->ev_queue;
+	omit_data = ev_queue->flags &
+		MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA;
+
+	if (omit_data) {
+		spin_lock_irqsave(&ev_queue->lock, flags);
+		if (!list_empty(&event_sub->event_list)) {
+			spin_unlock_irqrestore(&ev_queue->lock, flags);
+			return 0;
+		}
+
+		list_add_tail(&event_sub->event_list, &ev_queue->event_list);
+		spin_unlock_irqrestore(&ev_queue->lock, flags);
+		wake_up_interruptible(&ev_queue->poll_wait);
+		return 0;
+	}
+
+	event_data = kzalloc(sizeof(*event_data) +
+			     (omit_data ? 0 : sizeof(struct mlx5_eqe)),
+			     GFP_ATOMIC);
+	if (!event_data) {
+		spin_lock_irqsave(&ev_queue->lock, flags);
+		ev_queue->is_overflow_err = 1;
+		spin_unlock_irqrestore(&ev_queue->lock, flags);
+		return -ENOMEM;
+	}
+
+	event_data->hdr.cookie = event_sub->cookie;
+	memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe));
+
+	spin_lock_irqsave(&ev_queue->lock, flags);
+	list_add_tail(&event_data->list, &ev_queue->event_list);
+	spin_unlock_irqrestore(&ev_queue->lock, flags);
+	wake_up_interruptible(&ev_queue->poll_wait);
+
+	return 0;
+}
+
+static void dispatch_event_fd(struct list_head *fd_list,
+			      const void *data)
+{
+	struct devx_event_subscription *item;
+
+	list_for_each_entry_rcu(item, fd_list, xa_list) {
+		if (!get_file_rcu((struct file *)item->object))
+			continue;
+
+		if (item->eventfd) {
+			eventfd_signal(item->eventfd, 1);
+			fput(item->object);
+			continue;
+		}
+
+		deliver_event(item, data);
+		fput(item->object);
+	}
+}
+
 static int devx_event_notifier(struct notifier_block *nb,
 			       unsigned long event_type, void *data)
 {
-	return NOTIFY_DONE;
+	struct mlx5_devx_event_table *table;
+	struct mlx5_ib_dev *dev;
+	struct devx_event *event;
+	struct devx_obj_event *obj_event;
+	u16 obj_type = 0;
+	bool is_unaffiliated;
+	u32 obj_id;
+
+	/* Explicit filtering to kernel events which may occur frequently */
+	if (event_type == MLX5_EVENT_TYPE_CMD ||
+	    event_type == MLX5_EVENT_TYPE_PAGE_REQUEST)
+		return NOTIFY_OK;
+
+	table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb);
+	dev = container_of(table, struct mlx5_ib_dev, devx_event_table);
+	is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type);
+
+	if (!is_unaffiliated)
+		obj_type = get_event_obj_type(event_type, data);
+	event = xa_load(&table->event_xa, event_type | (obj_type << 16));
+	if (!event)
+		return NOTIFY_DONE;
+
+	if (is_unaffiliated) {
+		dispatch_event_fd(&event->unaffiliated_list, data);
+		return NOTIFY_OK;
+	}
+
+	obj_id = devx_get_obj_id_from_event(event_type, data);
+	rcu_read_lock();
+	obj_event = xa_load(&event->object_ids, obj_id);
+	if (!obj_event) {
+		rcu_read_unlock();
+		return NOTIFY_DONE;
+	}
+
+	dispatch_event_fd(&obj_event->obj_sub_list, data);
+
+	rcu_read_unlock();
+	return NOTIFY_OK;
 }
 
 void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev)
@@ -2221,7 +2444,7 @@  void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev)
 		event = entry;
 		list_for_each_entry_safe(sub, tmp, &event->unaffiliated_list,
 					 xa_list)
-			devx_cleanup_subscription(dev, sub);
+			devx_cleanup_subscription(dev, sub, false);
 		kfree(entry);
 	}
 	mutex_unlock(&dev->devx_event_table.event_xa_lock);
@@ -2329,18 +2552,126 @@  static const struct file_operations devx_async_cmd_event_fops = {
 static ssize_t devx_async_event_read(struct file *filp, char __user *buf,
 				     size_t count, loff_t *pos)
 {
-	return -EINVAL;
+	struct devx_async_event_file *ev_file = filp->private_data;
+	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
+	struct devx_event_subscription *event_sub;
+	struct devx_async_event_data *uninitialized_var(event);
+	int ret = 0;
+	size_t eventsz;
+	bool omit_data;
+	void *event_data;
+
+	omit_data = ev_queue->flags &
+		MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA;
+
+	spin_lock_irq(&ev_queue->lock);
+
+	if (ev_queue->is_overflow_err) {
+		ev_queue->is_overflow_err = 0;
+		spin_unlock_irq(&ev_queue->lock);
+		return -EOVERFLOW;
+	}
+
+	while (list_empty(&ev_queue->event_list)) {
+		spin_unlock_irq(&ev_queue->lock);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(ev_queue->poll_wait,
+			    (!list_empty(&ev_queue->event_list) ||
+			     ev_queue->is_destroyed))) {
+			return -ERESTARTSYS;
+		}
+
+		if (list_empty(&ev_queue->event_list) &&
+		    ev_queue->is_destroyed)
+			return -EIO;
+
+		spin_lock_irq(&ev_queue->lock);
+	}
+
+	if (omit_data) {
+		event_sub = list_first_entry(&ev_queue->event_list,
+					struct devx_event_subscription,
+					event_list);
+		eventsz = sizeof(event_sub->cookie);
+		event_data = &event_sub->cookie;
+	} else {
+		event = list_first_entry(&ev_queue->event_list,
+				      struct devx_async_event_data, list);
+		eventsz = sizeof(struct mlx5_eqe) +
+			sizeof(struct mlx5_ib_uapi_devx_async_event_hdr);
+		event_data = &event->hdr;
+	}
+
+	if (eventsz > count) {
+		spin_unlock_irq(&ev_queue->lock);
+		return -ENOSPC;
+	}
+
+	if (omit_data)
+		list_del_init(&event_sub->event_list);
+	else
+		list_del(&event->list);
+
+	spin_unlock_irq(&ev_queue->lock);
+
+	if (copy_to_user(buf, event_data, eventsz))
+		ret = -EFAULT;
+	else
+		ret = eventsz;
+
+	if (!omit_data)
+		kfree(event);
+	return ret;
 }
 
 static __poll_t devx_async_event_poll(struct file *filp,
 				      struct poll_table_struct *wait)
 {
-	return 0;
+	struct devx_async_event_file *ev_file = filp->private_data;
+	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
+	__poll_t pollflags = 0;
+
+	poll_wait(filp, &ev_queue->poll_wait, wait);
+
+	spin_lock_irq(&ev_queue->lock);
+	if (ev_queue->is_destroyed)
+		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+	else if (!list_empty(&ev_queue->event_list))
+		pollflags = EPOLLIN | EPOLLRDNORM;
+	spin_unlock_irq(&ev_queue->lock);
+
+	return pollflags;
 }
 
 static int devx_async_event_close(struct inode *inode, struct file *filp)
 {
+	struct ib_uobject *uobj = filp->private_data;
+	struct devx_async_event_file *ev_file =
+		container_of(uobj, struct devx_async_event_file, uobj);
+	struct devx_event_subscription *event_sub, *event_sub_tmp;
+	struct devx_async_event_data *entry, *tmp;
+
+	mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock);
+	/* delete the subscriptions which are related to this FD */
+	list_for_each_entry_safe(event_sub, event_sub_tmp,
+				 &ev_file->subscribed_events_list, file_list)
+		devx_cleanup_subscription(ev_file->dev, event_sub, true);
+	mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock);
+
+	/* free the pending events allocation */
+	if (!(ev_file->ev_queue.flags &
+	    MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA)) {
+		spin_lock_irq(&ev_file->ev_queue.lock);
+		list_for_each_entry_safe(entry, tmp,
+					 &ev_file->ev_queue.event_list, list)
+			kfree(entry); /* read can't come any nore */
+		spin_unlock_irq(&ev_file->ev_queue.lock);
+	}
 	uverbs_close_fd(filp);
+	put_device(&ev_file->dev->ib_dev.dev);
 	return 0;
 }
 
@@ -2374,6 +2705,17 @@  static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
 static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj,
 					    enum rdma_remove_reason why)
 {
+	struct devx_async_event_file *ev_file =
+		container_of(uobj, struct devx_async_event_file,
+			     uobj);
+	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
+
+	spin_lock_irq(&ev_queue->lock);
+	ev_queue->is_destroyed = 1;
+	spin_unlock_irq(&ev_queue->lock);
+
+	if (why == RDMA_REMOVE_DRIVER_REMOVE)
+		wake_up_interruptible(&ev_queue->poll_wait);
 	return 0;
 };
 
diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
index 57beea4589e4..9500ff7363ef 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
@@ -67,5 +67,10 @@  enum mlx5_ib_uapi_devx_create_event_channel_flags {
 	MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA = 1 << 0,
 };
 
+struct mlx5_ib_uapi_devx_async_event_hdr {
+	__aligned_u64	cookie;
+	__u8		out_data[];
+};
+
 #endif