diff mbox series

[RFC,06/15] fanotify: Support submission through ring buffer

Message ID 20210426184201.4177978-7-krisman@collabora.com
State Superseded
Headers show
Series File system wide monitoring | expand

Commit Message

Gabriel Krisman Bertazi April 26, 2021, 6:41 p.m. UTC
This adds support for the ring buffer mode in fanotify.  It is enabled
by a new flag FAN_PREALLOC_QUEUE passed to fanotify_init.  If this flag
is enabled, the group only allows marks that support the ring buffer
submission.  In a following patch, FAN_ERROR will make use of this
mechanism.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
---
 fs/notify/fanotify/fanotify.c      | 77 +++++++++++++++++++---------
 fs/notify/fanotify/fanotify_user.c | 81 ++++++++++++++++++------------
 include/linux/fanotify.h           |  5 +-
 include/uapi/linux/fanotify.h      |  1 +
 4 files changed, 105 insertions(+), 59 deletions(-)

Comments

Amir Goldstein April 27, 2021, 6:02 a.m. UTC | #1
On Mon, Apr 26, 2021 at 9:42 PM Gabriel Krisman Bertazi
<krisman@collabora.com> wrote:
>
> This adds support for the ring buffer mode in fanotify.  It is enabled
> by a new flag FAN_PREALLOC_QUEUE passed to fanotify_init.  If this flag
> is enabled, the group only allows marks that support the ring buffer

I don't like this limitation.
I think FAN_PREALLOC_QUEUE can work with other events, why not?

In any case if we keep ring buffer, please use a different set of
fanotify_ring_buffer_ops struct instead of spraying if/else all over the
event queue implementation.

> submission.  In a following patch, FAN_ERROR will make use of this
> mechanism.
>
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> ---
>  fs/notify/fanotify/fanotify.c      | 77 +++++++++++++++++++---------
>  fs/notify/fanotify/fanotify_user.c | 81 ++++++++++++++++++------------
>  include/linux/fanotify.h           |  5 +-
>  include/uapi/linux/fanotify.h      |  1 +
>  4 files changed, 105 insertions(+), 59 deletions(-)
>
> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index e3669d8a4a64..98591a8155a7 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -612,6 +612,26 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
>         return event;
>  }
>
> +static struct fanotify_event *fanotify_ring_get_slot(struct fsnotify_group *group,
> +                                                    u32 mask, const void *data,
> +                                                    int data_type)
> +{
> +       size_t size = 0;
> +
> +       pr_debug("%s: group=%p mask=%x size=%lu\n", __func__, group, mask, size);
> +
> +       return FANOTIFY_E(fsnotify_ring_alloc_event_slot(group, size));
> +}
> +
> +static void fanotify_ring_write_event(struct fsnotify_group *group,
> +                                     struct fanotify_event *event, u32 mask,
> +                                     const void *data, __kernel_fsid_t *fsid)
> +{
> +       fanotify_init_event(group, event, 0, mask);
> +
> +       event->pid = get_pid(task_tgid(current));
> +}
> +
>  /*
>   * Get cached fsid of the filesystem containing the object from any connector.
>   * All connectors are supposed to have the same fsid, but we do not verify that
> @@ -701,31 +721,38 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
>                         return 0;
>         }
>
> -       event = fanotify_alloc_event(group, mask, data, data_type, dir,
> -                                    file_name, &fsid);
> -       ret = -ENOMEM;
> -       if (unlikely(!event)) {
> -               /*
> -                * We don't queue overflow events for permission events as
> -                * there the access is denied and so no event is in fact lost.
> -                */
> -               if (!fanotify_is_perm_event(mask))
> -                       fsnotify_queue_overflow(group);
> -               goto finish;
> -       }
> -
> -       fsn_event = &event->fse;
> -       ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
> -       if (ret) {
> -               /* Permission events shouldn't be merged */
> -               BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
> -               /* Our event wasn't used in the end. Free it. */
> -               fsnotify_destroy_event(group, fsn_event);
> -
> -               ret = 0;
> -       } else if (fanotify_is_perm_event(mask)) {
> -               ret = fanotify_get_response(group, FANOTIFY_PERM(event),
> -                                           iter_info);
> +       if (group->flags & FSN_SUBMISSION_RING_BUFFER) {
> +               event = fanotify_ring_get_slot(group, mask, data, data_type);
> +               if (IS_ERR(event))
> +                       return PTR_ERR(event);

So no FAN_OVERFLOW with the ring buffer implementation?
This will be unexpected for fanotify users and frankly, less useful IMO.
I also don't see the technical reason to omit the overflow event.

> +               fanotify_ring_write_event(group, event, mask, data, &fsid);
> +               fsnotify_ring_commit_slot(group, &event->fse);
> +       } else {
> +               event = fanotify_alloc_event(group, mask, data, data_type, dir,
> +                                            file_name, &fsid);
> +               ret = -ENOMEM;
> +               if (unlikely(!event)) {
> +                       /*
> +                        * We don't queue overflow events for permission events as
> +                        * there the access is denied and so no event is in fact lost.
> +                        */
> +                       if (!fanotify_is_perm_event(mask))
> +                               fsnotify_queue_overflow(group);
> +                       goto finish;
> +               }
> +               fsn_event = &event->fse;
> +               ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
> +               if (ret) {
> +                       /* Permission events shouldn't be merged */
> +                       BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
> +                       /* Our event wasn't used in the end. Free it. */
> +                       fsnotify_destroy_event(group, fsn_event);
> +
> +                       ret = 0;
> +               } else if (fanotify_is_perm_event(mask)) {
> +                       ret = fanotify_get_response(group, FANOTIFY_PERM(event),
> +                                                   iter_info);
> +               }
>         }
>  finish:
>         if (fanotify_is_perm_event(mask))
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index fe605359af88..5031198bf7db 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -521,7 +521,9 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
>                  * Permission events get queued to wait for response.  Other
>                  * events can be destroyed now.
>                  */
> -               if (!fanotify_is_perm_event(event->mask)) {
> +               if (group->fanotify_data.flags & FAN_PREALLOC_QUEUE) {
> +                       fsnotify_ring_buffer_consume_event(group, &event->fse);
> +               } else if (!fanotify_is_perm_event(event->mask)) {
>                         fsnotify_destroy_event(group, &event->fse);
>                 } else {
>                         if (ret <= 0) {
> @@ -587,40 +589,39 @@ static int fanotify_release(struct inode *ignored, struct file *file)
>          */
>         fsnotify_group_stop_queueing(group);
>
> -       /*
> -        * Process all permission events on access_list and notification queue
> -        * and simulate reply from userspace.
> -        */
> -       spin_lock(&group->notification_lock);
> -       while (!list_empty(&group->fanotify_data.access_list)) {
> -               struct fanotify_perm_event *event;
> -
> -               event = list_first_entry(&group->fanotify_data.access_list,
> -                               struct fanotify_perm_event, fae.fse.list);
> -               list_del_init(&event->fae.fse.list);
> -               finish_permission_event(group, event, FAN_ALLOW);
> +       if (!(group->flags & FSN_SUBMISSION_RING_BUFFER)) {
> +               /*
> +                * Process all permission events on access_list and notification queue
> +                * and simulate reply from userspace.
> +                */
>                 spin_lock(&group->notification_lock);
> -       }
> -
> -       /*
> -        * Destroy all non-permission events. For permission events just
> -        * dequeue them and set the response. They will be freed once the
> -        * response is consumed and fanotify_get_response() returns.
> -        */
> -       while (!fsnotify_notify_queue_is_empty(group)) {
> -               struct fanotify_event *event;
> -
> -               event = FANOTIFY_E(fsnotify_remove_first_event(group));
> -               if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
> -                       spin_unlock(&group->notification_lock);
> -                       fsnotify_destroy_event(group, &event->fse);
> -               } else {
> -                       finish_permission_event(group, FANOTIFY_PERM(event),
> -                                               FAN_ALLOW);
> +               while (!list_empty(&group->fanotify_data.access_list)) {
> +                       struct fanotify_perm_event *event;
> +                       event = list_first_entry(&group->fanotify_data.access_list,
> +                                                struct fanotify_perm_event, fae.fse.list);
> +                       list_del_init(&event->fae.fse.list);
> +                       finish_permission_event(group, event, FAN_ALLOW);
> +                       spin_lock(&group->notification_lock);
>                 }
> -               spin_lock(&group->notification_lock);
> +               /*
> +                * Destroy all non-permission events. For permission events just
> +                * dequeue them and set the response. They will be freed once the
> +                * response is consumed and fanotify_get_response() returns.
> +                */
> +               while (!fsnotify_notify_queue_is_empty(group)) {
> +                       struct fanotify_event *event;
> +                       event = FANOTIFY_E(fsnotify_remove_first_event(group));
> +                       if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
> +                               spin_unlock(&group->notification_lock);
> +                               fsnotify_destroy_event(group, &event->fse);
> +                       } else {
> +                               finish_permission_event(group, FANOTIFY_PERM(event),
> +                                                       FAN_ALLOW);
> +                       }
> +                       spin_lock(&group->notification_lock);
> +               }
> +               spin_unlock(&group->notification_lock);
>         }
> -       spin_unlock(&group->notification_lock);
>
>         /* Response for all permission events it set, wakeup waiters */
>         wake_up(&group->fanotify_data.access_waitq);
> @@ -981,6 +982,16 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>         if (flags & FAN_NONBLOCK)
>                 f_flags |= O_NONBLOCK;
>
> +       if (flags & FAN_PREALLOC_QUEUE) {
> +               if (!capable(CAP_SYS_ADMIN))
> +                       return -EPERM;
> +
> +               if (flags & FAN_UNLIMITED_QUEUE)
> +                       return -EINVAL;
> +
> +               fsn_flags = FSN_SUBMISSION_RING_BUFFER;
> +       }
> +
>         /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
>         group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops, fsn_flags);
>         if (IS_ERR(group)) {
> @@ -1223,6 +1234,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
>                 goto fput_and_out;
>         }
>
> +       if ((group->flags & FSN_SUBMISSION_RING_BUFFER) &&
> +           (mask & ~FANOTIFY_SUBMISSION_BUFFER_EVENTS))
> +               goto fput_and_out;
> +
>         ret = fanotify_find_path(dfd, pathname, &path, flags,
>                         (mask & ALL_FSNOTIFY_EVENTS), obj_type);
>         if (ret)
> @@ -1327,7 +1342,7 @@ SYSCALL32_DEFINE6(fanotify_mark,
>   */
>  static int __init fanotify_user_setup(void)
>  {
> -       BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
> +       BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
>         BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
>
>         fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
> diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
> index 3e9c56ee651f..5a4cefb4b1c3 100644
> --- a/include/linux/fanotify.h
> +++ b/include/linux/fanotify.h
> @@ -23,7 +23,8 @@
>  #define FANOTIFY_INIT_FLAGS    (FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \
>                                  FAN_REPORT_TID | \
>                                  FAN_CLOEXEC | FAN_NONBLOCK | \
> -                                FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS)
> +                                FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS | \
> +                                FAN_PREALLOC_QUEUE)
>
>  #define FANOTIFY_MARK_TYPE_BITS        (FAN_MARK_INODE | FAN_MARK_MOUNT | \
>                                  FAN_MARK_FILESYSTEM)
> @@ -71,6 +72,8 @@
>                                          FANOTIFY_PERM_EVENTS | \
>                                          FAN_Q_OVERFLOW | FAN_ONDIR)
>
> +#define FANOTIFY_SUBMISSION_BUFFER_EVENTS 0

FANOTIFY_RING_BUFFER_EVENTS? FANOTIFY_PREALLOC_EVENTS?

Please leave a comment above to state what this group means.
I *think* there is no reason to limit the set of events, only the sort of
information that is possible with FAN_PREALLOC_QUEUE.

Perhaps FAN_REPORT_FID cannot be allowed and as a result
FANOTIFY_INODE_EVENTS will not be allowed, but I am not even
sure if that limitation is needed.

Thanks,
Amir.
Gabriel Krisman Bertazi April 29, 2021, 6:36 p.m. UTC | #2
Amir Goldstein <amir73il@gmail.com> writes:

> On Mon, Apr 26, 2021 at 9:42 PM Gabriel Krisman Bertazi
> <krisman@collabora.com> wrote:
>>
>> This adds support for the ring buffer mode in fanotify.  It is enabled
>> by a new flag FAN_PREALLOC_QUEUE passed to fanotify_init.  If this flag
>> is enabled, the group only allows marks that support the ring buffer
>
> I don't like this limitation.
> I think FAN_PREALLOC_QUEUE can work with other events, why not?

The only complications I see are permission events and mergeable events,
which would no longer be merged.  The merging problem is not big,
except it changes the existing expectations.  Other than that, it should
be trivial to have every FAN_CLASS_NOTIF events in the ring buffer.
>
> In any case if we keep ring buffer, please use a different set of
> fanotify_ring_buffer_ops struct instead of spraying if/else all over the
> event queue implementation.
>
>> submission.  In a following patch, FAN_ERROR will make use of this
>> mechanism.
>>
>> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
>> ---
>>  fs/notify/fanotify/fanotify.c      | 77 +++++++++++++++++++---------
>>  fs/notify/fanotify/fanotify_user.c | 81 ++++++++++++++++++------------
>>  include/linux/fanotify.h           |  5 +-
>>  include/uapi/linux/fanotify.h      |  1 +
>>  4 files changed, 105 insertions(+), 59 deletions(-)
>>
>> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
>> index e3669d8a4a64..98591a8155a7 100644
>> --- a/fs/notify/fanotify/fanotify.c
>> +++ b/fs/notify/fanotify/fanotify.c
>> @@ -612,6 +612,26 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
>>         return event;
>>  }
>>
>> +static struct fanotify_event *fanotify_ring_get_slot(struct fsnotify_group *group,
>> +                                                    u32 mask, const void *data,
>> +                                                    int data_type)
>> +{
>> +       size_t size = 0;
>> +
>> +       pr_debug("%s: group=%p mask=%x size=%lu\n", __func__, group, mask, size);
>> +
>> +       return FANOTIFY_E(fsnotify_ring_alloc_event_slot(group, size));
>> +}
>> +
>> +static void fanotify_ring_write_event(struct fsnotify_group *group,
>> +                                     struct fanotify_event *event, u32 mask,
>> +                                     const void *data, __kernel_fsid_t *fsid)
>> +{
>> +       fanotify_init_event(group, event, 0, mask);
>> +
>> +       event->pid = get_pid(task_tgid(current));
>> +}
>> +
>>  /*
>>   * Get cached fsid of the filesystem containing the object from any connector.
>>   * All connectors are supposed to have the same fsid, but we do not verify that
>> @@ -701,31 +721,38 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
>>                         return 0;
>>         }
>>
>> -       event = fanotify_alloc_event(group, mask, data, data_type, dir,
>> -                                    file_name, &fsid);
>> -       ret = -ENOMEM;
>> -       if (unlikely(!event)) {
>> -               /*
>> -                * We don't queue overflow events for permission events as
>> -                * there the access is denied and so no event is in fact lost.
>> -                */
>> -               if (!fanotify_is_perm_event(mask))
>> -                       fsnotify_queue_overflow(group);
>> -               goto finish;
>> -       }
>> -
>> -       fsn_event = &event->fse;
>> -       ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
>> -       if (ret) {
>> -               /* Permission events shouldn't be merged */
>> -               BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
>> -               /* Our event wasn't used in the end. Free it. */
>> -               fsnotify_destroy_event(group, fsn_event);
>> -
>> -               ret = 0;
>> -       } else if (fanotify_is_perm_event(mask)) {
>> -               ret = fanotify_get_response(group, FANOTIFY_PERM(event),
>> -                                           iter_info);
>> +       if (group->flags & FSN_SUBMISSION_RING_BUFFER) {
>> +               event = fanotify_ring_get_slot(group, mask, data, data_type);
>> +               if (IS_ERR(event))
>> +                       return PTR_ERR(event);
>
> So no FAN_OVERFLOW with the ring buffer implementation?
> This will be unexpected for fanotify users and frankly, less useful IMO.
> I also don't see the technical reason to omit the overflow event.
>
>> +               fanotify_ring_write_event(group, event, mask, data, &fsid);
>> +               fsnotify_ring_commit_slot(group, &event->fse);
>> +       } else {
>> +               event = fanotify_alloc_event(group, mask, data, data_type, dir,
>> +                                            file_name, &fsid);
>> +               ret = -ENOMEM;
>> +               if (unlikely(!event)) {
>> +                       /*
>> +                        * We don't queue overflow events for permission events as
>> +                        * there the access is denied and so no event is in fact lost.
>> +                        */
>> +                       if (!fanotify_is_perm_event(mask))
>> +                               fsnotify_queue_overflow(group);
>> +                       goto finish;
>> +               }
>> +               fsn_event = &event->fse;
>> +               ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
>> +               if (ret) {
>> +                       /* Permission events shouldn't be merged */
>> +                       BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
>> +                       /* Our event wasn't used in the end. Free it. */
>> +                       fsnotify_destroy_event(group, fsn_event);
>> +
>> +                       ret = 0;
>> +               } else if (fanotify_is_perm_event(mask)) {
>> +                       ret = fanotify_get_response(group, FANOTIFY_PERM(event),
>> +                                                   iter_info);
>> +               }
>>         }
>>  finish:
>>         if (fanotify_is_perm_event(mask))
>> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
>> index fe605359af88..5031198bf7db 100644
>> --- a/fs/notify/fanotify/fanotify_user.c
>> +++ b/fs/notify/fanotify/fanotify_user.c
>> @@ -521,7 +521,9 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
>>                  * Permission events get queued to wait for response.  Other
>>                  * events can be destroyed now.
>>                  */
>> -               if (!fanotify_is_perm_event(event->mask)) {
>> +               if (group->fanotify_data.flags & FAN_PREALLOC_QUEUE) {
>> +                       fsnotify_ring_buffer_consume_event(group, &event->fse);
>> +               } else if (!fanotify_is_perm_event(event->mask)) {
>>                         fsnotify_destroy_event(group, &event->fse);
>>                 } else {
>>                         if (ret <= 0) {
>> @@ -587,40 +589,39 @@ static int fanotify_release(struct inode *ignored, struct file *file)
>>          */
>>         fsnotify_group_stop_queueing(group);
>>
>> -       /*
>> -        * Process all permission events on access_list and notification queue
>> -        * and simulate reply from userspace.
>> -        */
>> -       spin_lock(&group->notification_lock);
>> -       while (!list_empty(&group->fanotify_data.access_list)) {
>> -               struct fanotify_perm_event *event;
>> -
>> -               event = list_first_entry(&group->fanotify_data.access_list,
>> -                               struct fanotify_perm_event, fae.fse.list);
>> -               list_del_init(&event->fae.fse.list);
>> -               finish_permission_event(group, event, FAN_ALLOW);
>> +       if (!(group->flags & FSN_SUBMISSION_RING_BUFFER)) {
>> +               /*
>> +                * Process all permission events on access_list and notification queue
>> +                * and simulate reply from userspace.
>> +                */
>>                 spin_lock(&group->notification_lock);
>> -       }
>> -
>> -       /*
>> -        * Destroy all non-permission events. For permission events just
>> -        * dequeue them and set the response. They will be freed once the
>> -        * response is consumed and fanotify_get_response() returns.
>> -        */
>> -       while (!fsnotify_notify_queue_is_empty(group)) {
>> -               struct fanotify_event *event;
>> -
>> -               event = FANOTIFY_E(fsnotify_remove_first_event(group));
>> -               if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
>> -                       spin_unlock(&group->notification_lock);
>> -                       fsnotify_destroy_event(group, &event->fse);
>> -               } else {
>> -                       finish_permission_event(group, FANOTIFY_PERM(event),
>> -                                               FAN_ALLOW);
>> +               while (!list_empty(&group->fanotify_data.access_list)) {
>> +                       struct fanotify_perm_event *event;
>> +                       event = list_first_entry(&group->fanotify_data.access_list,
>> +                                                struct fanotify_perm_event, fae.fse.list);
>> +                       list_del_init(&event->fae.fse.list);
>> +                       finish_permission_event(group, event, FAN_ALLOW);
>> +                       spin_lock(&group->notification_lock);
>>                 }
>> -               spin_lock(&group->notification_lock);
>> +               /*
>> +                * Destroy all non-permission events. For permission events just
>> +                * dequeue them and set the response. They will be freed once the
>> +                * response is consumed and fanotify_get_response() returns.
>> +                */
>> +               while (!fsnotify_notify_queue_is_empty(group)) {
>> +                       struct fanotify_event *event;
>> +                       event = FANOTIFY_E(fsnotify_remove_first_event(group));
>> +                       if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
>> +                               spin_unlock(&group->notification_lock);
>> +                               fsnotify_destroy_event(group, &event->fse);
>> +                       } else {
>> +                               finish_permission_event(group, FANOTIFY_PERM(event),
>> +                                                       FAN_ALLOW);
>> +                       }
>> +                       spin_lock(&group->notification_lock);
>> +               }
>> +               spin_unlock(&group->notification_lock);
>>         }
>> -       spin_unlock(&group->notification_lock);
>>
>>         /* Response for all permission events it set, wakeup waiters */
>>         wake_up(&group->fanotify_data.access_waitq);
>> @@ -981,6 +982,16 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>>         if (flags & FAN_NONBLOCK)
>>                 f_flags |= O_NONBLOCK;
>>
>> +       if (flags & FAN_PREALLOC_QUEUE) {
>> +               if (!capable(CAP_SYS_ADMIN))
>> +                       return -EPERM;
>> +
>> +               if (flags & FAN_UNLIMITED_QUEUE)
>> +                       return -EINVAL;
>> +
>> +               fsn_flags = FSN_SUBMISSION_RING_BUFFER;
>> +       }
>> +
>>         /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
>>         group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops, fsn_flags);
>>         if (IS_ERR(group)) {
>> @@ -1223,6 +1234,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
>>                 goto fput_and_out;
>>         }
>>
>> +       if ((group->flags & FSN_SUBMISSION_RING_BUFFER) &&
>> +           (mask & ~FANOTIFY_SUBMISSION_BUFFER_EVENTS))
>> +               goto fput_and_out;
>> +
>>         ret = fanotify_find_path(dfd, pathname, &path, flags,
>>                         (mask & ALL_FSNOTIFY_EVENTS), obj_type);
>>         if (ret)
>> @@ -1327,7 +1342,7 @@ SYSCALL32_DEFINE6(fanotify_mark,
>>   */
>>  static int __init fanotify_user_setup(void)
>>  {
>> -       BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
>> +       BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
>>         BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
>>
>>         fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
>> diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
>> index 3e9c56ee651f..5a4cefb4b1c3 100644
>> --- a/include/linux/fanotify.h
>> +++ b/include/linux/fanotify.h
>> @@ -23,7 +23,8 @@
>>  #define FANOTIFY_INIT_FLAGS    (FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \
>>                                  FAN_REPORT_TID | \
>>                                  FAN_CLOEXEC | FAN_NONBLOCK | \
>> -                                FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS)
>> +                                FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS | \
>> +                                FAN_PREALLOC_QUEUE)
>>
>>  #define FANOTIFY_MARK_TYPE_BITS        (FAN_MARK_INODE | FAN_MARK_MOUNT | \
>>                                  FAN_MARK_FILESYSTEM)
>> @@ -71,6 +72,8 @@
>>                                          FANOTIFY_PERM_EVENTS | \
>>                                          FAN_Q_OVERFLOW | FAN_ONDIR)
>>
>> +#define FANOTIFY_SUBMISSION_BUFFER_EVENTS 0
>
> FANOTIFY_RING_BUFFER_EVENTS? FANOTIFY_PREALLOC_EVENTS?
>
> Please leave a comment above to state what this group means.
> I *think* there is no reason to limit the set of events, only the sort of
> information that is possible with FAN_PREALLOC_QUEUE.
>
> Perhaps FAN_REPORT_FID cannot be allowed and as a result
> FANOTIFY_INODE_EVENTS will not be allowed, but I am not even
> sure if that limitation is needed.
>
> Thanks,
> Amir.
diff mbox series

Patch

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index e3669d8a4a64..98591a8155a7 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -612,6 +612,26 @@  static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
 	return event;
 }
 
+static struct fanotify_event *fanotify_ring_get_slot(struct fsnotify_group *group,
+						     u32 mask, const void *data,
+						     int data_type)
+{
+	size_t size = 0;
+
+	pr_debug("%s: group=%p mask=%x size=%lu\n", __func__, group, mask, size);
+
+	return FANOTIFY_E(fsnotify_ring_alloc_event_slot(group, size));
+}
+
+static void fanotify_ring_write_event(struct fsnotify_group *group,
+				      struct fanotify_event *event, u32 mask,
+				      const void *data, __kernel_fsid_t *fsid)
+{
+	fanotify_init_event(group, event, 0, mask);
+
+	event->pid = get_pid(task_tgid(current));
+}
+
 /*
  * Get cached fsid of the filesystem containing the object from any connector.
  * All connectors are supposed to have the same fsid, but we do not verify that
@@ -701,31 +721,38 @@  static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
 			return 0;
 	}
 
-	event = fanotify_alloc_event(group, mask, data, data_type, dir,
-				     file_name, &fsid);
-	ret = -ENOMEM;
-	if (unlikely(!event)) {
-		/*
-		 * We don't queue overflow events for permission events as
-		 * there the access is denied and so no event is in fact lost.
-		 */
-		if (!fanotify_is_perm_event(mask))
-			fsnotify_queue_overflow(group);
-		goto finish;
-	}
-
-	fsn_event = &event->fse;
-	ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
-	if (ret) {
-		/* Permission events shouldn't be merged */
-		BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
-		/* Our event wasn't used in the end. Free it. */
-		fsnotify_destroy_event(group, fsn_event);
-
-		ret = 0;
-	} else if (fanotify_is_perm_event(mask)) {
-		ret = fanotify_get_response(group, FANOTIFY_PERM(event),
-					    iter_info);
+	if (group->flags & FSN_SUBMISSION_RING_BUFFER) {
+		event = fanotify_ring_get_slot(group, mask, data, data_type);
+		if (IS_ERR(event))
+			return PTR_ERR(event);
+		fanotify_ring_write_event(group, event, mask, data, &fsid);
+		fsnotify_ring_commit_slot(group, &event->fse);
+	} else {
+		event = fanotify_alloc_event(group, mask, data, data_type, dir,
+					     file_name, &fsid);
+		ret = -ENOMEM;
+		if (unlikely(!event)) {
+			/*
+			 * We don't queue overflow events for permission events as
+			 * there the access is denied and so no event is in fact lost.
+			 */
+			if (!fanotify_is_perm_event(mask))
+				fsnotify_queue_overflow(group);
+			goto finish;
+		}
+		fsn_event = &event->fse;
+		ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
+		if (ret) {
+			/* Permission events shouldn't be merged */
+			BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
+			/* Our event wasn't used in the end. Free it. */
+			fsnotify_destroy_event(group, fsn_event);
+
+			ret = 0;
+		} else if (fanotify_is_perm_event(mask)) {
+			ret = fanotify_get_response(group, FANOTIFY_PERM(event),
+						    iter_info);
+		}
 	}
 finish:
 	if (fanotify_is_perm_event(mask))
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index fe605359af88..5031198bf7db 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -521,7 +521,9 @@  static ssize_t fanotify_read(struct file *file, char __user *buf,
 		 * Permission events get queued to wait for response.  Other
 		 * events can be destroyed now.
 		 */
-		if (!fanotify_is_perm_event(event->mask)) {
+		if (group->fanotify_data.flags & FAN_PREALLOC_QUEUE) {
+			fsnotify_ring_buffer_consume_event(group, &event->fse);
+		} else if (!fanotify_is_perm_event(event->mask)) {
 			fsnotify_destroy_event(group, &event->fse);
 		} else {
 			if (ret <= 0) {
@@ -587,40 +589,39 @@  static int fanotify_release(struct inode *ignored, struct file *file)
 	 */
 	fsnotify_group_stop_queueing(group);
 
-	/*
-	 * Process all permission events on access_list and notification queue
-	 * and simulate reply from userspace.
-	 */
-	spin_lock(&group->notification_lock);
-	while (!list_empty(&group->fanotify_data.access_list)) {
-		struct fanotify_perm_event *event;
-
-		event = list_first_entry(&group->fanotify_data.access_list,
-				struct fanotify_perm_event, fae.fse.list);
-		list_del_init(&event->fae.fse.list);
-		finish_permission_event(group, event, FAN_ALLOW);
+	if (!(group->flags & FSN_SUBMISSION_RING_BUFFER)) {
+		/*
+		 * Process all permission events on access_list and notification queue
+		 * and simulate reply from userspace.
+		 */
 		spin_lock(&group->notification_lock);
-	}
-
-	/*
-	 * Destroy all non-permission events. For permission events just
-	 * dequeue them and set the response. They will be freed once the
-	 * response is consumed and fanotify_get_response() returns.
-	 */
-	while (!fsnotify_notify_queue_is_empty(group)) {
-		struct fanotify_event *event;
-
-		event = FANOTIFY_E(fsnotify_remove_first_event(group));
-		if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
-			spin_unlock(&group->notification_lock);
-			fsnotify_destroy_event(group, &event->fse);
-		} else {
-			finish_permission_event(group, FANOTIFY_PERM(event),
-						FAN_ALLOW);
+		while (!list_empty(&group->fanotify_data.access_list)) {
+			struct fanotify_perm_event *event;
+			event = list_first_entry(&group->fanotify_data.access_list,
+						 struct fanotify_perm_event, fae.fse.list);
+			list_del_init(&event->fae.fse.list);
+			finish_permission_event(group, event, FAN_ALLOW);
+			spin_lock(&group->notification_lock);
 		}
-		spin_lock(&group->notification_lock);
+		/*
+		 * Destroy all non-permission events. For permission events just
+		 * dequeue them and set the response. They will be freed once the
+		 * response is consumed and fanotify_get_response() returns.
+		 */
+		while (!fsnotify_notify_queue_is_empty(group)) {
+			struct fanotify_event *event;
+			event = FANOTIFY_E(fsnotify_remove_first_event(group));
+			if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
+				spin_unlock(&group->notification_lock);
+				fsnotify_destroy_event(group, &event->fse);
+			} else {
+				finish_permission_event(group, FANOTIFY_PERM(event),
+							FAN_ALLOW);
+			}
+			spin_lock(&group->notification_lock);
+		}
+		spin_unlock(&group->notification_lock);
 	}
-	spin_unlock(&group->notification_lock);
 
 	/* Response for all permission events it set, wakeup waiters */
 	wake_up(&group->fanotify_data.access_waitq);
@@ -981,6 +982,16 @@  SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	if (flags & FAN_NONBLOCK)
 		f_flags |= O_NONBLOCK;
 
+	if (flags & FAN_PREALLOC_QUEUE) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (flags & FAN_UNLIMITED_QUEUE)
+			return -EINVAL;
+
+		fsn_flags = FSN_SUBMISSION_RING_BUFFER;
+	}
+
 	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
 	group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops, fsn_flags);
 	if (IS_ERR(group)) {
@@ -1223,6 +1234,10 @@  static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 		goto fput_and_out;
 	}
 
+	if ((group->flags & FSN_SUBMISSION_RING_BUFFER) &&
+	    (mask & ~FANOTIFY_SUBMISSION_BUFFER_EVENTS))
+		goto fput_and_out;
+
 	ret = fanotify_find_path(dfd, pathname, &path, flags,
 			(mask & ALL_FSNOTIFY_EVENTS), obj_type);
 	if (ret)
@@ -1327,7 +1342,7 @@  SYSCALL32_DEFINE6(fanotify_mark,
  */
 static int __init fanotify_user_setup(void)
 {
-	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
+	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
 
 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 3e9c56ee651f..5a4cefb4b1c3 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -23,7 +23,8 @@ 
 #define FANOTIFY_INIT_FLAGS	(FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \
 				 FAN_REPORT_TID | \
 				 FAN_CLOEXEC | FAN_NONBLOCK | \
-				 FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS)
+				 FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS | \
+				 FAN_PREALLOC_QUEUE)
 
 #define FANOTIFY_MARK_TYPE_BITS	(FAN_MARK_INODE | FAN_MARK_MOUNT | \
 				 FAN_MARK_FILESYSTEM)
@@ -71,6 +72,8 @@ 
 					 FANOTIFY_PERM_EVENTS | \
 					 FAN_Q_OVERFLOW | FAN_ONDIR)
 
+#define FANOTIFY_SUBMISSION_BUFFER_EVENTS 0
+
 #define ALL_FANOTIFY_EVENT_BITS		(FANOTIFY_OUTGOING_EVENTS | \
 					 FANOTIFY_EVENT_FLAGS)
 
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index fbf9c5c7dd59..b283531549f1 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -49,6 +49,7 @@ 
 #define FAN_UNLIMITED_QUEUE	0x00000010
 #define FAN_UNLIMITED_MARKS	0x00000020
 #define FAN_ENABLE_AUDIT	0x00000040
+#define FAN_PREALLOC_QUEUE	0x00000080
 
 /* Flags to determine fanotify event format */
 #define FAN_REPORT_TID		0x00000100	/* event->pid is thread id */