diff mbox series

[ovs-dev,RFC] dpdk: Allow retaining cap_sys_rawio privileges

Message ID 20230222170728.1698916-1-aconole@redhat.com
State Changes Requested
Headers show
Series [ovs-dev,RFC] dpdk: Allow retaining cap_sys_rawio privileges | expand

Checks

Context Check Description
ovsrobot/apply-robot warning apply and check: warning
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Aaron Conole Feb. 22, 2023, 5:07 p.m. UTC
Open vSwitch generally tries to let the underlying operating system
managed the low level details of hardware, for example DMA mapping,
bus arbitration, etc.  However, when using DPDK, the underlying
operating system yields control of many of these details to userspace
for management.

In the case of some DPDK port drivers, configuring rte_flow or even
allocating resources may require access to iopl/ioperm calls, which
are guarded by the CAP_SYS_RAWIO privilege on linux systems.  These
calls are dangerous, and can allow a process to completely compromise
a system.  However, they are needed in the case of some userspace
driver code which manages the hardware (for example, the mlx
implementation of backend support for rte_flow).

Here, we create an opt-in flag passed to the command line to allow
this access.  We need to do this before ever accessing the database,
because we want to drop all privileges asap, and cannot wait for
a connection to the database to be established and functional before
dropping.  There may be distribution specific ways to do capability
management as well (using for example, systemd), but they are not
as universal to the vswitchd as a flag.

Signed-off-by: Aaron Conole <aconole@redhat.com>
---
 NEWS                           |  4 ++++
 lib/daemon-unix.c              | 31 ++++++++++++++++++++++---------
 lib/daemon.c                   |  2 +-
 lib/daemon.h                   |  4 ++--
 ovsdb/ovsdb-client.c           |  6 +++---
 ovsdb/ovsdb-server.c           |  4 ++--
 tests/test-netflow.c           |  2 +-
 tests/test-sflow.c             |  2 +-
 tests/test-unixctl.c           |  2 +-
 utilities/ovs-ofctl.c          |  4 ++--
 utilities/ovs-testcontroller.c |  4 ++--
 vswitchd/ovs-vswitchd.8.in     |  8 ++++++++
 vswitchd/ovs-vswitchd.c        | 11 ++++++++++-
 13 files changed, 59 insertions(+), 25 deletions(-)

Comments

Aaron Conole Feb. 22, 2023, 5:11 p.m. UTC | #1
Apologies - I mis-typed Gaetan's email when I entered it into my mail
file.  CC'd correctly on this email (but I can resend the patch, if you
think it is better).

Aaron Conole <aconole@redhat.com> writes:

> Open vSwitch generally tries to let the underlying operating system
> managed the low level details of hardware, for example DMA mapping,
> bus arbitration, etc.  However, when using DPDK, the underlying
> operating system yields control of many of these details to userspace
> for management.
>
> In the case of some DPDK port drivers, configuring rte_flow or even
> allocating resources may require access to iopl/ioperm calls, which
> are guarded by the CAP_SYS_RAWIO privilege on linux systems.  These
> calls are dangerous, and can allow a process to completely compromise
> a system.  However, they are needed in the case of some userspace
> driver code which manages the hardware (for example, the mlx
> implementation of backend support for rte_flow).
>
> Here, we create an opt-in flag passed to the command line to allow
> this access.  We need to do this before ever accessing the database,
> because we want to drop all privileges asap, and cannot wait for
> a connection to the database to be established and functional before
> dropping.  There may be distribution specific ways to do capability
> management as well (using for example, systemd), but they are not
> as universal to the vswitchd as a flag.
>
> Signed-off-by: Aaron Conole <aconole@redhat.com>
> ---
>  NEWS                           |  4 ++++
>  lib/daemon-unix.c              | 31 ++++++++++++++++++++++---------
>  lib/daemon.c                   |  2 +-
>  lib/daemon.h                   |  4 ++--
>  ovsdb/ovsdb-client.c           |  6 +++---
>  ovsdb/ovsdb-server.c           |  4 ++--
>  tests/test-netflow.c           |  2 +-
>  tests/test-sflow.c             |  2 +-
>  tests/test-unixctl.c           |  2 +-
>  utilities/ovs-ofctl.c          |  4 ++--
>  utilities/ovs-testcontroller.c |  4 ++--
>  vswitchd/ovs-vswitchd.8.in     |  8 ++++++++
>  vswitchd/ovs-vswitchd.c        | 11 ++++++++++-
>  13 files changed, 59 insertions(+), 25 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index 85b3496214..65f35dcdd5 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -10,6 +10,10 @@ Post-v3.1.0
>         in order to create OVSDB sockets with access mode of 0770.
>     - QoS:
>       * Added new configuration option 'jitter' for a linux-netem QoS type.
> +   - DPDK:
> +     * ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started
> +       with the --hw-rawio-access command line option.  This allows the
> +       process extra privileges when mapping physical interconnect memory.
>  
>  
>  v3.1.0 - 16 Feb 2023
> diff --git a/lib/daemon-unix.c b/lib/daemon-unix.c
> index 1a7ba427d7..8b895a48de 100644
> --- a/lib/daemon-unix.c
> +++ b/lib/daemon-unix.c
> @@ -88,7 +88,8 @@ static bool switch_user = false;
>  static uid_t uid;
>  static gid_t gid;
>  static char *user = NULL;
> -static void daemon_become_new_user__(bool access_datapath);
> +static void daemon_become_new_user__(bool access_datapath,
> +                                     bool access_hardware_ports);
>  
>  static void check_already_running(void);
>  static int lock_pidfile(FILE *, int command);
> @@ -443,13 +444,13 @@ monitor_daemon(pid_t daemon_pid)
>   * daemonize_complete()) or that it failed to start up (by exiting with a
>   * nonzero exit code). */
>  void
> -daemonize_start(bool access_datapath)
> +daemonize_start(bool access_datapath, bool access_hardware_ports)
>  {
>      assert_single_threaded();
>      daemonize_fd = -1;
>  
>      if (switch_user) {
> -        daemon_become_new_user__(access_datapath);
> +        daemon_become_new_user__(access_datapath, access_hardware_ports);
>          switch_user = false;
>      }
>  
> @@ -807,7 +808,8 @@ daemon_become_new_user_unix(void)
>  /* Linux specific implementation of daemon_become_new_user()
>   * using libcap-ng.   */
>  static void
> -daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
> +daemon_become_new_user_linux(bool access_datapath OVS_UNUSED,
> +                             bool access_hardware_ports OVS_UNUSED)
>  {
>  #if defined __linux__ &&  HAVE_LIBCAPNG
>      int ret;
> @@ -826,7 +828,17 @@ daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
>              if (access_datapath && !ret) {
>                  ret = capng_update(CAPNG_ADD, cap_sets, CAP_NET_ADMIN)
>                        || capng_update(CAPNG_ADD, cap_sets, CAP_NET_RAW)
> -                      || capng_update(CAPNG_ADD, cap_sets, CAP_NET_BROADCAST);
> +                      || capng_update(CAPNG_ADD, cap_sets, CAP_NET_BROADCAST)
> +#ifdef DPDK_NETDEV
> +                      || (access_hardware_ports &&
> +                          capng_update(CAPNG_ADD, cap_sets, CAP_SYS_RAWIO))
> +#else
> +                    ;
> +                if (access_hardware_ports) {
> +                    VLOG_WARN("hw port access requested, but no userspace ioport support.  Dropping.");
> +                }
> +#endif
> +                    ;
>              }
>          } else {
>              ret = -1;
> @@ -854,7 +866,7 @@ daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
>  }
>  
>  static void
> -daemon_become_new_user__(bool access_datapath)
> +daemon_become_new_user__(bool access_datapath, bool access_hardware_ports)
>  {
>      /* If vlog file has been created, change its owner to the non-root user
>       * as specifed by the --user option.  */
> @@ -862,7 +874,8 @@ daemon_become_new_user__(bool access_datapath)
>  
>      if (LINUX) {
>          if (LIBCAPNG) {
> -            daemon_become_new_user_linux(access_datapath);
> +            daemon_become_new_user_linux(access_datapath,
> +                                         access_hardware_ports);
>          } else {
>              VLOG_FATAL("%s: fail to downgrade user using libcap-ng. "
>                         "(libcap-ng is not configured at compile time), "
> @@ -877,11 +890,11 @@ daemon_become_new_user__(bool access_datapath)
>   * However, there in case the user switch needs to be done
>   * before daemonize_start(), the following API can be used.  */
>  void
> -daemon_become_new_user(bool access_datapath)
> +daemon_become_new_user(bool access_datapath, bool access_hardware_ports)
>  {
>      assert_single_threaded();
>      if (switch_user) {
> -        daemon_become_new_user__(access_datapath);
> +        daemon_become_new_user__(access_datapath, access_hardware_ports);
>          /* daemonize_start() should not switch user again. */
>          switch_user = false;
>      }
> diff --git a/lib/daemon.c b/lib/daemon.c
> index 3249c5ab4b..1e1c019eb1 100644
> --- a/lib/daemon.c
> +++ b/lib/daemon.c
> @@ -48,7 +48,7 @@ get_detach(void)
>  void
>  daemonize(void)
>  {
> -    daemonize_start(false);
> +    daemonize_start(false, false);
>      daemonize_complete();
>  }
>  
> diff --git a/lib/daemon.h b/lib/daemon.h
> index 0941574963..42372d1463 100644
> --- a/lib/daemon.h
> +++ b/lib/daemon.h
> @@ -167,10 +167,10 @@ void set_detach(void);
>  bool get_detach(void);
>  void daemon_save_fd(int fd);
>  void daemonize(void);
> -void daemonize_start(bool access_datapath);
> +void daemonize_start(bool access_datapath, bool access_hardware_ports);
>  void daemonize_complete(void);
>  void daemon_set_new_user(const char * user_spec);
> -void daemon_become_new_user(bool access_datapath);
> +void daemon_become_new_user(bool access_datapath, bool access_hardware_ports);
>  void daemon_usage(void);
>  void daemon_disable_self_confinement(void);
>  bool daemon_should_self_confine(void);
> diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c
> index f1b8d64910..bae2c5f041 100644
> --- a/ovsdb/ovsdb-client.c
> +++ b/ovsdb/ovsdb-client.c
> @@ -250,7 +250,7 @@ main(int argc, char *argv[])
>      parse_options(argc, argv);
>      fatal_ignore_sigpipe();
>  
> -    daemon_become_new_user(false);
> +    daemon_become_new_user(false, false);
>      if (optind >= argc) {
>          ovs_fatal(0, "missing command name; use --help for help");
>      }
> @@ -1392,7 +1392,7 @@ do_monitor__(struct jsonrpc *rpc, const char *database,
>  
>      daemon_save_fd(STDOUT_FILENO);
>      daemon_save_fd(STDERR_FILENO);
> -    daemonize_start(false);
> +    daemonize_start(false, false);
>      if (get_detach()) {
>          int error;
>  
> @@ -2276,7 +2276,7 @@ do_lock(struct jsonrpc *rpc, const char *method, const char *lock)
>                                          getting a reply of the previous
>                                          request. */
>      daemon_save_fd(STDOUT_FILENO);
> -    daemonize_start(false);
> +    daemonize_start(false, false);
>      lock_req_init(&lock_req, method, lock);
>  
>      if (get_detach()) {
> diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c
> index 33ca4910d7..4fea2dbda7 100644
> --- a/ovsdb/ovsdb-server.c
> +++ b/ovsdb/ovsdb-server.c
> @@ -341,7 +341,7 @@ main(int argc, char *argv[])
>                    &run_command, &sync_from, &sync_exclude, &active);
>      is_backup = sync_from && !active;
>  
> -    daemon_become_new_user(false);
> +    daemon_become_new_user(false, false);
>  
>      /* Create and initialize 'config_tmpfile' as a temporary file to hold
>       * ovsdb-server's most basic configuration, and then save our initial
> @@ -359,7 +359,7 @@ main(int argc, char *argv[])
>      save_config__(config_tmpfile, &remotes, &db_filenames, sync_from,
>                    sync_exclude, is_backup);
>  
> -    daemonize_start(false);
> +    daemonize_start(false, false);
>  
>      /* Load the saved config. */
>      load_config(config_tmpfile, &remotes, &db_filenames, &sync_from,
> diff --git a/tests/test-netflow.c b/tests/test-netflow.c
> index d2322d4509..7f89cfcae0 100644
> --- a/tests/test-netflow.c
> +++ b/tests/test-netflow.c
> @@ -195,7 +195,7 @@ test_netflow_main(int argc, char *argv[])
>      }
>  
>      daemon_save_fd(STDOUT_FILENO);
> -    daemonize_start(false);
> +    daemonize_start(false, false);
>  
>      error = unixctl_server_create(NULL, &server);
>      if (error) {
> diff --git a/tests/test-sflow.c b/tests/test-sflow.c
> index 460d4d6c54..3c617bdd16 100644
> --- a/tests/test-sflow.c
> +++ b/tests/test-sflow.c
> @@ -709,7 +709,7 @@ test_sflow_main(int argc, char *argv[])
>      }
>  
>      daemon_save_fd(STDOUT_FILENO);
> -    daemonize_start(false);
> +    daemonize_start(false, false);
>  
>      error = unixctl_server_create(NULL, &server);
>      if (error) {
> diff --git a/tests/test-unixctl.c b/tests/test-unixctl.c
> index 3eadf54cd9..9e89827895 100644
> --- a/tests/test-unixctl.c
> +++ b/tests/test-unixctl.c
> @@ -83,7 +83,7 @@ test_unixctl_main(int argc, char *argv[])
>      fatal_ignore_sigpipe();
>      parse_options(&argc, &argv, &unixctl_path);
>  
> -    daemonize_start(false);
> +    daemonize_start(false, false);
>      int retval = unixctl_server_create(unixctl_path, &unixctl);
>      if (retval) {
>          exit(EXIT_FAILURE);
> diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c
> index eabec18a36..f81f5f759a 100644
> --- a/utilities/ovs-ofctl.c
> +++ b/utilities/ovs-ofctl.c
> @@ -173,7 +173,7 @@ main(int argc, char *argv[])
>      ctx.argc = argc - optind;
>      ctx.argv = argv + optind;
>  
> -    daemon_become_new_user(false);
> +    daemon_become_new_user(false, false);
>      if (read_only) {
>          ovs_cmdl_run_command_read_only(&ctx, get_all_commands());
>      } else {
> @@ -2127,7 +2127,7 @@ monitor_vconn(struct vconn *vconn, bool reply_to_echo_requests,
>      int error;
>  
>      daemon_save_fd(STDERR_FILENO);
> -    daemonize_start(false);
> +    daemonize_start(false, false);
>      error = unixctl_server_create(unixctl_path, &server);
>      if (error) {
>          ovs_fatal(error, "failed to create unixctl server");
> diff --git a/utilities/ovs-testcontroller.c b/utilities/ovs-testcontroller.c
> index b489ff5fc7..9f2fbfdf51 100644
> --- a/utilities/ovs-testcontroller.c
> +++ b/utilities/ovs-testcontroller.c
> @@ -109,7 +109,7 @@ main(int argc, char *argv[])
>      parse_options(argc, argv);
>      fatal_ignore_sigpipe();
>  
> -    daemon_become_new_user(false);
> +    daemon_become_new_user(false, false);
>  
>      if (argc - optind < 1) {
>          ovs_fatal(0, "at least one vconn argument required; "
> @@ -148,7 +148,7 @@ main(int argc, char *argv[])
>          ovs_fatal(0, "no active or passive switch connections");
>      }
>  
> -    daemonize_start(false);
> +    daemonize_start(false, false);
>  
>      retval = unixctl_server_create(unixctl_path, &unixctl);
>      if (retval) {
> diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in
> index 9569265fcb..a6a4a24606 100644
> --- a/vswitchd/ovs-vswitchd.8.in
> +++ b/vswitchd/ovs-vswitchd.8.in
> @@ -81,6 +81,14 @@ unavailable or unsuccessful.
>  .SS "DPDK Options"
>  For details on initializing \fBovs\-vswitchd\fR to use DPDK ports,
>  refer to the documentation or \fBovs\-vswitchd.conf.db\fR(5).
> +.SS "DPDK HW Access Options"
> +.IP "\fB\-\-hw\-rawio\-access\fR"
> +Tells \fBovs\-vswitchd\fR to retain the \fBCAP_SYS_RAWIO\fR capability,
> +to allow userspace drivers access to raw hardware memory.  This will
> +also allow the \fBovs\-vswitchd\fR daemon to call \fBiopl()\fR and
> +\fBioperm()\fR functions to set port access.  This is a \fBvery\fR
> +powerful capability, so generally only enable as needed for specific
> +hardware.
>  .SS "Daemon Options"
>  .ds DD \
>  \fBovs\-vswitchd\fR detaches only after it has connected to the \
> diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c
> index 407bfc60eb..f62d1ad751 100644
> --- a/vswitchd/ovs-vswitchd.c
> +++ b/vswitchd/ovs-vswitchd.c
> @@ -60,6 +60,9 @@ VLOG_DEFINE_THIS_MODULE(vswitchd);
>   * the kernel from paging any of its memory to disk. */
>  static bool want_mlockall;
>  
> +/* --hw-access: If set, retains CAP_SYS_RAWIO privileges.  */
> +static bool hw_access;
> +
>  static unixctl_cb_func ovs_vswitchd_exit;
>  
>  static char *parse_options(int argc, char *argv[], char **unixctl_path);
> @@ -89,7 +92,7 @@ main(int argc, char *argv[])
>      remote = parse_options(argc, argv, &unixctl_path);
>      fatal_ignore_sigpipe();
>  
> -    daemonize_start(true);
> +    daemonize_start(true, true);
>  
>      if (want_mlockall) {
>  #ifdef HAVE_MLOCKALL
> @@ -169,6 +172,7 @@ parse_options(int argc, char *argv[], char **unixctl_pathp)
>          OPT_DPDK,
>          SSL_OPTION_ENUMS,
>          OPT_DUMMY_NUMA,
> +        OPT_HW_ACCESS,
>      };
>      static const struct option long_options[] = {
>          {"help",        no_argument, NULL, 'h'},
> @@ -185,6 +189,7 @@ parse_options(int argc, char *argv[], char **unixctl_pathp)
>          {"disable-system-route", no_argument, NULL, OPT_DISABLE_SYSTEM_ROUTE},
>          {"dpdk", optional_argument, NULL, OPT_DPDK},
>          {"dummy-numa", required_argument, NULL, OPT_DUMMY_NUMA},
> +        {"hw-rawio-access", no_argument, NULL, OPT_HW_ACCESS},
>          {NULL, 0, NULL, 0},
>      };
>      char *short_options = ovs_cmdl_long_options_to_short_options(long_options);
> @@ -249,6 +254,10 @@ parse_options(int argc, char *argv[], char **unixctl_pathp)
>              ovs_numa_set_dummy(optarg);
>              break;
>  
> +        case OPT_HW_ACCESS:
> +            hw_access = true;
> +            break;
> +
>          default:
>              abort();
>          }
0-day Robot Feb. 22, 2023, 6:19 p.m. UTC | #2
Bleep bloop.  Greetings Aaron Conole, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Line is 103 characters long (recommended limit is 79)
#112 FILE: lib/daemon-unix.c:838:
                    VLOG_WARN("hw port access requested, but no userspace ioport support.  Dropping.");

Lines checked: 390, Warnings: 1, Errors: 0


Please check this out.  If you feel there has been an error, please email aconole@redhat.com

Thanks,
0-day Robot
Robin Jarry Feb. 23, 2023, 5:03 p.m. UTC | #3
Aaron Conole, Feb 22, 2023 at 18:07:
> Open vSwitch generally tries to let the underlying operating system
> managed the low level details of hardware, for example DMA mapping,
> bus arbitration, etc.  However, when using DPDK, the underlying
> operating system yields control of many of these details to userspace
> for management.
>
> In the case of some DPDK port drivers, configuring rte_flow or even
> allocating resources may require access to iopl/ioperm calls, which
> are guarded by the CAP_SYS_RAWIO privilege on linux systems.  These
> calls are dangerous, and can allow a process to completely compromise
> a system.  However, they are needed in the case of some userspace
> driver code which manages the hardware (for example, the mlx
> implementation of backend support for rte_flow).
>
> Here, we create an opt-in flag passed to the command line to allow
> this access.  We need to do this before ever accessing the database,
> because we want to drop all privileges asap, and cannot wait for
> a connection to the database to be established and functional before
> dropping.  There may be distribution specific ways to do capability
> management as well (using for example, systemd), but they are not
> as universal to the vswitchd as a flag.
>
> Signed-off-by: Aaron Conole <aconole@redhat.com>

Hi Aaron,

I briefly tested the injection of a basic RTE flow rule (see my control
plane protection patch here for more details[1]). With a non-root user,
there are no permission issues that I can see. I have tested with both
the i40e (X710) and mlx5 (ConnectX-5 Ex) drivers.

Maybe CAP_SYS_RAWIO is only required for more advanced flows but I am
surprised that I didn't encounter the issue for neither a vfio-pci based
driver nor with a bifurcated driver.

[1]: http://patchwork.ozlabs.org/project/openvswitch/patch/20230222154321.23421-1-rjarry@redhat.com/
Aaron Conole Feb. 23, 2023, 9:09 p.m. UTC | #4
"Robin Jarry" <rjarry@redhat.com> writes:

> Aaron Conole, Feb 22, 2023 at 18:07:
>> Open vSwitch generally tries to let the underlying operating system
>> managed the low level details of hardware, for example DMA mapping,
>> bus arbitration, etc.  However, when using DPDK, the underlying
>> operating system yields control of many of these details to userspace
>> for management.
>>
>> In the case of some DPDK port drivers, configuring rte_flow or even
>> allocating resources may require access to iopl/ioperm calls, which
>> are guarded by the CAP_SYS_RAWIO privilege on linux systems.  These
>> calls are dangerous, and can allow a process to completely compromise
>> a system.  However, they are needed in the case of some userspace
>> driver code which manages the hardware (for example, the mlx
>> implementation of backend support for rte_flow).
>>
>> Here, we create an opt-in flag passed to the command line to allow
>> this access.  We need to do this before ever accessing the database,
>> because we want to drop all privileges asap, and cannot wait for
>> a connection to the database to be established and functional before
>> dropping.  There may be distribution specific ways to do capability
>> management as well (using for example, systemd), but they are not
>> as universal to the vswitchd as a flag.
>>
>> Signed-off-by: Aaron Conole <aconole@redhat.com>
>
> Hi Aaron,
>
> I briefly tested the injection of a basic RTE flow rule (see my control
> plane protection patch here for more details[1]). With a non-root user,
> there are no permission issues that I can see. I have tested with both
> the i40e (X710) and mlx5 (ConnectX-5 Ex) drivers.

Thanks for taking a look.  You're saying that you tested without this
patch applied, yes?  That could be.  I only know of one hardware which
requires CAP_SYS_RAWIO for rte_flow to function.

> Maybe CAP_SYS_RAWIO is only required for more advanced flows but I am
> surprised that I didn't encounter the issue for neither a vfio-pci based
> driver nor with a bifurcated driver.
>
> [1]: http://patchwork.ozlabs.org/project/openvswitch/patch/20230222154321.23421-1-rjarry@redhat.com/

I'm surprised as well.  Maybe someone from Mellanox can comment?
Robin Jarry Feb. 23, 2023, 9:14 p.m. UTC | #5
Aaron Conole, Feb 23, 2023 at 22:09:
> Thanks for taking a look.  You're saying that you tested without this
> patch applied, yes?  That could be.  I only know of one hardware which
> requires CAP_SYS_RAWIO for rte_flow to function.

Yes that is correct, I tested *without* this patch applied and with
a non-root user (ovs-vswitchd linked with libcap-ng).

  ovs-ctl --ovs-user="openvswitch:hugetlbfs" start

The basic RTE flow rules (matching of the ether type field and redirect
to a specific queue) were created without errors returned with both NICs
I had available (Intel X710 and Mellanox ConnectX-5 Ex)

 cp-protection: redirected lacp traffic to rx queue 1
 cp-protection: redirected other traffic to rx queue 0
Gaetan Rivet Feb. 23, 2023, 9:33 p.m. UTC | #6
> -----Original Message-----
> From: Robin Jarry <rjarry@redhat.com <mailto:rjarry@redhat.com>>
> Date: Thursday 23 February 2023 at 22:14
> To: Aaron Conole <aconole@redhat.com <mailto:aconole@redhat.com>>
> Cc: "dev@openvswitch.org <mailto:dev@openvswitch.org>" <dev@openvswitch.org <mailto:dev@openvswitch.org>>, Eli Britstein <elibr@nvidia.com <mailto:elibr@nvidia.com>>, Gaetan Rivet <gaetanr@nvidia.com <mailto:gaetanr@nvidia.com>>, Ilya Maximets <i.maximets@ovn.org <mailto:i.      maximets@ovn.org>>, Maxime Coquelin <maxime.coquelin@redhat.com <mailto:maxime.coquelin@redhat.com>>, Jason Gunthorpe <jgg@nvidia.com <mailto:jgg@nvidia.com>>, Majd Dibbiny <majd@nvidia.com <mailto:majd@nvidia.com>>, David Marchand <david.marchand@redhat.com <mailto:david.marchand@redhat.com>>, Gaetan Rivet <grive@u256.net <mailto:grive@u256.net>>, Eelco Chaudron <echaudro@redhat.com <mailto:echaudro@redhat.com>>
> Subject: Re: [ovs-dev] [RFC] dpdk: Allow retaining cap_sys_rawio privileges
>
>
> External email: Use caution opening links or attachments
>
>
>
>
> Aaron Conole, Feb 23, 2023 at 22:09:
> > Thanks for taking a look. You're saying that you tested without this
> > patch applied, yes? That could be. I only know of one hardware which
> > requires CAP_SYS_RAWIO for rte_flow to function.
>
>
> Yes that is correct, I tested *without* this patch applied and with
> a non-root user (ovs-vswitchd linked with libcap-ng).
>
>
> ovs-ctl --ovs-user="openvswitch:hugetlbfs" start
>
>
> The basic RTE flow rules (matching of the ether type field and redirect
> to a specific queue) were created without errors returned with both NICs
> I had available (Intel X710 and Mellanox ConnectX-5 Ex)
>
>
> cp-protection: redirected lacp traffic to rx queue 1
> cp-protection: redirected other traffic to rx queue 0

Hello,

I've looked at your patch Robin and the offloads you insert in dpdk_cp_prot_add_flow
use the following:

    const struct rte_flow_attr attr = { .ingress = 1 };

implicitly setting transfer and group to 0. If either of those had been non-zero instead,
cap_sys_rawio would be required.

Otherwise thank you very much Aaron for you patch, I was reading it and will
comment directly to it.

Best regards,
Gaetan
Robin Jarry Feb. 23, 2023, 9:42 p.m. UTC | #7
Salut Gaëtan,

Gaetan Rivet, Feb 23, 2023 at 22:33:
> I've looked at your patch Robin and the offloads you insert in
> dpdk_cp_prot_add_flow use the following:
>
>     const struct rte_flow_attr attr = { .ingress = 1 };
>
> implicitly setting transfer and group to 0. If either of those had
> been non-zero instead, cap_sys_rawio would be required.

Oh I was not aware that this would change anything. Is there some
document/code snippet/anything that explains why is that so? Is that
specific to the mlx5 driver?

Thanks!
Gaetan Rivet Feb. 23, 2023, 10:09 p.m. UTC | #8
> -----Original Message-----
> From: Robin Jarry <rjarry@redhat.com <mailto:rjarry@redhat.com>>
> Date: Thursday 23 February 2023 at 22:43
> To: Gaetan Rivet <gaetanr@nvidia.com <mailto:gaetanr@nvidia.com>>, Aaron Conole <aconole@redhat.com <mailto:aconole@redhat.com>>
> Cc: "dev@openvswitch.org <mailto:dev@openvswitch.org>" <dev@openvswitch.org <mailto:dev@openvswitch.org>>, Eli Britstein <elibr@nvidia.com <mailto:elibr@nvidia.com>>, Ilya Maximets <i.maximets@ovn.org <mailto:i.maximets@ovn.org>>, Maxime Coquelin <maxime.coquelin@redhat.com      <mailto:maxime.coquelin@redhat.com>>, Jason Gunthorpe <jgg@nvidia.com <mailto:jgg@nvidia.com>>, Majd Dibbiny <majd@nvidia.com <mailto:majd@nvidia.com>>, David Marchand <david.marchand@redhat.com <mailto:david.marchand@redhat.com>>, Gaetan Rivet <grive@u256.net <mailto:grive@u256.  net>>, Eelco Chaudron <echaudro@redhat.com <mailto:echaudro@redhat.com>>
> Subject: Re: [ovs-dev] [RFC] dpdk: Allow retaining cap_sys_rawio privileges
>
>
> Salut Gaëtan,
>
>
> Gaetan Rivet, Feb 23, 2023 at 22:33:
> > I've looked at your patch Robin and the offloads you insert in
> > dpdk_cp_prot_add_flow use the following:
> >
> > const struct rte_flow_attr attr = { .ingress = 1 };
> >
> > implicitly setting transfer and group to 0. If either of those had
> > been non-zero instead, cap_sys_rawio would be required.
>
>
> Oh I was not aware that this would change anything. Is there some
> document/code snippet/anything that explains why is that so? Is that
> specific to the mlx5 driver?
>
>
> Thanks!

You can find some scarce info there: https://doc.dpdk.org/guides/platform/mlx5.html#linux-environment
Check out section 5.5.1.5. "Run as Non-Root".

This doc is incomplete, which is one of the RC of these threads.
Gaetan Rivet Feb. 23, 2023, 10:29 p.m. UTC | #9
> -----Original Message-----
> From: Aaron Conole <aconole@redhat.com <mailto:aconole@redhat.com>>
> Date: Wednesday 22 February 2023 at 18:11
> To: "dev@openvswitch.org <mailto:dev@openvswitch.org>" <dev@openvswitch.org <mailto:dev@openvswitch.org>>
> Cc: Eli Britstein <elibr@nvidia.com <mailto:elibr@nvidia.com>>, Gaetan Rivet <gaetanr@nvidia.com <mailto:gaetanr@nvidia.com>>, Ilya Maximets <i.maximets@ovn.org <mailto:i.maximets@ovn.org>>, Maxime Coquelin <maxime.coquelin@redhat.com <mailto:maxime.coquelin@redhat.com>>, Jason  Gunthorpe <jgg@nvidia.com <mailto:jgg@nvidia.com>>, Majd Dibbiny <majd@nvidia.com <mailto:majd@nvidia.com>>, David Marchand <david.marchand@redhat.com <mailto:david.marchand@redhat.com>>
> Subject: Re: [ovs-dev] [RFC] dpdk: Allow retaining cap_sys_rawio privileges
>
> Apologies - I mis-typed Gaetan's email when I entered it into my mail
> file. CC'd correctly on this email (but I can resend the patch, if you
> think it is better).
>
> Aaron Conole <aconole@redhat.com <mailto:aconole@redhat.com>> writes:
>
> > Open vSwitch generally tries to let the underlying operating system
> > managed the low level details of hardware, for example DMA mapping,
> > bus arbitration, etc. However, when using DPDK, the underlying
> > operating system yields control of many of these details to userspace
> > for management.
> >
> > In the case of some DPDK port drivers, configuring rte_flow or even
> > allocating resources may require access to iopl/ioperm calls, which
> > are guarded by the CAP_SYS_RAWIO privilege on linux systems. These
> > calls are dangerous, and can allow a process to completely compromise
> > a system. However, they are needed in the case of some userspace
> > driver code which manages the hardware (for example, the mlx
> > implementation of backend support for rte_flow).
> >
> > Here, we create an opt-in flag passed to the command line to allow
> > this access. We need to do this before ever accessing the database,
> > because we want to drop all privileges asap, and cannot wait for
> > a connection to the database to be established and functional before
> > dropping. There may be distribution specific ways to do capability
> > management as well (using for example, systemd), but they are not
> > as universal to the vswitchd as a flag.
> >
> > Signed-off-by: Aaron Conole <aconole@redhat.com <mailto:aconole@redhat.com>>
> > ---

Hello Aaron,

Thank you for proposing this change.

If users want to use mlx5 ports with OVS without being root, this capability will be required.
Adding a vswitchd option to enable it seems the simplest way to offer some control.

If vendor-specific logic was allowed, I could add a function to detect Mellanox ports
and enable this option in that case. Otherwise we can document as much as possible,
but hopefully the errors will be made clear from the DPDK side because it will
be hard to explain those errors without vendor-specific code.

Regarding the implementation, I had a few comments:
> @@ -877,11 +890,11 @@ daemon_become_new_user__(bool access_datapath)
>   * However, there in case the user switch needs to be done
>   * before daemonize_start(), the following API can be used.  */
>  void
> -daemon_become_new_user(bool access_datapath)
> +daemon_become_new_user(bool access_datapath, bool access_hardware_ports)
>  {
>      assert_single_threaded();
>      if (switch_user) {
> -        daemon_become_new_user__(access_datapath);
> +        daemon_become_new_user__(access_datapath, access_hardware_ports);
>          /* daemonize_start() should not switch user again. */
>          switch_user = false;
>      }

Grepping for daemon_become_new_user, I see the following that might need a change:
lib/daemon-windows.c:529:daemon_become_new_user(bool access_datapath OVS_UNUSED)

> diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c
> index 407bfc60e..f62d1ad75 100644
> --- a/vswitchd/ovs-vswitchd.c
> +++ b/vswitchd/ovs-vswitchd.c
> @@ -60,6 +60,9 @@ VLOG_DEFINE_THIS_MODULE(vswitchd);
>   * the kernel from paging any of its memory to disk. */
>  static bool want_mlockall;
>
> +/* --hw-access: If set, retains CAP_SYS_RAWIO privileges.  */
> +static bool hw_access;
> +
>  static unixctl_cb_func ovs_vswitchd_exit;
>
>  static char *parse_options(int argc, char *argv[], char **unixctl_path);
> @@ -89,7 +92,7 @@ main(int argc, char *argv[])
>      remote = parse_options(argc, argv, &unixctl_path);
>      fatal_ignore_sigpipe();
>
> -    daemonize_start(true);
> +    daemonize_start(true, true);
 
Here I think it should be daemonize_start(true, hw_access);

Best regards,
Aaron Conole Feb. 27, 2023, 6:26 p.m. UTC | #10
Gaetan Rivet <gaetanr@nvidia.com> writes:

>> -----Original Message-----
>> From: Aaron Conole <aconole@redhat.com <mailto:aconole@redhat.com>>
>> Date: Wednesday 22 February 2023 at 18:11
>> To: "dev@openvswitch.org <mailto:dev@openvswitch.org>"
>> <dev@openvswitch.org <mailto:dev@openvswitch.org>>
>> Cc: Eli Britstein <elibr@nvidia.com <mailto:elibr@nvidia.com>>,
>> Gaetan Rivet <gaetanr@nvidia.com <mailto:gaetanr@nvidia.com>>, Ilya
>> Maximets <i.maximets@ovn.org <mailto:i.maximets@ovn.org>>, Maxime
>> Coquelin <maxime.coquelin@redhat.com
>> <mailto:maxime.coquelin@redhat.com>>, Jason Gunthorpe
>> <jgg@nvidia.com <mailto:jgg@nvidia.com>>, Majd Dibbiny
>> <majd@nvidia.com <mailto:majd@nvidia.com>>, David Marchand
>> <david.marchand@redhat.com <mailto:david.marchand@redhat.com>>
>> Subject: Re: [ovs-dev] [RFC] dpdk: Allow retaining cap_sys_rawio privileges
>>
>> Apologies - I mis-typed Gaetan's email when I entered it into my mail
>> file. CC'd correctly on this email (but I can resend the patch, if you
>> think it is better).
>>
>> Aaron Conole <aconole@redhat.com <mailto:aconole@redhat.com>> writes:
>>
>> > Open vSwitch generally tries to let the underlying operating system
>> > managed the low level details of hardware, for example DMA mapping,
>> > bus arbitration, etc. However, when using DPDK, the underlying
>> > operating system yields control of many of these details to userspace
>> > for management.
>> >
>> > In the case of some DPDK port drivers, configuring rte_flow or even
>> > allocating resources may require access to iopl/ioperm calls, which
>> > are guarded by the CAP_SYS_RAWIO privilege on linux systems. These
>> > calls are dangerous, and can allow a process to completely compromise
>> > a system. However, they are needed in the case of some userspace
>> > driver code which manages the hardware (for example, the mlx
>> > implementation of backend support for rte_flow).
>> >
>> > Here, we create an opt-in flag passed to the command line to allow
>> > this access. We need to do this before ever accessing the database,
>> > because we want to drop all privileges asap, and cannot wait for
>> > a connection to the database to be established and functional before
>> > dropping. There may be distribution specific ways to do capability
>> > management as well (using for example, systemd), but they are not
>> > as universal to the vswitchd as a flag.
>> >
>> > Signed-off-by: Aaron Conole <aconole@redhat.com <mailto:aconole@redhat.com>>
>> > ---
>
> Hello Aaron,
>
> Thank you for proposing this change.
>
> If users want to use mlx5 ports with OVS without being root, this capability will be required.
> Adding a vswitchd option to enable it seems the simplest way to offer some control.

Yes, this seems to be the best way as far as I can tell.

> If vendor-specific logic was allowed, I could add a function to detect Mellanox ports
> and enable this option in that case. Otherwise we can document as much as possible,
> but hopefully the errors will be made clear from the DPDK side because it will
> be hard to explain those errors without vendor-specific code.

Unfortunately, there isn't a good place to put such vendor specific
logic.  I thought about it a bit.  The vswitchd can't assume that just
because an mlx5 card is present that it will be used as part of DPDK
port.  We cannot wait until connecting to DB to preserve this
capability, either, since the privileges drops would happen *after* a
stable DB connection and read (which is far too long).

> Regarding the implementation, I had a few comments:

Great!

>> @@ -877,11 +890,11 @@ daemon_become_new_user__(bool access_datapath)
>>   * However, there in case the user switch needs to be done
>>   * before daemonize_start(), the following API can be used.  */
>>  void
>> -daemon_become_new_user(bool access_datapath)
>> +daemon_become_new_user(bool access_datapath, bool access_hardware_ports)
>>  {
>>      assert_single_threaded();
>>      if (switch_user) {
>> -        daemon_become_new_user__(access_datapath);
>> +        daemon_become_new_user__(access_datapath, access_hardware_ports);
>>          /* daemonize_start() should not switch user again. */
>>          switch_user = false;
>>      }
>
> Grepping for daemon_become_new_user, I see the following that might need a change:
> lib/daemon-windows.c:529:daemon_become_new_user(bool access_datapath OVS_UNUSED)

Yes, I think it needs to change here as well.  My appveyor run for some
reason passed
(https://ci.appveyor.com/project/orgcandman/ovs/builds/46307713) and
that is concerning.

>> diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c
>> index 407bfc60e..f62d1ad75 100644
>> --- a/vswitchd/ovs-vswitchd.c
>> +++ b/vswitchd/ovs-vswitchd.c
>> @@ -60,6 +60,9 @@ VLOG_DEFINE_THIS_MODULE(vswitchd);
>>   * the kernel from paging any of its memory to disk. */
>>  static bool want_mlockall;
>>
>> +/* --hw-access: If set, retains CAP_SYS_RAWIO privileges.  */
>> +static bool hw_access;
>> +
>>  static unixctl_cb_func ovs_vswitchd_exit;
>>
>>  static char *parse_options(int argc, char *argv[], char **unixctl_path);
>> @@ -89,7 +92,7 @@ main(int argc, char *argv[])
>>      remote = parse_options(argc, argv, &unixctl_path);
>>      fatal_ignore_sigpipe();
>>
>> -    daemonize_start(true);
>> +    daemonize_start(true, true);
>  
> Here I think it should be daemonize_start(true, hw_access);

Yes - you're correct.  I'll fix it when I submit as PATCH.

> Best regards,
Ilya Maximets Feb. 27, 2023, 7:05 p.m. UTC | #11
On 2/22/23 18:07, Aaron Conole wrote:
> Open vSwitch generally tries to let the underlying operating system
> managed the low level details of hardware, for example DMA mapping,
> bus arbitration, etc.  However, when using DPDK, the underlying
> operating system yields control of many of these details to userspace
> for management.
> 
> In the case of some DPDK port drivers, configuring rte_flow or even
> allocating resources may require access to iopl/ioperm calls, which
> are guarded by the CAP_SYS_RAWIO privilege on linux systems.  These
> calls are dangerous, and can allow a process to completely compromise
> a system.  However, they are needed in the case of some userspace
> driver code which manages the hardware (for example, the mlx
> implementation of backend support for rte_flow).
> 
> Here, we create an opt-in flag passed to the command line to allow
> this access.  We need to do this before ever accessing the database,
> because we want to drop all privileges asap, and cannot wait for
> a connection to the database to be established and functional before
> dropping.  There may be distribution specific ways to do capability
> management as well (using for example, systemd), but they are not
> as universal to the vswitchd as a flag.
> 
> Signed-off-by: Aaron Conole <aconole@redhat.com>
> ---
>  NEWS                           |  4 ++++
>  lib/daemon-unix.c              | 31 ++++++++++++++++++++++---------
>  lib/daemon.c                   |  2 +-
>  lib/daemon.h                   |  4 ++--
>  ovsdb/ovsdb-client.c           |  6 +++---
>  ovsdb/ovsdb-server.c           |  4 ++--
>  tests/test-netflow.c           |  2 +-
>  tests/test-sflow.c             |  2 +-
>  tests/test-unixctl.c           |  2 +-
>  utilities/ovs-ofctl.c          |  4 ++--
>  utilities/ovs-testcontroller.c |  4 ++--
>  vswitchd/ovs-vswitchd.8.in     |  8 ++++++++
>  vswitchd/ovs-vswitchd.c        | 11 ++++++++++-
>  13 files changed, 59 insertions(+), 25 deletions(-)
> 

...

> diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c
> index 407bfc60eb..f62d1ad751 100644
> --- a/vswitchd/ovs-vswitchd.c
> +++ b/vswitchd/ovs-vswitchd.c
> @@ -60,6 +60,9 @@ VLOG_DEFINE_THIS_MODULE(vswitchd);
>   * the kernel from paging any of its memory to disk. */
>  static bool want_mlockall;
>  
> +/* --hw-access: If set, retains CAP_SYS_RAWIO privileges.  */
> +static bool hw_access;

The comment is outdated.  And we may also want to rename the variable
itself to match the option.

Best regards, Ilya Maximets.
diff mbox series

Patch

diff --git a/NEWS b/NEWS
index 85b3496214..65f35dcdd5 100644
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,10 @@  Post-v3.1.0
        in order to create OVSDB sockets with access mode of 0770.
    - QoS:
      * Added new configuration option 'jitter' for a linux-netem QoS type.
+   - DPDK:
+     * ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started
+       with the --hw-rawio-access command line option.  This allows the
+       process extra privileges when mapping physical interconnect memory.
 
 
 v3.1.0 - 16 Feb 2023
diff --git a/lib/daemon-unix.c b/lib/daemon-unix.c
index 1a7ba427d7..8b895a48de 100644
--- a/lib/daemon-unix.c
+++ b/lib/daemon-unix.c
@@ -88,7 +88,8 @@  static bool switch_user = false;
 static uid_t uid;
 static gid_t gid;
 static char *user = NULL;
-static void daemon_become_new_user__(bool access_datapath);
+static void daemon_become_new_user__(bool access_datapath,
+                                     bool access_hardware_ports);
 
 static void check_already_running(void);
 static int lock_pidfile(FILE *, int command);
@@ -443,13 +444,13 @@  monitor_daemon(pid_t daemon_pid)
  * daemonize_complete()) or that it failed to start up (by exiting with a
  * nonzero exit code). */
 void
-daemonize_start(bool access_datapath)
+daemonize_start(bool access_datapath, bool access_hardware_ports)
 {
     assert_single_threaded();
     daemonize_fd = -1;
 
     if (switch_user) {
-        daemon_become_new_user__(access_datapath);
+        daemon_become_new_user__(access_datapath, access_hardware_ports);
         switch_user = false;
     }
 
@@ -807,7 +808,8 @@  daemon_become_new_user_unix(void)
 /* Linux specific implementation of daemon_become_new_user()
  * using libcap-ng.   */
 static void
-daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
+daemon_become_new_user_linux(bool access_datapath OVS_UNUSED,
+                             bool access_hardware_ports OVS_UNUSED)
 {
 #if defined __linux__ &&  HAVE_LIBCAPNG
     int ret;
@@ -826,7 +828,17 @@  daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
             if (access_datapath && !ret) {
                 ret = capng_update(CAPNG_ADD, cap_sets, CAP_NET_ADMIN)
                       || capng_update(CAPNG_ADD, cap_sets, CAP_NET_RAW)
-                      || capng_update(CAPNG_ADD, cap_sets, CAP_NET_BROADCAST);
+                      || capng_update(CAPNG_ADD, cap_sets, CAP_NET_BROADCAST)
+#ifdef DPDK_NETDEV
+                      || (access_hardware_ports &&
+                          capng_update(CAPNG_ADD, cap_sets, CAP_SYS_RAWIO))
+#else
+                    ;
+                if (access_hardware_ports) {
+                    VLOG_WARN("hw port access requested, but no userspace ioport support.  Dropping.");
+                }
+#endif
+                    ;
             }
         } else {
             ret = -1;
@@ -854,7 +866,7 @@  daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
 }
 
 static void
-daemon_become_new_user__(bool access_datapath)
+daemon_become_new_user__(bool access_datapath, bool access_hardware_ports)
 {
     /* If vlog file has been created, change its owner to the non-root user
      * as specifed by the --user option.  */
@@ -862,7 +874,8 @@  daemon_become_new_user__(bool access_datapath)
 
     if (LINUX) {
         if (LIBCAPNG) {
-            daemon_become_new_user_linux(access_datapath);
+            daemon_become_new_user_linux(access_datapath,
+                                         access_hardware_ports);
         } else {
             VLOG_FATAL("%s: fail to downgrade user using libcap-ng. "
                        "(libcap-ng is not configured at compile time), "
@@ -877,11 +890,11 @@  daemon_become_new_user__(bool access_datapath)
  * However, there in case the user switch needs to be done
  * before daemonize_start(), the following API can be used.  */
 void
-daemon_become_new_user(bool access_datapath)
+daemon_become_new_user(bool access_datapath, bool access_hardware_ports)
 {
     assert_single_threaded();
     if (switch_user) {
-        daemon_become_new_user__(access_datapath);
+        daemon_become_new_user__(access_datapath, access_hardware_ports);
         /* daemonize_start() should not switch user again. */
         switch_user = false;
     }
diff --git a/lib/daemon.c b/lib/daemon.c
index 3249c5ab4b..1e1c019eb1 100644
--- a/lib/daemon.c
+++ b/lib/daemon.c
@@ -48,7 +48,7 @@  get_detach(void)
 void
 daemonize(void)
 {
-    daemonize_start(false);
+    daemonize_start(false, false);
     daemonize_complete();
 }
 
diff --git a/lib/daemon.h b/lib/daemon.h
index 0941574963..42372d1463 100644
--- a/lib/daemon.h
+++ b/lib/daemon.h
@@ -167,10 +167,10 @@  void set_detach(void);
 bool get_detach(void);
 void daemon_save_fd(int fd);
 void daemonize(void);
-void daemonize_start(bool access_datapath);
+void daemonize_start(bool access_datapath, bool access_hardware_ports);
 void daemonize_complete(void);
 void daemon_set_new_user(const char * user_spec);
-void daemon_become_new_user(bool access_datapath);
+void daemon_become_new_user(bool access_datapath, bool access_hardware_ports);
 void daemon_usage(void);
 void daemon_disable_self_confinement(void);
 bool daemon_should_self_confine(void);
diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c
index f1b8d64910..bae2c5f041 100644
--- a/ovsdb/ovsdb-client.c
+++ b/ovsdb/ovsdb-client.c
@@ -250,7 +250,7 @@  main(int argc, char *argv[])
     parse_options(argc, argv);
     fatal_ignore_sigpipe();
 
-    daemon_become_new_user(false);
+    daemon_become_new_user(false, false);
     if (optind >= argc) {
         ovs_fatal(0, "missing command name; use --help for help");
     }
@@ -1392,7 +1392,7 @@  do_monitor__(struct jsonrpc *rpc, const char *database,
 
     daemon_save_fd(STDOUT_FILENO);
     daemon_save_fd(STDERR_FILENO);
-    daemonize_start(false);
+    daemonize_start(false, false);
     if (get_detach()) {
         int error;
 
@@ -2276,7 +2276,7 @@  do_lock(struct jsonrpc *rpc, const char *method, const char *lock)
                                         getting a reply of the previous
                                         request. */
     daemon_save_fd(STDOUT_FILENO);
-    daemonize_start(false);
+    daemonize_start(false, false);
     lock_req_init(&lock_req, method, lock);
 
     if (get_detach()) {
diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c
index 33ca4910d7..4fea2dbda7 100644
--- a/ovsdb/ovsdb-server.c
+++ b/ovsdb/ovsdb-server.c
@@ -341,7 +341,7 @@  main(int argc, char *argv[])
                   &run_command, &sync_from, &sync_exclude, &active);
     is_backup = sync_from && !active;
 
-    daemon_become_new_user(false);
+    daemon_become_new_user(false, false);
 
     /* Create and initialize 'config_tmpfile' as a temporary file to hold
      * ovsdb-server's most basic configuration, and then save our initial
@@ -359,7 +359,7 @@  main(int argc, char *argv[])
     save_config__(config_tmpfile, &remotes, &db_filenames, sync_from,
                   sync_exclude, is_backup);
 
-    daemonize_start(false);
+    daemonize_start(false, false);
 
     /* Load the saved config. */
     load_config(config_tmpfile, &remotes, &db_filenames, &sync_from,
diff --git a/tests/test-netflow.c b/tests/test-netflow.c
index d2322d4509..7f89cfcae0 100644
--- a/tests/test-netflow.c
+++ b/tests/test-netflow.c
@@ -195,7 +195,7 @@  test_netflow_main(int argc, char *argv[])
     }
 
     daemon_save_fd(STDOUT_FILENO);
-    daemonize_start(false);
+    daemonize_start(false, false);
 
     error = unixctl_server_create(NULL, &server);
     if (error) {
diff --git a/tests/test-sflow.c b/tests/test-sflow.c
index 460d4d6c54..3c617bdd16 100644
--- a/tests/test-sflow.c
+++ b/tests/test-sflow.c
@@ -709,7 +709,7 @@  test_sflow_main(int argc, char *argv[])
     }
 
     daemon_save_fd(STDOUT_FILENO);
-    daemonize_start(false);
+    daemonize_start(false, false);
 
     error = unixctl_server_create(NULL, &server);
     if (error) {
diff --git a/tests/test-unixctl.c b/tests/test-unixctl.c
index 3eadf54cd9..9e89827895 100644
--- a/tests/test-unixctl.c
+++ b/tests/test-unixctl.c
@@ -83,7 +83,7 @@  test_unixctl_main(int argc, char *argv[])
     fatal_ignore_sigpipe();
     parse_options(&argc, &argv, &unixctl_path);
 
-    daemonize_start(false);
+    daemonize_start(false, false);
     int retval = unixctl_server_create(unixctl_path, &unixctl);
     if (retval) {
         exit(EXIT_FAILURE);
diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c
index eabec18a36..f81f5f759a 100644
--- a/utilities/ovs-ofctl.c
+++ b/utilities/ovs-ofctl.c
@@ -173,7 +173,7 @@  main(int argc, char *argv[])
     ctx.argc = argc - optind;
     ctx.argv = argv + optind;
 
-    daemon_become_new_user(false);
+    daemon_become_new_user(false, false);
     if (read_only) {
         ovs_cmdl_run_command_read_only(&ctx, get_all_commands());
     } else {
@@ -2127,7 +2127,7 @@  monitor_vconn(struct vconn *vconn, bool reply_to_echo_requests,
     int error;
 
     daemon_save_fd(STDERR_FILENO);
-    daemonize_start(false);
+    daemonize_start(false, false);
     error = unixctl_server_create(unixctl_path, &server);
     if (error) {
         ovs_fatal(error, "failed to create unixctl server");
diff --git a/utilities/ovs-testcontroller.c b/utilities/ovs-testcontroller.c
index b489ff5fc7..9f2fbfdf51 100644
--- a/utilities/ovs-testcontroller.c
+++ b/utilities/ovs-testcontroller.c
@@ -109,7 +109,7 @@  main(int argc, char *argv[])
     parse_options(argc, argv);
     fatal_ignore_sigpipe();
 
-    daemon_become_new_user(false);
+    daemon_become_new_user(false, false);
 
     if (argc - optind < 1) {
         ovs_fatal(0, "at least one vconn argument required; "
@@ -148,7 +148,7 @@  main(int argc, char *argv[])
         ovs_fatal(0, "no active or passive switch connections");
     }
 
-    daemonize_start(false);
+    daemonize_start(false, false);
 
     retval = unixctl_server_create(unixctl_path, &unixctl);
     if (retval) {
diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in
index 9569265fcb..a6a4a24606 100644
--- a/vswitchd/ovs-vswitchd.8.in
+++ b/vswitchd/ovs-vswitchd.8.in
@@ -81,6 +81,14 @@  unavailable or unsuccessful.
 .SS "DPDK Options"
 For details on initializing \fBovs\-vswitchd\fR to use DPDK ports,
 refer to the documentation or \fBovs\-vswitchd.conf.db\fR(5).
+.SS "DPDK HW Access Options"
+.IP "\fB\-\-hw\-rawio\-access\fR"
+Tells \fBovs\-vswitchd\fR to retain the \fBCAP_SYS_RAWIO\fR capability,
+to allow userspace drivers access to raw hardware memory.  This will
+also allow the \fBovs\-vswitchd\fR daemon to call \fBiopl()\fR and
+\fBioperm()\fR functions to set port access.  This is a \fBvery\fR
+powerful capability, so generally only enable as needed for specific
+hardware.
 .SS "Daemon Options"
 .ds DD \
 \fBovs\-vswitchd\fR detaches only after it has connected to the \
diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c
index 407bfc60eb..f62d1ad751 100644
--- a/vswitchd/ovs-vswitchd.c
+++ b/vswitchd/ovs-vswitchd.c
@@ -60,6 +60,9 @@  VLOG_DEFINE_THIS_MODULE(vswitchd);
  * the kernel from paging any of its memory to disk. */
 static bool want_mlockall;
 
+/* --hw-access: If set, retains CAP_SYS_RAWIO privileges.  */
+static bool hw_access;
+
 static unixctl_cb_func ovs_vswitchd_exit;
 
 static char *parse_options(int argc, char *argv[], char **unixctl_path);
@@ -89,7 +92,7 @@  main(int argc, char *argv[])
     remote = parse_options(argc, argv, &unixctl_path);
     fatal_ignore_sigpipe();
 
-    daemonize_start(true);
+    daemonize_start(true, true);
 
     if (want_mlockall) {
 #ifdef HAVE_MLOCKALL
@@ -169,6 +172,7 @@  parse_options(int argc, char *argv[], char **unixctl_pathp)
         OPT_DPDK,
         SSL_OPTION_ENUMS,
         OPT_DUMMY_NUMA,
+        OPT_HW_ACCESS,
     };
     static const struct option long_options[] = {
         {"help",        no_argument, NULL, 'h'},
@@ -185,6 +189,7 @@  parse_options(int argc, char *argv[], char **unixctl_pathp)
         {"disable-system-route", no_argument, NULL, OPT_DISABLE_SYSTEM_ROUTE},
         {"dpdk", optional_argument, NULL, OPT_DPDK},
         {"dummy-numa", required_argument, NULL, OPT_DUMMY_NUMA},
+        {"hw-rawio-access", no_argument, NULL, OPT_HW_ACCESS},
         {NULL, 0, NULL, 0},
     };
     char *short_options = ovs_cmdl_long_options_to_short_options(long_options);
@@ -249,6 +254,10 @@  parse_options(int argc, char *argv[], char **unixctl_pathp)
             ovs_numa_set_dummy(optarg);
             break;
 
+        case OPT_HW_ACCESS:
+            hw_access = true;
+            break;
+
         default:
             abort();
         }