diff mbox series

[focal/linux-azure] UBUNTU: SAUCE: Add sunrpc module parameters for NFSv3 nconnect

Message ID 20220125135455.14221-2-tim.gardner@canonical.com
State New
Headers show
Series [focal/linux-azure] UBUNTU: SAUCE: Add sunrpc module parameters for NFSv3 nconnect | expand

Commit Message

Tim Gardner Jan. 25, 2022, 1:54 p.m. UTC
From: Nagendra Tomar <Nagendra.Tomar@microsoft.com>

BugLink: https://bugs.launchpad.net/bugs/1958990

Add module parameters and code to allow pinning a TCP connection
to a specific server. See upstream discussion at:

https://www.spinics.net/lists/linux-nfs/msg83074.html

This patch is a consolidated and simplified backport of the logic required to
implement pinning to one connection. With the author's permission I have changed the
default sense for the module parameter 'enable_azure_nconnect' to false in
order to preserve existing behavior.

This logic isn't enabled unless the module parameter enable_azure_nconnect=true.

Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
---
 net/sunrpc/clnt.c | 237 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 235 insertions(+), 2 deletions(-)

Comments

Kleber Sacilotto de Souza Jan. 27, 2022, 2:18 p.m. UTC | #1
On 1/25/22 14:54, Tim Gardner wrote:
> From: Nagendra Tomar <Nagendra.Tomar@microsoft.com>
> 
> BugLink: https://bugs.launchpad.net/bugs/1958990
> 
> Add module parameters and code to allow pinning a TCP connection
> to a specific server. See upstream discussion at:
> 
> https://www.spinics.net/lists/linux-nfs/msg83074.html
> 
> This patch is a consolidated and simplified backport of the logic required to
> implement pinning to one connection. With the author's permission I have changed the
> default sense for the module parameter 'enable_azure_nconnect' to false in
> order to preserve existing behavior.
> 
> This logic isn't enabled unless the module parameter enable_azure_nconnect=true.
> 
> Signed-off-by: Tim Gardner <tim.gardner@canonical.com>

The Signed-off-by of the original author is missing.

Tim, can you confirm that we could add this when applying the patches?


Thanks,
Kleber

> ---
>   net/sunrpc/clnt.c | 237 +++++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 235 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index b6039642df67e..cdf4436ea9650 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -33,6 +33,23 @@
>   #include <linux/in6.h>
>   #include <linux/un.h>
>   
> +/*
> + * Note #1:
> + * Accessing NFS structures inside sunrpc code is layering violation, but
> + * that's the best we can do w/o making changes to existing structures,
> + * which would prevent the updated module from being loaded into existing
> + * pre-built kernels.
> + *
> + * Note #2:
> + * We define __LINUX_NFSACL_H to prevent nfsacl.h from being included o/w
> + * some of the rpc* methods get different modversion than the kernel, due
> + * to some nfs acl structures being forward declared.
> + */
> +#define __LINUX_NFSACL_H
> +#include <linux/nfs_fs.h>
> +#undef ifdebug
> +#define ifdebug(fac)		if (0)
> +
>   #include <linux/sunrpc/clnt.h>
>   #include <linux/sunrpc/addr.h>
>   #include <linux/sunrpc/rpc_pipe_fs.h>
> @@ -51,6 +68,28 @@
>   	dprintk("RPC: %5u %s (status %d)\n", t->tk_pid,		\
>   			__func__, t->tk_status)
>   
> +/*
> + * If enable_azure_nconnect is true, RPC requests for a file are sent over
> + * one connection. RPC requests for different files may be sent over different
> + * connections.
> + */
> +static bool enable_azure_nconnect __read_mostly = false;
> +module_param(enable_azure_nconnect, bool, 0644);
> +MODULE_PARM_DESC(enable_azure_nconnect,
> +	"Send RPC requests for one file over one connection (requests for different files go over different connections)");
> +
> +/*
> + * By default read requests to one file are sent over one connection.
> + * azure_nconnect_readscaling module parameter can be used to control that
> + * behavior. By distributing READ RPCs for one file over multiple connections
> + * we can get much higher single-file READ throughput. This can be used if
> + * we have a readonly mount or if files are mostly read and not written.
> + */
> +static bool azure_nconnect_readscaling __read_mostly = false;
> +module_param(azure_nconnect_readscaling, bool, 0644);
> +MODULE_PARM_DESC(azure_nconnect_readscaling,
> +	"Scale single file reads by sending them round-robin over all the available connections. Use only for readonly mounts or for read-mostly workloads");
> +
>   /*
>    * All RPC clients are linked into this list
>    */
> @@ -1055,6 +1094,200 @@ rpc_task_get_next_xprt(struct rpc_clnt *clnt)
>   	return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi));
>   }
>   
> +/*
> + * For the given rpc_task, compute the hash for the target filehandle.
> + */
> +static u32
> +rpc_task_fh_hash(const struct rpc_task *task)
> +{
> +	const struct rpc_message *rpc_message = &task->tk_msg;
> +	const struct rpc_procinfo *rpc_proc = rpc_message->rpc_proc;
> +	const u32 p_proc = (rpc_proc ? rpc_proc->p_proc : NFS3PROC_NULL);
> +	const struct nfs_fh *fh = NULL;
> +
> +	switch (p_proc) {
> +	case NFS3PROC_GETATTR:
> +	{
> +		fh = rpc_message->rpc_argp;
> +		break;
> +	}
> +	case NFS3PROC_SETATTR:
> +	{
> +		const struct nfs3_sattrargs *args = rpc_message->rpc_argp;
> +		fh = args->fh;
> +		break;
> +	}
> +	case NFS3PROC_LOOKUP:
> +	case NFS3PROC_RMDIR:
> +	{
> +		const struct nfs3_diropargs *args = rpc_message->rpc_argp;
> +		fh = args->fh;
> +		break;
> +	}
> +	case NFS3PROC_ACCESS:
> +	{
> +		const struct nfs3_accessargs *args = rpc_message->rpc_argp;
> +		fh = args->fh;
> +		break;
> +	}
> +	case NFS3PROC_READLINK:
> +	{
> +		const struct nfs3_readlinkargs *args = rpc_message->rpc_argp;
> +		fh = args->fh;
> +		break;
> +	}
> +	case NFS3PROC_READ:
> +	case NFS3PROC_WRITE:
> +	{
> +		const struct nfs_pgio_args *args = rpc_message->rpc_argp;
> +		fh = args->fh;
> +		break;
> +	}
> +	case NFS3PROC_COMMIT:
> +	{
> +		const struct nfs_commitargs *args = rpc_message->rpc_argp;
> +		fh = args->fh;
> +		break;
> +	}
> +	case NFS3PROC_CREATE:
> +	{
> +		const struct nfs3_createargs *args = rpc_message->rpc_argp;
> +		fh = args->fh;
> +		break;
> +	}
> +	case NFS3PROC_MKDIR:
> +	{
> +		const struct nfs3_mkdirargs *args = rpc_message->rpc_argp;
> +		fh = args->fh;
> +		break;
> +	}
> +	case NFS3PROC_SYMLINK:
> +	{
> +		const struct nfs3_symlinkargs *args = rpc_message->rpc_argp;
> +		fh = args->fromfh;
> +		break;
> +	}
> +	case NFS3PROC_MKNOD:
> +	{
> +		const struct nfs3_mknodargs *args = rpc_message->rpc_argp;
> +		fh = args->fh;
> +		break;
> +	}
> +	case NFS3PROC_REMOVE:
> +	{
> +		const struct nfs_removeargs *args = rpc_message->rpc_argp;
> +		fh = args->fh;
> +		break;
> +	}
> +	case NFS3PROC_RENAME:
> +	{
> +		const struct nfs_renameargs *args = rpc_message->rpc_argp;
> +		/*
> +		 * In case of cross-dir rename, we have to choose between
> +		 * old and new dir to have the updated cache. We prefer
> +		 * new_dir as that's where the user expects the file to
> +		 * show up.
> +		 */
> +		fh = args->new_dir;
> +		if (!fh)
> +			fh = args->old_dir;
> +		break;
> +	}
> +	case NFS3PROC_LINK:
> +	{
> +		const struct nfs3_linkargs *args = rpc_message->rpc_argp;
> +		fh = args->tofh;
> +		break;
> +	}
> +	case NFS3PROC_READDIR:
> +	case NFS3PROC_READDIRPLUS:
> +	{
> +		const struct nfs3_readdirargs *args = rpc_message->rpc_argp;
> +		fh = args->fh;
> +		break;
> +	}
> +	/*
> +	 * Rest are not targeted to a file and map to the first
> +	 * transport connection.
> +	 */
> +	}
> +
> +	return (fh ? jhash(fh->data, fh->size, 0) : 0);
> +}
> +
> +static
> +bool xprt_is_active(const struct rpc_xprt *xprt)
> +{
> +	return kref_read(&xprt->kref) != 0;
> +}
> +
> +/*
> + * For the given rpc_task return the hashed xprt to use.
> + * This will ensure RPCs targeted to the same file get the same xprt.
> + */
> +static struct rpc_xprt *
> +rpc_task_get_hashed_xprt(struct rpc_clnt *clnt, const struct rpc_task *task)
> +{
> +	const struct rpc_xprt_switch *xps = NULL;
> +	struct rpc_xprt *xprt = NULL;
> +	const u32 hash = rpc_task_fh_hash(task);
> +
> +	rcu_read_lock();
> +	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
> +
> +	if (xps && hash) {
> +		const struct list_head *head = &xps->xps_xprt_list;
> +		struct rpc_xprt *pos;
> +		const u32 nactive = READ_ONCE(xps->xps_nactive);
> +		const u32 xprt_idx = (hash % nactive);
> +		u32 idx = 0;
> +
> +		list_for_each_entry_rcu(pos, head, xprt_switch) {
> +			if (xprt_idx > idx++)
> +				continue;
> +			if (xprt_is_active(pos)) {
> +				xprt = xprt_get(pos);
> +				break;
> +			} else {
> +				if (printk_ratelimit())
> +					printk(KERN_ERR "!xprt_is_active idx=%u, xprt_idx=%u, hash=%u\n",
> +						idx, xprt_idx, hash);
> +			}
> +		}
> +	}
> +
> +	/*
> +	 * Use first transport, if not found any, or if RPC is not targeted
> +	 * to a specific file, e.g., FSINFO.
> +	 */
> +	if (!xprt)
> +		xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
> +	rcu_read_unlock();
> +
> +	return rpc_task_get_xprt(clnt, xprt);
> +}
> +
> +static struct rpc_xprt *
> +rpc_task_get_azure_xprt(struct rpc_clnt *clnt, const struct rpc_task *task)
> +{
> +	/*
> +	 * Use special azure nconnect only for NFSv3 RPC requests.
> +	 */
> +	if (clnt->cl_prog != NFS_PROGRAM || clnt->cl_vers != NFS3_VERSION)
> +		return rpc_task_get_next_xprt(clnt);
> +
> +	if (enable_azure_nconnect) {
> +		if (azure_nconnect_readscaling) {
> +			const struct rpc_procinfo *rpc_proc =
> +				task->tk_msg.rpc_proc;
> +			if (rpc_proc && rpc_proc->p_proc == NFS3PROC_READ)
> +				return rpc_task_get_next_xprt(clnt);
> +		}
> +		return rpc_task_get_hashed_xprt(clnt, task);
> +	} else
> +		return rpc_task_get_next_xprt(clnt);
> +}
> +
>   static
>   void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
>   {
> @@ -1063,7 +1296,7 @@ void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
>   	if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
>   		task->tk_xprt = rpc_task_get_first_xprt(clnt);
>   	else
> -		task->tk_xprt = rpc_task_get_next_xprt(clnt);
> +		task->tk_xprt = rpc_task_get_azure_xprt(clnt, task);
>   }
>   
>   static
> @@ -1123,8 +1356,8 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
>   
>   	task = rpc_new_task(task_setup_data);
>   
> -	rpc_task_set_client(task, task_setup_data->rpc_client);
>   	rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
> +	rpc_task_set_client(task, task_setup_data->rpc_client);
>   
>   	if (task->tk_action == NULL)
>   		rpc_call_start(task);
Tim Gardner Jan. 27, 2022, 3:36 p.m. UTC | #2
On 1/27/22 7:18 AM, Kleber Souza wrote:
> On 1/25/22 14:54, Tim Gardner wrote:
>> From: Nagendra Tomar <Nagendra.Tomar@microsoft.com>
>>
>> BugLink: https://bugs.launchpad.net/bugs/1958990
>>
>> Add module parameters and code to allow pinning a TCP connection
>> to a specific server. See upstream discussion at:
>>
>> https://www.spinics.net/lists/linux-nfs/msg83074.html
>>
>> This patch is a consolidated and simplified backport of the logic 
>> required to
>> implement pinning to one connection. With the author's permission I 
>> have changed the
>> default sense for the module parameter 'enable_azure_nconnect' to 
>> false in
>> order to preserve existing behavior.
>>
>> This logic isn't enabled unless the module parameter 
>> enable_azure_nconnect=true.
>>
>> Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
> 
> The Signed-off-by of the original author is missing.
> 
> Tim, can you confirm that we could add this when applying the patches?
> 

I was hesitant to do that without the author's permission. All I have is 
proof that Nagendra was the author. I've added my S-o-b since I've 
thorougly reviewed the patch as well as made minor modifications.

> 
> Thanks,
> Kleber
> 
>> ---
>>   net/sunrpc/clnt.c | 237 +++++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 235 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>> index b6039642df67e..cdf4436ea9650 100644
>> --- a/net/sunrpc/clnt.c
>> +++ b/net/sunrpc/clnt.c
>> @@ -33,6 +33,23 @@
>>   #include <linux/in6.h>
>>   #include <linux/un.h>
>> +/*
>> + * Note #1:
>> + * Accessing NFS structures inside sunrpc code is layering violation, 
>> but
>> + * that's the best we can do w/o making changes to existing structures,
>> + * which would prevent the updated module from being loaded into 
>> existing
>> + * pre-built kernels.
>> + *
>> + * Note #2:
>> + * We define __LINUX_NFSACL_H to prevent nfsacl.h from being included 
>> o/w
>> + * some of the rpc* methods get different modversion than the kernel, 
>> due
>> + * to some nfs acl structures being forward declared.
>> + */
>> +#define __LINUX_NFSACL_H
>> +#include <linux/nfs_fs.h>
>> +#undef ifdebug
>> +#define ifdebug(fac)        if (0)
>> +
>>   #include <linux/sunrpc/clnt.h>
>>   #include <linux/sunrpc/addr.h>
>>   #include <linux/sunrpc/rpc_pipe_fs.h>
>> @@ -51,6 +68,28 @@
>>       dprintk("RPC: %5u %s (status %d)\n", t->tk_pid,        \
>>               __func__, t->tk_status)
>> +/*
>> + * If enable_azure_nconnect is true, RPC requests for a file are sent 
>> over
>> + * one connection. RPC requests for different files may be sent over 
>> different
>> + * connections.
>> + */
>> +static bool enable_azure_nconnect __read_mostly = false;
>> +module_param(enable_azure_nconnect, bool, 0644);
>> +MODULE_PARM_DESC(enable_azure_nconnect,
>> +    "Send RPC requests for one file over one connection (requests for 
>> different files go over different connections)");
>> +
>> +/*
>> + * By default read requests to one file are sent over one connection.
>> + * azure_nconnect_readscaling module parameter can be used to control 
>> that
>> + * behavior. By distributing READ RPCs for one file over multiple 
>> connections
>> + * we can get much higher single-file READ throughput. This can be 
>> used if
>> + * we have a readonly mount or if files are mostly read and not written.
>> + */
>> +static bool azure_nconnect_readscaling __read_mostly = false;
>> +module_param(azure_nconnect_readscaling, bool, 0644);
>> +MODULE_PARM_DESC(azure_nconnect_readscaling,
>> +    "Scale single file reads by sending them round-robin over all the 
>> available connections. Use only for readonly mounts or for read-mostly 
>> workloads");
>> +
>>   /*
>>    * All RPC clients are linked into this list
>>    */
>> @@ -1055,6 +1094,200 @@ rpc_task_get_next_xprt(struct rpc_clnt *clnt)
>>       return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi));
>>   }
>> +/*
>> + * For the given rpc_task, compute the hash for the target filehandle.
>> + */
>> +static u32
>> +rpc_task_fh_hash(const struct rpc_task *task)
>> +{
>> +    const struct rpc_message *rpc_message = &task->tk_msg;
>> +    const struct rpc_procinfo *rpc_proc = rpc_message->rpc_proc;
>> +    const u32 p_proc = (rpc_proc ? rpc_proc->p_proc : NFS3PROC_NULL);
>> +    const struct nfs_fh *fh = NULL;
>> +
>> +    switch (p_proc) {
>> +    case NFS3PROC_GETATTR:
>> +    {
>> +        fh = rpc_message->rpc_argp;
>> +        break;
>> +    }
>> +    case NFS3PROC_SETATTR:
>> +    {
>> +        const struct nfs3_sattrargs *args = rpc_message->rpc_argp;
>> +        fh = args->fh;
>> +        break;
>> +    }
>> +    case NFS3PROC_LOOKUP:
>> +    case NFS3PROC_RMDIR:
>> +    {
>> +        const struct nfs3_diropargs *args = rpc_message->rpc_argp;
>> +        fh = args->fh;
>> +        break;
>> +    }
>> +    case NFS3PROC_ACCESS:
>> +    {
>> +        const struct nfs3_accessargs *args = rpc_message->rpc_argp;
>> +        fh = args->fh;
>> +        break;
>> +    }
>> +    case NFS3PROC_READLINK:
>> +    {
>> +        const struct nfs3_readlinkargs *args = rpc_message->rpc_argp;
>> +        fh = args->fh;
>> +        break;
>> +    }
>> +    case NFS3PROC_READ:
>> +    case NFS3PROC_WRITE:
>> +    {
>> +        const struct nfs_pgio_args *args = rpc_message->rpc_argp;
>> +        fh = args->fh;
>> +        break;
>> +    }
>> +    case NFS3PROC_COMMIT:
>> +    {
>> +        const struct nfs_commitargs *args = rpc_message->rpc_argp;
>> +        fh = args->fh;
>> +        break;
>> +    }
>> +    case NFS3PROC_CREATE:
>> +    {
>> +        const struct nfs3_createargs *args = rpc_message->rpc_argp;
>> +        fh = args->fh;
>> +        break;
>> +    }
>> +    case NFS3PROC_MKDIR:
>> +    {
>> +        const struct nfs3_mkdirargs *args = rpc_message->rpc_argp;
>> +        fh = args->fh;
>> +        break;
>> +    }
>> +    case NFS3PROC_SYMLINK:
>> +    {
>> +        const struct nfs3_symlinkargs *args = rpc_message->rpc_argp;
>> +        fh = args->fromfh;
>> +        break;
>> +    }
>> +    case NFS3PROC_MKNOD:
>> +    {
>> +        const struct nfs3_mknodargs *args = rpc_message->rpc_argp;
>> +        fh = args->fh;
>> +        break;
>> +    }
>> +    case NFS3PROC_REMOVE:
>> +    {
>> +        const struct nfs_removeargs *args = rpc_message->rpc_argp;
>> +        fh = args->fh;
>> +        break;
>> +    }
>> +    case NFS3PROC_RENAME:
>> +    {
>> +        const struct nfs_renameargs *args = rpc_message->rpc_argp;
>> +        /*
>> +         * In case of cross-dir rename, we have to choose between
>> +         * old and new dir to have the updated cache. We prefer
>> +         * new_dir as that's where the user expects the file to
>> +         * show up.
>> +         */
>> +        fh = args->new_dir;
>> +        if (!fh)
>> +            fh = args->old_dir;
>> +        break;
>> +    }
>> +    case NFS3PROC_LINK:
>> +    {
>> +        const struct nfs3_linkargs *args = rpc_message->rpc_argp;
>> +        fh = args->tofh;
>> +        break;
>> +    }
>> +    case NFS3PROC_READDIR:
>> +    case NFS3PROC_READDIRPLUS:
>> +    {
>> +        const struct nfs3_readdirargs *args = rpc_message->rpc_argp;
>> +        fh = args->fh;
>> +        break;
>> +    }
>> +    /*
>> +     * Rest are not targeted to a file and map to the first
>> +     * transport connection.
>> +     */
>> +    }
>> +
>> +    return (fh ? jhash(fh->data, fh->size, 0) : 0);
>> +}
>> +
>> +static
>> +bool xprt_is_active(const struct rpc_xprt *xprt)
>> +{
>> +    return kref_read(&xprt->kref) != 0;
>> +}
>> +
>> +/*
>> + * For the given rpc_task return the hashed xprt to use.
>> + * This will ensure RPCs targeted to the same file get the same xprt.
>> + */
>> +static struct rpc_xprt *
>> +rpc_task_get_hashed_xprt(struct rpc_clnt *clnt, const struct rpc_task 
>> *task)
>> +{
>> +    const struct rpc_xprt_switch *xps = NULL;
>> +    struct rpc_xprt *xprt = NULL;
>> +    const u32 hash = rpc_task_fh_hash(task);
>> +
>> +    rcu_read_lock();
>> +    xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
>> +
>> +    if (xps && hash) {
>> +        const struct list_head *head = &xps->xps_xprt_list;
>> +        struct rpc_xprt *pos;
>> +        const u32 nactive = READ_ONCE(xps->xps_nactive);
>> +        const u32 xprt_idx = (hash % nactive);
>> +        u32 idx = 0;
>> +
>> +        list_for_each_entry_rcu(pos, head, xprt_switch) {
>> +            if (xprt_idx > idx++)
>> +                continue;
>> +            if (xprt_is_active(pos)) {
>> +                xprt = xprt_get(pos);
>> +                break;
>> +            } else {
>> +                if (printk_ratelimit())
>> +                    printk(KERN_ERR "!xprt_is_active idx=%u, 
>> xprt_idx=%u, hash=%u\n",
>> +                        idx, xprt_idx, hash);
>> +            }
>> +        }
>> +    }
>> +
>> +    /*
>> +     * Use first transport, if not found any, or if RPC is not targeted
>> +     * to a specific file, e.g., FSINFO.
>> +     */
>> +    if (!xprt)
>> +        xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
>> +    rcu_read_unlock();
>> +
>> +    return rpc_task_get_xprt(clnt, xprt);
>> +}
>> +
>> +static struct rpc_xprt *
>> +rpc_task_get_azure_xprt(struct rpc_clnt *clnt, const struct rpc_task 
>> *task)
>> +{
>> +    /*
>> +     * Use special azure nconnect only for NFSv3 RPC requests.
>> +     */
>> +    if (clnt->cl_prog != NFS_PROGRAM || clnt->cl_vers != NFS3_VERSION)
>> +        return rpc_task_get_next_xprt(clnt);
>> +
>> +    if (enable_azure_nconnect) {
>> +        if (azure_nconnect_readscaling) {
>> +            const struct rpc_procinfo *rpc_proc =
>> +                task->tk_msg.rpc_proc;
>> +            if (rpc_proc && rpc_proc->p_proc == NFS3PROC_READ)
>> +                return rpc_task_get_next_xprt(clnt);
>> +        }
>> +        return rpc_task_get_hashed_xprt(clnt, task);
>> +    } else
>> +        return rpc_task_get_next_xprt(clnt);
>> +}
>> +
>>   static
>>   void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt 
>> *clnt)
>>   {
>> @@ -1063,7 +1296,7 @@ void rpc_task_set_transport(struct rpc_task 
>> *task, struct rpc_clnt *clnt)
>>       if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
>>           task->tk_xprt = rpc_task_get_first_xprt(clnt);
>>       else
>> -        task->tk_xprt = rpc_task_get_next_xprt(clnt);
>> +        task->tk_xprt = rpc_task_get_azure_xprt(clnt, task);
>>   }
>>   static
>> @@ -1123,8 +1356,8 @@ struct rpc_task *rpc_run_task(const struct 
>> rpc_task_setup *task_setup_data)
>>       task = rpc_new_task(task_setup_data);
>> -    rpc_task_set_client(task, task_setup_data->rpc_client);
>>       rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
>> +    rpc_task_set_client(task, task_setup_data->rpc_client);
>>       if (task->tk_action == NULL)
>>           rpc_call_start(task);
>
Krzysztof Kozlowski Jan. 28, 2022, 10:40 a.m. UTC | #3
On 25/01/2022 14:54, Tim Gardner wrote:
> From: Nagendra Tomar <Nagendra.Tomar@microsoft.com>
> 
> BugLink: https://bugs.launchpad.net/bugs/1958990
> 
> Add module parameters and code to allow pinning a TCP connection
> to a specific server. See upstream discussion at:
> 
> https://www.spinics.net/lists/linux-nfs/msg83074.html
> 
> This patch is a consolidated and simplified backport of the logic required to
> implement pinning to one connection. With the author's permission I have changed the
> default sense for the module parameter 'enable_azure_nconnect' to false in
> order to preserve existing behavior.
> 
> This logic isn't enabled unless the module parameter enable_azure_nconnect=true.

I understand why we want this patch in linux-azure kernel but this does
not justify taking it. Microsoft should work on this with upstream and
we should take either upstream patch or upstream-candidate-ready patch.

Not a sauce hacky-wacky-workaround which will bring only pain to us in
it's maintenance. Imagine next `cranky rebase`. And next, and the
following... If we do not push back on such stuff, they might never end
in upstream.

That said, understanding the business implications, I am against the
patch but I won't NAK it. :)

> 
> Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
> ---
>  net/sunrpc/clnt.c | 237 +++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 235 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index b6039642df67e..cdf4436ea9650 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -33,6 +33,23 @@
>  #include <linux/in6.h>
>  #include <linux/un.h>
>  

Best regards,
Krzysztof
diff mbox series

Patch

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b6039642df67e..cdf4436ea9650 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -33,6 +33,23 @@ 
 #include <linux/in6.h>
 #include <linux/un.h>
 
+/*
+ * Note #1:
+ * Accessing NFS structures inside sunrpc code is layering violation, but
+ * that's the best we can do w/o making changes to existing structures,
+ * which would prevent the updated module from being loaded into existing
+ * pre-built kernels.
+ *
+ * Note #2:
+ * We define __LINUX_NFSACL_H to prevent nfsacl.h from being included o/w
+ * some of the rpc* methods get different modversion than the kernel, due
+ * to some nfs acl structures being forward declared.
+ */
+#define __LINUX_NFSACL_H
+#include <linux/nfs_fs.h>
+#undef ifdebug
+#define ifdebug(fac)		if (0)
+
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
@@ -51,6 +68,28 @@ 
 	dprintk("RPC: %5u %s (status %d)\n", t->tk_pid,		\
 			__func__, t->tk_status)
 
+/*
+ * If enable_azure_nconnect is true, RPC requests for a file are sent over
+ * one connection. RPC requests for different files may be sent over different
+ * connections.
+ */
+static bool enable_azure_nconnect __read_mostly = false;
+module_param(enable_azure_nconnect, bool, 0644);
+MODULE_PARM_DESC(enable_azure_nconnect,
+	"Send RPC requests for one file over one connection (requests for different files go over different connections)");
+
+/*
+ * By default read requests to one file are sent over one connection.
+ * azure_nconnect_readscaling module parameter can be used to control that
+ * behavior. By distributing READ RPCs for one file over multiple connections
+ * we can get much higher single-file READ throughput. This can be used if
+ * we have a readonly mount or if files are mostly read and not written.
+ */
+static bool azure_nconnect_readscaling __read_mostly = false;
+module_param(azure_nconnect_readscaling, bool, 0644);
+MODULE_PARM_DESC(azure_nconnect_readscaling,
+	"Scale single file reads by sending them round-robin over all the available connections. Use only for readonly mounts or for read-mostly workloads");
+
 /*
  * All RPC clients are linked into this list
  */
@@ -1055,6 +1094,200 @@  rpc_task_get_next_xprt(struct rpc_clnt *clnt)
 	return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi));
 }
 
+/*
+ * For the given rpc_task, compute the hash for the target filehandle.
+ */
+static u32
+rpc_task_fh_hash(const struct rpc_task *task)
+{
+	const struct rpc_message *rpc_message = &task->tk_msg;
+	const struct rpc_procinfo *rpc_proc = rpc_message->rpc_proc;
+	const u32 p_proc = (rpc_proc ? rpc_proc->p_proc : NFS3PROC_NULL);
+	const struct nfs_fh *fh = NULL;
+
+	switch (p_proc) {
+	case NFS3PROC_GETATTR:
+	{
+		fh = rpc_message->rpc_argp;
+		break;
+	}
+	case NFS3PROC_SETATTR:
+	{
+		const struct nfs3_sattrargs *args = rpc_message->rpc_argp;
+		fh = args->fh;
+		break;
+	}
+	case NFS3PROC_LOOKUP:
+	case NFS3PROC_RMDIR:
+	{
+		const struct nfs3_diropargs *args = rpc_message->rpc_argp;
+		fh = args->fh;
+		break;
+	}
+	case NFS3PROC_ACCESS:
+	{
+		const struct nfs3_accessargs *args = rpc_message->rpc_argp;
+		fh = args->fh;
+		break;
+	}
+	case NFS3PROC_READLINK:
+	{
+		const struct nfs3_readlinkargs *args = rpc_message->rpc_argp;
+		fh = args->fh;
+		break;
+	}
+	case NFS3PROC_READ:
+	case NFS3PROC_WRITE:
+	{
+		const struct nfs_pgio_args *args = rpc_message->rpc_argp;
+		fh = args->fh;
+		break;
+	}
+	case NFS3PROC_COMMIT:
+	{
+		const struct nfs_commitargs *args = rpc_message->rpc_argp;
+		fh = args->fh;
+		break;
+	}
+	case NFS3PROC_CREATE:
+	{
+		const struct nfs3_createargs *args = rpc_message->rpc_argp;
+		fh = args->fh;
+		break;
+	}
+	case NFS3PROC_MKDIR:
+	{
+		const struct nfs3_mkdirargs *args = rpc_message->rpc_argp;
+		fh = args->fh;
+		break;
+	}
+	case NFS3PROC_SYMLINK:
+	{
+		const struct nfs3_symlinkargs *args = rpc_message->rpc_argp;
+		fh = args->fromfh;
+		break;
+	}
+	case NFS3PROC_MKNOD:
+	{
+		const struct nfs3_mknodargs *args = rpc_message->rpc_argp;
+		fh = args->fh;
+		break;
+	}
+	case NFS3PROC_REMOVE:
+	{
+		const struct nfs_removeargs *args = rpc_message->rpc_argp;
+		fh = args->fh;
+		break;
+	}
+	case NFS3PROC_RENAME:
+	{
+		const struct nfs_renameargs *args = rpc_message->rpc_argp;
+		/*
+		 * In case of cross-dir rename, we have to choose between
+		 * old and new dir to have the updated cache. We prefer
+		 * new_dir as that's where the user expects the file to
+		 * show up.
+		 */
+		fh = args->new_dir;
+		if (!fh)
+			fh = args->old_dir;
+		break;
+	}
+	case NFS3PROC_LINK:
+	{
+		const struct nfs3_linkargs *args = rpc_message->rpc_argp;
+		fh = args->tofh;
+		break;
+	}
+	case NFS3PROC_READDIR:
+	case NFS3PROC_READDIRPLUS:
+	{
+		const struct nfs3_readdirargs *args = rpc_message->rpc_argp;
+		fh = args->fh;
+		break;
+	}
+	/*
+	 * Rest are not targeted to a file and map to the first
+	 * transport connection.
+	 */
+	}
+
+	return (fh ? jhash(fh->data, fh->size, 0) : 0);
+}
+
+static
+bool xprt_is_active(const struct rpc_xprt *xprt)
+{
+	return kref_read(&xprt->kref) != 0;
+}
+
+/*
+ * For the given rpc_task return the hashed xprt to use.
+ * This will ensure RPCs targeted to the same file get the same xprt.
+ */
+static struct rpc_xprt *
+rpc_task_get_hashed_xprt(struct rpc_clnt *clnt, const struct rpc_task *task)
+{
+	const struct rpc_xprt_switch *xps = NULL;
+	struct rpc_xprt *xprt = NULL;
+	const u32 hash = rpc_task_fh_hash(task);
+
+	rcu_read_lock();
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+
+	if (xps && hash) {
+		const struct list_head *head = &xps->xps_xprt_list;
+		struct rpc_xprt *pos;
+		const u32 nactive = READ_ONCE(xps->xps_nactive);
+		const u32 xprt_idx = (hash % nactive);
+		u32 idx = 0;
+
+		list_for_each_entry_rcu(pos, head, xprt_switch) {
+			if (xprt_idx > idx++)
+				continue;
+			if (xprt_is_active(pos)) {
+				xprt = xprt_get(pos);
+				break;
+			} else {
+				if (printk_ratelimit())
+					printk(KERN_ERR "!xprt_is_active idx=%u, xprt_idx=%u, hash=%u\n",
+						idx, xprt_idx, hash);
+			}
+		}
+	}
+
+	/*
+	 * Use first transport, if not found any, or if RPC is not targeted
+	 * to a specific file, e.g., FSINFO.
+	 */
+	if (!xprt)
+		xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+	rcu_read_unlock();
+
+	return rpc_task_get_xprt(clnt, xprt);
+}
+
+static struct rpc_xprt *
+rpc_task_get_azure_xprt(struct rpc_clnt *clnt, const struct rpc_task *task)
+{
+	/*
+	 * Use special azure nconnect only for NFSv3 RPC requests.
+	 */
+	if (clnt->cl_prog != NFS_PROGRAM || clnt->cl_vers != NFS3_VERSION)
+		return rpc_task_get_next_xprt(clnt);
+
+	if (enable_azure_nconnect) {
+		if (azure_nconnect_readscaling) {
+			const struct rpc_procinfo *rpc_proc =
+				task->tk_msg.rpc_proc;
+			if (rpc_proc && rpc_proc->p_proc == NFS3PROC_READ)
+				return rpc_task_get_next_xprt(clnt);
+		}
+		return rpc_task_get_hashed_xprt(clnt, task);
+	} else
+		return rpc_task_get_next_xprt(clnt);
+}
+
 static
 void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 {
@@ -1063,7 +1296,7 @@  void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 	if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
 		task->tk_xprt = rpc_task_get_first_xprt(clnt);
 	else
-		task->tk_xprt = rpc_task_get_next_xprt(clnt);
+		task->tk_xprt = rpc_task_get_azure_xprt(clnt, task);
 }
 
 static
@@ -1123,8 +1356,8 @@  struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
 
 	task = rpc_new_task(task_setup_data);
 
-	rpc_task_set_client(task, task_setup_data->rpc_client);
 	rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
+	rpc_task_set_client(task, task_setup_data->rpc_client);
 
 	if (task->tk_action == NULL)
 		rpc_call_start(task);