diff mbox

[01/11] SYSCTL: export root and set handling routines

Message ID 20111214104449.3991.61989.stgit@localhost6.localdomain6
State Not Applicable, archived
Delegated to: David Miller
Headers show

Commit Message

Stanislav Kinsbursky Dec. 14, 2011, 11:44 a.m. UTC
These routines are required for making SUNRPC sysctl's per network namespace
context.

Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>

---
 include/linux/sysctl.h |    1 +
 kernel/sysctl.c        |   11 +++++++++++
 2 files changed, 12 insertions(+), 0 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric W. Biederman Dec. 17, 2011, 10:25 p.m. UTC | #1
Stanislav Kinsbursky <skinsbursky@parallels.com> writes:

> These routines are required for making SUNRPC sysctl's per network namespace
> context.

Why does sunrpc require it's own sysctl root?  You should be able to use
the generic per network namespace root and call it good.

What makes register_net_sysctl_table and register_net_sysctl_ro_table
unsuitable for sunrpc.  I skimmed through your patches and I haven't
seen anything obvious.

Eric


> Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
>
> ---
>  include/linux/sysctl.h |    1 +
>  kernel/sysctl.c        |   11 +++++++++++
>  2 files changed, 12 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 703cfa3..be586a9 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -1084,6 +1084,7 @@ struct ctl_path {
>  };
>  
>  void register_sysctl_root(struct ctl_table_root *root);
> +void unregister_sysctl_root(struct ctl_table_root *root);
>  struct ctl_table_header *__register_sysctl_paths(
>  	struct ctl_table_root *root, struct nsproxy *namespaces,
>  	const struct ctl_path *path, struct ctl_table *table);
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index ae27196..fb016a9 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1700,6 +1700,15 @@ void register_sysctl_root(struct ctl_table_root *root)
>  	list_add_tail(&root->root_list, &sysctl_table_root.root_list);
>  	spin_unlock(&sysctl_lock);
>  }
> +EXPORT_SYMBOL_GPL(register_sysctl_root);
> +
> +void unregister_sysctl_root(struct ctl_table_root *root)
> +{
> +	spin_lock(&sysctl_lock);
> +	list_del(&root->root_list);
> +	spin_unlock(&sysctl_lock);
> +}
> +EXPORT_SYMBOL_GPL(unregister_sysctl_root);
>  
>  /*
>   * sysctl_perm does NOT grant the superuser all rights automatically, because
> @@ -1925,6 +1934,7 @@ struct ctl_table_header *__register_sysctl_paths(
>  
>  	return header;
>  }
> +EXPORT_SYMBOL_GPL(__register_sysctl_paths);
>  
>  /**
>   * register_sysctl_table_path - register a sysctl table hierarchy
> @@ -2007,6 +2017,7 @@ void setup_sysctl_set(struct ctl_table_set *p,
>  	p->parent = parent ? parent : &sysctl_table_root.default_set;
>  	p->is_seen = is_seen;
>  }
> +EXPORT_SYMBOL_GPL(setup_sysctl_set);
>  
>  #else /* !CONFIG_SYSCTL */
>  struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stanislav Kinsbursky Dec. 19, 2011, 8:56 a.m. UTC | #2
18.12.2011 02:25, Eric W. Biederman пишет:
> Stanislav Kinsbursky<skinsbursky@parallels.com>  writes:
>
>> These routines are required for making SUNRPC sysctl's per network namespace
>> context.
>
> Why does sunrpc require it's own sysctl root?  You should be able to use
> the generic per network namespace root and call it good.
>
> What makes register_net_sysctl_table and register_net_sysctl_ro_table
> unsuitable for sunrpc.  I skimmed through your patches and I haven't
> seen anything obvious.
>
> Eric
>

Hello, Eric. Sorry for the lack of information.
I was considering two ways how to make these sysctl per net ns:

1) Use register_net_sysctl_table and register_net_sysctl_ro_table as you 
mentioned. This was easy and cheap, but also means, than all user-space 
programs, tuning SUNRPC will be broken (since all sysctl currently located 
in"/proc/sys/sunprc/").

2) Export sysctl root creation routines and make per-net SUNRPC sysctl root. 
This approach allows to make any part of sysctl tree per namespace context and 
thus leave user-space stuff unchanged.

BTW, NFS and LockD also have it's sysctls ("/proc/sys/fs/nfs/").
And also because of them I've decided, that it would be better to export SYSCTL 
root creation routines instead of breaking compatibility for all NFS layers by 
moving all sysctl under /proc/sys/net/ directory.

Do you feel that it was a bad decision?

>
>> Signed-off-by: Stanislav Kinsbursky<skinsbursky@parallels.com>
>>
>> ---
>>   include/linux/sysctl.h |    1 +
>>   kernel/sysctl.c        |   11 +++++++++++
>>   2 files changed, 12 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
>> index 703cfa3..be586a9 100644
>> --- a/include/linux/sysctl.h
>> +++ b/include/linux/sysctl.h
>> @@ -1084,6 +1084,7 @@ struct ctl_path {
>>   };
>>
>>   void register_sysctl_root(struct ctl_table_root *root);
>> +void unregister_sysctl_root(struct ctl_table_root *root);
>>   struct ctl_table_header *__register_sysctl_paths(
>>   	struct ctl_table_root *root, struct nsproxy *namespaces,
>>   	const struct ctl_path *path, struct ctl_table *table);
>> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
>> index ae27196..fb016a9 100644
>> --- a/kernel/sysctl.c
>> +++ b/kernel/sysctl.c
>> @@ -1700,6 +1700,15 @@ void register_sysctl_root(struct ctl_table_root *root)
>>   	list_add_tail(&root->root_list,&sysctl_table_root.root_list);
>>   	spin_unlock(&sysctl_lock);
>>   }
>> +EXPORT_SYMBOL_GPL(register_sysctl_root);
>> +
>> +void unregister_sysctl_root(struct ctl_table_root *root)
>> +{
>> +	spin_lock(&sysctl_lock);
>> +	list_del(&root->root_list);
>> +	spin_unlock(&sysctl_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(unregister_sysctl_root);
>>
>>   /*
>>    * sysctl_perm does NOT grant the superuser all rights automatically, because
>> @@ -1925,6 +1934,7 @@ struct ctl_table_header *__register_sysctl_paths(
>>
>>   	return header;
>>   }
>> +EXPORT_SYMBOL_GPL(__register_sysctl_paths);
>>
>>   /**
>>    * register_sysctl_table_path - register a sysctl table hierarchy
>> @@ -2007,6 +2017,7 @@ void setup_sysctl_set(struct ctl_table_set *p,
>>   	p->parent = parent ? parent :&sysctl_table_root.default_set;
>>   	p->is_seen = is_seen;
>>   }
>> +EXPORT_SYMBOL_GPL(setup_sysctl_set);
>>
>>   #else /* !CONFIG_SYSCTL */
>>   struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Dec. 19, 2011, 10:15 a.m. UTC | #3
Stanislav Kinsbursky <skinsbursky@parallels.com> writes:

> 18.12.2011 02:25, Eric W. Biederman пишет:
>> Stanislav Kinsbursky<skinsbursky@parallels.com>  writes:
>>
>>> These routines are required for making SUNRPC sysctl's per network namespace
>>> context.
>>
>> Why does sunrpc require it's own sysctl root?  You should be able to use
>> the generic per network namespace root and call it good.
>>
>> What makes register_net_sysctl_table and register_net_sysctl_ro_table
>> unsuitable for sunrpc.  I skimmed through your patches and I haven't
>> seen anything obvious.
>>
>> Eric
>>
>
> Hello, Eric. Sorry for the lack of information.
> I was considering two ways how to make these sysctl per net ns:
>
> 1) Use register_net_sysctl_table and register_net_sysctl_ro_table as you
> mentioned. This was easy and cheap, but also means, than all user-space
> programs, tuning SUNRPC will be broken (since all sysctl currently located
> in"/proc/sys/sunprc/").

Nope.  That is a misunderstanding.  register_net_sysctl_table works for
anything under /proc/sys.

> 2) Export sysctl root creation routines and make per-net SUNRPC sysctl
> root. This approach allows to make any part of sysctl tree per namespace context
> and thus leave user-space stuff unchanged.
>
> BTW, NFS and LockD also have it's sysctls ("/proc/sys/fs/nfs/").
> And also because of them I've decided, that it would be better to export SYSCTL
> root creation routines instead of breaking compatibility for all NFS layers by
> moving all sysctl under /proc/sys/net/ directory.
>
> Do you feel that it was a bad decision?

I think it was a misinformed decision.

I fully support not breaking userspace by moving where the sysctls files
are.  If something sounds like I am suggesting moving sysctl files there
is a miscommunication somewhere.

The concept of a sysctl root as I had envisioned it and essentially as it
is implemented was a per namespace sysctl tree.  Those sysctl trees are
then unioned together when presented to user space.  There should only
be one root per namespace.

In practice what this means is that register_net_sysctl_table should
work for any sysctl file anywhere under /proc/sys.  I think
register_net_sysctl_table is the right solution for your problem.  The
only possible caveat I can think of is you might hit Al's performance
optimizations and need to create a common empty directory first with
register_sysctl_paths.


....
That said since I am in the process of rewriting things some of this
may change a little bit, but hopefully not in ways that immediately
effect the users of register_sysctl_table.

Don't use register_net_sysctl_ro_table.   I think what the implementors
actually wanted was register_net_sysctl_table(&init_net, ...) and didn't
know it.

Don't put subdirectories in your sysctl tables.  Use a ctl_path to
specify the entire directory where the files should show up.  Generally
the code is easier to read in that form, and the code is simpler to deal
with if we don't have to worry about directories.

Don't play with the sysctl roots.  It is my intention to completely kill
them off and replace them by moving the per net sysctl tree under
/proc/<pid>/sys/.   Leaving behind symlinks in /proc/sys/net and I guess
ultimately in /proc/sys/sunrpc/ and /proc/sys/fs/nfs...  Which actually
seems to better describe your mental model.

Thank you for mentioning /proc/sys/fs/nfs.  That is a case I hadn't
thought about.  In thinking about it I see some deficiencies in my
rewrite that I need to correct before I push that code.

Eric
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stanislav Kinsbursky Dec. 19, 2011, 12:22 p.m. UTC | #4
19.12.2011 14:15, Eric W. Biederman пишет:
> Stanislav Kinsbursky<skinsbursky@parallels.com>  writes:
>
>> 18.12.2011 02:25, Eric W. Biederman пишет:
>>> Stanislav Kinsbursky<skinsbursky@parallels.com>   writes:
>>>
>>>> These routines are required for making SUNRPC sysctl's per network namespace
>>>> context.
>>>
>>> Why does sunrpc require it's own sysctl root?  You should be able to use
>>> the generic per network namespace root and call it good.
>>>
>>> What makes register_net_sysctl_table and register_net_sysctl_ro_table
>>> unsuitable for sunrpc.  I skimmed through your patches and I haven't
>>> seen anything obvious.
>>>
>>> Eric
>>>
>>
>> Hello, Eric. Sorry for the lack of information.
>> I was considering two ways how to make these sysctl per net ns:
>>
>> 1) Use register_net_sysctl_table and register_net_sysctl_ro_table as you
>> mentioned. This was easy and cheap, but also means, than all user-space
>> programs, tuning SUNRPC will be broken (since all sysctl currently located
>> in"/proc/sys/sunprc/").
>
> Nope.  That is a misunderstanding.  register_net_sysctl_table works for
> anything under /proc/sys.
>
>> 2) Export sysctl root creation routines and make per-net SUNRPC sysctl
>> root. This approach allows to make any part of sysctl tree per namespace context
>> and thus leave user-space stuff unchanged.
>>
>> BTW, NFS and LockD also have it's sysctls ("/proc/sys/fs/nfs/").
>> And also because of them I've decided, that it would be better to export SYSCTL
>> root creation routines instead of breaking compatibility for all NFS layers by
>> moving all sysctl under /proc/sys/net/ directory.
>>
>> Do you feel that it was a bad decision?
>
> I think it was a misinformed decision.
>
> I fully support not breaking userspace by moving where the sysctls files
> are.  If something sounds like I am suggesting moving sysctl files there
> is a miscommunication somewhere.
>
> The concept of a sysctl root as I had envisioned it and essentially as it
> is implemented was a per namespace sysctl tree.  Those sysctl trees are
> then unioned together when presented to user space.  There should only
> be one root per namespace.
>
> In practice what this means is that register_net_sysctl_table should
> work for any sysctl file anywhere under /proc/sys.  I think
> register_net_sysctl_table is the right solution for your problem.  The
> only possible caveat I can think of is you might hit Al's performance
> optimizations and need to create a common empty directory first with
> register_sysctl_paths.
>
>


Sorry, but I forgot to mention one more important goal I would like to achieve:
I want to manage sysctl's variables in context of mount owner, but not viewer one.
IOW imagine, that we have one two network namespaces: "A" and "B". Both of them 
have it's own net sysctl's root. And we have per-net sysctl "/proc/sys/var".
And for ns "A" variable was set to 0, and for "B" - to 1.
And B's "/proc/sys/var" is accessible from "A" namespace
("/chroot_path/proc/sys/var" for example).
With this configuration I want to read "1" from both namespaces:
owner "B" (/proc/sys/var) and "A" ("/chroot_path/proc/sys/var").
Looks like simple using of register_net_sysctl_table doesn't allow me this, 
because current net ns is used. And to achieve this goal I need my own sysctl 
set for SUNRPC like it was done for network namespaces.


> ....
> That said since I am in the process of rewriting things some of this
> may change a little bit, but hopefully not in ways that immediately
> effect the users of register_sysctl_table.
>
> Don't use register_net_sysctl_ro_table.   I think what the implementors
> actually wanted was register_net_sysctl_table(&init_net, ...) and didn't
> know it.
>
> Don't put subdirectories in your sysctl tables.  Use a ctl_path to
> specify the entire directory where the files should show up.  Generally
> the code is easier to read in that form, and the code is simpler to deal
> with if we don't have to worry about directories.
>
> Don't play with the sysctl roots.  It is my intention to completely kill
> them off and replace them by moving the per net sysctl tree under
> /proc/<pid>/sys/.   Leaving behind symlinks in /proc/sys/net and I guess
> ultimately in /proc/sys/sunrpc/ and /proc/sys/fs/nfs...  Which actually
> seems to better describe your mental model.
>


I'm afraid, that this approach this not allow me to achieve the goal, mentioned 
above, because current->nsproxy->net_ns will be used during lookup.
Or maybe I misunderstanding here?


> Thank you for mentioning /proc/sys/fs/nfs.  That is a case I hadn't
> thought about.  In thinking about it I see some deficiencies in my
> rewrite that I need to correct before I push that code.
>


Was glad to be usefull.


> Eric
Eric W. Biederman Dec. 19, 2011, 4:37 p.m. UTC | #5
Stanislav Kinsbursky <skinsbursky@parallels.com> writes:

>> In practice what this means is that register_net_sysctl_table should
>> work for any sysctl file anywhere under /proc/sys.  I think
>> register_net_sysctl_table is the right solution for your problem.  The
>> only possible caveat I can think of is you might hit Al's performance
>> optimizations and need to create a common empty directory first with
>> register_sysctl_paths.
>
> Sorry, but I forgot to mention one more important goal I would like to achieve:
> I want to manage sysctl's variables in context of mount owner, but not viewer one.
> IOW imagine, that we have one two network namespaces: "A" and "B". Both of them
> have it's own net sysctl's root. And we have per-net sysctl "/proc/sys/var".
> And for ns "A" variable was set to 0, and for "B" - to 1.
> And B's "/proc/sys/var" is accessible from "A" namespace
> ("/chroot_path/proc/sys/var" for example).
> With this configuration I want to read "1" from both namespaces:
> owner "B" (/proc/sys/var) and "A" ("/chroot_path/proc/sys/var").
> Looks like simple using of register_net_sysctl_table doesn't allow me this,
> because current net ns is used. And to achieve this goal I need my own sysctl
> set for SUNRPC like it was done for network namespaces.

Doing that independently of the rest of the sysctls is pretty horrible
and confusing to users.   What I am planning might suit your needs and
if not we need to talk some more about how to get the vfs to do
something reasonable.

>> That said since I am in the process of rewriting things some of this
>> may change a little bit, but hopefully not in ways that immediately
>> effect the users of register_sysctl_table.
>>
>> Don't use register_net_sysctl_ro_table.   I think what the implementors
>> actually wanted was register_net_sysctl_table(&init_net, ...) and didn't
>> know it.
>>
>> Don't put subdirectories in your sysctl tables.  Use a ctl_path to
>> specify the entire directory where the files should show up.  Generally
>> the code is easier to read in that form, and the code is simpler to deal
>> with if we don't have to worry about directories.
>>
>> Don't play with the sysctl roots.  It is my intention to completely kill
>> them off and replace them by moving the per net sysctl tree under
>> /proc/<pid>/sys/.   Leaving behind symlinks in /proc/sys/net and I guess
>> ultimately in /proc/sys/sunrpc/ and /proc/sys/fs/nfs...  Which actually
>> seems to better describe your mental model.
>>
>
>
> I'm afraid, that this approach this not allow me to achieve the goal, mentioned
> above, because current->nsproxy->net_ns will be used during lookup.
> Or maybe I misunderstanding here?

What I hope to do is to stop using current, and to behave like
/proc/net.  Aka a per process view under /proc/<pid>/sys that matches
the namespaces of the specified process.  

The VFS really hates my use of current in the sysctl case, and I intend
to stop.

I need to run and catch my plane.  It doesn't look like I will have
access to this email address for the next two weeks :(  

Eric
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stanislav Kinsbursky Dec. 19, 2011, 5:24 p.m. UTC | #6
19.12.2011 20:37, Eric W. Biederman пишет:
> Stanislav Kinsbursky<skinsbursky@parallels.com>  writes:
>
> Doing that independently of the rest of the sysctls is pretty horrible
> and confusing to users.   What I am planning might suit your needs and
> if not we need to talk some more about how to get the vfs to do
> something reasonable.
>

Ok, Eric. Would be glad to discuss your sysctls plans.
But actually you already know my needs: I would like to make sysctls work in the 
way like sysfs does: i.e. content of files depends on mount maker - not viewer.
Eric W. Biederman Jan. 3, 2012, 3:49 a.m. UTC | #7
Stanislav Kinsbursky <skinsbursky@parallels.com> writes:

> 19.12.2011 20:37, Eric W. Biederman пишет:
>> Stanislav Kinsbursky<skinsbursky@parallels.com>  writes:
>>
>> Doing that independently of the rest of the sysctls is pretty horrible
>> and confusing to users.   What I am planning might suit your needs and
>> if not we need to talk some more about how to get the vfs to do
>> something reasonable.
>>
>
> Ok, Eric. Would be glad to discuss your sysctls plans.
> But actually you already know my needs: I would like to make sysctls work in the
> way like sysfs does: i.e. content of files depends on mount maker -
> not viewer.

What drives the desire to have sysctls depend on the mount maker?
Especially what drives that desire not to have it have a /proc/<pid>/sys
directory that reflects the sysctls for a given process.

Eric

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stanislav Kinsbursky Jan. 10, 2012, 10:38 a.m. UTC | #8
03.01.2012 07:49, Eric W. Biederman пишет:
> Stanislav Kinsbursky<skinsbursky@parallels.com>  writes:
>
>> 19.12.2011 20:37, Eric W. Biederman пишет:
>>> Stanislav Kinsbursky<skinsbursky@parallels.com>   writes:
>>>
>>> Doing that independently of the rest of the sysctls is pretty horrible
>>> and confusing to users.   What I am planning might suit your needs and
>>> if not we need to talk some more about how to get the vfs to do
>>> something reasonable.
>>>
>>
>> Ok, Eric. Would be glad to discuss your sysctls plans.
>> But actually you already know my needs: I would like to make sysctls work in the
>> way like sysfs does: i.e. content of files depends on mount maker -
>> not viewer.
>
> What drives the desire to have sysctls depend on the mount maker?

Because we can (will, actually) have nested fs root's for containers. IOW, 
container's root will be accessible from it's creator context. And I want to 
tune container's fs from creators context.

> Especially what drives that desire not to have it have a /proc/<pid>/sys
> directory that reflects the sysctls for a given process.
>

This is not so important for me, where to access sysctl's. But I'm worrying 
about backward compatibility. IOW, I'm afraid of changing path
"/proc/sys/sunprc/*" to "/proc/<pid>/sys/sunrpc". This would break a lot of 
user-space programs.
Eric W. Biederman Jan. 10, 2012, 10:39 p.m. UTC | #9
Stanislav Kinsbursky <skinsbursky@parallels.com> writes:

> 03.01.2012 07:49, Eric W. Biederman пишет:
>> Stanislav Kinsbursky<skinsbursky@parallels.com>  writes:
>>
>>> 19.12.2011 20:37, Eric W. Biederman пишет:
>>>> Stanislav Kinsbursky<skinsbursky@parallels.com>   writes:
>>>>
>>>> Doing that independently of the rest of the sysctls is pretty horrible
>>>> and confusing to users.   What I am planning might suit your needs and
>>>> if not we need to talk some more about how to get the vfs to do
>>>> something reasonable.
>>>>
>>>
>>> Ok, Eric. Would be glad to discuss your sysctls plans.
>>> But actually you already know my needs: I would like to make sysctls work in the
>>> way like sysfs does: i.e. content of files depends on mount maker -
>>> not viewer.
>>
>> What drives the desire to have sysctls depend on the mount maker?
>
> Because we can (will, actually) have nested fs root's for containers. IOW,
> container's root will be accessible from it's creator context. And I want to
> tune container's fs from creators context.

Tuning the child context from the parent context is an entirely
reasonable thing to do.  To affect a namespace that is not yours
the requirement is simply that we don't use current to lookup the
sysctl.  So what I am proposing should work for your case.

>> Especially what drives that desire not to have it have a /proc/<pid>/sys
>> directory that reflects the sysctls for a given process.
>>
>
> This is not so important for me, where to access sysctl's. But I'm worrying
> about backward compatibility. IOW, I'm afraid of changing path
> "/proc/sys/sunprc/*" to "/proc/<pid>/sys/sunrpc". This would break a lot of
> user-space programs.

The part that keeps it all working is by adding a symlink from /proc/sys
to /proc/self/sys.  That technique has worked well for /proc/net, and I
don't expect there will be any problems with /proc/sys either.  It is
possible but is very rare for the introduction of a symlink in a path
to cause problems.

Eric

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stanislav Kinsbursky Jan. 11, 2012, 9:47 a.m. UTC | #10
11.01.2012 02:39, Eric W. Biederman пишет:
> Stanislav Kinsbursky<skinsbursky@parallels.com>  writes:
>
>> 03.01.2012 07:49, Eric W. Biederman пишет:
>>> Stanislav Kinsbursky<skinsbursky@parallels.com>   writes:
>>>
>>>> 19.12.2011 20:37, Eric W. Biederman пишет:
>>>>> Stanislav Kinsbursky<skinsbursky@parallels.com>    writes:
>>>>>
>>>>> Doing that independently of the rest of the sysctls is pretty horrible
>>>>> and confusing to users.   What I am planning might suit your needs and
>>>>> if not we need to talk some more about how to get the vfs to do
>>>>> something reasonable.
>>>>>
>>>>
>>>> Ok, Eric. Would be glad to discuss your sysctls plans.
>>>> But actually you already know my needs: I would like to make sysctls work in the
>>>> way like sysfs does: i.e. content of files depends on mount maker -
>>>> not viewer.
>>>
>>> What drives the desire to have sysctls depend on the mount maker?
>>
>> Because we can (will, actually) have nested fs root's for containers. IOW,
>> container's root will be accessible from it's creator context. And I want to
>> tune container's fs from creators context.
>
> Tuning the child context from the parent context is an entirely
> reasonable thing to do.  To affect a namespace that is not yours
> the requirement is simply that we don't use current to lookup the
> sysctl.  So what I am proposing should work for your case.
>

Could you explain, what are you proposing?
I still don't know any details about it.

>>> Especially what drives that desire not to have it have a /proc/<pid>/sys
>>> directory that reflects the sysctls for a given process.
>>>
>>
>> This is not so important for me, where to access sysctl's. But I'm worrying
>> about backward compatibility. IOW, I'm afraid of changing path
>> "/proc/sys/sunprc/*" to "/proc/<pid>/sys/sunrpc". This would break a lot of
>> user-space programs.
>
> The part that keeps it all working is by adding a symlink from /proc/sys
> to /proc/self/sys.  That technique has worked well for /proc/net, and I
> don't expect there will be any problems with /proc/sys either.  It is
> possible but is very rare for the introduction of a symlink in a path
> to cause problems.
>

Probably I don't understand you, but as I see it now, symlink to "/proc/self/" 
is unacceptable because of the following:
1) will be used current context (any) instead of desired one
1) if CT has other pid namespace - then we just have broken link.

> Eric
>
Eric W. Biederman Jan. 11, 2012, 5:21 p.m. UTC | #11
Stanislav Kinsbursky <skinsbursky@parallels.com> writes:

> 11.01.2012 02:39, Eric W. Biederman пишет:
>> Stanislav Kinsbursky<skinsbursky@parallels.com>  writes:
>>
>>> 03.01.2012 07:49, Eric W. Biederman пишет:
>>>> Stanislav Kinsbursky<skinsbursky@parallels.com>   writes:
>>>>
>>>>> 19.12.2011 20:37, Eric W. Biederman пишет:
>>>>>> Stanislav Kinsbursky<skinsbursky@parallels.com>    writes:
>>>>>>
>>>>>> Doing that independently of the rest of the sysctls is pretty horrible
>>>>>> and confusing to users.   What I am planning might suit your needs and
>>>>>> if not we need to talk some more about how to get the vfs to do
>>>>>> something reasonable.
>>>>>>
>>>>>
>>>>> Ok, Eric. Would be glad to discuss your sysctls plans.
>>>>> But actually you already know my needs: I would like to make sysctls work in the
>>>>> way like sysfs does: i.e. content of files depends on mount maker -
>>>>> not viewer.
>>>>
>>>> What drives the desire to have sysctls depend on the mount maker?
>>>
>>> Because we can (will, actually) have nested fs root's for containers. IOW,
>>> container's root will be accessible from it's creator context. And I want to
>>> tune container's fs from creators context.
>>
>> Tuning the child context from the parent context is an entirely
>> reasonable thing to do.  To affect a namespace that is not yours
>> the requirement is simply that we don't use current to lookup the
>> sysctl.  So what I am proposing should work for your case.
>>
>
> Could you explain, what are you proposing?
> I still don't know any details about it.

I am proposing treating /proc/sys like /proc/net is currently treated.
See below.

>>>> Especially what drives that desire not to have it have a /proc/<pid>/sys
>>>> directory that reflects the sysctls for a given process.
>>>>
>>>
>>> This is not so important for me, where to access sysctl's. But I'm worrying
>>> about backward compatibility. IOW, I'm afraid of changing path
>>> "/proc/sys/sunprc/*" to "/proc/<pid>/sys/sunrpc". This would break a lot of
>>> user-space programs.
>>
>> The part that keeps it all working is by adding a symlink from /proc/sys
>> to /proc/self/sys.  That technique has worked well for /proc/net, and I
>> don't expect there will be any problems with /proc/sys either.  It is
>> possible but is very rare for the introduction of a symlink in a path
>> to cause problems.
>>
>
> Probably I don't understand you, but as I see it now, symlink to "/proc/self/"
> is unacceptable because of the following:
> 1) will be used current context (any) instead of desired one
(Using the current context is the desirable outcome for existing tools).
> 1) if CT has other pid namespace - then we just have broken link.

Assuming the process in question is not in the pid namespace available
to proc then yes you will indeed have a broken link.  But a broken
link is only a problem for new applications that are doing something strange.

I am proposing treating /proc/sys like /proc/net has already been
treated.  Aka move have the version of /proc/sys that relative to a
process be visible at: /proc/<pid>/sys, and with a compat symlink 
from /proc/sys -> /proc/self/sys.

Just like has already been done with /proc/net.

Semantically this should be easy to understand, and about as backwards
compatible as it gets.

Eric
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stanislav Kinsbursky Jan. 11, 2012, 6:02 p.m. UTC | #12
11.01.2012 21:21, Eric W. Biederman пишет:
>>>>> Especially what drives that desire not to have it have a /proc/<pid>/sys
>>>>> directory that reflects the sysctls for a given process.
>>>>>
>>>>
>>>> This is not so important for me, where to access sysctl's. But I'm worrying
>>>> about backward compatibility. IOW, I'm afraid of changing path
>>>> "/proc/sys/sunprc/*" to "/proc/<pid>/sys/sunrpc". This would break a lot of
>>>> user-space programs.
>>>
>>> The part that keeps it all working is by adding a symlink from /proc/sys
>>> to /proc/self/sys.  That technique has worked well for /proc/net, and I
>>> don't expect there will be any problems with /proc/sys either.  It is
>>> possible but is very rare for the introduction of a symlink in a path
>>> to cause problems.
>>>
>>
>> Probably I don't understand you, but as I see it now, symlink to "/proc/self/"
>> is unacceptable because of the following:
>> 1) will be used current context (any) instead of desired one
> (Using the current context is the desirable outcome for existing tools).
>> 1) if CT has other pid namespace - then we just have broken link.
>
> Assuming the process in question is not in the pid namespace available
> to proc then yes you will indeed have a broken link.  But a broken
> link is only a problem for new applications that are doing something strange.
>

I believe, that container is assuming to work in  it's own network and pid 
namespaces.
With your approach, if I'm not mistaken, container's /proc/net and /proc/sys 
tunables will be unaccessible from parent environment. Or I'm wrong here?

> I am proposing treating /proc/sys like /proc/net has already been
> treated.  Aka move have the version of /proc/sys that relative to a
> process be visible at: /proc/<pid>/sys, and with a compat symlink
> from /proc/sys ->  /proc/self/sys.
>
> Just like has already been done with /proc/net.
>

1) On one hand it looks logical, that any nested dentries in /proc are tied to 
pid namespace. But on the other hand we have a lot of tunables in /proc/net, 
/proc/sys, etc. which have nothing with processes or whatever similar.
2) currently /proc processes directories (i.e. /proc/1/, etc) depends on mount 
maker context. But /proc/sys and /proc/net doesn't. This looks weird and 
despondently, from my pow. What do you think about it?

And what do you think about "conteinerization" of /proc contents in the way like 
"sysfs" was done?
Implementing /proc "conteinerization" in this way can give us great flexibility.
For example, /proc/net (and /proc/sys/sunrpc) depends on mount owner net 
namespace, /proc/sysvipc depends on mount owner ipc namespace, etc.
And this approach doesn't break backward compatibility as well.

> Semantically this should be easy to understand, and about as backwards
> compatible as it gets.
>
> Eric
Eric W. Biederman Jan. 11, 2012, 7:36 p.m. UTC | #13
Stanislav Kinsbursky <skinsbursky@parallels.com> writes:

> 11.01.2012 21:21, Eric W. Biederman пишет:
>>>>>> Especially what drives that desire not to have it have a /proc/<pid>/sys
>>>>>> directory that reflects the sysctls for a given process.
>>>>>>
>>>>>
>>>>> This is not so important for me, where to access sysctl's. But I'm worrying
>>>>> about backward compatibility. IOW, I'm afraid of changing path
>>>>> "/proc/sys/sunprc/*" to "/proc/<pid>/sys/sunrpc". This would break a lot of
>>>>> user-space programs.
>>>>
>>>> The part that keeps it all working is by adding a symlink from /proc/sys
>>>> to /proc/self/sys.  That technique has worked well for /proc/net, and I
>>>> don't expect there will be any problems with /proc/sys either.  It is
>>>> possible but is very rare for the introduction of a symlink in a path
>>>> to cause problems.
>>>>
>>>
>>> Probably I don't understand you, but as I see it now, symlink to "/proc/self/"
>>> is unacceptable because of the following:
>>> 1) will be used current context (any) instead of desired one
>> (Using the current context is the desirable outcome for existing tools).
>>> 1) if CT has other pid namespace - then we just have broken link.
>>
>> Assuming the process in question is not in the pid namespace available
>> to proc then yes you will indeed have a broken link.  But a broken
>> link is only a problem for new applications that are doing something strange.
>>
>
> I believe, that container is assuming to work in  it's own network and pid
> namespaces.
> With your approach, if I'm not mistaken, container's /proc/net and /proc/sys
> tunables will be unaccessible from parent environment. Or I'm wrong here?

Wrong.

>> I am proposing treating /proc/sys like /proc/net has already been
>> treated.  Aka move have the version of /proc/sys that relative to a
>> process be visible at: /proc/<pid>/sys, and with a compat symlink
>> from /proc/sys ->  /proc/self/sys.
>>
>> Just like has already been done with /proc/net.
>>
>
> 1) On one hand it looks logical, that any nested dentries in /proc are tied to
> pid namespace. But on the other hand we have a lot of tunables in /proc/net,
> /proc/sys, etc. which have nothing with processes or whatever similar.

Please stop and take a look at /proc/net.  If your /proc/net is not a
symlink please look at a modern kernel.

/proc/<pid>/net reflects the network namespace of the task in question.

> 2) currently /proc processes directories (i.e. /proc/1/, etc) depends on mount
> maker context. But /proc/sys and /proc/net doesn't. This looks weird and
> despondently, from my pow. What do you think about it?

Yep.  Sysfs is weird.  Ideally sysfs would display all devices all of
the time but unfortunately that breaks backwards compatibility.

In proc we have the opportunity to display nearly everything all of the
time and I think that opportunity is worth seizing.

Having to mount a filesystem simply because the designers of the
filesystem were not creative enough to figure out how to display
all of the information the filesystem is responsible for displaying
without having namespace conflicts is unfortunate.

> And what do you think about "conteinerization" of /proc contents in the way like
> "sysfs" was done?

I think the way sysfs is done is a pain in the neck to use.  Especially
in the context of commands like "ip netns exec".  With the sysfs model
there is a lot of extra state to manage.

I totally agree that the way sysfs is done is much better than the way
/proc/sys is done today.  Looking at current can be limiting in the
general case.

My current preference is the way /proc/net was done.

> Implementing /proc "conteinerization" in this way can give us great flexibility.
> For example, /proc/net (and /proc/sys/sunrpc) depends on mount owner net
> namespace, /proc/sysvipc depends on mount owner ipc namespace, etc.
> And this approach doesn't break backward compatibility as well.

The thing is /proc/net is already done.

All I see with making things like /proc/net depend on the context of the
process that called mount is a need to call mount much more often.

Eric
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stanislav Kinsbursky Jan. 12, 2012, 9:17 a.m. UTC | #14
11.01.2012 23:36, Eric W. Biederman пишет:
>
> Please stop and take a look at /proc/net.  If your /proc/net is not a
> symlink please look at a modern kernel.
>
> /proc/<pid>/net reflects the network namespace of the task in question.
>

Ok, I know that.
I know, that if some task with pid N is in other network namespace, then 
/proc/<N>/net contents will differ to /proc/selt/net contents.

>> And what do you think about "conteinerization" of /proc contents in the way like
>> "sysfs" was done?
>
> I think the way sysfs is done is a pain in the neck to use.  Especially
> in the context of commands like "ip netns exec".  With the sysfs model
> there is a lot of extra state to manage.
>
> I totally agree that the way sysfs is done is much better than the way
> /proc/sys is done today.  Looking at current can be limiting in the
> general case.
>
> My current preference is the way /proc/net was done.
>

Ok. But this approach still requires some additional data to manage in user 
space. I.e. it's really easy to manage container's context using it's fs root, 
because container's root is a part of initial configuration. But container's 
processed pids numbers in parent context are unpredictable.

>> Implementing /proc "conteinerization" in this way can give us great flexibility.
>> For example, /proc/net (and /proc/sys/sunrpc) depends on mount owner net
>> namespace, /proc/sysvipc depends on mount owner ipc namespace, etc.
>> And this approach doesn't break backward compatibility as well.
>
> The thing is /proc/net is already done.
>
> All I see with making things like /proc/net depend on the context of the
> process that called mount is a need to call mount much more often.
>

/proc/net is a part or /proc. And /proc mount is called per container. So this 
is just like it is.

I have some solution I mind, which looks quite simple to implement, doesn't 
require significant additional state to manage and suits my needs.
Please, consider this.
It's based on sysfs containerization approach, but simplified a lot.
Sysctl's (comparing to sysfs entries) entries are the same for all namespaces.
This actually means, that we don't need any additional infrastructure for 
managing dentries. All we need to know on read/write operations with sysctl's is 
the namespaces /proc was mounted from.

Thus if we:

1) replace /proc sb->s_fsdata content from pid_namespace to nsproxy and
2) add link to /proc sb to ctl_table and
3) add ns tag (pid, net, else or none) to ctl_table

then we will have all we need to manage sysctl's content in the way we want.
And looks like this approach doesn't break backward compatibility.

What do you think about it?
diff mbox

Patch

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 703cfa3..be586a9 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1084,6 +1084,7 @@  struct ctl_path {
 };
 
 void register_sysctl_root(struct ctl_table_root *root);
+void unregister_sysctl_root(struct ctl_table_root *root);
 struct ctl_table_header *__register_sysctl_paths(
 	struct ctl_table_root *root, struct nsproxy *namespaces,
 	const struct ctl_path *path, struct ctl_table *table);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae27196..fb016a9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1700,6 +1700,15 @@  void register_sysctl_root(struct ctl_table_root *root)
 	list_add_tail(&root->root_list, &sysctl_table_root.root_list);
 	spin_unlock(&sysctl_lock);
 }
+EXPORT_SYMBOL_GPL(register_sysctl_root);
+
+void unregister_sysctl_root(struct ctl_table_root *root)
+{
+	spin_lock(&sysctl_lock);
+	list_del(&root->root_list);
+	spin_unlock(&sysctl_lock);
+}
+EXPORT_SYMBOL_GPL(unregister_sysctl_root);
 
 /*
  * sysctl_perm does NOT grant the superuser all rights automatically, because
@@ -1925,6 +1934,7 @@  struct ctl_table_header *__register_sysctl_paths(
 
 	return header;
 }
+EXPORT_SYMBOL_GPL(__register_sysctl_paths);
 
 /**
  * register_sysctl_table_path - register a sysctl table hierarchy
@@ -2007,6 +2017,7 @@  void setup_sysctl_set(struct ctl_table_set *p,
 	p->parent = parent ? parent : &sysctl_table_root.default_set;
 	p->is_seen = is_seen;
 }
+EXPORT_SYMBOL_GPL(setup_sysctl_set);
 
 #else /* !CONFIG_SYSCTL */
 struct ctl_table_header *register_sysctl_table(struct ctl_table * table)