diff mbox

[5/5] c/r: Add AF_UNIX support (v6)

Message ID 1248295301-30930-6-git-send-email-danms@us.ibm.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Dan Smith July 22, 2009, 8:41 p.m. UTC
This patch adds basic checkpoint/restart support for AF_UNIX sockets.  It
has been tested with a single and multiple processes, and with data inflight
at the time of checkpoint.  It supports socketpair()s, path-based, and
abstract sockets.

Changes in v6:
  - Moved the socket addresses to the per-type header
  - Eliminated the HASCWD flag
  - Remove use of ckpt_write_err() in restart paths
  - Change the order in which buffers are read so that we can set the
    socket's limit equal to the size of the image's buffers (if appropriate)
    and then restore the original values afterwards.
  - Use the ckpt_validate_errno() helper
  - Add a check to make sure that we didn't restore a (UNIX) socket with
    any skb's in the send buffer
  - Fix up sock_unix_join() to not leave addr uninitialized for socketpair
  - Remove inclusion of checkpoint_hdr.h in the socket files
  - Make sock_unix_write_cwd() use ckpt_write_string() and use the new
    ckpt_read_string() for reading the cwd
  - Use the restored realcred credentials in sock_unix_join()
  - Fix error path of the chdir_and_bind
  - Change the algorithm for reloading the socket buffers to use sendmsg()
    on the socket's peer for better accounting
  - For DGRAM sockets, check the backlog value against the system max
    to avoid letting a restart bypass the overloaded queue length
  - Use sock_bind() instead of sock->ops->bind() to gain the security hook
  - Change "restart" to "restore" in some of the function names

Changes in v5:
  - Change laddr and raddr buffers in socket header to be long enough
    for INET6 addresses
  - Place socket.c and sock.h function definitions inside #ifdef
    CONFIG_CHECKPOINT
  - Add explicit check in sock_unix_makeaddr() to refuse if the
    checkpoint image specifies an addr length of 0
  - Split sock_unix_restart() into a few pieces to facilitate:
  - Changed behavior of the unix restore code so that unlinked LISTEN
    sockets don't do a bind()...unlink()
  - Save the base path of a bound socket's path so that we can chdir()
    to the base before bind() if it is a relative path
  - Call bind() for any socket that is not established but has a
    non-zero-length local address
  - Enforce the current sysctl limit on socket buffer size during restart
    unless the user holds CAP_NET_ADMIN
  - Unlink a path-based socket before calling bind()

Changes in v4:
  - Changed the signdness of rcvlowat, rcvtimeo, sndtimeo, and backlog
    to match their struct sock definitions.  This should avoid issues
    with sign extension.
  - Add a sock_cptrst_verify() function to be run at restore time to
    validate several of the values in the checkpoint image against
    limits, flag masks, etc.
  - Write an error string with ctk_write_err() in the obscure cases
  - Don't write socket buffers for listen sockets
  - Sanity check address lengths before we agree to allocate memory
  - Check the result of inserting the peer object in the objhash on
    restart
  - Check return value of sock_cptrst() on restart
  - Change logic in remote getname() phase of checkpoint to not fail for
    closed (et al) sockets
  - Eliminate the memory copy while reading socket buffers on restart

Changes in v3:
  - Move sock_file_checkpoint() above sock_file_restore()
  - Change __sock_file_*() functions to do_sock_file_*()
  - Adjust some of the struct cr_hdr_socket alignment
  - Improve the sock_copy_buffers() algorithm to avoid locking the source
    queue for the entire operation
  - Fix alignment in the socket header struct(s)
  - Move the per-protocol structure (ckpt_hdr_socket_un) out of the
    common socket header and read/write it separately
  - Fix missing call to sock_cptrst() in restore path
  - Break out the socket joining into another function
  - Fix failure to restore the socket address thus fixing getname()
  - Check the state values on restart
  - Fix case of state being TCP_CLOSE, which allows dgram sockets to be
    properly connected (if appropriate) to their peer and maintain the
    sockaddr for getname() operation
  - Fix restoring a listening socket that has been unlink()'d
  - Fix checkpointing sockets with an in-flight FD-passing SKB.  Fail
    with EBUSY.
  - Fix checkpointing listening sockets with an unaccepted connection.
    Fail with EBUSY.
  - Changed 'un' to 'unix' in function and structure names

Changes in v2:
  - Change GFP_KERNEL to GFP_ATOMIC in sock_copy_buffers() (this seems
    to be rather common in other uses of skb_copy())
  - Move the ckpt_hdr_socket structure definition to linux/socket.h
  - Fix whitespace issue
  - Move sock_file_checkpoint() to net/socket.c for symmetry

Cc: Oren Laaden <orenl@cs.columbia.edu>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: netdev@vger.kernel.org
Signed-off-by: Dan Smith <danms@us.ibm.com>
---
 checkpoint/files.c             |    7 +
 checkpoint/objhash.c           |   27 ++
 include/linux/checkpoint_hdr.h |   13 +
 include/linux/socket.h         |   63 ++++
 include/net/sock.h             |   11 +
 net/Makefile                   |    2 +
 net/checkpoint.c               |  779 ++++++++++++++++++++++++++++++++++++++++
 net/socket.c                   |   85 +++++
 8 files changed, 987 insertions(+), 0 deletions(-)
 create mode 100644 net/checkpoint.c

Comments

Oren Laadan July 28, 2009, 4:54 p.m. UTC | #1
Hi Dan,

Dan Smith wrote:
> This patch adds basic checkpoint/restart support for AF_UNIX sockets.  It
> has been tested with a single and multiple processes, and with data inflight
> at the time of checkpoint.  It supports socketpair()s, path-based, and
> abstract sockets.
> 
> Changes in v6:
>   - Moved the socket addresses to the per-type header
>   - Eliminated the HASCWD flag
>   - Remove use of ckpt_write_err() in restart paths
>   - Change the order in which buffers are read so that we can set the
>     socket's limit equal to the size of the image's buffers (if appropriate)
>     and then restore the original values afterwards.
>   - Use the ckpt_validate_errno() helper
>   - Add a check to make sure that we didn't restore a (UNIX) socket with
>     any skb's in the send buffer
>   - Fix up sock_unix_join() to not leave addr uninitialized for socketpair
>   - Remove inclusion of checkpoint_hdr.h in the socket files
>   - Make sock_unix_write_cwd() use ckpt_write_string() and use the new
>     ckpt_read_string() for reading the cwd
>   - Use the restored realcred credentials in sock_unix_join()
>   - Fix error path of the chdir_and_bind
>   - Change the algorithm for reloading the socket buffers to use sendmsg()
>     on the socket's peer for better accounting
>   - For DGRAM sockets, check the backlog value against the system max
>     to avoid letting a restart bypass the overloaded queue length
>   - Use sock_bind() instead of sock->ops->bind() to gain the security hook
>   - Change "restart" to "restore" in some of the function names
> 
> Changes in v5:
>   - Change laddr and raddr buffers in socket header to be long enough
>     for INET6 addresses
>   - Place socket.c and sock.h function definitions inside #ifdef
>     CONFIG_CHECKPOINT
>   - Add explicit check in sock_unix_makeaddr() to refuse if the
>     checkpoint image specifies an addr length of 0
>   - Split sock_unix_restart() into a few pieces to facilitate:
>   - Changed behavior of the unix restore code so that unlinked LISTEN
>     sockets don't do a bind()...unlink()
>   - Save the base path of a bound socket's path so that we can chdir()
>     to the base before bind() if it is a relative path
>   - Call bind() for any socket that is not established but has a
>     non-zero-length local address
>   - Enforce the current sysctl limit on socket buffer size during restart
>     unless the user holds CAP_NET_ADMIN
>   - Unlink a path-based socket before calling bind()

[...]

 > +static int obj_sock_grab(void *ptr)
> +{
> +	sock_hold((struct sock *) ptr);
> +	return 0;
> +}
> +
> +static void obj_sock_drop(void *ptr)
> +{
> +	sock_put((struct sock *) ptr);
> +}
> +
> +static int obj_sock_users(void *ptr)
> +{
> +	return atomic_read(&((struct sock *) ptr)->sk_refcnt);
> +}
> +

obj_sock_users() is only required for objects that are to be
tested for leaks in full container checkpoint. I don't think
it's needed, because the relation sock <-> file is 1-to-1.
(If it were, then you would also need to collect sockets).

>  static struct ckpt_obj_ops ckpt_obj_ops[] = {
>  	/* ignored object */
>  	{
> @@ -367,6 +384,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
>  		.checkpoint = checkpoint_groupinfo,
>  		.restore = restore_groupinfo,
>  	},
> +	/* sock object */
> +	{
> +		.obj_name = "SOCKET",
> +		.obj_type = CKPT_OBJ_SOCK,
> +		.ref_drop = obj_sock_drop,
> +		.ref_grab = obj_sock_grab,
> +		.ref_users = obj_sock_users,
		^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
		      scratch this one

> +		.checkpoint = sock_file_checkpoint,
> +		.restore = sock_file_restore,

These two should be 'checkpoint_bad' and 'restore_bad' respsectively.
Sockets are handled via files, not as independent objects.

(Actually, I will remove checkpoint_bad and restore_bad and modify
checkpoint_obj() and restore_obj() to fail if the respective method
is NULL).

> +	},
>  };
>  
>  
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 8b7ca46..e542ff5 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -88,6 +88,12 @@ enum {
>  
>  	CKPT_HDR_SIGHAND = 601,
>  
> +	CKPT_HDR_FD_SOCKET = 701,
> +	CKPT_HDR_SOCKET,
> +	CKPT_HDR_SOCKET_BUFFERS,
> +	CKPT_HDR_SOCKET_BUFFER,

Nit: I already got confused a few times because of the similar names.
Perhaps change CKPT_HDR_SOCKET_BUFFERS to CKPT_HDR_SOCKET_QUEUE (and
adjust the header name accordingly).

> +	CKPT_HDR_SOCKET_UNIX,
> +
>  	CKPT_HDR_TAIL = 9001,
>  
>  	CKPT_HDR_ERROR = 9999,
> @@ -122,6 +128,7 @@ enum obj_type {
>  	CKPT_OBJ_CRED,
>  	CKPT_OBJ_USER,
>  	CKPT_OBJ_GROUPINFO,
> +	CKPT_OBJ_SOCK,
>  	CKPT_OBJ_MAX
>  };
>  
> @@ -326,6 +333,7 @@ enum file_type {
>  	CKPT_FILE_IGNORE = 0,
>  	CKPT_FILE_GENERIC,
>  	CKPT_FILE_PIPE,
> +	CKPT_FILE_SOCKET,
>  	CKPT_FILE_MAX
>  };
>  
> @@ -349,6 +357,11 @@ struct ckpt_hdr_file_pipe {
>  	__s32 pipe_objref;
>  } __attribute__((aligned(8)));
>  
> +struct ckpt_hdr_file_socket {
> +	struct ckpt_hdr_file common;
> +	__u16 family;
> +} __attribute__((aligned(8)));
> +
>  struct ckpt_hdr_file_pipe_state {
>  	struct ckpt_hdr h;
>  	__s32 pipe_len;
> diff --git a/include/linux/socket.h b/include/linux/socket.h
> index 3b461df..cd675dd 100644
> --- a/include/linux/socket.h
> +++ b/include/linux/socket.h
> @@ -23,6 +23,7 @@ struct __kernel_sockaddr_storage {
>  #include <linux/uio.h>			/* iovec support		*/
>  #include <linux/types.h>		/* pid_t			*/
>  #include <linux/compiler.h>		/* __user			*/
> +#include <linux/checkpoint.h>		/* struct ckpt_hdr              */
>  
>  #ifdef __KERNEL__
>  # ifdef CONFIG_PROC_FS
> @@ -328,5 +329,67 @@ extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *ka
>  extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
>  
>  #endif
> +
> +#ifdef CONFIG_CHECKPOINT
> +#include <linux/un.h>                   /* sockaddr_un			*/

Unless/until we decide otherwise, let's keep all ckpt_hdr_xxx
definitions in a single place -- checkpoint_hdr.h

> +
> +#define CKPT_UNIX_LINKED 1
> +struct ckpt_hdr_socket_unix {
> +	struct ckpt_hdr h;
> +	__u32 this;
> +	__u32 peer;

For objrefs please use __s32.

> +	__u32 flags;
> +	__u32 laddr_len;
> +	__u32 raddr_len;
> +	struct sockaddr_un laddr;
> +	struct sockaddr_un raddr;
> +} __attribute__ ((aligned(8)));
> +
> +struct ckpt_hdr_socket {
> +	struct ckpt_hdr h;
> +
> +	struct ckpt_socket { /* struct socket */

Unless you intend to use the struct name 'ckpt_socket' elsewhere,
can we get rid of it (leaving only 'struct {') ?

> +		__u64 flags;
> +		__u8 state;
> +	} socket __attribute__ ((aligned(8)));
> +
> +	struct ckpt_sock_common { /* struct sock_common */

Here too.

> +		__u32 bound_dev_if;
> +		__u16 family;
> +		__u8 state;
> +		__u8 reuse;
> +	} sock_common __attribute__ ((aligned(8)));
> +
> +	struct ckpt_sock { /* struct sock */

And here.

> +		__s64 rcvlowat;
> +		__s64 rcvtimeo;
> +		__s64 sndtimeo;
> +		__u64 flags;
> +		__u64 lingertime;
> +
> +		__u32 err;
> +		__u32 err_soft;
> +		__u32 priority;
> +		__s32 rcvbuf;
> +		__s32 sndbuf;
> +		__u16 type;
> +		__s16 backlog;
> +
> +		__u8 protocol;
> +		__u8 state;
> +		__u8 shutdown;
> +		__u8 userlocks;
> +		__u8 no_check;
> +	} sock __attribute__ ((aligned(8)));
> +
> +} __attribute__ ((aligned(8)));
> +
> +struct ckpt_hdr_socket_buffer {
> +	struct ckpt_hdr h;
> +	__u32 skb_count;
> +	__u32 total_bytes;
> +} __attribute__ ((aligned(8)));
> +#endif /* CONFIG_CHECKPOINT */
> +
>  #endif /* not kernel and not glibc */
>  #endif /* _LINUX_SOCKET_H */
> diff --git a/include/net/sock.h b/include/net/sock.h
> index d435dc1..4043cd2 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1608,4 +1608,15 @@ extern int sysctl_optmem_max;
>  extern __u32 sysctl_wmem_default;
>  extern __u32 sysctl_rmem_default;
>  
> +#ifdef CONFIG_CHECKPOINT
> +/* Checkpoint/Restart Functions */
> +struct ckpt_ctx;
> +struct ckpt_hdr_socket;
> +extern int sock_file_checkpoint(struct ckpt_ctx *, void *);
> +extern void *sock_file_restore(struct ckpt_ctx *);
> +extern struct socket *do_sock_file_restore(struct ckpt_ctx *,
> +					   struct ckpt_hdr_socket *);
> +extern int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file);
> +#endif
> +
>  #endif	/* _SOCK_H */
> diff --git a/net/Makefile b/net/Makefile
> index ba324ae..91d12fe 100644
> --- a/net/Makefile
> +++ b/net/Makefile
> @@ -66,3 +66,5 @@ ifeq ($(CONFIG_NET),y)
>  obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
>  endif
>  obj-$(CONFIG_WIMAX)		+= wimax/
> +
> +obj-$(CONFIG_CHECKPOINT)	+= checkpoint.o
> diff --git a/net/checkpoint.c b/net/checkpoint.c
> new file mode 100644
> index 0000000..748d3aa
> --- /dev/null
> +++ b/net/checkpoint.c
> @@ -0,0 +1,779 @@
> +/*
> + *  Copyright 2009 IBM Corporation
> + *
> + *  Author: Dan Smith <danms@us.ibm.com>
> + *
> + *  This program is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU General Public License as
> + *  published by the Free Software Foundation, version 2 of the
> + *  License.
> + */
> +
> +#include <linux/socket.h>
> +#include <linux/mount.h>
> +#include <linux/file.h>
> +#include <linux/namei.h>
> +#include <linux/syscalls.h>
> +#include <linux/sched.h>
> +#include <linux/fs_struct.h>
> +
> +#include <net/af_unix.h>
> +#include <net/tcp_states.h>
> +
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +
> +#define UNIX_ADDR_EMPTY(a) (a <= sizeof(short))
> +
> +static inline int sock_unix_need_cwd(struct sockaddr_un *addr,
> +				     unsigned long len)
> +{
> +	return (!UNIX_ADDR_EMPTY(len)) &&
> +		addr->sun_path[0] &&
> +		(addr->sun_path[0] != '/');
> +}
> +
> +static int sock_copy_buffers(struct sk_buff_head *from,
> +			     struct sk_buff_head *to,
> +			     uint32_t *total_bytes)
> +{
> +	int count = 0;
> +	struct sk_buff *skb;
> +
> +	*total_bytes = 0;
> +
> +	skb_queue_walk(from, skb) {
> +		struct sk_buff *tmp;
> +
> +		tmp = dev_alloc_skb(skb->len);
> +		if (!tmp)
> +			return -ENOMEM;
> +
> +		spin_lock(&from->lock);
> +		skb_morph(tmp, skb);
> +		spin_unlock(&from->lock);
> +
> +		skb_queue_tail(to, tmp);
> +		count++;
> +		*total_bytes += tmp->len;
> +	}
> +
> +	return count;
> +}
> +
> +static int __sock_write_buffers(struct ckpt_ctx *ctx,
> +				struct sk_buff_head *queue)
> +{
> +	struct sk_buff *skb;
> +	int ret = 0;
> +
> +	skb_queue_walk(queue, skb) {
> +		if (UNIXCB(skb).fp) {
> +			ckpt_write_err(ctx, "fd-passing is not supported");
> +			return -EBUSY;
> +		}
> +
> +		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
> +					  CKPT_HDR_SOCKET_BUFFER);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	struct sk_buff_head tmpq;
> +	int ret = -ENOMEM;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
> +	if (!h)
> +		goto out;
> +
> +	skb_queue_head_init(&tmpq);
> +
> +	h->skb_count = sock_copy_buffers(queue, &tmpq, &h->total_bytes);
> +	if (h->skb_count < 0) {

h->skb_count is unsigned...

> +		ret = h->skb_count;
> +		goto out;
> +	}
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (!ret)
> +		ret = __sock_write_buffers(ctx, &tmpq);
> +
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	__skb_queue_purge(&tmpq);
> +
> +	return ret;
> +}
> +
> +static int sock_unix_write_cwd(struct ckpt_ctx *ctx,
> +			       struct sock *sock,
> +			       const char *sockpath)
> +{
> +	struct path path;
> +	char *buf;
> +	char *fqpath;
> +	int offset;
> +	int ret = -ENOENT;
> +
> +	buf = kmalloc(PATH_MAX, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	path.dentry = unix_sk(sock)->dentry;
> +	path.mnt = unix_sk(sock)->mnt;
> +
> +	fqpath = d_path(&path, buf, PATH_MAX);
> +	if (!fqpath)
> +		goto out;

Can you use fill_name() from checkpoint/file.c ?  rename it to
ckpt_fill_name() or something when you export it.

> +
> +	offset = strlen(fqpath) - strlen(sockpath);
> +	if (offset <= 0) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	fqpath[offset] = '\0';
> +
> +	ckpt_debug("writing socket directory: %s\n", fqpath);
> +	ret = ckpt_write_string(ctx, fqpath, strlen(fqpath));
> + out:
> +	kfree(buf);
> +	return ret;
> +}
> +
> +static int sock_unix_getnames(struct ckpt_ctx *ctx,
> +			      struct ckpt_hdr_socket_unix *un,
> +			      struct socket *socket)
> +{
> +	struct sockaddr *loc = (struct sockaddr *)&un->laddr;
> +	struct sockaddr *rem = (struct sockaddr *)&un->raddr;
> +
> +	if (socket->ops->getname(socket, loc, &un->laddr_len, 0)) {
> +		ckpt_write_err(ctx, "Unable to getname of local");
> +		return -EINVAL;
> +	}

This direct call to ->getname skips security checks through
getsockname(). You may want to refactor sys_getsockname() similar
to sys_bind().

> +
> +	if (socket->ops->getname(socket, rem, &un->raddr_len, 1)) {
> +		if ((socket->sk->sk_type != SOCK_DGRAM) &&
> +		    (socket->sk->sk_state == TCP_ESTABLISHED)) {
> +			ckpt_write_err(ctx, "Unable to getname of remote");
> +			return -EINVAL;
> +		}

And here it skips security checks through sys_getppername().

> +		un->raddr_len = 0;
> +	}
> +
> +	return 0;
> +}
> +
> +static int sock_unix_checkpoint(struct ckpt_ctx *ctx,
> +			        struct socket *socket,
> +			        struct ckpt_hdr_socket *h)
> +{
> +	struct unix_sock *sk = unix_sk(socket->sk);
> +	struct unix_sock *pr = unix_sk(sk->peer);
> +	struct ckpt_hdr_socket_unix *un;
> +	int new;
> +	int ret = -ENOMEM;
> +
> +	if ((socket->sk->sk_state == TCP_LISTEN) &&
> +	    !skb_queue_empty(&socket->sk->sk_receive_queue)) {
> +		ckpt_write_err(ctx, "listening socket has unaccepted peers");
> +		return -EBUSY;
> +	}
> +
> +	un = ckpt_hdr_get_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UNIX);
> +	if (!un)
> +		goto out;
> +
> +	ret = sock_unix_getnames(ctx, un, socket);
> +	if (ret)
> +		goto out;
> +
> +	if (sk->dentry && (sk->dentry->d_inode->i_nlink > 0))
> +		un->flags |= CKPT_UNIX_LINKED;
> +
> +	un->this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new);
> +	if (un->this < 0)
> +		goto out;

un->this is unsigned... but the real fix is to make un->this __s32
as noted above.

> +
> +	if (sk->peer)
> +		un->peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
> +	else
> +		un->peer = 0;
> +
> +	if (un->peer < 0) {

ditto.

> +		ret = un->peer;
> +		goto out;
> +	}
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) un);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (sock_unix_need_cwd(&un->laddr, un->laddr_len))
> +		ret = sock_unix_write_cwd(ctx, socket->sk, un->laddr.sun_path);
> + out:
> +	ckpt_hdr_put(ctx, un);
> +
> +	return ret;
> +}
> +
> +static int sock_cptrst_verify(struct ckpt_hdr_socket *h)
> +{
> +	uint8_t userlocks_mask = SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK |
> +		                 SOCK_BINDADDR_LOCK | SOCK_BINDPORT_LOCK;
> +
> +	if (h->sock.shutdown & ~SHUTDOWN_MASK)
> +		return -EINVAL;
> +	if (h->sock.userlocks & ~userlocks_mask)
> +		return -EINVAL;
> +	if (h->sock.sndtimeo < 0)
> +		return -EINVAL;
> +	if (h->sock.rcvtimeo < 0)
> +		return -EINVAL;

Forgot sock.rcvlowat ?

> +	if ((h->sock.userlocks & SOCK_SNDBUF_LOCK) &&
> +	    ((h->sock.sndbuf < SOCK_MIN_SNDBUF) ||
> +	     (h->sock.sndbuf > sysctl_wmem_max)))
> +		return -EINVAL;

At least for afunix, if the user did not explicitly set the sndbuf,
then userlocks won't have SOCK_SNDBUF_LOCK set.

Also, if userlocks in the image doesn't have SOCK_SNDBUF_LOCK, then
the sndbuf value isn't checked ?

> +	if ((h->sock.userlocks & SOCK_RCVBUF_LOCK) &&
> +	    ((h->sock.rcvbuf < SOCK_MIN_RCVBUF) ||
> +	     (h->sock.rcvbuf > sysctl_rmem_max)))
> +		return -EINVAL;

ditto.

> +	if ((h->sock.flags & SOCK_LINGER) &&
> +	    (h->sock.lingertime > MAX_SCHEDULE_TIMEOUT))
> +		return -EINVAL;

Regardless of SOCK_LINGER flag, lingertime should never exceed that
maximum value.

What about verifying sock.flags itself ?
In doing that, some options may assume/require some state --
- SOCK_USE_WRITE_QUEUE only used in ipv4/ipv6/sctp
- SOCK_DESTROY only used in some protocols

Perhaps sanitize sock.flags per protocol ?

> +	if (!ckpt_validate_errno(h->sock.err))
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static int sock_cptrst(struct ckpt_ctx *ctx,
> +		       struct sock *sock,
> +		       struct ckpt_hdr_socket *h,
> +		       int op)
> +{
> +	if (sock->sk_socket) {
> +		CKPT_COPY(op, h->socket.flags, sock->sk_socket->flags);
> +		CKPT_COPY(op, h->socket.state, sock->sk_socket->state);
> +	}
> +
> +	CKPT_COPY(op, h->sock_common.reuse, sock->sk_reuse);
> +	CKPT_COPY(op, h->sock_common.bound_dev_if, sock->sk_bound_dev_if);
> +	CKPT_COPY(op, h->sock_common.family, sock->sk_family);
> +
> +	CKPT_COPY(op, h->sock.shutdown, sock->sk_shutdown);
> +	CKPT_COPY(op, h->sock.userlocks, sock->sk_userlocks);
> +	CKPT_COPY(op, h->sock.no_check, sock->sk_no_check);
> +	CKPT_COPY(op, h->sock.protocol, sock->sk_protocol);
> +	CKPT_COPY(op, h->sock.err, sock->sk_err);
> +	CKPT_COPY(op, h->sock.err_soft, sock->sk_err_soft);
> +	CKPT_COPY(op, h->sock.priority, sock->sk_priority);
> +	CKPT_COPY(op, h->sock.rcvlowat, sock->sk_rcvlowat);
> +	CKPT_COPY(op, h->sock.backlog, sock->sk_max_ack_backlog);
> +	CKPT_COPY(op, h->sock.rcvtimeo, sock->sk_rcvtimeo);
> +	CKPT_COPY(op, h->sock.sndtimeo, sock->sk_sndtimeo);
> +	CKPT_COPY(op, h->sock.rcvbuf, sock->sk_rcvbuf);
> +	CKPT_COPY(op, h->sock.sndbuf, sock->sk_sndbuf);
> +	CKPT_COPY(op, h->sock.flags, sock->sk_flags);
> +	CKPT_COPY(op, h->sock.lingertime, sock->sk_lingertime);
> +	CKPT_COPY(op, h->sock.type, sock->sk_type);
> +	CKPT_COPY(op, h->sock.state, sock->sk_state);

Many of these direct copy into the socket and sock effectively
bypass security checks that take place in {get,set}sockopt(),
either explicitly (e.g. sk_sndtimeo) or implicitly (e.g.
SOCK_LINGER in sock.flags, reflecting SO_LINGER option).

This applies both to checkpoint (potentially bypassing permission
of the checkpointer to view this data) and restart (bypassing
permissions of user to set these values).

The alternative is to use socksetopt/sockgetopt for those
values that should be subject to security checks.

> +
> +	if ((h->socket.state == SS_CONNECTED) &&
> +	    (h->sock.state != TCP_ESTABLISHED)) {
> +		ckpt_debug("socket/sock in inconsistent state: %i/%i",
> +			   h->socket.state, h->sock.state);
> +		return -EINVAL;

There should also be per-protocol consistency checks. E.g. afunix
cannot be in socket.state == SS_{DIS,}CONNECTING

> +	} else if ((h->sock.state < TCP_ESTABLISHED) ||
> +		   (h->sock.state >= TCP_MAX_STATES)) {
> +		ckpt_debug("sock in invalid state: %i", h->sock.state);
> +		return -EINVAL;
> +	} else if ((h->socket.state < SS_FREE) ||
> +		   (h->socket.state > SS_DISCONNECTING)) {
> +		ckpt_debug("socket in invalid state: %i",
> +			   h->socket.state);
> +		return -EINVAL;
> +	}
> +
> +	if (op == CKPT_CPT)
> +		return sock_cptrst_verify(h);
> +	else
> +		return 0;
> +}
> +
> +int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +{
> +	struct socket *socket = file->private_data;
> +	struct sock *sock = socket->sk;
> +	struct ckpt_hdr_socket *h;
> +	int ret = 0;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	ret = sock_cptrst(ctx, sock, h, CKPT_CPT);
> +	if (ret)
> +		goto out;
> +
> +	if (sock->sk_family == AF_UNIX) {
> +		ret = sock_unix_checkpoint(ctx, socket, h);
> +		if (ret)
> +			goto out;
> +	} else {
> +		ckpt_write_err(ctx, "unsupported socket family %i",
> +			       sock->sk_family);
> +		ret = EINVAL;
		     ^^^^
		     - !!!
Better yet: -ENOSYS ?

> +		goto out;
> +	}
> +
> +	if (sock->sk_state != TCP_LISTEN) {
> +		ret = sock_write_buffers(ctx, &sock->sk_receive_queue);
> +		if (ret)
> +			goto out;
> +
> +		ret = sock_write_buffers(ctx, &sock->sk_write_queue);
> +		if (ret)
> +			goto out;
> +	}
> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
> +static int sock_read_buffer(struct ckpt_ctx *ctx, struct sock *sock)
> +{
> +	struct ckpt_hdr h;
> +	struct msghdr msg;
> +	struct kvec kvec;
> +	int ret = 0;
> +	int len;
> +
> +	memset(&msg, 0, sizeof(msg));
> +
> +	len = _ckpt_read_hdr_type(ctx, &h, CKPT_HDR_SOCKET_BUFFER);
> +	if (len < 0)
> +		return len;
> +
> +	if (len > SKB_MAX_ALLOC) {
> +		ckpt_debug("Socket buffer too big (%i > %lu)",
> +			   len, SKB_MAX_ALLOC);
> +		return -ENOSPC;
> +	}
> +
> +	kvec.iov_len = len;
> +	kvec.iov_base = kmalloc(len, GFP_KERNEL);
> +	if (!kvec.iov_base)
> +		return -ENOMEM;
> +
> +	ret = _ckpt_read_payload(ctx, &h, kvec.iov_base);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = kernel_sendmsg(sock->sk_socket, &msg, &kvec, 1, len);

Is it possible to avoid the extra copy using splice() instead ?

> +	ckpt_debug("kernel_sendmsg(%i): %i\n", len, ret);
> +	if ((ret > 0) && (ret != len))
> +		ret = -ENOMEM;
> + out:
> +	return ret;
> +}
> +
> +static int sock_read_buffers(struct ckpt_ctx *ctx, struct sock *sock,
> +			     uint32_t *bufsize)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	int ret = 0;
> +	int i;
> +	uint32_t total = 0;
> +	uint8_t sock_shutdown;
> +
> +	/* If peer is shutdown, unshutdown it for this process */
> +	sock_shutdown = sock->sk_shutdown;
> +	sock->sk_shutdown &= ~SHUTDOWN_MASK;

SOCK_DEAD in sk->flags may also pose a problem.
(Do we at all need to checkpoint and/or restore SOCK_DEAD ?!)

> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
> +	if (IS_ERR(h)) {
> +		ret = PTR_ERR(h);
> +		goto out;
> +	}
> +
> +	if (!bufsize) {
> +		if (h->total_bytes != 0) {
> +			ckpt_debug("Expected empty buffer, got %u\n",
> +				   h->total_bytes);
> +			ret = -EINVAL;
> +		}
> +		goto out;
> +	} else if (h->total_bytes > *bufsize) {
> +		if (capable(CAP_NET_ADMIN))
> +			*bufsize = h->total_bytes;

Hmm... this test is quite hidden here - maybe a fat comment
with a reference to why it's here and what it is doing ?

> +		else {
> +			ckpt_debug("Buffer total %u exceeds limit %u\n",
> +			   h->total_bytes, *bufsize);
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +	}
> +
> +	for (i = 0; i < h->skb_count; i++) {
> +		ret = sock_read_buffer(ctx, sock);
> +		ckpt_debug("read buffer %i: %i\n", i, ret);
> +		if (ret < 0)
> +			break;
> +
> +		total += ret;
> +		if (total > h->total_bytes) {

What if 'total' overflows (for CAP_NET_ADMIN) ?   perhaps:
		if (total > h->total_bytes || total < ret) {

> +			ckpt_debug("Buffers exceeded claimed %u", total);
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +	}
> +
> +	if (total == h->total_bytes)
> +		ret = 0;
> +	else {
> +		ckpt_debug("Inconsistent total buffer size %u != %u\n",
> +			   total, h->total_bytes);
> +		ret = -EINVAL;
> +	}
> + out:
> +	sock->sk_shutdown = sock_shutdown;
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
> +static struct unix_address *sock_unix_makeaddr(struct sockaddr_un *sun_addr,
> +					       unsigned len)
> +{
> +	struct unix_address *addr;
> +
> +	if (len > sizeof(struct sockaddr_un))
> +		return ERR_PTR(-EINVAL);
> +
> +	addr = kmalloc(sizeof(*addr) + len, GFP_KERNEL);
> +	if (!addr)
> +		return ERR_PTR(-ENOMEM);
> +
> +	memcpy(addr->name, sun_addr, len);
> +	addr->len = len;
> +	atomic_set(&addr->refcnt, 1);
> +
> +	return addr;
> +}
> +
> +static int sock_unix_join(struct ckpt_ctx *ctx,
> +			  struct sock *a,
> +			  struct sock *b,
> +			  struct ckpt_hdr_socket_unix *un)
> +{
> +	struct unix_address *addr = NULL;
> +
> +	sock_hold(a);
> +	sock_hold(b);
> +
> +	unix_sk(a)->peer = b;
> +	unix_sk(b)->peer = a;
> +
> +	a->sk_peercred.pid = task_tgid_vnr(current);
> +	a->sk_peercred.uid = ctx->realcred->uid;
> +	a->sk_peercred.gid = ctx->realcred->gid;
> +
> +	b->sk_peercred.pid = a->sk_peercred.pid;
> +	b->sk_peercred.uid = a->sk_peercred.uid;
> +	b->sk_peercred.gid = a->sk_peercred.gid;
> +

Does the following bypass security checks for sys_connect() ?

> +	if (!UNIX_ADDR_EMPTY(un->raddr_len))
> +		addr = sock_unix_makeaddr(&un->raddr, un->raddr_len);
> +	else if (!UNIX_ADDR_EMPTY(un->laddr_len))
> +		addr = sock_unix_makeaddr(&un->laddr, un->laddr_len);
> +
> +	if (IS_ERR(addr))
> +		return PTR_ERR(addr);
> +	else if (addr) {
> +		atomic_inc(&addr->refcnt); /* Held by both ends */
> +		unix_sk(a)->addr = unix_sk(b)->addr = addr;
> +	}
> +
> +	return 0;
> +}
> +
> +static int sock_unix_restore_connected(struct ckpt_ctx *ctx,
> +				       struct ckpt_hdr_socket *h,
> +				       struct ckpt_hdr_socket_unix *un,
> +				       struct socket *socket)
> +{
> +	struct sock *this = ckpt_obj_fetch(ctx, un->this, CKPT_OBJ_SOCK);
> +	struct sock *peer = ckpt_obj_fetch(ctx, un->peer, CKPT_OBJ_SOCK);
> +	struct socket *tmp = NULL;
> +	int ret;
> +
> +	if (!IS_ERR(this) && !IS_ERR(peer)) {
> +		/* We're last */
> +		struct socket *old = this->sk_socket;
> +
> +		old->sk = NULL;
> +		sock_release(old);
> +		sock_graft(this, socket);
> +
> +	} else if ((PTR_ERR(this) == -EINVAL) && (PTR_ERR(peer) == -EINVAL)) {
> +		/* We're first */
> +		int family = socket->sk->sk_family;
> +		int type = socket->sk->sk_type;
> +
> +		ret = sock_create(family, type, 0, &tmp);
> +		ckpt_debug("sock_create: %i\n", ret);
> +		if (ret)
> +			goto out;
> +
> +		this = socket->sk;
> +		peer = tmp->sk;
> +
> +		ret = ckpt_obj_insert(ctx, this, un->this, CKPT_OBJ_SOCK);
> +		if (ret < 0)
> +			goto out;
> +
> +		ret = ckpt_obj_insert(ctx, peer, un->peer, CKPT_OBJ_SOCK);
> +		if (ret < 0)
> +			goto out;
> +
> +		ret = sock_unix_join(ctx, this, peer, un);
> +		ckpt_debug("sock_unix_join: %i\n", ret);
> +		if (ret)
> +			goto out;
> +
> +	} else {
> +		ckpt_debug("Order Error\n");
> +		ret = PTR_ERR(this);
> +		goto out;
> +	}
> +
> +	/* Prime the socket's buffer limit with the maximum */
> +	peer->sk_userlocks |= SOCK_SNDBUF_LOCK;
> +	peer->sk_sndbuf = sysctl_wmem_max;

I suppose this meant to be temporary ?

> +
> +	/* Read my buffers and sendmsg() then back to me via my peer */
> +	ret = sock_read_buffers(ctx, peer, &peer->sk_sndbuf);
> +	ckpt_debug("sock_read_buffers: %i\n", ret);
> +	if (ret)
> +		goto out;
> +
> +	/* Read peer's buffers and expect 0 */
> +	ret = sock_read_buffers(ctx, peer, NULL);

It is indeed a good idea to make it generic for all sockets, but I
suspect the way sock_read_buffers() works will only be suitable for
afunix, and perhaps for in-container inet4/6 (by "in-container" I
mean that both sides of the connection are in the checkpoint image).

> + out:
> +	if (tmp && ret)
> +		sock_release(tmp);
> +
> +	return ret;
> +}
> +
> +static int sock_unix_unlink(const char *name)
> +{
> +	struct path spath;
> +	struct path ppath;
> +	int ret;
> +
> +	ret = kern_path(name, 0, &spath);
> +	if (ret)
> +		return ret;
> +
> +	ret = kern_path(name, LOOKUP_PARENT, &ppath);
> +	if (ret)
> +		goto out_s;
> +
> +	if (!spath.dentry) {
> +		ckpt_debug("No dentry found for %s\n", name);
> +		ret = -ENOENT;
> +		goto out_p;
> +	}
> +
> +	if (!ppath.dentry || !ppath.dentry->d_inode) {
> +		ckpt_debug("No inode for parent of %s\n", name);
> +		ret = -ENOENT;
> +		goto out_p;
> +	}
> +
> +	ret = vfs_unlink(ppath.dentry->d_inode, spath.dentry);
> + out_p:
> +	path_put(&ppath);
> + out_s:
> +	path_put(&spath);
> +
> +	return ret;
> +}
> +
> +/* Call bind() for socket, optionally changing (temporarily) to @path first
> + * if non-NULL
> + */
> +static int sock_unix_chdir_and_bind(struct socket *socket,
> +				    const char *path,
> +				    struct sockaddr *addr,
> +				    unsigned long addrlen)
> +{
> +	struct sockaddr_un *un = (struct sockaddr_un *)addr;
> +	int ret;
> +	struct path cur;
> +	struct path dir;
> +
> +	if (path) {
> +		ckpt_debug("switching to cwd %s for unix bind", path);
> +
> +		ret = kern_path(path, 0, &dir);
> +		if (ret)
> +			return ret;
> +
> +		ret = inode_permission(dir.dentry->d_inode,
> +				       MAY_EXEC | MAY_ACCESS);
> +		if (ret)
> +			goto out;
> +
> +		write_lock(&current->fs->lock);
> +		cur = current->fs->pwd;
> +		current->fs->pwd = dir;
> +		write_unlock(&current->fs->lock);
> +	}
> +
> +	ret = sock_unix_unlink(un->sun_path);
> +	ckpt_debug("unlink(%s): %i\n", un->sun_path, ret);
> +	if ((ret == 0) || (ret == -ENOENT))
> +		ret = sock_bind(socket, addr, addrlen);
> +
> +	if (path) {
> +		write_lock(&current->fs->lock);
> +		current->fs->pwd = cur;
> +		write_unlock(&current->fs->lock);
> +	}
> + out:
> +	if (path)
> +		path_put(&dir);
> +
> +	return ret;
> +}
> +
> +static int sock_unix_fakebind(struct socket *socket,
> +			      struct sockaddr_un *addr,
> +			      unsigned long len)
> +{
> +	struct unix_address *uaddr;
> +
> +	uaddr = sock_unix_makeaddr(addr, len);
> +	if (IS_ERR(uaddr))
> +		return PTR_ERR(uaddr);
> +
> +	unix_sk(socket->sk)->addr = uaddr;
> +
> +	return 0;
> +}
> +
> +static int sock_unix_bind(struct ckpt_hdr_socket *h,
> +			  struct ckpt_hdr_socket_unix *un,
> +			  struct socket *socket,
> +			  const char *path)
> +{
> +	struct sockaddr *addr = (struct sockaddr *)&un->laddr;
> +	unsigned long len = un->laddr_len;
> +
> +	if (!un->laddr.sun_path[0])
> +		return sock_bind(socket, addr, len);
> +	else if (!(un->flags & CKPT_UNIX_LINKED))
> +		return sock_unix_fakebind(socket, &un->laddr, len);
> +	else
> +		return sock_unix_chdir_and_bind(socket, path, addr, len);
> +}
> +
> +static int sock_unix_restore(struct ckpt_ctx *ctx,
> +			     struct ckpt_hdr_socket *h,
> +			     struct socket *socket)
> +{
> +	struct ckpt_hdr_socket_unix *un;
> +	int ret = -EINVAL;
> +	char *cwd = NULL;
> +	struct net *net = sock_net(socket->sk);
> +
> +	/* AF_UNIX overloads the backlog setting to define the maximum
> +	 * queue length for DGRAM sockets.  Make sure we don't let the
> +	 * caller exceed that value on restart.
> +	 */
> +	if ((h->sock.type == SOCK_DGRAM) &&
> +	    (h->sock.backlog > net->unx.sysctl_max_dgram_qlen)) {
> +		ckpt_debug("DGRAM backlog of %i exceeds system max of %i\n",
> +			   h->sock.backlog, net->unx.sysctl_max_dgram_qlen);
> +		return -EINVAL;
> +	}
> +
> +	un = ckpt_read_obj_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UNIX);
> +	if (IS_ERR(un))
> +		return PTR_ERR(un);
> +
> +	if (un->peer < 0)
> +		goto out;
> +
> +	if (sock_unix_need_cwd(&un->laddr, un->laddr_len)) {
> +		ret = ckpt_read_string(ctx, &cwd, PATH_MAX);
> +		ckpt_debug("read cwd(%i): %s\n", ret, cwd);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	if ((h->sock.state != TCP_ESTABLISHED) &&
> +	    !UNIX_ADDR_EMPTY(un->laddr_len)) {
> +		ret = sock_unix_bind(h, un, socket, cwd);
> +		if (ret)
> +			goto out;
> +	}
> +

Hmm... does the code below work well with dgram sockets who received
data from the peer, and then the peer died (before checkpoint) ?

> +	if ((h->sock.state == TCP_ESTABLISHED) || (h->sock.state == TCP_CLOSE))
> +		ret = sock_unix_restore_connected(ctx, h, un, socket);
> +	else if (h->sock.state == TCP_LISTEN)
> +		ret = socket->ops->listen(socket, h->sock.backlog);
> +	else
> +		ckpt_debug("unsupported UNIX socket state %i\n", h->sock.state);
> + out:
> +	ckpt_hdr_put(ctx, un);
> +	kfree(cwd);
> +	return ret;
> +}
> +
> +struct socket *do_sock_file_restore(struct ckpt_ctx *ctx,
> +				    struct ckpt_hdr_socket *h)
> +{
> +	struct socket *socket;
> +	int ret;
> +
> +	ret = sock_create(h->sock_common.family, h->sock.type, 0, &socket);
> +	if (ret < 0)
> +		return ERR_PTR(ret);
> +
> +	if (h->sock_common.family == AF_UNIX) {
> +		ret = sock_unix_restore(ctx, h, socket);
> +		ckpt_debug("sock_unix_restore: %i\n", ret);
> +	} else {
> +		ckpt_debug("unsupported family %i\n", h->sock_common.family);
> +		ret = -EINVAL;
> +	}
> +
> +	if (ret)
> +		goto out;
> +
> +	ret = sock_cptrst(ctx, socket->sk, h, CKPT_RST);
> + out:
> +	if (ret) {
> +		sock_release(socket);
> +		socket = ERR_PTR(ret);
> +	}
> +
> +	return socket;
> +}
> +
> diff --git a/net/socket.c b/net/socket.c
> index 017f6e6..ff556fc 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -96,6 +96,8 @@
>  #include <net/sock.h>
>  #include <linux/netfilter.h>
>  
> +#include <linux/checkpoint.h>
> +
>  static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
>  static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
>  			 unsigned long nr_segs, loff_t pos);
> @@ -140,6 +142,9 @@ static const struct file_operations socket_file_ops = {
>  	.sendpage =	sock_sendpage,
>  	.splice_write = generic_splice_sendpage,
>  	.splice_read =	sock_splice_read,
> +#ifdef CONFIG_CHECKPOINT
> +	.checkpoint =   sock_file_checkpoint,
> +#endif
>  };
>  
>  /*
> @@ -415,6 +420,86 @@ int sock_map_fd(struct socket *sock, int flags)
>  	return fd;
>  }
>  
> +#ifdef CONFIG_CHECKPOINT
> +int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr)
> +{
> +	struct ckpt_hdr_file_socket *h;
> +	int ret;
> +	struct file *file = ptr;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	h->common.f_type = CKPT_FILE_SOCKET;
> +
> +	ret = checkpoint_file_common(ctx, file, &h->common);
> +	if (ret < 0)
> +		goto out;
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = do_sock_file_checkpoint(ctx, file);
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	return ret;
> +}
> +
> +static struct file *sock_alloc_attach_fd(struct socket *socket)

Nit: I think this name is misleading - sock_attach_fd() itself is
already and should have been sock_attach_file() IMHO - so perhaps
s/fd/file here ?

> +{
> +	struct file *file;
> +	int err;
> +
> +	file = get_empty_filp();
> +	if (!file)
> +		return ERR_PTR(ENOMEM);
> +
> +	err = sock_attach_fd(socket, file, 0);
> +	if (err < 0) {
> +		put_filp(file);
> +		file = ERR_PTR(err);
> +	}
> +
> +	return file;
> +}
> +
> +void *sock_file_restore(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_hdr_socket *h = NULL;
> +	struct socket *socket = NULL;
> +	struct file *file = NULL;
> +	int err;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
> +	if (IS_ERR(h))
> +		return h;
> +
> +	socket = do_sock_file_restore(ctx, h);
> +	if (IS_ERR(socket)) {
> +		err = PTR_ERR(socket);
> +		goto err_put;
> +	}
> +
> +	file = sock_alloc_attach_fd(socket);
> +	if (IS_ERR(file)) {
> +		err = PTR_ERR(file);
> +		goto err_release;
> +	}
> +
> +	ckpt_hdr_put(ctx, h);
> +
> +	return file;
> +
> + err_release:
> +	sock_release(socket);
> + err_put:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ERR_PTR(err);
> +}
> +#endif /* CONFIG_CHECKPOINT */
> +
>  static struct socket *sock_from_file(struct file *file, int *err)
>  {
>  	if (file->f_op == &socket_file_ops)

Oren.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Smith July 28, 2009, 8:34 p.m. UTC | #2
OL> obj_sock_users() is only required for objects that are to be
OL> tested for leaks in full container checkpoint. I don't think it's
OL> needed, because the relation sock <-> file is 1-to-1.  (If it
OL> were, then you would also need to collect sockets).

Okay, that also answers the question posed by John in the other
thread.

OL> (Actually, I will remove checkpoint_bad and restore_bad and modify
OL> checkpoint_obj() and restore_obj() to fail if the respective
OL> method is NULL).

Okay, should I expect that to show up in v17-dev soon?

OL> Nit: I already got confused a few times because of the similar
OL> names.  Perhaps change CKPT_HDR_SOCKET_BUFFERS to
OL> CKPT_HDR_SOCKET_QUEUE (and adjust the header name accordingly).

Agreed.  I remember writing that and remarking to myself how
ridiculous of a name it was, but never went back to change it.

OL> Unless/until we decide otherwise, let's keep all ckpt_hdr_xxx
OL> definitions in a single place -- checkpoint_hdr.h

That's how I initially had it, but Serge asked for it to be moved into
the network headers.  Serge?

OL> Unless you intend to use the struct name 'ckpt_socket' elsewhere,
OL> can we get rid of it (leaving only 'struct {') ?

Yep.

OL> Can you use fill_name() from checkpoint/file.c ?

Yeah, looks like it.

OL> This direct call to ->getname skips security checks through
OL> getsockname(). You may want to refactor sys_getsockname() similar
OL> to sys_bind().

Okay.

>> +	if ((h->sock.userlocks & SOCK_SNDBUF_LOCK) &&
>> +	    ((h->sock.sndbuf < SOCK_MIN_SNDBUF) ||
>> +	     (h->sock.sndbuf > sysctl_wmem_max)))
>> +		return -EINVAL;

OL> At least for afunix, if the user did not explicitly set the
OL> sndbuf, then userlocks won't have SOCK_SNDBUF_LOCK set.

OL> Also, if userlocks in the image doesn't have SOCK_SNDBUF_LOCK,
OL> then the sndbuf value isn't checked ?

I think I was thinking that I only needed to verify the buffer value
if the user claimed to have set it (as if it would be ignored
otherwise), but that doesn't seem right.  So, I think the proper
thing to do here is always check it (i.e., remove the first check of
the lock).

OL> What about verifying sock.flags itself ?
OL> In doing that, some options may assume/require some state --
OL> - SOCK_USE_WRITE_QUEUE only used in ipv4/ipv6/sctp
OL> - SOCK_DESTROY only used in some protocols

OL> Perhaps sanitize sock.flags per protocol ?

Hmm, okay.

OL> Many of these direct copy into the socket and sock effectively
OL> bypass security checks that take place in {get,set}sockopt(),
OL> either explicitly (e.g. sk_sndtimeo) or implicitly (e.g.
OL> SOCK_LINGER in sock.flags, reflecting SO_LINGER option).

OL> This applies both to checkpoint (potentially bypassing permission
OL> of the checkpointer to view this data) and restart (bypassing
OL> permissions of user to set these values).

OL> The alternative is to use socksetopt/sockgetopt for those values
OL> that should be subject to security checks.

Yeah, I suppose so.  I've resisted that thus far because it will make
the sync operation so much harder to read, but I suppose it's
unavoidable.

OL> There should also be per-protocol consistency checks. E.g. afunix
OL> cannot be in socket.state == SS_{DIS,}CONNECTING

I suppose so, but I don't see anything in af_unix.c that seems to care :)

OL> Better yet: -ENOSYS ?

Okay.

>> +	ret = kernel_sendmsg(sock->sk_socket, &msg, &kvec, 1, len);

OL> Is it possible to avoid the extra copy using splice() instead ?

It's possible, but it will require some refactoring to get access to
all the right pointers.  I'd propose we push that optimization out
until after we've got these patches integrated into the c/r tree.

OL> SOCK_DEAD in sk->flags may also pose a problem.  (Do we at all
OL> need to checkpoint and/or restore SOCK_DEAD ?!)

Is there any reasonable way we could arrive at a SOCK_DEAD socket via
a valid descriptor?

OL> Hmm... this test is quite hidden here - maybe a fat comment with a
OL> reference to why it's here and what it is doing ?

Sure.

>> +		else {
>> +			ckpt_debug("Buffer total %u exceeds limit %u\n",
>> +			   h->total_bytes, *bufsize);
>> +			ret = -EINVAL;
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	for (i = 0; i < h->skb_count; i++) {
>> +		ret = sock_read_buffer(ctx, sock);
>> +		ckpt_debug("read buffer %i: %i\n", i, ret);
>> +		if (ret < 0)
>> +			break;
>> +
>> +		total += ret;
>> +		if (total > h->total_bytes) {

OL> What if 'total' overflows (for CAP_NET_ADMIN) ?   perhaps:
OL> 		if (total > h->total_bytes || total < ret) {

Actually, I've changed this locally to require fewer variables and
special cases.  Let me know when I re-post if you don't think it's a
better way to handle this.

OL> Does the following bypass security checks for sys_connect() ?

I don't think so.  We're basically replicating sys_socketpair() here,
which does not do a security check, presumably because all you're
doing is hooking two sockets together that both belong to you.  That's
not to say that we're as safe as that limited operation, but I don't
think it's totally clear.  Perhaps someone more confident will
comment.

>> +	/* Prime the socket's buffer limit with the maximum */
>> +	peer->sk_userlocks |= SOCK_SNDBUF_LOCK;
>> +	peer->sk_sndbuf = sysctl_wmem_max;

OL> I suppose this meant to be temporary ?

Right, it will be overridden when we synchronize the socket
properties.  I'll strengthen the comment.

OL> It is indeed a good idea to make it generic for all sockets, but I
OL> suspect the way sock_read_buffers() works will only be suitable
OL> for afunix, and perhaps for in-container inet4/6 (by
OL> "in-container" I mean that both sides of the connection are in the
OL> checkpoint image).

Yeah, I meant to put a comment in the commit log about this.  As I
mentioned before, I was hoping to put most of the effort into the UNIX
patch and move forward with that as we iterate on the INET patch.
Thus, this was part of the buffer restore re-swizzle that clearly
won't work for INET.  In the meantime I've been working on the INET
patch and splitting it up while retaining the necessary behavioral
changes made here.  In my next posting these should look better.

OL> Hmm... does the code below work well with dgram sockets who
OL> received data from the peer, and then the peer died (before
OL> checkpoint) ?

I'm not sure that I've replicated that exact scenario in my tests.
However, when we're restoring socket A with outstanding incoming
buffers, we restore them via B.  If B is not in a reasonable state, I
put it back into connected state to appease the sendmsg function and
restore it when complete.  See the "unshutdown" comment.

>> +static struct file *sock_alloc_attach_fd(struct socket *socket)

OL> Nit: I think this name is misleading - sock_attach_fd() itself is
OL> already and should have been sock_attach_file() IMHO - so perhaps
OL> s/fd/file here ?

The name was chosen to reflect the fact that we're allocating a socket
and then calling sock_attach_fd() on it.  If you think it's less
misleading to be inconsistent with the other call, I'll change it, but
I wouldn't really agree... :)

Thanks!
Oren Laadan July 28, 2009, 9:18 p.m. UTC | #3
Dan Smith wrote:
> OL> obj_sock_users() is only required for objects that are to be
> OL> tested for leaks in full container checkpoint. I don't think it's
> OL> needed, because the relation sock <-> file is 1-to-1.  (If it
> OL> were, then you would also need to collect sockets).
> 
> Okay, that also answers the question posed by John in the other
> thread.
> 
> OL> (Actually, I will remove checkpoint_bad and restore_bad and modify
> OL> checkpoint_obj() and restore_obj() to fail if the respective
> OL> method is NULL).
> 
> Okay, should I expect that to show up in v17-dev soon?

Yes.

> 
> OL> Nit: I already got confused a few times because of the similar
> OL> names.  Perhaps change CKPT_HDR_SOCKET_BUFFERS to
> OL> CKPT_HDR_SOCKET_QUEUE (and adjust the header name accordingly).
> 
> Agreed.  I remember writing that and remarking to myself how
> ridiculous of a name it was, but never went back to change it.
> 
> OL> Unless/until we decide otherwise, let's keep all ckpt_hdr_xxx
> OL> definitions in a single place -- checkpoint_hdr.h
> 
> That's how I initially had it, but Serge asked for it to be moved into
> the network headers.  Serge?
> 
> OL> Unless you intend to use the struct name 'ckpt_socket' elsewhere,
> OL> can we get rid of it (leaving only 'struct {') ?
> 
> Yep.
> 
> OL> Can you use fill_name() from checkpoint/file.c ?
> 
> Yeah, looks like it.
> 
> OL> This direct call to ->getname skips security checks through
> OL> getsockname(). You may want to refactor sys_getsockname() similar
> OL> to sys_bind().
> 
> Okay.
> 
>>> +	if ((h->sock.userlocks & SOCK_SNDBUF_LOCK) &&
>>> +	    ((h->sock.sndbuf < SOCK_MIN_SNDBUF) ||
>>> +	     (h->sock.sndbuf > sysctl_wmem_max)))
>>> +		return -EINVAL;
> 
> OL> At least for afunix, if the user did not explicitly set the
> OL> sndbuf, then userlocks won't have SOCK_SNDBUF_LOCK set.
> 
> OL> Also, if userlocks in the image doesn't have SOCK_SNDBUF_LOCK,
> OL> then the sndbuf value isn't checked ?
> 
> I think I was thinking that I only needed to verify the buffer value
> if the user claimed to have set it (as if it would be ignored
> otherwise), but that doesn't seem right.  So, I think the proper
> thing to do here is always check it (i.e., remove the first check of
> the lock).
> 
> OL> What about verifying sock.flags itself ?
> OL> In doing that, some options may assume/require some state --
> OL> - SOCK_USE_WRITE_QUEUE only used in ipv4/ipv6/sctp
> OL> - SOCK_DESTROY only used in some protocols
> 
> OL> Perhaps sanitize sock.flags per protocol ?
> 
> Hmm, okay.
> 
> OL> Many of these direct copy into the socket and sock effectively
> OL> bypass security checks that take place in {get,set}sockopt(),
> OL> either explicitly (e.g. sk_sndtimeo) or implicitly (e.g.
> OL> SOCK_LINGER in sock.flags, reflecting SO_LINGER option).
> 
> OL> This applies both to checkpoint (potentially bypassing permission
> OL> of the checkpointer to view this data) and restart (bypassing
> OL> permissions of user to set these values).
> 
> OL> The alternative is to use socksetopt/sockgetopt for those values
> OL> that should be subject to security checks.
> 
> Yeah, I suppose so.  I've resisted that thus far because it will make
> the sync operation so much harder to read, but I suppose it's
> unavoidable.
> 
> OL> There should also be per-protocol consistency checks. E.g. afunix
> OL> cannot be in socket.state == SS_{DIS,}CONNECTING
> 
> I suppose so, but I don't see anything in af_unix.c that seems to care :)
> 
> OL> Better yet: -ENOSYS ?
> 
> Okay.
> 
>>> +	ret = kernel_sendmsg(sock->sk_socket, &msg, &kvec, 1, len);
> 
> OL> Is it possible to avoid the extra copy using splice() instead ?
> 
> It's possible, but it will require some refactoring to get access to
> all the right pointers.  I'd propose we push that optimization out
> until after we've got these patches integrated into the c/r tree.

Hmmm.. what about splice_direct_to_actor() ?

> 
> OL> SOCK_DEAD in sk->flags may also pose a problem.  (Do we at all
> OL> need to checkpoint and/or restore SOCK_DEAD ?!)
> 
> Is there any reasonable way we could arrive at a SOCK_DEAD socket via
> a valid descriptor?

That's a good point. I think you are right, and there isn't.

Still need to keep it in mind for inet when including those lingering
sockets that don't belong to anyone.

This also means that a peer (of a dgram socket) that was closed will
not be checkpointed, so restoring the rcvbuf of the remaining dgram
socket wouldn't work.

> 
> OL> Hmm... this test is quite hidden here - maybe a fat comment with a
> OL> reference to why it's here and what it is doing ?
> 
> Sure.
> 
>>> +		else {
>>> +			ckpt_debug("Buffer total %u exceeds limit %u\n",
>>> +			   h->total_bytes, *bufsize);
>>> +			ret = -EINVAL;
>>> +			goto out;
>>> +		}
>>> +	}
>>> +
>>> +	for (i = 0; i < h->skb_count; i++) {
>>> +		ret = sock_read_buffer(ctx, sock);
>>> +		ckpt_debug("read buffer %i: %i\n", i, ret);
>>> +		if (ret < 0)
>>> +			break;
>>> +
>>> +		total += ret;
>>> +		if (total > h->total_bytes) {
> 
> OL> What if 'total' overflows (for CAP_NET_ADMIN) ?   perhaps:
> OL> 		if (total > h->total_bytes || total < ret) {
> 
> Actually, I've changed this locally to require fewer variables and
> special cases.  Let me know when I re-post if you don't think it's a
> better way to handle this.
> 
> OL> Does the following bypass security checks for sys_connect() ?
> 
> I don't think so.  We're basically replicating sys_socketpair() here,
> which does not do a security check, presumably because all you're
> doing is hooking two sockets together that both belong to you.  That's
> not to say that we're as safe as that limited operation, but I don't
> think it's totally clear.  Perhaps someone more confident will
> comment.

Yes, please ... Serge ?

To me it sounds plausible. If we adopt it, then a comment in the
code is worthwhile.

> 
>>> +	/* Prime the socket's buffer limit with the maximum */
>>> +	peer->sk_userlocks |= SOCK_SNDBUF_LOCK;
>>> +	peer->sk_sndbuf = sysctl_wmem_max;
> 
> OL> I suppose this meant to be temporary ?
> 
> Right, it will be overridden when we synchronize the socket
> properties.  I'll strengthen the comment.

Hmm.. then what happens when you have a circular dependency ?
For example, three dgram sockets, A, B and C where: A->B, B->C
and C->A  ('->' means connected).

I suspect that sock_unix_restore_connect() will fail, because
neither:

+	if (!IS_ERR(this) && !IS_ERR(peer)) {

nor

+	} else if ((PTR_ERR(this) == -EINVAL) && (PTR_ERR(peer) == -EINVAL)) {

will hold true, therefore:

+	} else {
+		ckpt_debug("Order Error\n");
+		ret = PTR_ERR(this);
+		goto out;
+	}

Either that, or the temporary change cited above will become
permanent, because A won't be restored again.

> 
> OL> It is indeed a good idea to make it generic for all sockets, but I
> OL> suspect the way sock_read_buffers() works will only be suitable
> OL> for afunix, and perhaps for in-container inet4/6 (by
> OL> "in-container" I mean that both sides of the connection are in the
> OL> checkpoint image).
> 
> Yeah, I meant to put a comment in the commit log about this.  As I
> mentioned before, I was hoping to put most of the effort into the UNIX
> patch and move forward with that as we iterate on the INET patch.
> Thus, this was part of the buffer restore re-swizzle that clearly
> won't work for INET.  In the meantime I've been working on the INET
> patch and splitting it up while retaining the necessary behavioral
> changes made here.  In my next posting these should look better.
> 
> OL> Hmm... does the code below work well with dgram sockets who
> OL> received data from the peer, and then the peer died (before
> OL> checkpoint) ?
> 
> I'm not sure that I've replicated that exact scenario in my tests.
> However, when we're restoring socket A with outstanding incoming
> buffers, we restore them via B.  If B is not in a reasonable state, I
> put it back into connected state to appease the sendmsg function and
> restore it when complete.  See the "unshutdown" comment.

The problem isn't that B is not in a reasonable state, but
rather that B is not checkpointed in the first place (because it
is not reachable via any fd), so it isn't restored either.

Oren.

> 
>>> +static struct file *sock_alloc_attach_fd(struct socket *socket)
> 
> OL> Nit: I think this name is misleading - sock_attach_fd() itself is
> OL> already and should have been sock_attach_file() IMHO - so perhaps
> OL> s/fd/file here ?
> 
> The name was chosen to reflect the fact that we're allocating a socket
> and then calling sock_attach_fd() on it.  If you think it's less
> misleading to be inconsistent with the other call, I'll change it, but
> I wouldn't really agree... :)
> Thanks!
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Serge E. Hallyn July 29, 2009, 1:56 a.m. UTC | #4
Quoting Dan Smith (danms@us.ibm.com):
> OL> Unless/until we decide otherwise, let's keep all ckpt_hdr_xxx
> OL> definitions in a single place -- checkpoint_hdr.h
> 
> That's how I initially had it, but Serge asked for it to be moved into
> the network headers.  Serge?

Yes, it seems to me we want to keep those close to the maintainer's
code rather than expect them to go browsing through cr headers when
they change their own code.

Of course just putting ckpt_hdr_xxx into the subsystems' headers
isn't sufficient - we need to have better discussions with the
maintainers about ways to make maintaining their subsystems with
cr pretty much the same as it was without cr.

-serge
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Serge E. Hallyn July 29, 2009, 1:36 p.m. UTC | #5
Quoting Oren Laadan (orenl@librato.com):
> > OL> Does the following bypass security checks for sys_connect() ?

[ on sock_unix_restore()->sock_unix_restore_connected()->sock_unix_join() ]

> > 
> > I don't think so.  We're basically replicating sys_socketpair() here,
> > which does not do a security check, presumably because all you're
> > doing is hooking two sockets together that both belong to you.  That's
> > not to say that we're as safe as that limited operation, but I don't
> > think it's totally clear.  Perhaps someone more confident will
> > comment.
> 
> Yes, please ... Serge ?
> 
> To me it sounds plausible. If we adopt it, then a comment in the
> code is worthwhile.

I'm not sure what Oren means "sounds plausible" or should be adopted.
Using a common helper with sys_connect()?

At the moment you miss out on the security_socket_connect() call.  That
may be not as important for unix sockets, but it does look like selinux +
netlabel can label unix sockets as well.  So I'm not convinced we can
just ignore it, as once we start properly LSM-labeling tasks and
sockets we may need to do that to ensure proper restart under selinux.

The other thing is that some new fancy doohicky might require another
hook in sys_connect, which may or may not be needed for this path.
If coded this way, we may not find out until someone reports some
subtle failure long after the fact.

Still your code is so customized that perhaps an explicit
security_socket_connect() call in your sock_unix_join() may be the
way to go...

-serge
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Smith July 29, 2009, 2:49 p.m. UTC | #6
SH> At the moment you miss out on the security_socket_connect() call. 

Is that any different than the path involved when a process does a
socketpair() call?

SH> Still your code is so customized that perhaps an explicit
SH> security_socket_connect() call in your sock_unix_join() may be the
SH> way to go...

So, when I do the join, I really should run the check on both the
remote and local addresses, right?  The join operation is not really a
connect in the sense of being one-sided...
Serge E. Hallyn July 29, 2009, 2:59 p.m. UTC | #7
Quoting Dan Smith (danms@us.ibm.com):
> SH> At the moment you miss out on the security_socket_connect() call. 
> 
> Is that any different than the path involved when a process does a
> socketpair() call?
> 
> SH> Still your code is so customized that perhaps an explicit
> SH> security_socket_connect() call in your sock_unix_join() may be the
> SH> way to go...
> 
> So, when I do the join, I really should run the check on both the
> remote and local addresses, right?  The join operation is not really a
> connect in the sense of being one-sided...

Ok well if a join is not a connect then ignore me.

I'll set a block of time aside to take a closer look on your next
submit.

thanks,
-serge
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Smith July 29, 2009, 3:03 p.m. UTC | #8
SH> Ok well if a join is not a connect then ignore me.

Well, it's closer to a socketpair() than a connect(), I'd say.

For now, I'll put a comment in there noting that we may need a
security hook to remind us to revisit it.
Oren Laadan July 29, 2009, 3:34 p.m. UTC | #9
Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl@librato.com):
>>> OL> Does the following bypass security checks for sys_connect() ?
> 
> [ on sock_unix_restore()->sock_unix_restore_connected()->sock_unix_join() ]
> 
>>> I don't think so.  We're basically replicating sys_socketpair() here,
>>> which does not do a security check, presumably because all you're
>>> doing is hooking two sockets together that both belong to you.  That's
>>> not to say that we're as safe as that limited operation, but I don't
>>> think it's totally clear.  Perhaps someone more confident will
>>> comment.
>> Yes, please ... Serge ?
>>
>> To me it sounds plausible. If we adopt it, then a comment in the
>> code is worthwhile.
> 
> I'm not sure what Oren means "sounds plausible" or should be adopted.
> Using a common helper with sys_connect()?

I meant that Dan's argument sounds plausible, and if we go that
way, it deserves a comment in the code explaining why the security
call is omitted.

Of course, that was before reading your concern about LSM-labeling
of sockets...

Oren.

> 
> At the moment you miss out on the security_socket_connect() call.  That
> may be not as important for unix sockets, but it does look like selinux +
> netlabel can label unix sockets as well.  So I'm not convinced we can
> just ignore it, as once we start properly LSM-labeling tasks and
> sockets we may need to do that to ensure proper restart under selinux.
> 
> The other thing is that some new fancy doohicky might require another
> hook in sys_connect, which may or may not be needed for this path.
> If coded this way, we may not find out until someone reports some
> subtle failure long after the fact.
> 
> Still your code is so customized that perhaps an explicit
> security_socket_connect() call in your sock_unix_join() may be the
> way to go...
> 
> -serge
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Smith July 29, 2009, 6:37 p.m. UTC | #10
OL> Hmmm.. what about splice_direct_to_actor() ?

The comments (and code) for that function define that we can't splice
between non-regular files.  For now, this would work, but would fail
if you pass a socket as the checkpoint descriptor, as might be the
case in a migration.

OL> Still need to keep it in mind for inet when including those
OL> lingering sockets that don't belong to anyone.

Yep.

OL> This also means that a peer (of a dgram socket) that was closed
OL> will not be checkpointed, so restoring the rcvbuf of the remaining
OL> dgram socket wouldn't work.

Actually, the new algorithm creates the pair when finding the first
socket instead of the second, so I think it will still work.  However,
I'll test it and apply some thought to what will happen to the socket
we created to represent the dead one (I think it will die when the
objhash is freed).

OL> Hmm.. then what happens when you have a circular dependency ?
OL> For example, three dgram sockets, A, B and C where: A->B, B->C
OL> and C->A  ('->' means connected).

Hmm, hadn't thought of that.

OL> I suspect that sock_unix_restore_connect() will fail, because
OL> neither:

OL> +	if (!IS_ERR(this) && !IS_ERR(peer)) {

OL> nor

OL> +	} else if ((PTR_ERR(this) == -EINVAL) && (PTR_ERR(peer) == -EINVAL)) {

OL> will hold true, therefore:

OL> +	} else {
OL> +		ckpt_debug("Order Error\n");
OL> +		ret = PTR_ERR(this);
OL> +		goto out;
OL> +	}

Yeah, but I don't think it will be too hard to add another case to
that condition to handle this.  I'll check into it.
Dan Smith July 30, 2009, 3:49 p.m. UTC | #11
OL> Hmm.. then what happens when you have a circular dependency ?  For
OL> example, three dgram sockets, A, B and C where: A->B, B->C and
OL> C->A ('->' means connected).

So, I've been cooking up changes to the patch and a test for this
case.  However, it seems like it's not valid, unless I'm missing
something.  The man page for connect() says:

  If the socket sockfd is of type SOCK_DGRAM then serv_addr is the
  address to which datagrams are sent by default, and the only address
  from which datagrams are received.

So, even though you can connect() a DGRAM socket and then sendto()
datagrams to a different location, it doesn't appear that the
relationship between A and B is really valid, at least the connection
between A and B is not functional.  In fact, in my testing, if you try
to connect() C back to A, you get "Operation not permitted" because A
is already connected elsewhere.

Thoughts?
John Dykstra July 30, 2009, 10:10 p.m. UTC | #12
On Thu, 2009-07-30 at 08:49 -0700, Dan Smith wrote:
> In fact, in my testing, if you try
> to connect() C back to A, you get "Operation not permitted" because A
> is already connected elsewhere.

I suspect this is a testing error.  Connecting a datagram socket does
not change or query the state of the target--it is a purely local
operation.

  --  John

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Dykstra July 30, 2009, 10:12 p.m. UTC | #13
On Thu, 2009-07-30 at 22:10 +0000, John Dykstra wrote:
> > In fact, in my testing, if you try
> > to connect() C back to A, you get "Operation not permitted" because
> A
> > is already connected elsewhere.
> 
> I suspect this is a testing error.  Connecting a datagram socket does
> not change or query the state of the target--it is a purely local
> operation.

Never mind.  You're talking about AF_UNIX.

  --  John

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Smith July 30, 2009, 10:14 p.m. UTC | #14
JD> Never mind.  You're talking about AF_UNIX.

...right :)
Oren Laadan Aug. 4, 2009, 8:27 a.m. UTC | #15
Dan Smith wrote:
> OL> Hmm.. then what happens when you have a circular dependency ?  For
> OL> example, three dgram sockets, A, B and C where: A->B, B->C and
> OL> C->A ('->' means connected).
> 
> So, I've been cooking up changes to the patch and a test for this
> case.  However, it seems like it's not valid, unless I'm missing
> something.  The man page for connect() says:
> 
>   If the socket sockfd is of type SOCK_DGRAM then serv_addr is the
>   address to which datagrams are sent by default, and the only address
>   from which datagrams are received.
> 
> So, even though you can connect() a DGRAM socket and then sendto()
> datagrams to a different location, it doesn't appear that the
> relationship between A and B is really valid, at least the connection
> between A and B is not functional.  In fact, in my testing, if you try
> to connect() C back to A, you get "Operation not permitted" because A
> is already connected elsewhere.

Weird ... I have a faint memory of seeing that happen (which is
why I brought it up).

Does this mean that a situation of A->B and B->C is valid only as
long as A->B is done first, otherwise A->B will fail because B will
already be connected to C ?

Then, the other problem is to restore correctly you need to mimic
the behavior of sendto() because of the way the skb references the
original socket for the write-buf accounting :(

In turn, this means that during checkpoint you need to record the
_origin_ of each buffer in the queue of afunix dgram sockets :((

Sigh :(((

Oren.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Smith Aug. 4, 2009, 3:16 p.m. UTC | #16
OL> Does this mean that a situation of A->B and B->C is valid only as
OL> long as A->B is done first, otherwise A->B will fail because B
OL> will already be connected to C ?

Correct.

OL> Then, the other problem is to restore correctly you need to mimic
OL> the behavior of sendto() because of the way the skb references the
OL> original socket for the write-buf accounting :(

OL> In turn, this means that during checkpoint you need to record the
OL> _origin_ of each buffer in the queue of afunix dgram sockets :((

Or return EBUSY when there are skb's that are outstanding from before
the re-connect, right?
Oren Laadan Aug. 4, 2009, 5:05 p.m. UTC | #17
Dan Smith wrote:
> OL> Does this mean that a situation of A->B and B->C is valid only as
> OL> long as A->B is done first, otherwise A->B will fail because B
> OL> will already be connected to C ?
> 
> Correct.

Hmm.. this also means that if A->C already, then an attempt to also
do B->C will fail :(

> 
> OL> Then, the other problem is to restore correctly you need to mimic
> OL> the behavior of sendto() because of the way the skb references the
> OL> original socket for the write-buf accounting :(
> 
> OL> In turn, this means that during checkpoint you need to record the
> OL> _origin_ of each buffer in the queue of afunix dgram sockets :((
> 
> Or return EBUSY when there are skb's that are outstanding from before
> the re-connect, right?

Hmm.. I think that when you reconnect a dgram socket, it discards all
unread pending data previous received, whether through connection or
via sendto().

What I meant is that restore (and therefore checkpoint) need to know
how to handle skb's sent with sendto() but without proper connection,
where the target socket is still unconnected.

I'm ok with declaring this case unsupported for now and put it in the
'todo' list, and meanwhile work towards getting the current afunix
patches into the cr-dev tree. (As long as it doesn't it will be buried
there forever...)

Oren.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Smith Aug. 4, 2009, 5:13 p.m. UTC | #18
OL> What I meant is that restore (and therefore checkpoint) need to
OL> know how to handle skb's sent with sendto() but without proper
OL> connection, where the target socket is still unconnected.

I see.  I'll have to think on that a bit.

OL> I'm ok with declaring this case unsupported for now and put it in
OL> the 'todo' list, and meanwhile work towards getting the current
OL> afunix patches into the cr-dev tree. (As long as it doesn't it
OL> will be buried there forever...)

Okay, I think that would be most productive :)
diff mbox

Patch

diff --git a/checkpoint/files.c b/checkpoint/files.c
index bcdc774..81e7b20 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -21,6 +21,7 @@ 
 #include <linux/syscalls.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
 
 
 /**************************************************************************
@@ -548,6 +549,12 @@  static struct restore_file_ops restore_file_ops[] = {
 		.file_type = CKPT_FILE_PIPE,
 		.restore = pipe_file_restore,
 	},
+	/* socket */
+	{
+		.file_name = "SOCKET",
+		.file_type = CKPT_FILE_SOCKET,
+		.restore = sock_file_restore,
+	},
 };
 
 static struct file *do_restore_file(struct ckpt_ctx *ctx)
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index da43bf4..ad08ed5 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -20,6 +20,7 @@ 
 #include <linux/user_namespace.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
 
 struct ckpt_obj;
 struct ckpt_obj_ops;
@@ -244,6 +245,22 @@  static void obj_groupinfo_drop(void *ptr)
 	put_group_info((struct group_info *) ptr);
 }
 
+static int obj_sock_grab(void *ptr)
+{
+	sock_hold((struct sock *) ptr);
+	return 0;
+}
+
+static void obj_sock_drop(void *ptr)
+{
+	sock_put((struct sock *) ptr);
+}
+
+static int obj_sock_users(void *ptr)
+{
+	return atomic_read(&((struct sock *) ptr)->sk_refcnt);
+}
+
 static struct ckpt_obj_ops ckpt_obj_ops[] = {
 	/* ignored object */
 	{
@@ -367,6 +384,16 @@  static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.checkpoint = checkpoint_groupinfo,
 		.restore = restore_groupinfo,
 	},
+	/* sock object */
+	{
+		.obj_name = "SOCKET",
+		.obj_type = CKPT_OBJ_SOCK,
+		.ref_drop = obj_sock_drop,
+		.ref_grab = obj_sock_grab,
+		.ref_users = obj_sock_users,
+		.checkpoint = sock_file_checkpoint,
+		.restore = sock_file_restore,
+	},
 };
 
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 8b7ca46..e542ff5 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -88,6 +88,12 @@  enum {
 
 	CKPT_HDR_SIGHAND = 601,
 
+	CKPT_HDR_FD_SOCKET = 701,
+	CKPT_HDR_SOCKET,
+	CKPT_HDR_SOCKET_BUFFERS,
+	CKPT_HDR_SOCKET_BUFFER,
+	CKPT_HDR_SOCKET_UNIX,
+
 	CKPT_HDR_TAIL = 9001,
 
 	CKPT_HDR_ERROR = 9999,
@@ -122,6 +128,7 @@  enum obj_type {
 	CKPT_OBJ_CRED,
 	CKPT_OBJ_USER,
 	CKPT_OBJ_GROUPINFO,
+	CKPT_OBJ_SOCK,
 	CKPT_OBJ_MAX
 };
 
@@ -326,6 +333,7 @@  enum file_type {
 	CKPT_FILE_IGNORE = 0,
 	CKPT_FILE_GENERIC,
 	CKPT_FILE_PIPE,
+	CKPT_FILE_SOCKET,
 	CKPT_FILE_MAX
 };
 
@@ -349,6 +357,11 @@  struct ckpt_hdr_file_pipe {
 	__s32 pipe_objref;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_file_socket {
+	struct ckpt_hdr_file common;
+	__u16 family;
+} __attribute__((aligned(8)));
+
 struct ckpt_hdr_file_pipe_state {
 	struct ckpt_hdr h;
 	__s32 pipe_len;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 3b461df..cd675dd 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -23,6 +23,7 @@  struct __kernel_sockaddr_storage {
 #include <linux/uio.h>			/* iovec support		*/
 #include <linux/types.h>		/* pid_t			*/
 #include <linux/compiler.h>		/* __user			*/
+#include <linux/checkpoint.h>		/* struct ckpt_hdr              */
 
 #ifdef __KERNEL__
 # ifdef CONFIG_PROC_FS
@@ -328,5 +329,67 @@  extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *ka
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
 #endif
+
+#ifdef CONFIG_CHECKPOINT
+#include <linux/un.h>                   /* sockaddr_un			*/
+
+#define CKPT_UNIX_LINKED 1
+struct ckpt_hdr_socket_unix {
+	struct ckpt_hdr h;
+	__u32 this;
+	__u32 peer;
+	__u32 flags;
+	__u32 laddr_len;
+	__u32 raddr_len;
+	struct sockaddr_un laddr;
+	struct sockaddr_un raddr;
+} __attribute__ ((aligned(8)));
+
+struct ckpt_hdr_socket {
+	struct ckpt_hdr h;
+
+	struct ckpt_socket { /* struct socket */
+		__u64 flags;
+		__u8 state;
+	} socket __attribute__ ((aligned(8)));
+
+	struct ckpt_sock_common { /* struct sock_common */
+		__u32 bound_dev_if;
+		__u16 family;
+		__u8 state;
+		__u8 reuse;
+	} sock_common __attribute__ ((aligned(8)));
+
+	struct ckpt_sock { /* struct sock */
+		__s64 rcvlowat;
+		__s64 rcvtimeo;
+		__s64 sndtimeo;
+		__u64 flags;
+		__u64 lingertime;
+
+		__u32 err;
+		__u32 err_soft;
+		__u32 priority;
+		__s32 rcvbuf;
+		__s32 sndbuf;
+		__u16 type;
+		__s16 backlog;
+
+		__u8 protocol;
+		__u8 state;
+		__u8 shutdown;
+		__u8 userlocks;
+		__u8 no_check;
+	} sock __attribute__ ((aligned(8)));
+
+} __attribute__ ((aligned(8)));
+
+struct ckpt_hdr_socket_buffer {
+	struct ckpt_hdr h;
+	__u32 skb_count;
+	__u32 total_bytes;
+} __attribute__ ((aligned(8)));
+#endif /* CONFIG_CHECKPOINT */
+
 #endif /* not kernel and not glibc */
 #endif /* _LINUX_SOCKET_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index d435dc1..4043cd2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1608,4 +1608,15 @@  extern int sysctl_optmem_max;
 extern __u32 sysctl_wmem_default;
 extern __u32 sysctl_rmem_default;
 
+#ifdef CONFIG_CHECKPOINT
+/* Checkpoint/Restart Functions */
+struct ckpt_ctx;
+struct ckpt_hdr_socket;
+extern int sock_file_checkpoint(struct ckpt_ctx *, void *);
+extern void *sock_file_restore(struct ckpt_ctx *);
+extern struct socket *do_sock_file_restore(struct ckpt_ctx *,
+					   struct ckpt_hdr_socket *);
+extern int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file);
+#endif
+
 #endif	/* _SOCK_H */
diff --git a/net/Makefile b/net/Makefile
index ba324ae..91d12fe 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -66,3 +66,5 @@  ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
 endif
 obj-$(CONFIG_WIMAX)		+= wimax/
+
+obj-$(CONFIG_CHECKPOINT)	+= checkpoint.o
diff --git a/net/checkpoint.c b/net/checkpoint.c
new file mode 100644
index 0000000..748d3aa
--- /dev/null
+++ b/net/checkpoint.c
@@ -0,0 +1,779 @@ 
+/*
+ *  Copyright 2009 IBM Corporation
+ *
+ *  Author: Dan Smith <danms@us.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/socket.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/fs_struct.h>
+
+#include <net/af_unix.h>
+#include <net/tcp_states.h>
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+#define UNIX_ADDR_EMPTY(a) (a <= sizeof(short))
+
+static inline int sock_unix_need_cwd(struct sockaddr_un *addr,
+				     unsigned long len)
+{
+	return (!UNIX_ADDR_EMPTY(len)) &&
+		addr->sun_path[0] &&
+		(addr->sun_path[0] != '/');
+}
+
+static int sock_copy_buffers(struct sk_buff_head *from,
+			     struct sk_buff_head *to,
+			     uint32_t *total_bytes)
+{
+	int count = 0;
+	struct sk_buff *skb;
+
+	*total_bytes = 0;
+
+	skb_queue_walk(from, skb) {
+		struct sk_buff *tmp;
+
+		tmp = dev_alloc_skb(skb->len);
+		if (!tmp)
+			return -ENOMEM;
+
+		spin_lock(&from->lock);
+		skb_morph(tmp, skb);
+		spin_unlock(&from->lock);
+
+		skb_queue_tail(to, tmp);
+		count++;
+		*total_bytes += tmp->len;
+	}
+
+	return count;
+}
+
+static int __sock_write_buffers(struct ckpt_ctx *ctx,
+				struct sk_buff_head *queue)
+{
+	struct sk_buff *skb;
+	int ret = 0;
+
+	skb_queue_walk(queue, skb) {
+		if (UNIXCB(skb).fp) {
+			ckpt_write_err(ctx, "fd-passing is not supported");
+			return -EBUSY;
+		}
+
+		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
+					  CKPT_HDR_SOCKET_BUFFER);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	struct sk_buff_head tmpq;
+	int ret = -ENOMEM;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
+	if (!h)
+		goto out;
+
+	skb_queue_head_init(&tmpq);
+
+	h->skb_count = sock_copy_buffers(queue, &tmpq, &h->total_bytes);
+	if (h->skb_count < 0) {
+		ret = h->skb_count;
+		goto out;
+	}
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	if (!ret)
+		ret = __sock_write_buffers(ctx, &tmpq);
+
+ out:
+	ckpt_hdr_put(ctx, h);
+	__skb_queue_purge(&tmpq);
+
+	return ret;
+}
+
+static int sock_unix_write_cwd(struct ckpt_ctx *ctx,
+			       struct sock *sock,
+			       const char *sockpath)
+{
+	struct path path;
+	char *buf;
+	char *fqpath;
+	int offset;
+	int ret = -ENOENT;
+
+	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	path.dentry = unix_sk(sock)->dentry;
+	path.mnt = unix_sk(sock)->mnt;
+
+	fqpath = d_path(&path, buf, PATH_MAX);
+	if (!fqpath)
+		goto out;
+
+	offset = strlen(fqpath) - strlen(sockpath);
+	if (offset <= 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	fqpath[offset] = '\0';
+
+	ckpt_debug("writing socket directory: %s\n", fqpath);
+	ret = ckpt_write_string(ctx, fqpath, strlen(fqpath));
+ out:
+	kfree(buf);
+	return ret;
+}
+
+static int sock_unix_getnames(struct ckpt_ctx *ctx,
+			      struct ckpt_hdr_socket_unix *un,
+			      struct socket *socket)
+{
+	struct sockaddr *loc = (struct sockaddr *)&un->laddr;
+	struct sockaddr *rem = (struct sockaddr *)&un->raddr;
+
+	if (socket->ops->getname(socket, loc, &un->laddr_len, 0)) {
+		ckpt_write_err(ctx, "Unable to getname of local");
+		return -EINVAL;
+	}
+
+	if (socket->ops->getname(socket, rem, &un->raddr_len, 1)) {
+		if ((socket->sk->sk_type != SOCK_DGRAM) &&
+		    (socket->sk->sk_state == TCP_ESTABLISHED)) {
+			ckpt_write_err(ctx, "Unable to getname of remote");
+			return -EINVAL;
+		}
+		un->raddr_len = 0;
+	}
+
+	return 0;
+}
+
+static int sock_unix_checkpoint(struct ckpt_ctx *ctx,
+			        struct socket *socket,
+			        struct ckpt_hdr_socket *h)
+{
+	struct unix_sock *sk = unix_sk(socket->sk);
+	struct unix_sock *pr = unix_sk(sk->peer);
+	struct ckpt_hdr_socket_unix *un;
+	int new;
+	int ret = -ENOMEM;
+
+	if ((socket->sk->sk_state == TCP_LISTEN) &&
+	    !skb_queue_empty(&socket->sk->sk_receive_queue)) {
+		ckpt_write_err(ctx, "listening socket has unaccepted peers");
+		return -EBUSY;
+	}
+
+	un = ckpt_hdr_get_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UNIX);
+	if (!un)
+		goto out;
+
+	ret = sock_unix_getnames(ctx, un, socket);
+	if (ret)
+		goto out;
+
+	if (sk->dentry && (sk->dentry->d_inode->i_nlink > 0))
+		un->flags |= CKPT_UNIX_LINKED;
+
+	un->this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new);
+	if (un->this < 0)
+		goto out;
+
+	if (sk->peer)
+		un->peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
+	else
+		un->peer = 0;
+
+	if (un->peer < 0) {
+		ret = un->peer;
+		goto out;
+	}
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	if (ret < 0)
+		goto out;
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) un);
+	if (ret < 0)
+		goto out;
+
+	if (sock_unix_need_cwd(&un->laddr, un->laddr_len))
+		ret = sock_unix_write_cwd(ctx, socket->sk, un->laddr.sun_path);
+ out:
+	ckpt_hdr_put(ctx, un);
+
+	return ret;
+}
+
+static int sock_cptrst_verify(struct ckpt_hdr_socket *h)
+{
+	uint8_t userlocks_mask = SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK |
+		                 SOCK_BINDADDR_LOCK | SOCK_BINDPORT_LOCK;
+
+	if (h->sock.shutdown & ~SHUTDOWN_MASK)
+		return -EINVAL;
+	if (h->sock.userlocks & ~userlocks_mask)
+		return -EINVAL;
+	if (h->sock.sndtimeo < 0)
+		return -EINVAL;
+	if (h->sock.rcvtimeo < 0)
+		return -EINVAL;
+	if ((h->sock.userlocks & SOCK_SNDBUF_LOCK) &&
+	    ((h->sock.sndbuf < SOCK_MIN_SNDBUF) ||
+	     (h->sock.sndbuf > sysctl_wmem_max)))
+		return -EINVAL;
+	if ((h->sock.userlocks & SOCK_RCVBUF_LOCK) &&
+	    ((h->sock.rcvbuf < SOCK_MIN_RCVBUF) ||
+	     (h->sock.rcvbuf > sysctl_rmem_max)))
+		return -EINVAL;
+	if ((h->sock.flags & SOCK_LINGER) &&
+	    (h->sock.lingertime > MAX_SCHEDULE_TIMEOUT))
+		return -EINVAL;
+	if (!ckpt_validate_errno(h->sock.err))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int sock_cptrst(struct ckpt_ctx *ctx,
+		       struct sock *sock,
+		       struct ckpt_hdr_socket *h,
+		       int op)
+{
+	if (sock->sk_socket) {
+		CKPT_COPY(op, h->socket.flags, sock->sk_socket->flags);
+		CKPT_COPY(op, h->socket.state, sock->sk_socket->state);
+	}
+
+	CKPT_COPY(op, h->sock_common.reuse, sock->sk_reuse);
+	CKPT_COPY(op, h->sock_common.bound_dev_if, sock->sk_bound_dev_if);
+	CKPT_COPY(op, h->sock_common.family, sock->sk_family);
+
+	CKPT_COPY(op, h->sock.shutdown, sock->sk_shutdown);
+	CKPT_COPY(op, h->sock.userlocks, sock->sk_userlocks);
+	CKPT_COPY(op, h->sock.no_check, sock->sk_no_check);
+	CKPT_COPY(op, h->sock.protocol, sock->sk_protocol);
+	CKPT_COPY(op, h->sock.err, sock->sk_err);
+	CKPT_COPY(op, h->sock.err_soft, sock->sk_err_soft);
+	CKPT_COPY(op, h->sock.priority, sock->sk_priority);
+	CKPT_COPY(op, h->sock.rcvlowat, sock->sk_rcvlowat);
+	CKPT_COPY(op, h->sock.backlog, sock->sk_max_ack_backlog);
+	CKPT_COPY(op, h->sock.rcvtimeo, sock->sk_rcvtimeo);
+	CKPT_COPY(op, h->sock.sndtimeo, sock->sk_sndtimeo);
+	CKPT_COPY(op, h->sock.rcvbuf, sock->sk_rcvbuf);
+	CKPT_COPY(op, h->sock.sndbuf, sock->sk_sndbuf);
+	CKPT_COPY(op, h->sock.flags, sock->sk_flags);
+	CKPT_COPY(op, h->sock.lingertime, sock->sk_lingertime);
+	CKPT_COPY(op, h->sock.type, sock->sk_type);
+	CKPT_COPY(op, h->sock.state, sock->sk_state);
+
+	if ((h->socket.state == SS_CONNECTED) &&
+	    (h->sock.state != TCP_ESTABLISHED)) {
+		ckpt_debug("socket/sock in inconsistent state: %i/%i",
+			   h->socket.state, h->sock.state);
+		return -EINVAL;
+	} else if ((h->sock.state < TCP_ESTABLISHED) ||
+		   (h->sock.state >= TCP_MAX_STATES)) {
+		ckpt_debug("sock in invalid state: %i", h->sock.state);
+		return -EINVAL;
+	} else if ((h->socket.state < SS_FREE) ||
+		   (h->socket.state > SS_DISCONNECTING)) {
+		ckpt_debug("socket in invalid state: %i",
+			   h->socket.state);
+		return -EINVAL;
+	}
+
+	if (op == CKPT_CPT)
+		return sock_cptrst_verify(h);
+	else
+		return 0;
+}
+
+int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+	struct socket *socket = file->private_data;
+	struct sock *sock = socket->sk;
+	struct ckpt_hdr_socket *h;
+	int ret = 0;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+	if (!h)
+		return -ENOMEM;
+
+	ret = sock_cptrst(ctx, sock, h, CKPT_CPT);
+	if (ret)
+		goto out;
+
+	if (sock->sk_family == AF_UNIX) {
+		ret = sock_unix_checkpoint(ctx, socket, h);
+		if (ret)
+			goto out;
+	} else {
+		ckpt_write_err(ctx, "unsupported socket family %i",
+			       sock->sk_family);
+		ret = EINVAL;
+		goto out;
+	}
+
+	if (sock->sk_state != TCP_LISTEN) {
+		ret = sock_write_buffers(ctx, &sock->sk_receive_queue);
+		if (ret)
+			goto out;
+
+		ret = sock_write_buffers(ctx, &sock->sk_write_queue);
+		if (ret)
+			goto out;
+	}
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+static int sock_read_buffer(struct ckpt_ctx *ctx, struct sock *sock)
+{
+	struct ckpt_hdr h;
+	struct msghdr msg;
+	struct kvec kvec;
+	int ret = 0;
+	int len;
+
+	memset(&msg, 0, sizeof(msg));
+
+	len = _ckpt_read_hdr_type(ctx, &h, CKPT_HDR_SOCKET_BUFFER);
+	if (len < 0)
+		return len;
+
+	if (len > SKB_MAX_ALLOC) {
+		ckpt_debug("Socket buffer too big (%i > %lu)",
+			   len, SKB_MAX_ALLOC);
+		return -ENOSPC;
+	}
+
+	kvec.iov_len = len;
+	kvec.iov_base = kmalloc(len, GFP_KERNEL);
+	if (!kvec.iov_base)
+		return -ENOMEM;
+
+	ret = _ckpt_read_payload(ctx, &h, kvec.iov_base);
+	if (ret < 0)
+		goto out;
+
+	ret = kernel_sendmsg(sock->sk_socket, &msg, &kvec, 1, len);
+	ckpt_debug("kernel_sendmsg(%i): %i\n", len, ret);
+	if ((ret > 0) && (ret != len))
+		ret = -ENOMEM;
+ out:
+	return ret;
+}
+
+static int sock_read_buffers(struct ckpt_ctx *ctx, struct sock *sock,
+			     uint32_t *bufsize)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	int ret = 0;
+	int i;
+	uint32_t total = 0;
+	uint8_t sock_shutdown;
+
+	/* If peer is shutdown, unshutdown it for this process */
+	sock_shutdown = sock->sk_shutdown;
+	sock->sk_shutdown &= ~SHUTDOWN_MASK;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
+	if (IS_ERR(h)) {
+		ret = PTR_ERR(h);
+		goto out;
+	}
+
+	if (!bufsize) {
+		if (h->total_bytes != 0) {
+			ckpt_debug("Expected empty buffer, got %u\n",
+				   h->total_bytes);
+			ret = -EINVAL;
+		}
+		goto out;
+	} else if (h->total_bytes > *bufsize) {
+		if (capable(CAP_NET_ADMIN))
+			*bufsize = h->total_bytes;
+		else {
+			ckpt_debug("Buffer total %u exceeds limit %u\n",
+			   h->total_bytes, *bufsize);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < h->skb_count; i++) {
+		ret = sock_read_buffer(ctx, sock);
+		ckpt_debug("read buffer %i: %i\n", i, ret);
+		if (ret < 0)
+			break;
+
+		total += ret;
+		if (total > h->total_bytes) {
+			ckpt_debug("Buffers exceeded claimed %u", total);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (total == h->total_bytes)
+		ret = 0;
+	else {
+		ckpt_debug("Inconsistent total buffer size %u != %u\n",
+			   total, h->total_bytes);
+		ret = -EINVAL;
+	}
+ out:
+	sock->sk_shutdown = sock_shutdown;
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+static struct unix_address *sock_unix_makeaddr(struct sockaddr_un *sun_addr,
+					       unsigned len)
+{
+	struct unix_address *addr;
+
+	if (len > sizeof(struct sockaddr_un))
+		return ERR_PTR(-EINVAL);
+
+	addr = kmalloc(sizeof(*addr) + len, GFP_KERNEL);
+	if (!addr)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(addr->name, sun_addr, len);
+	addr->len = len;
+	atomic_set(&addr->refcnt, 1);
+
+	return addr;
+}
+
+static int sock_unix_join(struct ckpt_ctx *ctx,
+			  struct sock *a,
+			  struct sock *b,
+			  struct ckpt_hdr_socket_unix *un)
+{
+	struct unix_address *addr = NULL;
+
+	sock_hold(a);
+	sock_hold(b);
+
+	unix_sk(a)->peer = b;
+	unix_sk(b)->peer = a;
+
+	a->sk_peercred.pid = task_tgid_vnr(current);
+	a->sk_peercred.uid = ctx->realcred->uid;
+	a->sk_peercred.gid = ctx->realcred->gid;
+
+	b->sk_peercred.pid = a->sk_peercred.pid;
+	b->sk_peercred.uid = a->sk_peercred.uid;
+	b->sk_peercred.gid = a->sk_peercred.gid;
+
+	if (!UNIX_ADDR_EMPTY(un->raddr_len))
+		addr = sock_unix_makeaddr(&un->raddr, un->raddr_len);
+	else if (!UNIX_ADDR_EMPTY(un->laddr_len))
+		addr = sock_unix_makeaddr(&un->laddr, un->laddr_len);
+
+	if (IS_ERR(addr))
+		return PTR_ERR(addr);
+	else if (addr) {
+		atomic_inc(&addr->refcnt); /* Held by both ends */
+		unix_sk(a)->addr = unix_sk(b)->addr = addr;
+	}
+
+	return 0;
+}
+
+static int sock_unix_restore_connected(struct ckpt_ctx *ctx,
+				       struct ckpt_hdr_socket *h,
+				       struct ckpt_hdr_socket_unix *un,
+				       struct socket *socket)
+{
+	struct sock *this = ckpt_obj_fetch(ctx, un->this, CKPT_OBJ_SOCK);
+	struct sock *peer = ckpt_obj_fetch(ctx, un->peer, CKPT_OBJ_SOCK);
+	struct socket *tmp = NULL;
+	int ret;
+
+	if (!IS_ERR(this) && !IS_ERR(peer)) {
+		/* We're last */
+		struct socket *old = this->sk_socket;
+
+		old->sk = NULL;
+		sock_release(old);
+		sock_graft(this, socket);
+
+	} else if ((PTR_ERR(this) == -EINVAL) && (PTR_ERR(peer) == -EINVAL)) {
+		/* We're first */
+		int family = socket->sk->sk_family;
+		int type = socket->sk->sk_type;
+
+		ret = sock_create(family, type, 0, &tmp);
+		ckpt_debug("sock_create: %i\n", ret);
+		if (ret)
+			goto out;
+
+		this = socket->sk;
+		peer = tmp->sk;
+
+		ret = ckpt_obj_insert(ctx, this, un->this, CKPT_OBJ_SOCK);
+		if (ret < 0)
+			goto out;
+
+		ret = ckpt_obj_insert(ctx, peer, un->peer, CKPT_OBJ_SOCK);
+		if (ret < 0)
+			goto out;
+
+		ret = sock_unix_join(ctx, this, peer, un);
+		ckpt_debug("sock_unix_join: %i\n", ret);
+		if (ret)
+			goto out;
+
+	} else {
+		ckpt_debug("Order Error\n");
+		ret = PTR_ERR(this);
+		goto out;
+	}
+
+	/* Prime the socket's buffer limit with the maximum */
+	peer->sk_userlocks |= SOCK_SNDBUF_LOCK;
+	peer->sk_sndbuf = sysctl_wmem_max;
+
+	/* Read my buffers and sendmsg() then back to me via my peer */
+	ret = sock_read_buffers(ctx, peer, &peer->sk_sndbuf);
+	ckpt_debug("sock_read_buffers: %i\n", ret);
+	if (ret)
+		goto out;
+
+	/* Read peer's buffers and expect 0 */
+	ret = sock_read_buffers(ctx, peer, NULL);
+ out:
+	if (tmp && ret)
+		sock_release(tmp);
+
+	return ret;
+}
+
+static int sock_unix_unlink(const char *name)
+{
+	struct path spath;
+	struct path ppath;
+	int ret;
+
+	ret = kern_path(name, 0, &spath);
+	if (ret)
+		return ret;
+
+	ret = kern_path(name, LOOKUP_PARENT, &ppath);
+	if (ret)
+		goto out_s;
+
+	if (!spath.dentry) {
+		ckpt_debug("No dentry found for %s\n", name);
+		ret = -ENOENT;
+		goto out_p;
+	}
+
+	if (!ppath.dentry || !ppath.dentry->d_inode) {
+		ckpt_debug("No inode for parent of %s\n", name);
+		ret = -ENOENT;
+		goto out_p;
+	}
+
+	ret = vfs_unlink(ppath.dentry->d_inode, spath.dentry);
+ out_p:
+	path_put(&ppath);
+ out_s:
+	path_put(&spath);
+
+	return ret;
+}
+
+/* Call bind() for socket, optionally changing (temporarily) to @path first
+ * if non-NULL
+ */
+static int sock_unix_chdir_and_bind(struct socket *socket,
+				    const char *path,
+				    struct sockaddr *addr,
+				    unsigned long addrlen)
+{
+	struct sockaddr_un *un = (struct sockaddr_un *)addr;
+	int ret;
+	struct path cur;
+	struct path dir;
+
+	if (path) {
+		ckpt_debug("switching to cwd %s for unix bind", path);
+
+		ret = kern_path(path, 0, &dir);
+		if (ret)
+			return ret;
+
+		ret = inode_permission(dir.dentry->d_inode,
+				       MAY_EXEC | MAY_ACCESS);
+		if (ret)
+			goto out;
+
+		write_lock(&current->fs->lock);
+		cur = current->fs->pwd;
+		current->fs->pwd = dir;
+		write_unlock(&current->fs->lock);
+	}
+
+	ret = sock_unix_unlink(un->sun_path);
+	ckpt_debug("unlink(%s): %i\n", un->sun_path, ret);
+	if ((ret == 0) || (ret == -ENOENT))
+		ret = sock_bind(socket, addr, addrlen);
+
+	if (path) {
+		write_lock(&current->fs->lock);
+		current->fs->pwd = cur;
+		write_unlock(&current->fs->lock);
+	}
+ out:
+	if (path)
+		path_put(&dir);
+
+	return ret;
+}
+
+static int sock_unix_fakebind(struct socket *socket,
+			      struct sockaddr_un *addr,
+			      unsigned long len)
+{
+	struct unix_address *uaddr;
+
+	uaddr = sock_unix_makeaddr(addr, len);
+	if (IS_ERR(uaddr))
+		return PTR_ERR(uaddr);
+
+	unix_sk(socket->sk)->addr = uaddr;
+
+	return 0;
+}
+
+static int sock_unix_bind(struct ckpt_hdr_socket *h,
+			  struct ckpt_hdr_socket_unix *un,
+			  struct socket *socket,
+			  const char *path)
+{
+	struct sockaddr *addr = (struct sockaddr *)&un->laddr;
+	unsigned long len = un->laddr_len;
+
+	if (!un->laddr.sun_path[0])
+		return sock_bind(socket, addr, len);
+	else if (!(un->flags & CKPT_UNIX_LINKED))
+		return sock_unix_fakebind(socket, &un->laddr, len);
+	else
+		return sock_unix_chdir_and_bind(socket, path, addr, len);
+}
+
+static int sock_unix_restore(struct ckpt_ctx *ctx,
+			     struct ckpt_hdr_socket *h,
+			     struct socket *socket)
+{
+	struct ckpt_hdr_socket_unix *un;
+	int ret = -EINVAL;
+	char *cwd = NULL;
+	struct net *net = sock_net(socket->sk);
+
+	/* AF_UNIX overloads the backlog setting to define the maximum
+	 * queue length for DGRAM sockets.  Make sure we don't let the
+	 * caller exceed that value on restart.
+	 */
+	if ((h->sock.type == SOCK_DGRAM) &&
+	    (h->sock.backlog > net->unx.sysctl_max_dgram_qlen)) {
+		ckpt_debug("DGRAM backlog of %i exceeds system max of %i\n",
+			   h->sock.backlog, net->unx.sysctl_max_dgram_qlen);
+		return -EINVAL;
+	}
+
+	un = ckpt_read_obj_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UNIX);
+	if (IS_ERR(un))
+		return PTR_ERR(un);
+
+	if (un->peer < 0)
+		goto out;
+
+	if (sock_unix_need_cwd(&un->laddr, un->laddr_len)) {
+		ret = ckpt_read_string(ctx, &cwd, PATH_MAX);
+		ckpt_debug("read cwd(%i): %s\n", ret, cwd);
+		if (ret)
+			goto out;
+	}
+
+	if ((h->sock.state != TCP_ESTABLISHED) &&
+	    !UNIX_ADDR_EMPTY(un->laddr_len)) {
+		ret = sock_unix_bind(h, un, socket, cwd);
+		if (ret)
+			goto out;
+	}
+
+	if ((h->sock.state == TCP_ESTABLISHED) || (h->sock.state == TCP_CLOSE))
+		ret = sock_unix_restore_connected(ctx, h, un, socket);
+	else if (h->sock.state == TCP_LISTEN)
+		ret = socket->ops->listen(socket, h->sock.backlog);
+	else
+		ckpt_debug("unsupported UNIX socket state %i\n", h->sock.state);
+ out:
+	ckpt_hdr_put(ctx, un);
+	kfree(cwd);
+	return ret;
+}
+
+struct socket *do_sock_file_restore(struct ckpt_ctx *ctx,
+				    struct ckpt_hdr_socket *h)
+{
+	struct socket *socket;
+	int ret;
+
+	ret = sock_create(h->sock_common.family, h->sock.type, 0, &socket);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	if (h->sock_common.family == AF_UNIX) {
+		ret = sock_unix_restore(ctx, h, socket);
+		ckpt_debug("sock_unix_restore: %i\n", ret);
+	} else {
+		ckpt_debug("unsupported family %i\n", h->sock_common.family);
+		ret = -EINVAL;
+	}
+
+	if (ret)
+		goto out;
+
+	ret = sock_cptrst(ctx, socket->sk, h, CKPT_RST);
+ out:
+	if (ret) {
+		sock_release(socket);
+		socket = ERR_PTR(ret);
+	}
+
+	return socket;
+}
+
diff --git a/net/socket.c b/net/socket.c
index 017f6e6..ff556fc 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -96,6 +96,8 @@ 
 #include <net/sock.h>
 #include <linux/netfilter.h>
 
+#include <linux/checkpoint.h>
+
 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos);
@@ -140,6 +142,9 @@  static const struct file_operations socket_file_ops = {
 	.sendpage =	sock_sendpage,
 	.splice_write = generic_splice_sendpage,
 	.splice_read =	sock_splice_read,
+#ifdef CONFIG_CHECKPOINT
+	.checkpoint =   sock_file_checkpoint,
+#endif
 };
 
 /*
@@ -415,6 +420,86 @@  int sock_map_fd(struct socket *sock, int flags)
 	return fd;
 }
 
+#ifdef CONFIG_CHECKPOINT
+int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct ckpt_hdr_file_socket *h;
+	int ret;
+	struct file *file = ptr;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+	if (!h)
+		return -ENOMEM;
+
+	h->common.f_type = CKPT_FILE_SOCKET;
+
+	ret = checkpoint_file_common(ctx, file, &h->common);
+	if (ret < 0)
+		goto out;
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	if (ret < 0)
+		goto out;
+
+	ret = do_sock_file_checkpoint(ctx, file);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static struct file *sock_alloc_attach_fd(struct socket *socket)
+{
+	struct file *file;
+	int err;
+
+	file = get_empty_filp();
+	if (!file)
+		return ERR_PTR(ENOMEM);
+
+	err = sock_attach_fd(socket, file, 0);
+	if (err < 0) {
+		put_filp(file);
+		file = ERR_PTR(err);
+	}
+
+	return file;
+}
+
+void *sock_file_restore(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_socket *h = NULL;
+	struct socket *socket = NULL;
+	struct file *file = NULL;
+	int err;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+	if (IS_ERR(h))
+		return h;
+
+	socket = do_sock_file_restore(ctx, h);
+	if (IS_ERR(socket)) {
+		err = PTR_ERR(socket);
+		goto err_put;
+	}
+
+	file = sock_alloc_attach_fd(socket);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_release;
+	}
+
+	ckpt_hdr_put(ctx, h);
+
+	return file;
+
+ err_release:
+	sock_release(socket);
+ err_put:
+	ckpt_hdr_put(ctx, h);
+
+	return ERR_PTR(err);
+}
+#endif /* CONFIG_CHECKPOINT */
+
 static struct socket *sock_from_file(struct file *file, int *err)
 {
 	if (file->f_op == &socket_file_ops)