diff mbox

SIW: iWARP Protocol headers

Message ID 1286261630-5085-1-git-send-email-bmt@zurich.ibm.com
State Not Applicable, archived
Delegated to: David Miller
Headers show

Commit Message

Bernard Metzler Oct. 5, 2010, 6:53 a.m. UTC
---
 drivers/infiniband/hw/siw/iwarp.h |  324 +++++++++++++++++++++++++++++++++++++
 1 files changed, 324 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/iwarp.h

Comments

Steve Wise Oct. 5, 2010, 1:53 p.m. UTC | #1
On 10/05/2010 01:53 AM, Bernard Metzler wrote:
> ---
>   drivers/infiniband/hw/siw/iwarp.h |  324 +++++++++++++++++++++++++++++++++++++
>   1 files changed, 324 insertions(+), 0 deletions(-)
>   create mode 100644 drivers/infiniband/hw/siw/iwarp.h
>
> diff --git a/drivers/infiniband/hw/siw/iwarp.h b/drivers/infiniband/hw/siw/iwarp.h
> new file mode 100644
> index 0000000..762c1d3
> --- /dev/null
> +++ b/drivers/infiniband/hw/siw/iwarp.h
> @@ -0,0 +1,324 @@
> +/*
> + * Software iWARP device driver for Linux
> + *
> + * Authors: Bernard Metzler<bmt@zurich.ibm.com>
> + *          Fredy Neeser<nfd@zurich.ibm.com>
> + *
> + * Copyright (c) 2008-2010, IBM Corporation
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * BSD license below:
> + *
> + *   Redistribution and use in source and binary forms, with or
> + *   without modification, are permitted provided that the following
> + *   conditions are met:
> + *
> + *   - Redistributions of source code must retain the above copyright notice,
> + *     this list of conditions and the following disclaimer.
> + *
> + *   - Redistributions in binary form must reproduce the above copyright
> + *     notice, this list of conditions and the following disclaimer in the
> + *     documentation and/or other materials provided with the distribution.
> + *
> + *   - Neither the name of IBM nor the names of its contributors may be
> + *     used to endorse or promote products derived from this software without
> + *     specific prior written permission.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#ifndef _IWARP_H
> +#define _IWARP_H
> +
> +#include<rdma/rdma_user_cm.h>	/* RDMA_MAX_PRIVATE_DATA */
> +#include<linux/types.h>
> +#include<asm/byteorder.h>
> +
> +
> +#define RDMAP_VERSION		1
> +#define DDP_VERSION		1
> +#define MPA_REVISION_1		1
> +#define MPA_MAX_PRIVDATA	RDMA_MAX_PRIVATE_DATA
> +#define MPA_KEY_REQ		"MPA ID Req Frame"
> +#define MPA_KEY_REP		"MPA ID Rep Frame"
> +
> +struct mpa_rr_params {
> +#if defined(__LITTLE_ENDIAN_BITFIELD)
> +	__u16	res:5,
> +		r:1,
> +		c:1,
> +		m:1,
> +		rev:8;
> +#elif defined(__BIG_ENDIAN_BITFIELD)
> +	__u16	m:1,
> +		c:1,
> +		r:1,
> +		res:5,
> +		rev:8;
> +#else
> +#error "Adjust your<asm/byteorder.h>  defines"
> +#endif
> +	__u16	pd_len;
> +};
> +
> +/*
> + * MPA request/reply header
> + */
> +struct mpa_rr {
> +	__u8	key[16];
> +	struct mpa_rr_params params;
> +};
> +
> +/*
> + * Don't change the layout/size of this struct!
> + */
> +struct mpa_marker {
> +	__u16	rsvd;
> +	__u16	fpdu_hmd; /* FPDU header-marker distance (= MPA's FPDUPTR) */
> +};
> +
> +#define MPA_MARKER_SPACING	512
> +#define MPA_HDR_SIZE		2
> +
> +/*
> + * MPA marker size:
> + * - Standards-compliant marker insertion: Use sizeof(struct mpa_marker)
> + * - "Invisible markers" for testing sender's marker insertion
> + *   without affecting receiver: Use 0
> + */
> +#define MPA_MARKER_SIZE		sizeof(struct mpa_marker)
> +
> +
> +/*
> + * maximum MPA trailer
> + */
> +struct mpa_trailer {
> +	char	pad[4];
> +	__u32	crc;
> +};
> +
> +#define MPA_CRC_SIZE	4
> +
> +
> +/*
> + * Common portion of iWARP headers (MPA, DDP, RDMAP)
> + * for any FPDU
> + */
> +struct iwarp_ctrl {
> +	__u16	mpa_len;
> +#if defined(__LITTLE_ENDIAN_BITFIELD)
> +	__u16	dv:2,		/* DDP Version */
> +		rsvd:4,		/* DDP reserved, MBZ */
> +		l:1,		/* DDP Last flag */
> +		t:1,		/* DDP Tagged flag */
> +		opcode:4,	/* RDMAP opcode */
> +		rsv:2,		/* RDMAP reserved, MBZ */
> +		rv:2;		/* RDMAP Version, 01 for IETF, 00 for RDMAC */
> +#elif defined(__BIG_ENDIAN_BITFIELD)
> +	__u16	t:1,		/* DDP Tagged flag */
> +		l:1,		/* DDP Last flag */
> +		rsvd:4,		/* DDP reserved, MBZ */
> +		dv:2,		/* DDP Version */
> +		rv:2,		/* RDMAP Version, 01 for IETF, 00 for RDMAC */
> +		rsv:2,		/* RDMAP reserved, MBZ */
> +		opcode:4;	/* RDMAP opcode */
> +#else
> +#error "Adjust your<asm/byteorder.h>  defines"
> +#endif
> +};
> +
> +
> +struct rdmap_terminate_ctrl {
> +#if defined(__LITTLE_ENDIAN_BITFIELD)
> +	__u32	etype:4,
> +		layer:4,
> +		ecode:8,
> +		rsvd1:5,
> +		r:1,
> +		d:1,
> +		m:1,
> +		rsvd2:8;
> +#elif defined(__BIG_ENDIAN_BITFIELD)
> +	__u32	layer:4,
> +		etype:4,
> +		ecode:8,
> +		m:1,
> +		d:1,
> +		r:1,
> +		rsvd1:5,
> +		rsvd2:8;
> +#else
> +#error "Adjust your<asm/byteorder.h>  defines"
> +#endif
> +};
> +
> +
> +struct iwarp_rdma_write {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			sink_stag;
> +	__u64			sink_to;
> +} __attribute__((__packed__));
> +
> +struct iwarp_rdma_rreq {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			rsvd;
> +	__u32			ddp_qn;
> +	__u32			ddp_msn;
> +	__u32			ddp_mo;
> +	__u32			sink_stag;
> +	__u64			sink_to;
> +	__u32			read_size;
> +	__u32			source_stag;
> +	__u64			source_to;
> +} __attribute__((__packed__));
> +
> +struct iwarp_rdma_rresp {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			sink_stag;
> +	__u64			sink_to;
> +} __attribute__((__packed__));
> +
> +struct iwarp_send {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			rsvd;
> +	__u32			ddp_qn;
> +	__u32			ddp_msn;
> +	__u32			ddp_mo;
> +} __attribute__((__packed__));
> +
> +struct iwarp_send_inv {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			inval_stag;
> +	__u32			ddp_qn;
> +	__u32			ddp_msn;
> +	__u32			ddp_mo;
> +} __attribute__((__packed__));
> +
> +struct iwarp_terminate {
> +	struct iwarp_ctrl	ctrl;
> +	__u32				rsvd;
> +	__u32				ddp_qn;
> +	__u32				ddp_msn;
> +	__u32				ddp_mo;
> +	struct rdmap_terminate_ctrl	term_ctrl;
> +} __attribute__((__packed__));
> +
> +
> +/*
> + * Common portion of iWARP headers (MPA, DDP, RDMAP)
> + * for an FPDU carrying an untagged DDP segment
> + */
> +struct iwarp_ctrl_untagged {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			rsvd;
> +	__u32			ddp_qn;
> +	__u32			ddp_msn;
> +	__u32			ddp_mo;
> +} __attribute__((__packed__));
> +
> +/*
> + * Common portion of iWARP headers (MPA, DDP, RDMAP)
> + * for an FPDU carrying a tagged DDP segment
> + */
> +struct iwarp_ctrl_tagged {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			ddp_stag;
> +	__u64			ddp_to;
> +} __attribute__((__packed__));
> +
>    

All of the above header structures should use __beXX types since the 
fields are all in Network Byte Order.

Also, did you run sparse on the patches (Documentation/sparse.txt)?


> +union iwarp_hdrs {
> +	struct iwarp_ctrl		ctrl;
> +	struct iwarp_ctrl_untagged	c_untagged;
> +	struct iwarp_ctrl_tagged	c_tagged;
> +	struct iwarp_rdma_write		rwrite;
> +	struct iwarp_rdma_rreq		rreq;
> +	struct iwarp_rdma_rresp		rresp;
> +	struct iwarp_terminate		terminate;
> +	struct iwarp_send		send;
> +	struct iwarp_send_inv		send_inv;
> +};
> +
> +
> +#define MPA_MIN_FRAG ((sizeof(union iwarp_hdrs) + MPA_CRC_SIZE))
> +
> +enum ddp_etype {
> +	DDP_ETYPE_CATASTROPHIC	= 0x0,
> +	DDP_ETYPE_TAGGED_BUF	= 0x1,
> +	DDP_ETYPE_UNTAGGED_BUF	= 0x2,
> +	DDP_ETYPE_RSVD		= 0x3
> +};
> +
> +enum ddp_ecode {
> +	DDP_ECODE_CATASTROPHIC		= 0x00,
> +	/* Tagged Buffer Errors */
> +	DDP_ECODE_T_INVALID_STAG	= 0x00,
> +	DDP_ECODE_T_BASE_BOUNDS		= 0x01,
> +	DDP_ECODE_T_STAG_NOT_ASSOC	= 0x02,
> +	DDP_ECODE_T_TO_WRAP		= 0x03,
> +	DDP_ECODE_T_DDP_VERSION		= 0x04,
> +	/* Untagged Buffer Errors */
> +	DDP_ECODE_UT_INVALID_QN		= 0x01,
> +	DDP_ECODE_UT_INVALID_MSN_NOBUF	= 0x02,
> +	DDP_ECODE_UT_INVALID_MSN_RANGE	= 0x03,
> +	DDP_ECODE_UT_INVALID_MO		= 0x04,
> +	DDP_ECODE_UT_MSG_TOOLONG	= 0x05,
> +	DDP_ECODE_UT_DDP_VERSION	= 0x06
> +};
> +
> +
> +enum rdmap_untagged_qn {
> +	RDMAP_UNTAGGED_QN_SEND		= 0,
> +	RDMAP_UNTAGGED_QN_RDMA_READ	= 1,
> +	RDMAP_UNTAGGED_QN_TERMINATE	= 2,
> +	RDMAP_UNTAGGED_QN_COUNT		= 3
> +};
> +
> +enum rdmap_etype {
> +	RDMAP_ETYPE_CATASTROPHIC	= 0x0,
> +	RDMAP_ETYPE_REMOTE_PROTECTION	= 0x1,
> +	RDMAP_ETYPE_REMOTE_OPERATION	= 0x2
> +};
> +
> +enum rdmap_ecode {
> +	RDMAP_ECODE_INVALID_STAG	= 0x00,
> +	RDMAP_ECODE_BASE_BOUNDS		= 0x01,
> +	RDMAP_ECODE_ACCESS_RIGHTS	= 0x02,
> +	RDMAP_ECODE_STAG_NOT_ASSOC	= 0x03,
> +	RDMAP_ECODE_TO_WRAP		= 0x04,
> +	RDMAP_ECODE_RDMAP_VERSION	= 0x05,
> +	RDMAP_ECODE_UNEXPECTED_OPCODE	= 0x06,
> +	RDMAP_ECODE_CATASTROPHIC_STREAM	= 0x07,
> +	RDMAP_ECODE_CATASTROPHIC_GLOBAL	= 0x08,
> +	RDMAP_ECODE_STAG_NOT_INVALIDATE	= 0x09,
> +	RDMAP_ECODE_UNSPECIFIED		= 0xff
> +};
> +
> +enum rdmap_elayer {
> +	RDMAP_ERROR_LAYER_RDMA	= 0x00,
> +	RDMAP_ERROR_LAYER_DDP	= 0x01,
> +	RDMAP_ERROR_LAYER_LLP	= 0x02	/* eg., MPA */
> +};
> +
> +enum rdma_opcode {
> +	RDMAP_RDMA_WRITE	= 0x0,
> +	RDMAP_RDMA_READ_REQ	= 0x1,
> +	RDMAP_RDMA_READ_RESP	= 0x2,
> +	RDMAP_SEND		= 0x3,
> +	RDMAP_SEND_INVAL	= 0x4,
> +	RDMAP_SEND_SE		= 0x5,
> +	RDMAP_SEND_SE_INVAL	= 0x6,
> +	RDMAP_TERMINATE		= 0x7,
> +	RDMAP_NOT_SUPPORTED	= RDMAP_TERMINATE + 1
> +};
> +
> +#endif
>    

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bernard Metzler Oct. 5, 2010, 4:06 p.m. UTC | #2
<snip>
> > + */
> > +struct iwarp_ctrl_tagged {
> > +   struct iwarp_ctrl   ctrl;
> > +   __u32         ddp_stag;
> > +   __u64         ddp_to;
> > +} __attribute__((__packed__));
> > +
> > 
> 
> All of the above header structures should use __beXX types since the 
> fields are all in Network Byte Order.
> 
right. will get fixed.

> Also, did you run sparse on the patches (Documentation/sparse.txt)?
> 
> 
frankly, i was not aware. sorry. will do.

bernard.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Oct. 5, 2010, 4:10 p.m. UTC | #3
On Tue, Oct 5, 2010 at 8:53 AM, Bernard Metzler <bmt@zurich.ibm.com> wrote:
> +} __attribute__((__packed__));

Apparently you make extensive use of the packed attribute. Please read
this blog entry, in which it is explained why this is harmful:

http://digitalvampire.org/blog/index.php/2006/07/31/why-you-shouldnt-use-__attribute__packed/

Bart.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bernard Metzler Oct. 6, 2010, 12:52 p.m. UTC | #4
linux-rdma-owner@vger.kernel.org wrote on 10/05/2010 06:10:45 PM:

> On Tue, Oct 5, 2010 at 8:53 AM, Bernard Metzler <bmt@zurich.ibm.com>
wrote:
> > +} __attribute__((__packed__));
>
> Apparently you make extensive use of the packed attribute. Please read
> this blog entry, in which it is explained why this is harmful:
>
> http://digitalvampire.org/blog/index.php/2006/07/31/why-you-
> shouldnt-use-__attribute__packed/
>
right, thanks, i was not aware of that
big overhead.
but...there are reasons why we may need it _packed_:
(1) to get the size of the packet hdr
(2) to read/write the hdr content

(1) is not harmful. (2) could be limited to
just one read/write operation per rx/tx path
and all other access could be done on a
structure which contains the host representation
of the hdr content. both structures could coexist
in a union, part of the rx or tx context.
would that satisfy your complain?

many thanks,
bernard.


> Bart.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Oct. 6, 2010, 4:02 p.m. UTC | #5
On Wed, Oct 6, 2010 at 2:52 PM, Bernard Metzler <BMT@zurich.ibm.com> wrote:
>
>
> linux-rdma-owner@vger.kernel.org wrote on 10/05/2010 06:10:45 PM:
>
> > On Tue, Oct 5, 2010 at 8:53 AM, Bernard Metzler <bmt@zurich.ibm.com>
> wrote:
> > > +} __attribute__((__packed__));
> >
> > Apparently you make extensive use of the packed attribute. Please read
> > this blog entry, in which it is explained why this is harmful:
> >
> > http://digitalvampire.org/blog/index.php/2006/07/31/why-you-
> > shouldnt-use-__attribute__packed/
> >
> right, thanks, i was not aware of that
> big overhead.
> but...there are reasons why we may need it _packed_:
> (1) to get the size of the packet hdr
> (2) to read/write the hdr content
>
> (1) is not harmful. (2) could be limited to
> just one read/write operation per rx/tx path
> and all other access could be done on a
> structure which contains the host representation
> of the hdr content. both structures could coexist
> in a union, part of the rx or tx context.
> would that satisfy your complain?

Please have a look at e.g. the header file <scsi/srp.h>. As you can
see in that header file, __attribute__((__packed__)) is only specified
when absolutely necessary.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Oct. 6, 2010, 5:25 p.m. UTC | #6
On Wed, Oct 06, 2010 at 02:52:46PM +0200, Bernard Metzler wrote:
> 
> 
> linux-rdma-owner@vger.kernel.org wrote on 10/05/2010 06:10:45 PM:
> 
> > On Tue, Oct 5, 2010 at 8:53 AM, Bernard Metzler <bmt@zurich.ibm.com>
> wrote:
> > > +} __attribute__((__packed__));
> >
> > Apparently you make extensive use of the packed attribute. Please read
> > this blog entry, in which it is explained why this is harmful:
> >
> > http://digitalvampire.org/blog/index.php/2006/07/31/why-you-
> > shouldnt-use-__attribute__packed/
> >
> right, thanks, i was not aware of that
> big overhead.
> but...there are reasons why we may need it _packed_:
> (1) to get the size of the packet hdr
> (2) to read/write the hdr content

It is actually a little more complicated than just this. I assume you
are casting the structures over packet payloads? In this case you
have to guarentee alignment (or used packed everywhere). Does iwarp
have provisions for alignment? If so you can construct your bitfields
using the alignment type, ie if iWarp guarantees 4 bytes then the
biggest type you can use is u32 - then you can avoid using packed.

Mind you, I'm not sure how to guarentee alignment when you consider
all the possible sources of unalignment in the stack: TCP, IP, L2 stack ?

Otherwise you need a strategy for dealing with unaligned data for
portability. :(

Jason
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Dillow Oct. 6, 2010, 6:22 p.m. UTC | #7
On Wed, 2010-10-06 at 11:25 -0600, Jason Gunthorpe wrote:
> On Wed, Oct 06, 2010 at 02:52:46PM +0200, Bernard Metzler wrote:
> It is actually a little more complicated than just this. I assume you
> are casting the structures over packet payloads? In this case you
> have to guarentee alignment (or used packed everywhere). Does iwarp
> have provisions for alignment? If so you can construct your bitfields
> using the alignment type, ie if iWarp guarantees 4 bytes then the
> biggest type you can use is u32 - then you can avoid using packed.
> 
> Mind you, I'm not sure how to guarentee alignment when you consider
> all the possible sources of unalignment in the stack: TCP, IP, L2 stack ?

You don't have to. The TCP stack, IIRC, requires the architecture to fix
up misaligned accesses, or at least it used to. It will try to keep
things aligned if possible -- see the comment above NET_IP_ALIGN in
include/linux/skbuff.h.

The structures in Bernard's header have all fields naturally aligned, so
no need for the packed attribute and its baggage.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Oct. 6, 2010, 6:37 p.m. UTC | #8
On Wed, Oct 06, 2010 at 02:22:55PM -0400, David Dillow wrote:
> On Wed, 2010-10-06 at 11:25 -0600, Jason Gunthorpe wrote:
> > On Wed, Oct 06, 2010 at 02:52:46PM +0200, Bernard Metzler wrote:
> > It is actually a little more complicated than just this. I assume you
> > are casting the structures over packet payloads? In this case you
> > have to guarentee alignment (or used packed everywhere). Does iwarp
> > have provisions for alignment? If so you can construct your bitfields
> > using the alignment type, ie if iWarp guarantees 4 bytes then the
> > biggest type you can use is u32 - then you can avoid using packed.
> > 
> > Mind you, I'm not sure how to guarentee alignment when you consider
> > all the possible sources of unalignment in the stack: TCP, IP, L2 stack ?
> 
> You don't have to. The TCP stack, IIRC, requires the architecture to fix
> up misaligned accesses, or at least it used to. It will try to keep
> things aligned if possible -- see the comment above NET_IP_ALIGN in
> include/linux/skbuff.h.

I was under the impression that this was just to align the IP
header. There are quite a variety of options and additional stuff that
can be inserted at each layer, VLAN for ethernet, IP option/extension
headers, TCP option headers, etc, etc. I seem to remember that because
of all this there are cases where processing unaligned payloads is
unavoidable? Hence my question if iWarp has restrictions (particularly
on TCP) that might guarentee alignment.

> The structures in Bernard's header have all fields naturally
> aligned, so no need for the packed attribute and its baggage.

No, they arent:

> +struct iwarp_rdma_write {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			sink_stag;
> +	__u64			sink_to;
> +} __attribute__((__packed__));

For instance has sink_to unaligned if the maximum aligment of the
payload is only guarenteed to be 4. The proper thing to do in this
case is probably __attribute__((__packed__, __aligned__(4))) if that
works the way I think.. :)

Jason
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Dillow Oct. 6, 2010, 7:31 p.m. UTC | #9
On Wed, 2010-10-06 at 12:37 -0600, Jason Gunthorpe wrote:
> On Wed, Oct 06, 2010 at 02:22:55PM -0400, David Dillow wrote:
> > On Wed, 2010-10-06 at 11:25 -0600, Jason Gunthorpe wrote:
> > > On Wed, Oct 06, 2010 at 02:52:46PM +0200, Bernard Metzler wrote:
> > > It is actually a little more complicated than just this. I assume you
> > > are casting the structures over packet payloads? In this case you
> > > have to guarentee alignment (or used packed everywhere). Does iwarp
> > > have provisions for alignment? If so you can construct your bitfields
> > > using the alignment type, ie if iWarp guarantees 4 bytes then the
> > > biggest type you can use is u32 - then you can avoid using packed.
> > > 
> > > Mind you, I'm not sure how to guarentee alignment when you consider
> > > all the possible sources of unalignment in the stack: TCP, IP, L2 stack ?
> > 
> > You don't have to. The TCP stack, IIRC, requires the architecture to fix
> > up misaligned accesses, or at least it used to. It will try to keep
> > things aligned if possible -- see the comment above NET_IP_ALIGN in
> > include/linux/skbuff.h.
> 
> I was under the impression that this was just to align the IP
> header.

It is, and it assumes an ethernet header of 14 bytes. As you note, VLANs
and everything else can mess with that, which is why the TCP stack
requires the architecture to handle unaligned accesses. It's a bonus if
iWarp can guarantee an alignment WRT the IP or TCP header, but not
required.

> > The structures in Bernard's header have all fields naturally
> > aligned, so no need for the packed attribute and its baggage.
> 
> No, they arent:
> 
> > +struct iwarp_rdma_write {
> > +	struct iwarp_ctrl	ctrl;
> > +	__u32			sink_stag;
> > +	__u64			sink_to;
> > +} __attribute__((__packed__));
> 
> For instance has sink_to unaligned if the maximum aligment of the
> payload is only guarenteed to be 4. The proper thing to do in this
> case is probably __attribute__((__packed__, __aligned__(4))) if that
> works the way I think.. :)

struct iwarp_ctrl consists of 2 u16s, so sink_stag is aligned to 4 bytes
from the start of the struct iwarp_rdma_write, and sink_to is at 8 bytes
-- so they are naturally aligned with respect to the structure. The
compiler will not add padding, so no need for the packed attribute.

Note I'm not talking about alignment in memory -- that'll depend on
where the data starts, and as I mentioned, the stack doesn't make any
guarantees about that, relying on the architecture to fixup any
misaligned accesses.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bernard Metzler Oct. 7, 2010, 2:55 p.m. UTC | #10
David Dillow <dave@thedillows.org> wrote on 10/06/2010 09:31:44 PM:

> On Wed, 2010-10-06 at 12:37 -0600, Jason Gunthorpe wrote:
> > On Wed, Oct 06, 2010 at 02:22:55PM -0400, David Dillow wrote:
> > > On Wed, 2010-10-06 at 11:25 -0600, Jason Gunthorpe wrote:
> > > > On Wed, Oct 06, 2010 at 02:52:46PM +0200, Bernard Metzler wrote:
> > > > It is actually a little more complicated than just this. I assume
you
> > > > are casting the structures over packet payloads? In this case you
> > > > have to guarentee alignment (or used packed everywhere). Does iwarp
> > > > have provisions for alignment? If so you can construct your
bitfields
> > > > using the alignment type, ie if iWarp guarantees 4 bytes then the
> > > > biggest type you can use is u32 - then you can avoid using packed.
> > > >
> > > > Mind you, I'm not sure how to guarentee alignment when you consider
> > > > all the possible sources of unalignment in the stack: TCP, IP,L2
stack ?
> > >
> > > You don't have to. The TCP stack, IIRC, requires the architecture to
fix
> > > up misaligned accesses, or at least it used to. It will try to keep
> > > things aligned if possible -- see the comment above NET_IP_ALIGN in
> > > include/linux/skbuff.h.
> >
> > I was under the impression that this was just to align the IP
> > header.
>
> It is, and it assumes an ethernet header of 14 bytes. As you note, VLANs
> and everything else can mess with that, which is why the TCP stack
> requires the architecture to handle unaligned accesses. It's a bonus if
> iWarp can guarantee an alignment WRT the IP or TCP header, but not
> required.
>
> > > The structures in Bernard's header have all fields naturally
> > > aligned, so no need for the packed attribute and its baggage.
> >
> > No, they arent:
> >
> > > +struct iwarp_rdma_write {
> > > +   struct iwarp_ctrl   ctrl;
> > > +   __u32         sink_stag;
> > > +   __u64         sink_to;
> > > +} __attribute__((__packed__));
> >
> > For instance has sink_to unaligned if the maximum aligment of the
> > payload is only guarenteed to be 4. The proper thing to do in this
> > case is probably __attribute__((__packed__, __aligned__(4))) if that
> > works the way I think.. :)
>
> struct iwarp_ctrl consists of 2 u16s, so sink_stag is aligned to 4 bytes
> from the start of the struct iwarp_rdma_write, and sink_to is at 8 bytes
> -- so they are naturally aligned with respect to the structure. The
> compiler will not add padding, so no need for the packed attribute.
>
> Note I'm not talking about alignment in memory -- that'll depend on
> where the data starts, and as I mentioned, the stack doesn't make any
> guarantees about that, relying on the architecture to fixup any
> misaligned accesses.

agreed. all n-bytes are starting on a multipe of n-byte offset
relative to the start of the structure.

many thanks,
bernard.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/hw/siw/iwarp.h b/drivers/infiniband/hw/siw/iwarp.h
new file mode 100644
index 0000000..762c1d3
--- /dev/null
+++ b/drivers/infiniband/hw/siw/iwarp.h
@@ -0,0 +1,324 @@ 
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt@zurich.ibm.com>
+ *          Fredy Neeser <nfd@zurich.ibm.com>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IWARP_H
+#define _IWARP_H
+
+#include <rdma/rdma_user_cm.h>	/* RDMA_MAX_PRIVATE_DATA */
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+
+#define RDMAP_VERSION		1
+#define DDP_VERSION		1
+#define MPA_REVISION_1		1
+#define MPA_MAX_PRIVDATA	RDMA_MAX_PRIVATE_DATA
+#define MPA_KEY_REQ		"MPA ID Req Frame"
+#define MPA_KEY_REP		"MPA ID Rep Frame"
+
+struct mpa_rr_params {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u16	res:5,
+		r:1,
+		c:1,
+		m:1,
+		rev:8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u16	m:1,
+		c:1,
+		r:1,
+		res:5,
+		rev:8;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+	__u16	pd_len;
+};
+
+/*
+ * MPA request/reply header
+ */
+struct mpa_rr {
+	__u8	key[16];
+	struct mpa_rr_params params;
+};
+
+/*
+ * Don't change the layout/size of this struct!
+ */
+struct mpa_marker {
+	__u16	rsvd;
+	__u16	fpdu_hmd; /* FPDU header-marker distance (= MPA's FPDUPTR) */
+};
+
+#define MPA_MARKER_SPACING	512
+#define MPA_HDR_SIZE		2
+
+/*
+ * MPA marker size:
+ * - Standards-compliant marker insertion: Use sizeof(struct mpa_marker)
+ * - "Invisible markers" for testing sender's marker insertion
+ *   without affecting receiver: Use 0
+ */
+#define MPA_MARKER_SIZE		sizeof(struct mpa_marker)
+
+
+/*
+ * maximum MPA trailer
+ */
+struct mpa_trailer {
+	char	pad[4];
+	__u32	crc;
+};
+
+#define MPA_CRC_SIZE	4
+
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for any FPDU
+ */
+struct iwarp_ctrl {
+	__u16	mpa_len;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u16	dv:2,		/* DDP Version */
+		rsvd:4,		/* DDP reserved, MBZ */
+		l:1,		/* DDP Last flag */
+		t:1,		/* DDP Tagged flag */
+		opcode:4,	/* RDMAP opcode */
+		rsv:2,		/* RDMAP reserved, MBZ */
+		rv:2;		/* RDMAP Version, 01 for IETF, 00 for RDMAC */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u16	t:1,		/* DDP Tagged flag */
+		l:1,		/* DDP Last flag */
+		rsvd:4,		/* DDP reserved, MBZ */
+		dv:2,		/* DDP Version */
+		rv:2,		/* RDMAP Version, 01 for IETF, 00 for RDMAC */
+		rsv:2,		/* RDMAP reserved, MBZ */
+		opcode:4;	/* RDMAP opcode */
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+};
+
+
+struct rdmap_terminate_ctrl {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u32	etype:4,
+		layer:4,
+		ecode:8,
+		rsvd1:5,
+		r:1,
+		d:1,
+		m:1,
+		rsvd2:8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u32	layer:4,
+		etype:4,
+		ecode:8,
+		m:1,
+		d:1,
+		r:1,
+		rsvd1:5,
+		rsvd2:8;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+};
+
+
+struct iwarp_rdma_write {
+	struct iwarp_ctrl	ctrl;
+	__u32			sink_stag;
+	__u64			sink_to;
+} __attribute__((__packed__));
+
+struct iwarp_rdma_rreq {
+	struct iwarp_ctrl	ctrl;
+	__u32			rsvd;
+	__u32			ddp_qn;
+	__u32			ddp_msn;
+	__u32			ddp_mo;
+	__u32			sink_stag;
+	__u64			sink_to;
+	__u32			read_size;
+	__u32			source_stag;
+	__u64			source_to;
+} __attribute__((__packed__));
+
+struct iwarp_rdma_rresp {
+	struct iwarp_ctrl	ctrl;
+	__u32			sink_stag;
+	__u64			sink_to;
+} __attribute__((__packed__));
+
+struct iwarp_send {
+	struct iwarp_ctrl	ctrl;
+	__u32			rsvd;
+	__u32			ddp_qn;
+	__u32			ddp_msn;
+	__u32			ddp_mo;
+} __attribute__((__packed__));
+
+struct iwarp_send_inv {
+	struct iwarp_ctrl	ctrl;
+	__u32			inval_stag;
+	__u32			ddp_qn;
+	__u32			ddp_msn;
+	__u32			ddp_mo;
+} __attribute__((__packed__));
+
+struct iwarp_terminate {
+	struct iwarp_ctrl	ctrl;
+	__u32				rsvd;
+	__u32				ddp_qn;
+	__u32				ddp_msn;
+	__u32				ddp_mo;
+	struct rdmap_terminate_ctrl	term_ctrl;
+} __attribute__((__packed__));
+
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for an FPDU carrying an untagged DDP segment
+ */
+struct iwarp_ctrl_untagged {
+	struct iwarp_ctrl	ctrl;
+	__u32			rsvd;
+	__u32			ddp_qn;
+	__u32			ddp_msn;
+	__u32			ddp_mo;
+} __attribute__((__packed__));
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for an FPDU carrying a tagged DDP segment
+ */
+struct iwarp_ctrl_tagged {
+	struct iwarp_ctrl	ctrl;
+	__u32			ddp_stag;
+	__u64			ddp_to;
+} __attribute__((__packed__));
+
+union iwarp_hdrs {
+	struct iwarp_ctrl		ctrl;
+	struct iwarp_ctrl_untagged	c_untagged;
+	struct iwarp_ctrl_tagged	c_tagged;
+	struct iwarp_rdma_write		rwrite;
+	struct iwarp_rdma_rreq		rreq;
+	struct iwarp_rdma_rresp		rresp;
+	struct iwarp_terminate		terminate;
+	struct iwarp_send		send;
+	struct iwarp_send_inv		send_inv;
+};
+
+
+#define MPA_MIN_FRAG ((sizeof(union iwarp_hdrs) + MPA_CRC_SIZE))
+
+enum ddp_etype {
+	DDP_ETYPE_CATASTROPHIC	= 0x0,
+	DDP_ETYPE_TAGGED_BUF	= 0x1,
+	DDP_ETYPE_UNTAGGED_BUF	= 0x2,
+	DDP_ETYPE_RSVD		= 0x3
+};
+
+enum ddp_ecode {
+	DDP_ECODE_CATASTROPHIC		= 0x00,
+	/* Tagged Buffer Errors */
+	DDP_ECODE_T_INVALID_STAG	= 0x00,
+	DDP_ECODE_T_BASE_BOUNDS		= 0x01,
+	DDP_ECODE_T_STAG_NOT_ASSOC	= 0x02,
+	DDP_ECODE_T_TO_WRAP		= 0x03,
+	DDP_ECODE_T_DDP_VERSION		= 0x04,
+	/* Untagged Buffer Errors */
+	DDP_ECODE_UT_INVALID_QN		= 0x01,
+	DDP_ECODE_UT_INVALID_MSN_NOBUF	= 0x02,
+	DDP_ECODE_UT_INVALID_MSN_RANGE	= 0x03,
+	DDP_ECODE_UT_INVALID_MO		= 0x04,
+	DDP_ECODE_UT_MSG_TOOLONG	= 0x05,
+	DDP_ECODE_UT_DDP_VERSION	= 0x06
+};
+
+
+enum rdmap_untagged_qn {
+	RDMAP_UNTAGGED_QN_SEND		= 0,
+	RDMAP_UNTAGGED_QN_RDMA_READ	= 1,
+	RDMAP_UNTAGGED_QN_TERMINATE	= 2,
+	RDMAP_UNTAGGED_QN_COUNT		= 3
+};
+
+enum rdmap_etype {
+	RDMAP_ETYPE_CATASTROPHIC	= 0x0,
+	RDMAP_ETYPE_REMOTE_PROTECTION	= 0x1,
+	RDMAP_ETYPE_REMOTE_OPERATION	= 0x2
+};
+
+enum rdmap_ecode {
+	RDMAP_ECODE_INVALID_STAG	= 0x00,
+	RDMAP_ECODE_BASE_BOUNDS		= 0x01,
+	RDMAP_ECODE_ACCESS_RIGHTS	= 0x02,
+	RDMAP_ECODE_STAG_NOT_ASSOC	= 0x03,
+	RDMAP_ECODE_TO_WRAP		= 0x04,
+	RDMAP_ECODE_RDMAP_VERSION	= 0x05,
+	RDMAP_ECODE_UNEXPECTED_OPCODE	= 0x06,
+	RDMAP_ECODE_CATASTROPHIC_STREAM	= 0x07,
+	RDMAP_ECODE_CATASTROPHIC_GLOBAL	= 0x08,
+	RDMAP_ECODE_STAG_NOT_INVALIDATE	= 0x09,
+	RDMAP_ECODE_UNSPECIFIED		= 0xff
+};
+
+enum rdmap_elayer {
+	RDMAP_ERROR_LAYER_RDMA	= 0x00,
+	RDMAP_ERROR_LAYER_DDP	= 0x01,
+	RDMAP_ERROR_LAYER_LLP	= 0x02	/* eg., MPA */
+};
+
+enum rdma_opcode {
+	RDMAP_RDMA_WRITE	= 0x0,
+	RDMAP_RDMA_READ_REQ	= 0x1,
+	RDMAP_RDMA_READ_RESP	= 0x2,
+	RDMAP_SEND		= 0x3,
+	RDMAP_SEND_INVAL	= 0x4,
+	RDMAP_SEND_SE		= 0x5,
+	RDMAP_SEND_SE_INVAL	= 0x6,
+	RDMAP_TERMINATE		= 0x7,
+	RDMAP_NOT_SUPPORTED	= RDMAP_TERMINATE + 1
+};
+
+#endif