diff mbox

[01/14] aoe: for performance support larger packet payloads

Message ID 7c895879cfd1e15dd76c2469b417b48d0462d7df.1345743801.git.ecashin@coraid.com
State Not Applicable, archived
Delegated to: David Miller
Headers show

Commit Message

Ed Cashin Aug. 18, 2012, 1:18 a.m. UTC
This patch adds the ability to work with large packets composed of a
number of segments, using the scatter gather feature of the block
layer (biovecs) and the network layer (skb frag array).  The
motivation is the performance gained by using a packet data payload
greater than a page size and by using the network card's scatter
gather feature.

Users of the out-of-tree aoe driver already had these changes, but
since early 2011, they have complained of increased memory utilization
and higher CPU utilization during heavy writes.[1] The commit below
appears related, as it disables scatter gather on non-IP protocols
inside the harmonize_features function, even when the NIC supports sg.

  commit f01a5236bd4b140198fbcc550f085e8361fd73fa
  Author: Jesse Gross <jesse@nicira.com>
  Date:   Sun Jan 9 06:23:31 2011 +0000

      net offloading: Generalize netif_get_vlan_features().

With that regression in place, transmits always linearize sg AoE
packets, but in-kernel users did not have this patch.  Before 2.6.38,
though, these changes were working to allow sg to increase
performance.

1. http://www.spinics.net/lists/linux-mm/msg15184.html

Signed-off-by: Ed Cashin <ecashin@coraid.com>
---
 drivers/block/aoe/aoe.h    |    2 +
 drivers/block/aoe/aoeblk.c |    3 +
 drivers/block/aoe/aoecmd.c |  138 ++++++++++++++++++++++++++++++-------------
 drivers/block/aoe/aoedev.c |    1 +
 drivers/block/aoe/aoenet.c |   13 +++-
 5 files changed, 111 insertions(+), 46 deletions(-)

Comments

Ed Cashin Aug. 23, 2012, 8:46 p.m. UTC | #1
I should have Cc'ed netdev for this patch.  The series was only sent to linux-kernel, sorry.

On Aug 17, 2012, at 9:18 PM, Ed Cashin wrote:

> This patch adds the ability to work with large packets composed of a
> number of segments, using the scatter gather feature of the block
> layer (biovecs) and the network layer (skb frag array).  The
> motivation is the performance gained by using a packet data payload
> greater than a page size and by using the network card's scatter
> gather feature.
> 
> Users of the out-of-tree aoe driver already had these changes, but
> since early 2011, they have complained of increased memory utilization
> and higher CPU utilization during heavy writes.[1] The commit below
> appears related, as it disables scatter gather on non-IP protocols
> inside the harmonize_features function, even when the NIC supports sg.
> 
>  commit f01a5236bd4b140198fbcc550f085e8361fd73fa
>  Author: Jesse Gross <jesse@nicira.com>
>  Date:   Sun Jan 9 06:23:31 2011 +0000
> 
>      net offloading: Generalize netif_get_vlan_features().
> 
> With that regression in place, transmits always linearize sg AoE
> packets, but in-kernel users did not have this patch.  Before 2.6.38,
> though, these changes were working to allow sg to increase
> performance.
> 
> 1. http://www.spinics.net/lists/linux-mm/msg15184.html
> 
> Signed-off-by: Ed Cashin <ecashin@coraid.com>
> ---
> drivers/block/aoe/aoe.h    |    2 +
> drivers/block/aoe/aoeblk.c |    3 +
> drivers/block/aoe/aoecmd.c |  138 ++++++++++++++++++++++++++++++-------------
> drivers/block/aoe/aoedev.c |    1 +
> drivers/block/aoe/aoenet.c |   13 +++-
> 5 files changed, 111 insertions(+), 46 deletions(-)
> 
> diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
> index db195ab..8ca8c8a 100644
> --- a/drivers/block/aoe/aoe.h
> +++ b/drivers/block/aoe/aoe.h
> @@ -119,6 +119,8 @@ struct frame {
> 	ulong bcnt;
> 	sector_t lba;
> 	struct sk_buff *skb;
> +	struct bio_vec *bv;
> +	ulong bv_off;
> };
> 
> struct aoeif {
> diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
> index 321de7b..1471f81 100644
> --- a/drivers/block/aoe/aoeblk.c
> +++ b/drivers/block/aoe/aoeblk.c
> @@ -279,6 +279,9 @@ aoeblk_gdalloc(void *vp)
> 	if (bdi_init(&d->blkq->backing_dev_info))
> 		goto err_blkq;
> 	spin_lock_irqsave(&d->lock, flags);
> +	blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS);
> +	d->blkq->backing_dev_info.ra_pages = BLK_DEF_MAX_SECTORS * 1024;
> +	d->blkq->backing_dev_info.ra_pages /= PAGE_CACHE_SIZE;
> 	gd->major = AOE_MAJOR;
> 	gd->first_minor = d->sysminor * AOE_PARTITIONS;
> 	gd->fops = &aoe_bdops;
> diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
> index de0435e..f10ab49 100644
> --- a/drivers/block/aoe/aoecmd.c
> +++ b/drivers/block/aoe/aoecmd.c
> @@ -164,7 +164,8 @@ freeframe(struct aoedev *d)
> 						rf = f;
> 					continue;
> 				}
> -gotone:				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
> +gotone:				skb->truesize -= skb->data_len;
> +				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
> 				skb_trim(skb, 0);
> 				d->tgt = t;
> 				ifrotate(*t);
> @@ -200,6 +201,24 @@ gotone:				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
> 	return NULL;
> }
> 
> +static void
> +skb_fillup(struct sk_buff *skb, struct bio_vec *bv, ulong off, ulong cnt)
> +{
> +	int frag = 0;
> +	ulong fcnt;
> +loop:
> +	fcnt = bv->bv_len - (off - bv->bv_offset);
> +	if (fcnt > cnt)
> +		fcnt = cnt;
> +	skb_fill_page_desc(skb, frag++, bv->bv_page, off, fcnt);
> +	cnt -= fcnt;
> +	if (cnt <= 0)
> +		return;
> +	bv++;
> +	off = bv->bv_offset;
> +	goto loop;
> +}
> +
> static int
> aoecmd_ata_rw(struct aoedev *d)
> {
> @@ -210,7 +229,7 @@ aoecmd_ata_rw(struct aoedev *d)
> 	struct bio_vec *bv;
> 	struct aoetgt *t;
> 	struct sk_buff *skb;
> -	ulong bcnt;
> +	ulong bcnt, fbcnt;
> 	char writebit, extbit;
> 
> 	writebit = 0x10;
> @@ -225,8 +244,28 @@ aoecmd_ata_rw(struct aoedev *d)
> 	bcnt = t->ifp->maxbcnt;
> 	if (bcnt == 0)
> 		bcnt = DEFAULTBCNT;
> -	if (bcnt > buf->bv_resid)
> -		bcnt = buf->bv_resid;
> +	if (bcnt > buf->resid)
> +		bcnt = buf->resid;
> +	fbcnt = bcnt;
> +	f->bv = buf->bv;
> +	f->bv_off = f->bv->bv_offset + (f->bv->bv_len - buf->bv_resid);
> +	do {
> +		if (fbcnt < buf->bv_resid) {
> +			buf->bv_resid -= fbcnt;
> +			buf->resid -= fbcnt;
> +			break;
> +		}
> +		fbcnt -= buf->bv_resid;
> +		buf->resid -= buf->bv_resid;
> +		if (buf->resid == 0) {
> +			d->inprocess = NULL;
> +			break;
> +		}
> +		buf->bv++;
> +		buf->bv_resid = buf->bv->bv_len;
> +		WARN_ON(buf->bv_resid == 0);
> +	} while (fbcnt);
> +
> 	/* initialize the headers & frame */
> 	skb = f->skb;
> 	h = (struct aoe_hdr *) skb_mac_header(skb);
> @@ -237,7 +276,6 @@ aoecmd_ata_rw(struct aoedev *d)
> 	t->nout++;
> 	f->waited = 0;
> 	f->buf = buf;
> -	f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
> 	f->bcnt = bcnt;
> 	f->lba = buf->sector;
> 
> @@ -252,10 +290,11 @@ aoecmd_ata_rw(struct aoedev *d)
> 		ah->lba3 |= 0xe0;	/* LBA bit + obsolete 0xa0 */
> 	}
> 	if (bio_data_dir(buf->bio) == WRITE) {
> -		skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
> +		skb_fillup(skb, f->bv, f->bv_off, bcnt);
> 		ah->aflags |= AOEAFL_WRITE;
> 		skb->len += bcnt;
> 		skb->data_len = bcnt;
> +		skb->truesize += bcnt;
> 		t->wpkts++;
> 	} else {
> 		t->rpkts++;
> @@ -266,18 +305,7 @@ aoecmd_ata_rw(struct aoedev *d)
> 
> 	/* mark all tracking fields and load out */
> 	buf->nframesout += 1;
> -	buf->bv_off += bcnt;
> -	buf->bv_resid -= bcnt;
> -	buf->resid -= bcnt;
> 	buf->sector += bcnt >> 9;
> -	if (buf->resid == 0) {
> -		d->inprocess = NULL;
> -	} else if (buf->bv_resid == 0) {
> -		buf->bv = ++bv;
> -		buf->bv_resid = bv->bv_len;
> -		WARN_ON(buf->bv_resid == 0);
> -		buf->bv_off = bv->bv_offset;
> -	}
> 
> 	skb->dev = t->ifp->nd;
> 	skb = skb_clone(skb, GFP_ATOMIC);
> @@ -364,14 +392,12 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
> 		put_lba(ah, f->lba);
> 
> 		n = f->bcnt;
> -		if (n > DEFAULTBCNT)
> -			n = DEFAULTBCNT;
> 		ah->scnt = n >> 9;
> 		if (ah->aflags & AOEAFL_WRITE) {
> -			skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
> -				offset_in_page(f->bufaddr), n);
> +			skb_fillup(skb, f->bv, f->bv_off, n);
> 			skb->len = sizeof *h + sizeof *ah + n;
> 			skb->data_len = n;
> +			skb->truesize += n;
> 		}
> 	}
> 	skb->dev = t->ifp->nd;
> @@ -530,20 +556,6 @@ rexmit_timer(ulong vp)
> 				ejectif(t, ifp);
> 				ifp = NULL;
> 			}
> -
> -			if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
> -			&& ifp && ++ifp->lostjumbo > (t->nframes << 1)
> -			&& ifp->maxbcnt != DEFAULTBCNT) {
> -				printk(KERN_INFO
> -					"aoe: e%ld.%d: "
> -					"too many lost jumbo on "
> -					"%s:%pm - "
> -					"falling back to %d frames.\n",
> -					d->aoemajor, d->aoeminor,
> -					ifp->nd->name, t->addr,
> -					DEFAULTBCNT);
> -				ifp->maxbcnt = 0;
> -			}
> 			resend(d, t, f);
> 		}
> 
> @@ -736,6 +748,45 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
> 	part_stat_unlock();
> }
> 
> +static void
> +bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, ulong cnt)
> +{
> +	ulong fcnt;
> +	char *p;
> +	int soff = 0;
> +loop:
> +	fcnt = bv->bv_len - (off - bv->bv_offset);
> +	if (fcnt > cnt)
> +		fcnt = cnt;
> +	p = page_address(bv->bv_page) + off;
> +	skb_copy_bits(skb, soff, p, fcnt);
> +	soff += fcnt;
> +	cnt -= fcnt;
> +	if (cnt <= 0)
> +		return;
> +	bv++;
> +	off = bv->bv_offset;
> +	goto loop;
> +}
> +
> +static void
> +fadvance(struct frame *f, ulong cnt)
> +{
> +	ulong fcnt;
> +
> +	f->lba += cnt >> 9;
> +loop:
> +	fcnt = f->bv->bv_len - (f->bv_off - f->bv->bv_offset);
> +	if (fcnt > cnt) {
> +		f->bv_off += cnt;
> +		return;
> +	}
> +	cnt -= fcnt;
> +	f->bv++;
> +	f->bv_off = f->bv->bv_offset;
> +	goto loop;
> +}
> +
> void
> aoecmd_ata_rsp(struct sk_buff *skb)
> {
> @@ -753,6 +804,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
> 	u16 aoemajor;
> 
> 	hin = (struct aoe_hdr *) skb_mac_header(skb);
> +	skb_pull(skb, sizeof(*hin));
> 	aoemajor = get_unaligned_be16(&hin->major);
> 	d = aoedev_by_aoeaddr(aoemajor, hin->minor);
> 	if (d == NULL) {
> @@ -790,7 +842,8 @@ aoecmd_ata_rsp(struct sk_buff *skb)
> 
> 	calc_rttavg(d, tsince(f->tag));
> 
> -	ahin = (struct aoe_atahdr *) (hin+1);
> +	ahin = (struct aoe_atahdr *) skb->data;
> +	skb_pull(skb, sizeof(*ahin));
> 	hout = (struct aoe_hdr *) skb_mac_header(f->skb);
> 	ahout = (struct aoe_atahdr *) (hout+1);
> 	buf = f->buf;
> @@ -809,7 +862,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
> 		switch (ahout->cmdstat) {
> 		case ATA_CMD_PIO_READ:
> 		case ATA_CMD_PIO_READ_EXT:
> -			if (skb->len - sizeof *hin - sizeof *ahin < n) {
> +			if (skb->len < n) {
> 				printk(KERN_ERR
> 					"aoe: %s.  skb->len=%d need=%ld\n",
> 					"runt data size in read", skb->len, n);
> @@ -817,7 +870,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
> 				spin_unlock_irqrestore(&d->lock, flags);
> 				return;
> 			}
> -			memcpy(f->bufaddr, ahin+1, n);
> +			bvcpy(f->bv, f->bv_off, skb, n);
> 		case ATA_CMD_PIO_WRITE:
> 		case ATA_CMD_PIO_WRITE_EXT:
> 			ifp = getif(t, skb->dev);
> @@ -827,21 +880,22 @@ aoecmd_ata_rsp(struct sk_buff *skb)
> 					ifp->lostjumbo = 0;
> 			}
> 			if (f->bcnt -= n) {
> -				f->lba += n >> 9;
> -				f->bufaddr += n;
> +				fadvance(f, n);
> 				resend(d, t, f);
> 				goto xmit;
> 			}
> 			break;
> 		case ATA_CMD_ID_ATA:
> -			if (skb->len - sizeof *hin - sizeof *ahin < 512) {
> +			if (skb->len < 512) {
> 				printk(KERN_INFO
> 					"aoe: runt data size in ataid.  skb->len=%d\n",
> 					skb->len);
> 				spin_unlock_irqrestore(&d->lock, flags);
> 				return;
> 			}
> -			ataid_complete(d, t, (char *) (ahin+1));
> +			if (skb_linearize(skb))
> +				break;
> +			ataid_complete(d, t, skb->data);
> 			break;
> 		default:
> 			printk(KERN_INFO
> diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
> index 6b5110a..b2d1fd3 100644
> --- a/drivers/block/aoe/aoedev.c
> +++ b/drivers/block/aoe/aoedev.c
> @@ -182,6 +182,7 @@ skbfree(struct sk_buff *skb)
> 			"cannot free skb -- memory leaked.");
> 		return;
> 	}
> +	skb->truesize -= skb->data_len;
> 	skb_shinfo(skb)->nr_frags = skb->data_len = 0;
> 	skb_trim(skb, 0);
> 	dev_kfree_skb(skb);
> diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
> index 4d3bc0d..0787807 100644
> --- a/drivers/block/aoe/aoenet.c
> +++ b/drivers/block/aoe/aoenet.c
> @@ -102,7 +102,9 @@ static int
> aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
> {
> 	struct aoe_hdr *h;
> +	struct aoe_atahdr *ah;
> 	u32 n;
> +	int sn;
> 
> 	if (dev_net(ifp) != &init_net)
> 		goto exit;
> @@ -110,13 +112,16 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
> 	skb = skb_share_check(skb, GFP_ATOMIC);
> 	if (skb == NULL)
> 		return 0;
> -	if (skb_linearize(skb))
> -		goto exit;
> 	if (!is_aoe_netif(ifp))
> 		goto exit;
> 	skb_push(skb, ETH_HLEN);	/* (1) */
> -
> -	h = (struct aoe_hdr *) skb_mac_header(skb);
> +	sn = sizeof(*h) + sizeof(*ah);
> +	if (skb->len >= sn) {
> +		sn -= skb_headlen(skb);
> +		if (sn > 0 && !__pskb_pull_tail(skb, sn))
> +			goto exit;
> +	}
> +	h = (struct aoe_hdr *) skb->data;
> 	n = get_unaligned_be32(&h->tag);
> 	if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31))
> 		goto exit;
> -- 
> 1.7.2.5
>
diff mbox

Patch

diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index db195ab..8ca8c8a 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -119,6 +119,8 @@  struct frame {
 	ulong bcnt;
 	sector_t lba;
 	struct sk_buff *skb;
+	struct bio_vec *bv;
+	ulong bv_off;
 };
 
 struct aoeif {
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 321de7b..1471f81 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -279,6 +279,9 @@  aoeblk_gdalloc(void *vp)
 	if (bdi_init(&d->blkq->backing_dev_info))
 		goto err_blkq;
 	spin_lock_irqsave(&d->lock, flags);
+	blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS);
+	d->blkq->backing_dev_info.ra_pages = BLK_DEF_MAX_SECTORS * 1024;
+	d->blkq->backing_dev_info.ra_pages /= PAGE_CACHE_SIZE;
 	gd->major = AOE_MAJOR;
 	gd->first_minor = d->sysminor * AOE_PARTITIONS;
 	gd->fops = &aoe_bdops;
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index de0435e..f10ab49 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -164,7 +164,8 @@  freeframe(struct aoedev *d)
 						rf = f;
 					continue;
 				}
-gotone:				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+gotone:				skb->truesize -= skb->data_len;
+				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
 				skb_trim(skb, 0);
 				d->tgt = t;
 				ifrotate(*t);
@@ -200,6 +201,24 @@  gotone:				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
 	return NULL;
 }
 
+static void
+skb_fillup(struct sk_buff *skb, struct bio_vec *bv, ulong off, ulong cnt)
+{
+	int frag = 0;
+	ulong fcnt;
+loop:
+	fcnt = bv->bv_len - (off - bv->bv_offset);
+	if (fcnt > cnt)
+		fcnt = cnt;
+	skb_fill_page_desc(skb, frag++, bv->bv_page, off, fcnt);
+	cnt -= fcnt;
+	if (cnt <= 0)
+		return;
+	bv++;
+	off = bv->bv_offset;
+	goto loop;
+}
+
 static int
 aoecmd_ata_rw(struct aoedev *d)
 {
@@ -210,7 +229,7 @@  aoecmd_ata_rw(struct aoedev *d)
 	struct bio_vec *bv;
 	struct aoetgt *t;
 	struct sk_buff *skb;
-	ulong bcnt;
+	ulong bcnt, fbcnt;
 	char writebit, extbit;
 
 	writebit = 0x10;
@@ -225,8 +244,28 @@  aoecmd_ata_rw(struct aoedev *d)
 	bcnt = t->ifp->maxbcnt;
 	if (bcnt == 0)
 		bcnt = DEFAULTBCNT;
-	if (bcnt > buf->bv_resid)
-		bcnt = buf->bv_resid;
+	if (bcnt > buf->resid)
+		bcnt = buf->resid;
+	fbcnt = bcnt;
+	f->bv = buf->bv;
+	f->bv_off = f->bv->bv_offset + (f->bv->bv_len - buf->bv_resid);
+	do {
+		if (fbcnt < buf->bv_resid) {
+			buf->bv_resid -= fbcnt;
+			buf->resid -= fbcnt;
+			break;
+		}
+		fbcnt -= buf->bv_resid;
+		buf->resid -= buf->bv_resid;
+		if (buf->resid == 0) {
+			d->inprocess = NULL;
+			break;
+		}
+		buf->bv++;
+		buf->bv_resid = buf->bv->bv_len;
+		WARN_ON(buf->bv_resid == 0);
+	} while (fbcnt);
+
 	/* initialize the headers & frame */
 	skb = f->skb;
 	h = (struct aoe_hdr *) skb_mac_header(skb);
@@ -237,7 +276,6 @@  aoecmd_ata_rw(struct aoedev *d)
 	t->nout++;
 	f->waited = 0;
 	f->buf = buf;
-	f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
 	f->bcnt = bcnt;
 	f->lba = buf->sector;
 
@@ -252,10 +290,11 @@  aoecmd_ata_rw(struct aoedev *d)
 		ah->lba3 |= 0xe0;	/* LBA bit + obsolete 0xa0 */
 	}
 	if (bio_data_dir(buf->bio) == WRITE) {
-		skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
+		skb_fillup(skb, f->bv, f->bv_off, bcnt);
 		ah->aflags |= AOEAFL_WRITE;
 		skb->len += bcnt;
 		skb->data_len = bcnt;
+		skb->truesize += bcnt;
 		t->wpkts++;
 	} else {
 		t->rpkts++;
@@ -266,18 +305,7 @@  aoecmd_ata_rw(struct aoedev *d)
 
 	/* mark all tracking fields and load out */
 	buf->nframesout += 1;
-	buf->bv_off += bcnt;
-	buf->bv_resid -= bcnt;
-	buf->resid -= bcnt;
 	buf->sector += bcnt >> 9;
-	if (buf->resid == 0) {
-		d->inprocess = NULL;
-	} else if (buf->bv_resid == 0) {
-		buf->bv = ++bv;
-		buf->bv_resid = bv->bv_len;
-		WARN_ON(buf->bv_resid == 0);
-		buf->bv_off = bv->bv_offset;
-	}
 
 	skb->dev = t->ifp->nd;
 	skb = skb_clone(skb, GFP_ATOMIC);
@@ -364,14 +392,12 @@  resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
 		put_lba(ah, f->lba);
 
 		n = f->bcnt;
-		if (n > DEFAULTBCNT)
-			n = DEFAULTBCNT;
 		ah->scnt = n >> 9;
 		if (ah->aflags & AOEAFL_WRITE) {
-			skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
-				offset_in_page(f->bufaddr), n);
+			skb_fillup(skb, f->bv, f->bv_off, n);
 			skb->len = sizeof *h + sizeof *ah + n;
 			skb->data_len = n;
+			skb->truesize += n;
 		}
 	}
 	skb->dev = t->ifp->nd;
@@ -530,20 +556,6 @@  rexmit_timer(ulong vp)
 				ejectif(t, ifp);
 				ifp = NULL;
 			}
-
-			if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
-			&& ifp && ++ifp->lostjumbo > (t->nframes << 1)
-			&& ifp->maxbcnt != DEFAULTBCNT) {
-				printk(KERN_INFO
-					"aoe: e%ld.%d: "
-					"too many lost jumbo on "
-					"%s:%pm - "
-					"falling back to %d frames.\n",
-					d->aoemajor, d->aoeminor,
-					ifp->nd->name, t->addr,
-					DEFAULTBCNT);
-				ifp->maxbcnt = 0;
-			}
 			resend(d, t, f);
 		}
 
@@ -736,6 +748,45 @@  diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
 	part_stat_unlock();
 }
 
+static void
+bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, ulong cnt)
+{
+	ulong fcnt;
+	char *p;
+	int soff = 0;
+loop:
+	fcnt = bv->bv_len - (off - bv->bv_offset);
+	if (fcnt > cnt)
+		fcnt = cnt;
+	p = page_address(bv->bv_page) + off;
+	skb_copy_bits(skb, soff, p, fcnt);
+	soff += fcnt;
+	cnt -= fcnt;
+	if (cnt <= 0)
+		return;
+	bv++;
+	off = bv->bv_offset;
+	goto loop;
+}
+
+static void
+fadvance(struct frame *f, ulong cnt)
+{
+	ulong fcnt;
+
+	f->lba += cnt >> 9;
+loop:
+	fcnt = f->bv->bv_len - (f->bv_off - f->bv->bv_offset);
+	if (fcnt > cnt) {
+		f->bv_off += cnt;
+		return;
+	}
+	cnt -= fcnt;
+	f->bv++;
+	f->bv_off = f->bv->bv_offset;
+	goto loop;
+}
+
 void
 aoecmd_ata_rsp(struct sk_buff *skb)
 {
@@ -753,6 +804,7 @@  aoecmd_ata_rsp(struct sk_buff *skb)
 	u16 aoemajor;
 
 	hin = (struct aoe_hdr *) skb_mac_header(skb);
+	skb_pull(skb, sizeof(*hin));
 	aoemajor = get_unaligned_be16(&hin->major);
 	d = aoedev_by_aoeaddr(aoemajor, hin->minor);
 	if (d == NULL) {
@@ -790,7 +842,8 @@  aoecmd_ata_rsp(struct sk_buff *skb)
 
 	calc_rttavg(d, tsince(f->tag));
 
-	ahin = (struct aoe_atahdr *) (hin+1);
+	ahin = (struct aoe_atahdr *) skb->data;
+	skb_pull(skb, sizeof(*ahin));
 	hout = (struct aoe_hdr *) skb_mac_header(f->skb);
 	ahout = (struct aoe_atahdr *) (hout+1);
 	buf = f->buf;
@@ -809,7 +862,7 @@  aoecmd_ata_rsp(struct sk_buff *skb)
 		switch (ahout->cmdstat) {
 		case ATA_CMD_PIO_READ:
 		case ATA_CMD_PIO_READ_EXT:
-			if (skb->len - sizeof *hin - sizeof *ahin < n) {
+			if (skb->len < n) {
 				printk(KERN_ERR
 					"aoe: %s.  skb->len=%d need=%ld\n",
 					"runt data size in read", skb->len, n);
@@ -817,7 +870,7 @@  aoecmd_ata_rsp(struct sk_buff *skb)
 				spin_unlock_irqrestore(&d->lock, flags);
 				return;
 			}
-			memcpy(f->bufaddr, ahin+1, n);
+			bvcpy(f->bv, f->bv_off, skb, n);
 		case ATA_CMD_PIO_WRITE:
 		case ATA_CMD_PIO_WRITE_EXT:
 			ifp = getif(t, skb->dev);
@@ -827,21 +880,22 @@  aoecmd_ata_rsp(struct sk_buff *skb)
 					ifp->lostjumbo = 0;
 			}
 			if (f->bcnt -= n) {
-				f->lba += n >> 9;
-				f->bufaddr += n;
+				fadvance(f, n);
 				resend(d, t, f);
 				goto xmit;
 			}
 			break;
 		case ATA_CMD_ID_ATA:
-			if (skb->len - sizeof *hin - sizeof *ahin < 512) {
+			if (skb->len < 512) {
 				printk(KERN_INFO
 					"aoe: runt data size in ataid.  skb->len=%d\n",
 					skb->len);
 				spin_unlock_irqrestore(&d->lock, flags);
 				return;
 			}
-			ataid_complete(d, t, (char *) (ahin+1));
+			if (skb_linearize(skb))
+				break;
+			ataid_complete(d, t, skb->data);
 			break;
 		default:
 			printk(KERN_INFO
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 6b5110a..b2d1fd3 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -182,6 +182,7 @@  skbfree(struct sk_buff *skb)
 			"cannot free skb -- memory leaked.");
 		return;
 	}
+	skb->truesize -= skb->data_len;
 	skb_shinfo(skb)->nr_frags = skb->data_len = 0;
 	skb_trim(skb, 0);
 	dev_kfree_skb(skb);
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index 4d3bc0d..0787807 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -102,7 +102,9 @@  static int
 aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct aoe_hdr *h;
+	struct aoe_atahdr *ah;
 	u32 n;
+	int sn;
 
 	if (dev_net(ifp) != &init_net)
 		goto exit;
@@ -110,13 +112,16 @@  aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
 	skb = skb_share_check(skb, GFP_ATOMIC);
 	if (skb == NULL)
 		return 0;
-	if (skb_linearize(skb))
-		goto exit;
 	if (!is_aoe_netif(ifp))
 		goto exit;
 	skb_push(skb, ETH_HLEN);	/* (1) */
-
-	h = (struct aoe_hdr *) skb_mac_header(skb);
+	sn = sizeof(*h) + sizeof(*ah);
+	if (skb->len >= sn) {
+		sn -= skb_headlen(skb);
+		if (sn > 0 && !__pskb_pull_tail(skb, sn))
+			goto exit;
+	}
+	h = (struct aoe_hdr *) skb->data;
 	n = get_unaligned_be32(&h->tag);
 	if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31))
 		goto exit;