diff mbox

[net-next,v4] hyperv: Add support for virtual Receive Side Scaling (vRSS)

Message ID 1396898623-31667-1-git-send-email-haiyangz@microsoft.com
State Deferred, archived
Delegated to: David Miller
Headers show

Commit Message

Haiyang Zhang April 7, 2014, 7:23 p.m. UTC
This feature allows multiple channels to be used by each virtual NIC.
It is available on Hyper-V host 2012 R2.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: K. Y. Srinivasan <kys@microsoft.com>
---
 drivers/net/hyperv/hyperv_net.h   |  110 +++++++++++++++++++++-
 drivers/net/hyperv/netvsc.c       |  136 +++++++++++++++++++++-----
 drivers/net/hyperv/netvsc_drv.c   |  103 +++++++++++++++++++-
 drivers/net/hyperv/rndis_filter.c |  189 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 504 insertions(+), 34 deletions(-)

Comments

David Miller April 7, 2014, 6:42 p.m. UTC | #1
The net-next tree is not open yet, I will announce when it is and you can submit
net-next targetted patches.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sharat Masetty April 7, 2014, 10:12 p.m. UTC | #2
Hi Zhang,

How is this beneficial when compared to RPS(receive packet steering)?
Can you please provide more details on what this patch does?

Thanks
Sharat


On Mon, Apr 7, 2014 at 12:42 PM, David Miller <davem@davemloft.net> wrote:
>
> The net-next tree is not open yet, I will announce when it is and you can submit
> net-next targetted patches.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haiyang Zhang April 8, 2014, 3:43 p.m. UTC | #3
> -----Original Message-----

> From: Sharat Masetty [mailto:sharat04@gmail.com]

> Sent: Monday, April 7, 2014 6:13 PM

> To: David Miller

> Cc: Haiyang Zhang; Linux Netdev List; KY Srinivasan; olaf@aepfle.de;

> jasowang@redhat.com; linux-kernel@vger.kernel.org; driverdev-

> devel@linuxdriverproject.org

> Subject: Re: [PATCH net-next,v4] hyperv: Add support for virtual Receive Side

> Scaling (vRSS)

> 

> Hi Zhang,

> 

> How is this beneficial when compared to RPS(receive packet steering)?

> Can you please provide more details on what this patch does?


This patch supports the virtual RSS feature provided by Hyper-V hosts. Currently,
the interrupts are received on only one CPU per vNIC. With this feature, we can 
receive interrupts from multiple CPUs and VMBus channels for one vNIC. Outgoing 
packets are put into multiple channels as well. So the network throughput can 
scale up with the number of CPUs for a VM.

RPS is basically a software implementation of RSS. This is an online doc about 
these features:
https://www.kernel.org/doc/Documentation/networking/scaling.txt

Thanks,
- Haiyang
Haiyang Zhang April 17, 2014, 5:25 p.m. UTC | #4
> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Monday, April 7, 2014 2:43 PM
> To: Haiyang Zhang
> Cc: netdev@vger.kernel.org; KY Srinivasan; olaf@aepfle.de;
> jasowang@redhat.com; linux-kernel@vger.kernel.org; driverdev-
> devel@linuxdriverproject.org
> Subject: Re: [PATCH net-next,v4] hyperv: Add support for virtual Receive Side
> Scaling (vRSS)
> 
> 
> The net-next tree is not open yet, I will announce when it is and you can submit
> net-next targetted patches.

Dave,

Is the net-next tree open now? 
(I haven't seen your announcement, but I saw some patches for net-next are under review.)

Thanks,
- Haiyang
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 17, 2014, 6:17 p.m. UTC | #5
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Thu, 17 Apr 2014 17:25:31 +0000

> Is the net-next tree open now? 

No, it is not.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 13010b4..dcafa82 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -28,6 +28,96 @@ 
 #include <linux/hyperv.h>
 #include <linux/rndis.h>
 
+/* RSS related */
+#define OID_GEN_RECEIVE_SCALE_CAPABILITIES 0x00010203  /* query only */
+#define OID_GEN_RECEIVE_SCALE_PARAMETERS 0x00010204  /* query and set */
+
+#define NDIS_OBJECT_TYPE_RSS_CAPABILITIES 0x88
+#define NDIS_OBJECT_TYPE_RSS_PARAMETERS 0x89
+
+#define NDIS_RECEIVE_SCALE_CAPABILITIES_REVISION_2 2
+#define NDIS_RECEIVE_SCALE_PARAMETERS_REVISION_2 2
+
+struct ndis_obj_header {
+	u8 type;
+	u8 rev;
+	u16 size;
+} __packed;
+
+/* ndis_recv_scale_cap/cap_flag */
+#define NDIS_RSS_CAPS_MESSAGE_SIGNALED_INTERRUPTS 0x01000000
+#define NDIS_RSS_CAPS_CLASSIFICATION_AT_ISR       0x02000000
+#define NDIS_RSS_CAPS_CLASSIFICATION_AT_DPC       0x04000000
+#define NDIS_RSS_CAPS_USING_MSI_X                 0x08000000
+#define NDIS_RSS_CAPS_RSS_AVAILABLE_ON_PORTS      0x10000000
+#define NDIS_RSS_CAPS_SUPPORTS_MSI_X              0x20000000
+#define NDIS_RSS_CAPS_HASH_TYPE_TCP_IPV4          0x00000100
+#define NDIS_RSS_CAPS_HASH_TYPE_TCP_IPV6          0x00000200
+#define NDIS_RSS_CAPS_HASH_TYPE_TCP_IPV6_EX       0x00000400
+
+struct ndis_recv_scale_cap { /* NDIS_RECEIVE_SCALE_CAPABILITIES */
+	struct ndis_obj_header hdr;
+	u32 cap_flag;
+	u32 num_int_msg;
+	u32 num_recv_que;
+	u16 num_indirect_tabent;
+} __packed;
+
+
+/* ndis_recv_scale_param flags */
+#define NDIS_RSS_PARAM_FLAG_BASE_CPU_UNCHANGED     0x0001
+#define NDIS_RSS_PARAM_FLAG_HASH_INFO_UNCHANGED    0x0002
+#define NDIS_RSS_PARAM_FLAG_ITABLE_UNCHANGED       0x0004
+#define NDIS_RSS_PARAM_FLAG_HASH_KEY_UNCHANGED     0x0008
+#define NDIS_RSS_PARAM_FLAG_DISABLE_RSS            0x0010
+
+/* Hash info bits */
+#define NDIS_HASH_FUNC_TOEPLITZ 0x00000001
+#define NDIS_HASH_IPV4          0x00000100
+#define NDIS_HASH_TCP_IPV4      0x00000200
+#define NDIS_HASH_IPV6          0x00000400
+#define NDIS_HASH_IPV6_EX       0x00000800
+#define NDIS_HASH_TCP_IPV6      0x00001000
+#define NDIS_HASH_TCP_IPV6_EX   0x00002000
+
+#define NDIS_RSS_INDIRECTION_TABLE_MAX_SIZE_REVISION_2 (128 * 4)
+#define NDIS_RSS_HASH_SECRET_KEY_MAX_SIZE_REVISION_2   40
+
+#define ITAB_NUM 128
+#define HASH_KEYLEN NDIS_RSS_HASH_SECRET_KEY_MAX_SIZE_REVISION_2
+extern u8 netvsc_hash_key[];
+
+struct ndis_recv_scale_param { /* NDIS_RECEIVE_SCALE_PARAMETERS */
+	struct ndis_obj_header hdr;
+
+	/* Qualifies the rest of the information */
+	u16 flag;
+
+	/* The base CPU number to do receive processing. not used */
+	u16 base_cpu_number;
+
+	/* This describes the hash function and type being enabled */
+	u32 hashinfo;
+
+	/* The size of indirection table array */
+	u16 indirect_tabsize;
+
+	/* The offset of the indirection table from the beginning of this
+	 * structure
+	 */
+	u32 indirect_taboffset;
+
+	/* The size of the hash secret key */
+	u16 hashkey_size;
+
+	/* The offset of the secret key from the beginning of this structure */
+	u32 kashkey_offset;
+
+	u32 processor_masks_offset;
+	u32 num_processor_masks;
+	u32 processor_masks_entry_size;
+};
+
 /* Fwd declaration */
 struct hv_netvsc_packet;
 struct ndis_tcp_ip_checksum_info;
@@ -39,6 +129,8 @@  struct xferpage_packet {
 
 	/* # of netvsc packets this xfer packet contains */
 	u32 count;
+
+	struct vmbus_channel *channel;
 };
 
 /*
@@ -54,6 +146,9 @@  struct hv_netvsc_packet {
 	bool is_data_pkt;
 	u16 vlan_tci;
 
+	u16 q_idx;
+	struct vmbus_channel *channel;
+
 	/*
 	 * Valid only for receives when we break a xfer page packet
 	 * into multiple netvsc packets
@@ -120,6 +215,7 @@  void netvsc_linkstatus_callback(struct hv_device *device_obj,
 int netvsc_recv_callback(struct hv_device *device_obj,
 			struct hv_netvsc_packet *packet,
 			struct ndis_tcp_ip_checksum_info *csum_info);
+void netvsc_channel_cb(void *context);
 int rndis_filter_open(struct hv_device *dev);
 int rndis_filter_close(struct hv_device *dev);
 int rndis_filter_device_add(struct hv_device *dev,
@@ -522,6 +618,8 @@  struct nvsp_message {
 
 #define NETVSC_PACKET_SIZE                      2048
 
+#define VRSS_SEND_TAB_SIZE 16
+
 /* Per netvsc channel-specific */
 struct netvsc_device {
 	struct hv_device *dev;
@@ -555,10 +653,20 @@  struct netvsc_device {
 
 	struct net_device *ndev;
 
+	struct vmbus_channel *chn_table[NR_CPUS];
+	u32 send_table[VRSS_SEND_TAB_SIZE];
+	u32 num_chn;
+	atomic_t queue_sends[NR_CPUS];
+
 	/* Holds rndis device info */
 	void *extension;
-	/* The recive buffer for this device */
+
+	int ring_size;
+
+	/* The primary channel callback buffer */
 	unsigned char cb_buffer[NETVSC_PACKET_SIZE];
+	/* The sub channel callback buffer */
+	unsigned char *sub_cb_buf;
 };
 
 /* NdisInitialize message */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index daddea2..ec40e2c 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -422,6 +422,9 @@  int netvsc_device_remove(struct hv_device *device)
 		kfree(netvsc_packet);
 	}
 
+	if (net_device->sub_cb_buf)
+		vfree(net_device->sub_cb_buf);
+
 	kfree(net_device);
 	return 0;
 }
@@ -461,7 +464,9 @@  static void netvsc_send_completion(struct netvsc_device *net_device,
 	    (nvsp_packet->hdr.msg_type ==
 	     NVSP_MSG1_TYPE_SEND_RECV_BUF_COMPLETE) ||
 	    (nvsp_packet->hdr.msg_type ==
-	     NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE)) {
+	     NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE) ||
+	    (nvsp_packet->hdr.msg_type ==
+	     NVSP_MSG5_TYPE_SUBCHANNEL)) {
 		/* Copy the response back */
 		memcpy(&net_device->channel_init_pkt, nvsp_packet,
 		       sizeof(struct nvsp_message));
@@ -469,28 +474,37 @@  static void netvsc_send_completion(struct netvsc_device *net_device,
 	} else if (nvsp_packet->hdr.msg_type ==
 		   NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE) {
 		int num_outstanding_sends;
+		u16 q_idx = 0;
+		struct vmbus_channel *channel = device->channel;
+		int queue_sends;
 
 		/* Get the send context */
 		nvsc_packet = (struct hv_netvsc_packet *)(unsigned long)
 			packet->trans_id;
 
 		/* Notify the layer above us */
-		if (nvsc_packet)
+		if (nvsc_packet) {
+			q_idx = nvsc_packet->q_idx;
+			channel = nvsc_packet->channel;
 			nvsc_packet->completion.send.send_completion(
 				nvsc_packet->completion.send.
 				send_completion_ctx);
+		}
 
 		num_outstanding_sends =
 			atomic_dec_return(&net_device->num_outstanding_sends);
+		queue_sends = atomic_dec_return(&net_device->
+						queue_sends[q_idx]);
 
 		if (net_device->destroy && num_outstanding_sends == 0)
 			wake_up(&net_device->wait_drain);
 
-		if (netif_queue_stopped(ndev) && !net_device->start_remove &&
-			(hv_ringbuf_avail_percent(&device->channel->outbound)
-			> RING_AVAIL_PERCENT_HIWATER ||
-			num_outstanding_sends < 1))
-				netif_wake_queue(ndev);
+		if (netif_tx_queue_stopped(netdev_get_tx_queue(ndev, q_idx)) &&
+		    !net_device->start_remove &&
+		    (hv_ringbuf_avail_percent(&channel->outbound) >
+		     RING_AVAIL_PERCENT_HIWATER || queue_sends < 1))
+				netif_tx_wake_queue(netdev_get_tx_queue(
+						    ndev, q_idx));
 	} else {
 		netdev_err(ndev, "Unknown send completion packet type- "
 			   "%d received!!\n", nvsp_packet->hdr.msg_type);
@@ -505,6 +519,7 @@  int netvsc_send(struct hv_device *device,
 	int ret = 0;
 	struct nvsp_message sendMessage;
 	struct net_device *ndev;
+	struct vmbus_channel *out_channel = NULL;
 	u64 req_id;
 
 	net_device = get_outbound_net_device(device);
@@ -531,15 +546,20 @@  int netvsc_send(struct hv_device *device,
 	else
 		req_id = 0;
 
+	out_channel = net_device->chn_table[packet->q_idx];
+	if (out_channel == NULL)
+		out_channel = device->channel;
+	packet->channel = out_channel;
+
 	if (packet->page_buf_cnt) {
-		ret = vmbus_sendpacket_pagebuffer(device->channel,
+		ret = vmbus_sendpacket_pagebuffer(out_channel,
 						  packet->page_buf,
 						  packet->page_buf_cnt,
 						  &sendMessage,
 						  sizeof(struct nvsp_message),
 						  req_id);
 	} else {
-		ret = vmbus_sendpacket(device->channel, &sendMessage,
+		ret = vmbus_sendpacket(out_channel, &sendMessage,
 				sizeof(struct nvsp_message),
 				req_id,
 				VM_PKT_DATA_INBAND,
@@ -548,17 +568,24 @@  int netvsc_send(struct hv_device *device,
 
 	if (ret == 0) {
 		atomic_inc(&net_device->num_outstanding_sends);
-		if (hv_ringbuf_avail_percent(&device->channel->outbound) <
+		atomic_inc(&net_device->queue_sends[packet->q_idx]);
+
+		if (hv_ringbuf_avail_percent(&out_channel->outbound) <
 			RING_AVAIL_PERCENT_LOWATER) {
-			netif_stop_queue(ndev);
+			netif_tx_stop_queue(netdev_get_tx_queue(
+					    ndev, packet->q_idx));
+
 			if (atomic_read(&net_device->
-				num_outstanding_sends) < 1)
-				netif_wake_queue(ndev);
+				queue_sends[packet->q_idx]) < 1)
+				netif_tx_wake_queue(netdev_get_tx_queue(
+						    ndev, packet->q_idx));
 		}
 	} else if (ret == -EAGAIN) {
-		netif_stop_queue(ndev);
-		if (atomic_read(&net_device->num_outstanding_sends) < 1) {
-			netif_wake_queue(ndev);
+		netif_tx_stop_queue(netdev_get_tx_queue(
+				    ndev, packet->q_idx));
+		if (atomic_read(&net_device->queue_sends[packet->q_idx]) < 1) {
+			netif_tx_wake_queue(netdev_get_tx_queue(
+					    ndev, packet->q_idx));
 			ret = -ENOSPC;
 		}
 	} else {
@@ -570,6 +597,7 @@  int netvsc_send(struct hv_device *device,
 }
 
 static void netvsc_send_recv_completion(struct hv_device *device,
+					struct vmbus_channel *channel,
 					struct netvsc_device *net_device,
 					u64 transaction_id, u32 status)
 {
@@ -587,7 +615,7 @@  static void netvsc_send_recv_completion(struct hv_device *device,
 
 retry_send_cmplt:
 	/* Send the completion */
-	ret = vmbus_sendpacket(device->channel, &recvcompMessage,
+	ret = vmbus_sendpacket(channel, &recvcompMessage,
 			       sizeof(struct nvsp_message), transaction_id,
 			       VM_PKT_COMP, 0);
 	if (ret == 0) {
@@ -618,6 +646,7 @@  static void netvsc_receive_completion(void *context)
 {
 	struct hv_netvsc_packet *packet = context;
 	struct hv_device *device = packet->device;
+	struct vmbus_channel *channel;
 	struct netvsc_device *net_device;
 	u64 transaction_id = 0;
 	bool fsend_receive_comp = false;
@@ -649,6 +678,7 @@  static void netvsc_receive_completion(void *context)
 	 */
 	if (packet->xfer_page_pkt->count == 0) {
 		fsend_receive_comp = true;
+		channel = packet->xfer_page_pkt->channel;
 		transaction_id = packet->completion.recv.recv_completion_tid;
 		status = packet->xfer_page_pkt->status;
 		list_add_tail(&packet->xfer_page_pkt->list_ent,
@@ -662,12 +692,13 @@  static void netvsc_receive_completion(void *context)
 
 	/* Send a receive completion for the xfer page packet */
 	if (fsend_receive_comp)
-		netvsc_send_recv_completion(device, net_device, transaction_id,
-					status);
+		netvsc_send_recv_completion(device, channel, net_device,
+					    transaction_id, status);
 
 }
 
 static void netvsc_receive(struct netvsc_device *net_device,
+			struct vmbus_channel *channel,
 			struct hv_device *device,
 			struct vmpacket_descriptor *packet)
 {
@@ -748,7 +779,7 @@  static void netvsc_receive(struct netvsc_device *net_device,
 		spin_unlock_irqrestore(&net_device->recv_pkt_list_lock,
 				       flags);
 
-		netvsc_send_recv_completion(device, net_device,
+		netvsc_send_recv_completion(device, channel, net_device,
 					    vmxferpage_packet->d.trans_id,
 					    NVSP_STAT_FAIL);
 
@@ -759,6 +790,7 @@  static void netvsc_receive(struct netvsc_device *net_device,
 	xferpage_packet = (struct xferpage_packet *)listHead.next;
 	list_del(&xferpage_packet->list_ent);
 	xferpage_packet->status = NVSP_STAT_SUCCESS;
+	xferpage_packet->channel = channel;
 
 	/* This is how much we can satisfy */
 	xferpage_packet->count = count - 1;
@@ -800,10 +832,45 @@  static void netvsc_receive(struct netvsc_device *net_device,
 
 }
 
-static void netvsc_channel_cb(void *context)
+
+static void netvsc_send_table(struct hv_device *hdev,
+			      struct vmpacket_descriptor *vmpkt)
+{
+	struct netvsc_device *nvscdev;
+	struct net_device *ndev;
+	struct nvsp_message *nvmsg;
+	int i;
+	u32 count, *tab;
+
+	nvscdev = get_outbound_net_device(hdev);
+	if (!nvscdev)
+		return;
+	ndev = nvscdev->ndev;
+
+	nvmsg = (struct nvsp_message *)((unsigned long)vmpkt +
+					(vmpkt->offset8 << 3));
+
+	if (nvmsg->hdr.msg_type != NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE)
+		return;
+
+	count = nvmsg->msg.v5_msg.send_table.count;
+	if (count != VRSS_SEND_TAB_SIZE) {
+		netdev_err(ndev, "Received wrong send-table size:%u\n", count);
+		return;
+	}
+
+	tab = (u32 *)((unsigned long)&nvmsg->msg.v5_msg.send_table +
+		      nvmsg->msg.v5_msg.send_table.offset);
+
+	for (i = 0; i < count; i++)
+		nvscdev->send_table[i] = tab[i];
+}
+
+void netvsc_channel_cb(void *context)
 {
 	int ret;
-	struct hv_device *device = context;
+	struct vmbus_channel *channel = (struct vmbus_channel *)context;
+	struct hv_device *device;
 	struct netvsc_device *net_device;
 	u32 bytes_recvd;
 	u64 request_id;
@@ -812,14 +879,19 @@  static void netvsc_channel_cb(void *context)
 	int bufferlen = NETVSC_PACKET_SIZE;
 	struct net_device *ndev;
 
+	if (channel->primary_channel != NULL)
+		device = channel->primary_channel->device_obj;
+	else
+		device = channel->device_obj;
+
 	net_device = get_inbound_net_device(device);
 	if (!net_device)
 		return;
 	ndev = net_device->ndev;
-	buffer = net_device->cb_buffer;
+	buffer = get_per_channel_state(channel);
 
 	do {
-		ret = vmbus_recvpacket_raw(device->channel, buffer, bufferlen,
+		ret = vmbus_recvpacket_raw(channel, buffer, bufferlen,
 					   &bytes_recvd, &request_id);
 		if (ret == 0) {
 			if (bytes_recvd > 0) {
@@ -831,8 +903,12 @@  static void netvsc_channel_cb(void *context)
 					break;
 
 				case VM_PKT_DATA_USING_XFER_PAGES:
-					netvsc_receive(net_device,
-							device, desc);
+					netvsc_receive(net_device, channel,
+						       device, desc);
+					break;
+
+				case VM_PKT_DATA_INBAND:
+					netvsc_send_table(device, desc);
 					break;
 
 				default:
@@ -893,6 +969,8 @@  int netvsc_device_add(struct hv_device *device, void *additional_info)
 		goto cleanup;
 	}
 
+	net_device->ring_size = ring_size;
+
 	/*
 	 * Coming into this function, struct net_device * is
 	 * registered as the driver private data.
@@ -917,10 +995,12 @@  int netvsc_device_add(struct hv_device *device, void *additional_info)
 	}
 	init_completion(&net_device->channel_init_wait);
 
+	set_per_channel_state(device->channel, net_device->cb_buffer);
+
 	/* Open the channel */
 	ret = vmbus_open(device->channel, ring_size * PAGE_SIZE,
 			 ring_size * PAGE_SIZE, NULL, 0,
-			 netvsc_channel_cb, device);
+			 netvsc_channel_cb, device->channel);
 
 	if (ret != 0) {
 		netdev_err(ndev, "unable to open channel: %d\n", ret);
@@ -930,6 +1010,8 @@  int netvsc_device_add(struct hv_device *device, void *additional_info)
 	/* Channel is opened */
 	pr_info("hv_netvsc channel opened successfully\n");
 
+	net_device->chn_table[0] = device->channel;
+
 	/* Connect with the NetVsp */
 	ret = netvsc_connect_vsp(device);
 	if (ret != 0) {
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 4e4cf9e..35e58ba 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -101,7 +101,7 @@  static int netvsc_open(struct net_device *net)
 		return ret;
 	}
 
-	netif_start_queue(net);
+	netif_tx_start_all_queues(net);
 
 	nvdev = hv_get_drvdata(device_obj);
 	rdev = nvdev->extension;
@@ -149,6 +149,88 @@  static void *init_ppi_data(struct rndis_message *msg, u32 ppi_size,
 	return ppi;
 }
 
+union sub_key {
+	u64 k;
+	struct {
+		u8 pad[3];
+		u8 kb;
+		u32 ka;
+	};
+};
+
+/* Toeplitz hash function
+ * data: network byte order
+ * return: host byte order
+ */
+static u32 comp_hash(u8 *key, int klen, u8 *data, int dlen)
+{
+	union sub_key subk;
+	int k_next = 4;
+	u8 dt;
+	int i, j;
+	u32 ret = 0;
+
+	subk.k = 0;
+	subk.ka = ntohl(*(u32 *)key);
+
+	for (i = 0; i < dlen; i++) {
+		subk.kb = key[k_next];
+		k_next = (k_next + 1) % klen;
+		dt = data[i];
+		for (j = 0; j < 8; j++) {
+			if (dt & 0x80)
+				ret ^= subk.ka;
+			dt <<= 1;
+			subk.k <<= 1;
+		}
+	}
+
+	return ret;
+}
+
+static bool netvsc_set_hash(u32 *hash, struct sk_buff *skb)
+{
+	struct iphdr *iphdr;
+	int data_len;
+	bool ret = false;
+
+	if (eth_hdr(skb)->h_proto != htons(ETH_P_IP))
+		return false;
+
+	iphdr = ip_hdr(skb);
+
+	if (iphdr->version == 4) {
+		if (iphdr->protocol == IPPROTO_TCP)
+			data_len = 12;
+		else
+			data_len = 8;
+		*hash = comp_hash(netvsc_hash_key, HASH_KEYLEN,
+				  (u8 *)&iphdr->saddr, data_len);
+		ret = true;
+	}
+
+	return ret;
+}
+
+static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
+			void *accel_priv, select_queue_fallback_t fallback)
+{
+	struct net_device_context *net_device_ctx = netdev_priv(ndev);
+	struct hv_device *hdev =  net_device_ctx->device_ctx;
+	struct netvsc_device *nvsc_dev = hv_get_drvdata(hdev);
+	u32 hash;
+	u16 q_idx = 0;
+
+	if (nvsc_dev == NULL || ndev->real_num_tx_queues <= 1)
+		return 0;
+
+	if (netvsc_set_hash(&hash, skb))
+		q_idx = nvsc_dev->send_table[hash % VRSS_SEND_TAB_SIZE] %
+			ndev->real_num_tx_queues;
+
+	return q_idx;
+}
+
 static void netvsc_xmit_completion(void *context)
 {
 	struct hv_netvsc_packet *packet = (struct hv_netvsc_packet *)context;
@@ -331,6 +413,8 @@  static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net)
 
 	packet->vlan_tci = skb->vlan_tci;
 
+	packet->q_idx = skb_get_queue_mapping(skb);
+
 	packet->is_data_pkt = true;
 	packet->total_data_buflen = skb->len;
 
@@ -528,6 +612,10 @@  int netvsc_recv_callback(struct hv_device *device_obj,
 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
 				       packet->vlan_tci);
 
+	skb_record_rx_queue(skb, packet->xfer_page_pkt->channel->
+			    offermsg.offer.sub_channel_index %
+			    net->real_num_rx_queues);
+
 	net->stats.rx_packets++;
 	net->stats.rx_bytes += packet->total_data_buflen;
 
@@ -576,7 +664,7 @@  static int netvsc_change_mtu(struct net_device *ndev, int mtu)
 	hv_set_drvdata(hdev, ndev);
 	device_info.ring_size = ring_size;
 	rndis_filter_device_add(hdev, &device_info);
-	netif_wake_queue(ndev);
+	netif_tx_wake_all_queues(ndev);
 
 	return 0;
 }
@@ -622,6 +710,7 @@  static const struct net_device_ops device_ops = {
 	.ndo_change_mtu =		netvsc_change_mtu,
 	.ndo_validate_addr =		eth_validate_addr,
 	.ndo_set_mac_address =		netvsc_set_mac_addr,
+	.ndo_select_queue =		netvsc_select_queue,
 };
 
 /*
@@ -668,9 +757,11 @@  static int netvsc_probe(struct hv_device *dev,
 	struct net_device *net = NULL;
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device_info device_info;
+	struct netvsc_device *nvdev;
 	int ret;
 
-	net = alloc_etherdev(sizeof(struct net_device_context));
+	net = alloc_etherdev_mq(sizeof(struct net_device_context),
+				num_online_cpus());
 	if (!net)
 		return -ENOMEM;
 
@@ -703,6 +794,12 @@  static int netvsc_probe(struct hv_device *dev,
 	}
 	memcpy(net->dev_addr, device_info.mac_adr, ETH_ALEN);
 
+	nvdev = hv_get_drvdata(dev);
+	netif_set_real_num_tx_queues(net, nvdev->num_chn);
+	netif_set_real_num_rx_queues(net, nvdev->num_chn);
+	dev_info(&dev->device, "real num tx,rx queues:%u, %u\n",
+		 net->real_num_tx_queues, net->real_num_rx_queues);
+
 	ret = register_netdev(net);
 	if (ret != 0) {
 		pr_err("Unable to register netdev.\n");
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 4a37e3d..2054511 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -31,7 +31,7 @@ 
 #include "hyperv_net.h"
 
 
-#define RNDIS_EXT_LEN 100
+#define RNDIS_EXT_LEN PAGE_SIZE
 struct rndis_request {
 	struct list_head list_ent;
 	struct completion  wait_event;
@@ -94,6 +94,8 @@  static struct rndis_request *get_rndis_request(struct rndis_device *dev,
 	rndis_msg->ndis_msg_type = msg_type;
 	rndis_msg->msg_len = msg_len;
 
+	request->pkt.q_idx = 0;
+
 	/*
 	 * Set the request id. This field is always after the rndis header for
 	 * request/response packet types so we just used the SetRequest as a
@@ -509,6 +511,19 @@  static int rndis_filter_query_device(struct rndis_device *dev, u32 oid,
 	query->info_buflen = 0;
 	query->dev_vc_handle = 0;
 
+	if (oid == OID_GEN_RECEIVE_SCALE_CAPABILITIES) {
+		struct ndis_recv_scale_cap *cap;
+
+		request->request_msg.msg_len +=
+			sizeof(struct ndis_recv_scale_cap);
+		query->info_buflen = sizeof(struct ndis_recv_scale_cap);
+		cap = (struct ndis_recv_scale_cap *)((unsigned long)query +
+						     query->info_buf_offset);
+		cap->hdr.type = NDIS_OBJECT_TYPE_RSS_CAPABILITIES;
+		cap->hdr.rev = NDIS_RECEIVE_SCALE_CAPABILITIES_REVISION_2;
+		cap->hdr.size = sizeof(struct ndis_recv_scale_cap);
+	}
+
 	ret = rndis_filter_send_request(dev, request);
 	if (ret != 0)
 		goto cleanup;
@@ -685,6 +700,89 @@  cleanup:
 	return ret;
 }
 
+u8 netvsc_hash_key[HASH_KEYLEN] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
+};
+
+int rndis_filter_set_rss_param(struct rndis_device *rdev, int num_queue)
+{
+	struct net_device *ndev = rdev->net_dev->ndev;
+	struct rndis_request *request;
+	struct rndis_set_request *set;
+	struct rndis_set_complete *set_complete;
+	u32 extlen = sizeof(struct ndis_recv_scale_param) +
+		     4*ITAB_NUM + HASH_KEYLEN;
+	struct ndis_recv_scale_param *rssp;
+	u32 *itab;
+	u8 *keyp;
+	int i, t, ret;
+
+	request = get_rndis_request(
+			rdev, RNDIS_MSG_SET,
+			RNDIS_MESSAGE_SIZE(struct rndis_set_request) + extlen);
+	if (!request)
+		return -ENOMEM;
+
+	set = &request->request_msg.msg.set_req;
+	set->oid = OID_GEN_RECEIVE_SCALE_PARAMETERS;
+	set->info_buflen = extlen;
+	set->info_buf_offset = sizeof(struct rndis_set_request);
+	set->dev_vc_handle = 0;
+
+	rssp = (struct ndis_recv_scale_param *)(set + 1);
+	rssp->hdr.type = NDIS_OBJECT_TYPE_RSS_PARAMETERS;
+	rssp->hdr.rev = NDIS_RECEIVE_SCALE_PARAMETERS_REVISION_2;
+	rssp->hdr.size = sizeof(struct ndis_recv_scale_param);
+	rssp->flag = 0;
+	rssp->hashinfo = NDIS_HASH_FUNC_TOEPLITZ | NDIS_HASH_IPV4 |
+			 NDIS_HASH_TCP_IPV4;
+	rssp->indirect_tabsize = 4*ITAB_NUM;
+	rssp->indirect_taboffset = sizeof(struct ndis_recv_scale_param);
+	rssp->hashkey_size = HASH_KEYLEN;
+	rssp->kashkey_offset = rssp->indirect_taboffset +
+			       rssp->indirect_tabsize;
+
+	/* Set indirection table entries */
+	itab = (u32 *)(rssp + 1);
+	for (i = 0; i < ITAB_NUM; i++)
+		itab[i] = i % num_queue;
+
+	/* Set hask key values */
+	keyp = (u8 *)((unsigned long)rssp + rssp->kashkey_offset);
+	for (i = 0; i < HASH_KEYLEN; i++)
+		keyp[i] = netvsc_hash_key[i];
+
+
+	ret = rndis_filter_send_request(rdev, request);
+	if (ret != 0)
+		goto cleanup;
+
+	t = wait_for_completion_timeout(&request->wait_event, 5*HZ);
+	if (t == 0) {
+		netdev_err(ndev, "timeout before we got a set response...\n");
+		/* can't put_rndis_request, since we may still receive a
+		 * send-completion.
+		 */
+		return -ETIMEDOUT;
+	} else {
+		set_complete = &request->response_msg.msg.set_complete;
+		if (set_complete->status != RNDIS_STATUS_SUCCESS) {
+			netdev_err(ndev, "Fail to set RSS parameters:0x%x\n",
+				   set_complete->status);
+			ret = -EINVAL;
+		}
+	}
+
+cleanup:
+	put_rndis_request(rdev, request);
+	return ret;
+}
+
+
 static int rndis_filter_query_device_link_status(struct rndis_device *dev)
 {
 	u32 size = sizeof(u32);
@@ -876,6 +974,28 @@  static int rndis_filter_close_device(struct rndis_device *dev)
 	return ret;
 }
 
+static void netvsc_sc_open(struct vmbus_channel *new_sc)
+{
+	struct netvsc_device *nvscdev;
+	u16 chn_index = new_sc->offermsg.offer.sub_channel_index;
+	int ret;
+
+	nvscdev = hv_get_drvdata(new_sc->primary_channel->device_obj);
+
+	if (chn_index >= nvscdev->num_chn)
+		return;
+
+	set_per_channel_state(new_sc, nvscdev->sub_cb_buf + (chn_index - 1) *
+			      NETVSC_PACKET_SIZE);
+
+	ret = vmbus_open(new_sc, nvscdev->ring_size * PAGE_SIZE,
+			 nvscdev->ring_size * PAGE_SIZE, NULL, 0,
+			 netvsc_channel_cb, new_sc);
+
+	if (ret == 0)
+		nvscdev->chn_table[chn_index] = new_sc;
+}
+
 int rndis_filter_device_add(struct hv_device *dev,
 				  void *additional_info)
 {
@@ -884,6 +1004,10 @@  int rndis_filter_device_add(struct hv_device *dev,
 	struct rndis_device *rndis_device;
 	struct netvsc_device_info *device_info = additional_info;
 	struct ndis_offload_params offloads;
+	struct nvsp_message *init_packet;
+	int t;
+	struct ndis_recv_scale_cap rsscap;
+	u32 rsscap_size = sizeof(struct ndis_recv_scale_cap);
 
 	rndis_device = get_rndis_device();
 	if (!rndis_device)
@@ -903,6 +1027,7 @@  int rndis_filter_device_add(struct hv_device *dev,
 
 	/* Initialize the rndis device */
 	net_device = hv_get_drvdata(dev);
+	net_device->num_chn = 1;
 
 	net_device->extension = rndis_device;
 	rndis_device->net_dev = net_device;
@@ -942,7 +1067,6 @@  int rndis_filter_device_add(struct hv_device *dev,
 	if (ret)
 		goto err_dev_remv;
 
-
 	rndis_filter_query_device_link_status(rndis_device);
 
 	device_info->link_state = rndis_device->link_state;
@@ -951,7 +1075,66 @@  int rndis_filter_device_add(struct hv_device *dev,
 		 rndis_device->hw_mac_adr,
 		 device_info->link_state ? "down" : "up");
 
-	return ret;
+	if (net_device->nvsp_version < NVSP_PROTOCOL_VERSION_5)
+		return 0;
+
+	/* vRSS setup */
+	memset(&rsscap, 0, rsscap_size);
+	ret = rndis_filter_query_device(rndis_device,
+					OID_GEN_RECEIVE_SCALE_CAPABILITIES,
+					&rsscap, &rsscap_size);
+	if (ret || rsscap.num_recv_que < 2)
+		goto out;
+
+	net_device->num_chn = (num_online_cpus() < rsscap.num_recv_que) ?
+			       num_online_cpus() : rsscap.num_recv_que;
+	if (net_device->num_chn == 1)
+		goto out;
+
+	net_device->sub_cb_buf = vzalloc((net_device->num_chn - 1) *
+					 NETVSC_PACKET_SIZE);
+	if (!net_device->sub_cb_buf) {
+		net_device->num_chn = 1;
+		dev_info(&dev->device, "No memory for subchannels.\n");
+		goto out;
+	}
+
+	vmbus_set_sc_create_callback(dev->channel, netvsc_sc_open);
+
+	init_packet = &net_device->channel_init_pkt;
+	memset(init_packet, 0, sizeof(struct nvsp_message));
+	init_packet->hdr.msg_type = NVSP_MSG5_TYPE_SUBCHANNEL;
+	init_packet->msg.v5_msg.subchn_req.op = NVSP_SUBCHANNEL_ALLOCATE;
+	init_packet->msg.v5_msg.subchn_req.num_subchannels =
+						net_device->num_chn - 1;
+	ret = vmbus_sendpacket(dev->channel, init_packet,
+			       sizeof(struct nvsp_message),
+			       (unsigned long)init_packet,
+			       VM_PKT_DATA_INBAND,
+			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+	if (ret)
+		goto out;
+	t = wait_for_completion_timeout(&net_device->channel_init_wait, 5*HZ);
+	if (t == 0) {
+		ret = -ETIMEDOUT;
+		goto out;
+	}
+	if (init_packet->msg.v5_msg.subchn_comp.status !=
+	    NVSP_STAT_SUCCESS) {
+		ret = -ENODEV;
+		goto out;
+	}
+	net_device->num_chn = 1 +
+		init_packet->msg.v5_msg.subchn_comp.num_subchannels;
+
+	vmbus_are_subchannels_present(dev->channel);
+
+	ret = rndis_filter_set_rss_param(rndis_device, net_device->num_chn);
+
+out:
+	if (ret)
+		net_device->num_chn = 1;
+	return 0; /* return 0 because primary channel can be used alone */
 
 err_dev_remv:
 	rndis_filter_device_remove(dev);