Patchwork [1/2,RESEND] ehea: error handling improvement

login
register
mail settings
Submitter Thomas Klein
Date April 21, 2010, 9:10 a.m.
Message ID <201004211110.55986.tklein@de.ibm.com>
Download mbox | patch
Permalink /patch/50642/
State Accepted
Delegated to: David Miller
Headers show

Comments

Thomas Klein - April 21, 2010, 9:10 a.m.
Reset a port's resources only if they're actually in an error state

Signed-off-by: Thomas Klein <tklein@de.ibm.com>
---

Patch created against net-2.6

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller - April 22, 2010, 5:36 a.m.
From: Thomas Klein <tklein@de.ibm.com>
Date: Wed, 21 Apr 2010 11:10:55 +0200

> Reset a port's resources only if they're actually in an error state
> 
> Signed-off-by: Thomas Klein <tklein@de.ibm.com>
> ---
> 
> Patch created against net-2.6

I thought you were sorry for wasting my time and that you were going
to follow the directions I gave you last time, and I quote:

--------------------
3) These are not appropriate for net-2.6 as we are deep in
   the -rcX series at this point and only the most diabolical
   bug fixes are appropriate.  Therefore, please generate these
   against net-next-2.6, thanks.
--------------------

And here you are generating your patches against net-2.6.  Heck, you
even feel it's worth mentioning explicitly.

Lucky for you the patches happen to apply cleanly to net-next-2.6 so
I've put them there.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Klein - April 23, 2010, 8:22 a.m.
On 04/22/2010 07:36 AM, David Miller wrote:
> From: Thomas Klein<tklein@de.ibm.com>
> Date: Wed, 21 Apr 2010 11:10:55 +0200
>
>> Reset a port's resources only if they're actually in an error state
>>
>> Signed-off-by: Thomas Klein<tklein@de.ibm.com>
>> ---
>>
>> Patch created against net-2.6
>
> I thought you were sorry for wasting my time and that you were going
> to follow the directions I gave you last time, and I quote:
>
> --------------------
> 3) These are not appropriate for net-2.6 as we are deep in
>     the -rcX series at this point and only the most diabolical
>     bug fixes are appropriate.  Therefore, please generate these
>     against net-next-2.6, thanks.
> --------------------
>
> And here you are generating your patches against net-2.6.  Heck, you
> even feel it's worth mentioning explicitly.

Guilty! Allows no excuse. Screwed it. Deeply sorry.

>
> Lucky for you the patches happen to apply cleanly to net-next-2.6 so
> I've put them there.

Thanks!
Thomas

> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff -Nurp net-2.6.orig/drivers/net/ehea/ehea_main.c net-2.6/drivers/net/ehea/ehea_main.c
--- net-2.6.orig/drivers/net/ehea/ehea_main.c	2010-04-21 10:23:21.000000000 +0200
+++ net-2.6/drivers/net/ehea/ehea_main.c	2010-04-21 10:41:21.000000000 +0200
@@ -791,11 +791,17 @@  static struct ehea_cqe *ehea_proc_cqes(s
 		cqe_counter++;
 		rmb();
 		if (cqe->status & EHEA_CQE_STAT_ERR_MASK) {
-			ehea_error("Send Completion Error: Resetting port");
+			ehea_error("Bad send completion status=0x%04X",
+				   cqe->status);
+
 			if (netif_msg_tx_err(pr->port))
 				ehea_dump(cqe, sizeof(*cqe), "Send CQE");
-			ehea_schedule_port_reset(pr->port);
-			break;
+
+			if (cqe->status & EHEA_CQE_STAT_RESET_MASK) {
+				ehea_error("Resetting port");
+				ehea_schedule_port_reset(pr->port);
+				break;
+			}
 		}
 
 		if (netif_msg_tx_done(pr->port))
@@ -901,6 +907,8 @@  static irqreturn_t ehea_qp_aff_irq_handl
 	struct ehea_eqe *eqe;
 	struct ehea_qp *qp;
 	u32 qp_token;
+	u64 resource_type, aer, aerr;
+	int reset_port = 0;
 
 	eqe = ehea_poll_eq(port->qp_eq);
 
@@ -910,11 +918,24 @@  static irqreturn_t ehea_qp_aff_irq_handl
 			   eqe->entry, qp_token);
 
 		qp = port->port_res[qp_token].qp;
-		ehea_error_data(port->adapter, qp->fw_handle);
+
+		resource_type = ehea_error_data(port->adapter, qp->fw_handle,
+						&aer, &aerr);
+
+		if (resource_type == EHEA_AER_RESTYPE_QP) {
+			if ((aer & EHEA_AER_RESET_MASK) ||
+			    (aerr & EHEA_AERR_RESET_MASK))
+				 reset_port = 1;
+		} else
+			reset_port = 1;   /* Reset in case of CQ or EQ error */
+
 		eqe = ehea_poll_eq(port->qp_eq);
 	}
 
-	ehea_schedule_port_reset(port);
+	if (reset_port) {
+		ehea_error("Resetting port");
+		ehea_schedule_port_reset(port);
+	}
 
 	return IRQ_HANDLED;
 }
diff -Nurp net-2.6.orig/drivers/net/ehea/ehea_qmr.c net-2.6/drivers/net/ehea/ehea_qmr.c
--- net-2.6.orig/drivers/net/ehea/ehea_qmr.c	2010-04-21 10:23:21.000000000 +0200
+++ net-2.6/drivers/net/ehea/ehea_qmr.c	2010-04-21 10:41:21.000000000 +0200
@@ -229,14 +229,14 @@  u64 ehea_destroy_cq_res(struct ehea_cq *
 
 int ehea_destroy_cq(struct ehea_cq *cq)
 {
-	u64 hret;
+	u64 hret, aer, aerr;
 	if (!cq)
 		return 0;
 
 	hcp_epas_dtor(&cq->epas);
 	hret = ehea_destroy_cq_res(cq, NORMAL_FREE);
 	if (hret == H_R_STATE) {
-		ehea_error_data(cq->adapter, cq->fw_handle);
+		ehea_error_data(cq->adapter, cq->fw_handle, &aer, &aerr);
 		hret = ehea_destroy_cq_res(cq, FORCE_FREE);
 	}
 
@@ -357,7 +357,7 @@  u64 ehea_destroy_eq_res(struct ehea_eq *
 
 int ehea_destroy_eq(struct ehea_eq *eq)
 {
-	u64 hret;
+	u64 hret, aer, aerr;
 	if (!eq)
 		return 0;
 
@@ -365,7 +365,7 @@  int ehea_destroy_eq(struct ehea_eq *eq)
 
 	hret = ehea_destroy_eq_res(eq, NORMAL_FREE);
 	if (hret == H_R_STATE) {
-		ehea_error_data(eq->adapter, eq->fw_handle);
+		ehea_error_data(eq->adapter, eq->fw_handle, &aer, &aerr);
 		hret = ehea_destroy_eq_res(eq, FORCE_FREE);
 	}
 
@@ -540,7 +540,7 @@  u64 ehea_destroy_qp_res(struct ehea_qp *
 
 int ehea_destroy_qp(struct ehea_qp *qp)
 {
-	u64 hret;
+	u64 hret, aer, aerr;
 	if (!qp)
 		return 0;
 
@@ -548,7 +548,7 @@  int ehea_destroy_qp(struct ehea_qp *qp)
 
 	hret = ehea_destroy_qp_res(qp, NORMAL_FREE);
 	if (hret == H_R_STATE) {
-		ehea_error_data(qp->adapter, qp->fw_handle);
+		ehea_error_data(qp->adapter, qp->fw_handle, &aer, &aerr);
 		hret = ehea_destroy_qp_res(qp, FORCE_FREE);
 	}
 
@@ -986,42 +986,45 @@  void print_error_data(u64 *data)
 	if (length > EHEA_PAGESIZE)
 		length = EHEA_PAGESIZE;
 
-	if (type == 0x8) /* Queue Pair */
+	if (type == EHEA_AER_RESTYPE_QP)
 		ehea_error("QP (resource=%llX) state: AER=0x%llX, AERR=0x%llX, "
 			   "port=%llX", resource, data[6], data[12], data[22]);
-
-	if (type == 0x4) /* Completion Queue */
+	else if (type == EHEA_AER_RESTYPE_CQ)
 		ehea_error("CQ (resource=%llX) state: AER=0x%llX", resource,
 			   data[6]);
-
-	if (type == 0x3) /* Event Queue */
+	else if (type == EHEA_AER_RESTYPE_EQ)
 		ehea_error("EQ (resource=%llX) state: AER=0x%llX", resource,
 			   data[6]);
 
 	ehea_dump(data, length, "error data");
 }
 
-void ehea_error_data(struct ehea_adapter *adapter, u64 res_handle)
+u64 ehea_error_data(struct ehea_adapter *adapter, u64 res_handle,
+		    u64 *aer, u64 *aerr)
 {
 	unsigned long ret;
 	u64 *rblock;
+	u64 type = 0;
 
 	rblock = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!rblock) {
 		ehea_error("Cannot allocate rblock memory.");
-		return;
+		goto out;
 	}
 
-	ret = ehea_h_error_data(adapter->handle,
-				res_handle,
-				rblock);
+	ret = ehea_h_error_data(adapter->handle, res_handle, rblock);
 
-	if (ret == H_R_STATE)
-		ehea_error("No error data is available: %llX.", res_handle);
-	else if (ret == H_SUCCESS)
+	if (ret == H_SUCCESS) {
+		type = EHEA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]);
+		*aer = rblock[6];
+		*aerr = rblock[12];
 		print_error_data(rblock);
-	else
+	} else if (ret == H_R_STATE) {
+		ehea_error("No error data available: %llX.", res_handle);
+	} else
 		ehea_error("Error data could not be fetched: %llX", res_handle);
 
 	free_page((unsigned long)rblock);
+out:
+	return type;
 }
diff -Nurp net-2.6.orig/drivers/net/ehea/ehea_qmr.h net-2.6/drivers/net/ehea/ehea_qmr.h
--- net-2.6.orig/drivers/net/ehea/ehea_qmr.h	2010-04-21 10:23:21.000000000 +0200
+++ net-2.6/drivers/net/ehea/ehea_qmr.h	2010-04-21 10:41:21.000000000 +0200
@@ -154,6 +154,9 @@  struct ehea_rwqe {
 #define EHEA_CQE_STAT_ERR_IP       0x2000
 #define EHEA_CQE_STAT_ERR_CRC      0x1000
 
+/* Defines which bad send cqe stati lead to a port reset */
+#define EHEA_CQE_STAT_RESET_MASK   0x0002
+
 struct ehea_cqe {
 	u64 wr_id;		/* work request ID from WQE */
 	u8 type;
@@ -187,6 +190,14 @@  struct ehea_cqe {
 #define EHEA_EQE_SM_MECH_NUMBER  EHEA_BMASK_IBM(48, 55)
 #define EHEA_EQE_SM_PORT_NUMBER  EHEA_BMASK_IBM(56, 63)
 
+#define EHEA_AER_RESTYPE_QP  0x8
+#define EHEA_AER_RESTYPE_CQ  0x4
+#define EHEA_AER_RESTYPE_EQ  0x3
+
+/* Defines which affiliated errors lead to a port reset */
+#define EHEA_AER_RESET_MASK   0xFFFFFFFFFEFFFFFFULL
+#define EHEA_AERR_RESET_MASK  0xFFFFFFFFFFFFFFFFULL
+
 struct ehea_eqe {
 	u64 entry;
 };
@@ -379,7 +390,8 @@  int ehea_gen_smr(struct ehea_adapter *ad
 
 int ehea_rem_mr(struct ehea_mr *mr);
 
-void ehea_error_data(struct ehea_adapter *adapter, u64 res_handle);
+u64 ehea_error_data(struct ehea_adapter *adapter, u64 res_handle,
+		    u64 *aer, u64 *aerr);
 
 int ehea_add_sect_bmap(unsigned long pfn, unsigned long nr_pages);
 int ehea_rem_sect_bmap(unsigned long pfn, unsigned long nr_pages);