diff mbox series

[v2,09/20] nvme: add support for the asynchronous event request command

Message ID 20191015103900.313928-10-its@irrelevant.dk
State New
Headers show
Series nvme: support NVMe v1.3d, SGLs and multiple namespaces | expand

Commit Message

Klaus Jensen Oct. 15, 2019, 10:38 a.m. UTC
Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
Section 5.2 ("Asynchronous Event Request command").

Mostly imported from Keith's qemu-nvme tree. Modified to not enqueue
events if something of the same type is already queued (but not cleared
by the host).

Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
---
 hw/block/nvme.c       | 180 ++++++++++++++++++++++++++++++++++++++++--
 hw/block/nvme.h       |  13 ++-
 hw/block/trace-events |   8 ++
 include/block/nvme.h  |   4 +-
 4 files changed, 196 insertions(+), 9 deletions(-)

Comments

Beata Michalska Nov. 12, 2019, 3:04 p.m. UTC | #1
Hi Klaus,

On Tue, 15 Oct 2019 at 11:49, Klaus Jensen <its@irrelevant.dk> wrote:
>
> Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
> Section 5.2 ("Asynchronous Event Request command").
>
> Mostly imported from Keith's qemu-nvme tree. Modified to not enqueue
> events if something of the same type is already queued (but not cleared
> by the host).
>
> Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> ---
>  hw/block/nvme.c       | 180 ++++++++++++++++++++++++++++++++++++++++--
>  hw/block/nvme.h       |  13 ++-
>  hw/block/trace-events |   8 ++
>  include/block/nvme.h  |   4 +-
>  4 files changed, 196 insertions(+), 9 deletions(-)
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 4412a3bea3bc..5cdee37582f9 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -334,6 +334,46 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
>      timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>  }
>
> +static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
> +    uint8_t event_info, uint8_t log_page)
> +{
> +    NvmeAsyncEvent *event;
> +
> +    trace_nvme_enqueue_event(event_type, event_info, log_page);
> +
> +    /*
> +     * Do not enqueue the event if something of this type is already queued.
> +     * This bounds the size of the event queue and makes sure it does not grow
> +     * indefinitely when events are not processed by the host (i.e. does not
> +     * issue any AERs).
> +     */
> +    if (n->aer_mask_queued & (1 << event_type)) {
> +        trace_nvme_enqueue_event_masked(event_type);
> +        return;
> +    }
> +    n->aer_mask_queued |= (1 << event_type);
> +
> +    event = g_new(NvmeAsyncEvent, 1);
> +    event->result = (NvmeAerResult) {
> +        .event_type = event_type,
> +        .event_info = event_info,
> +        .log_page   = log_page,
> +    };
> +
> +    QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
> +
> +    timer_mod(n->aer_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> +}
> +
> +static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
> +{
> +    n->aer_mask &= ~(1 << event_type);
> +    if (!QTAILQ_EMPTY(&n->aer_queue)) {
> +        timer_mod(n->aer_timer,
> +            qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> +    }
> +}
> +
>  static void nvme_rw_cb(void *opaque, int ret)
>  {
>      NvmeRequest *req = opaque;
> @@ -578,7 +618,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
>      return NVME_SUCCESS;
>  }
>
> -static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd,
> +static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
>      uint32_t buf_len, uint64_t off, NvmeRequest *req)
>  {
>      uint32_t trans_len;
> @@ -591,12 +631,16 @@ static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd,
>
>      trans_len = MIN(sizeof(*n->elpes) * (n->params.elpe + 1) - off, buf_len);
>
> +    if (!rae) {
> +        nvme_clear_events(n, NVME_AER_TYPE_ERROR);
> +    }
> +
>      return nvme_dma_read_prp(n, (uint8_t *) n->elpes + off, trans_len, prp1,
>          prp2);
>  }
>
> -static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> -    uint64_t off, NvmeRequest *req)
> +static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
> +    uint32_t buf_len, uint64_t off, NvmeRequest *req)
>  {
>      uint64_t prp1 = le64_to_cpu(cmd->prp1);
>      uint64_t prp2 = le64_to_cpu(cmd->prp2);
> @@ -646,6 +690,10 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
>      smart.power_on_hours[0] = cpu_to_le64(
>          (((current_ms - n->starttime_ms) / 1000) / 60) / 60);
>
> +    if (!rae) {
> +        nvme_clear_events(n, NVME_AER_TYPE_SMART);
> +    }
> +
>      return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1,
>          prp2);
>  }
> @@ -698,9 +746,9 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>
>      switch (lid) {
>      case NVME_LOG_ERROR_INFO:
> -        return nvme_error_info(n, cmd, len, off, req);
> +        return nvme_error_info(n, cmd, rae, len, off, req);
>      case NVME_LOG_SMART_INFO:
> -        return nvme_smart_info(n, cmd, len, off, req);
> +        return nvme_smart_info(n, cmd, rae, len, off, req);
>      case NVME_LOG_FW_SLOT_INFO:
>          return nvme_fw_log_info(n, cmd, len, off, req);
>      default:
> @@ -958,6 +1006,9 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>          break;
>      case NVME_TIMESTAMP:
>          return nvme_get_feature_timestamp(n, cmd);
> +    case NVME_ASYNCHRONOUS_EVENT_CONF:
> +        result = cpu_to_le32(n->features.async_config);
> +        break;
>      default:
>          trace_nvme_err_invalid_getfeat(dw10);
>          return NVME_INVALID_FIELD | NVME_DNR;
> @@ -993,6 +1044,12 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>      switch (dw10) {
>      case NVME_TEMPERATURE_THRESHOLD:
>          n->features.temp_thresh = dw11;
> +
> +        if (n->features.temp_thresh <= n->temperature) {
> +            nvme_enqueue_event(n, NVME_AER_TYPE_SMART,
> +                NVME_AER_INFO_SMART_TEMP_THRESH, NVME_LOG_SMART_INFO);
> +        }
> +
>          break;
>
>      case NVME_VOLATILE_WRITE_CACHE:
> @@ -1008,6 +1065,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>          break;
>      case NVME_TIMESTAMP:
>          return nvme_set_feature_timestamp(n, cmd);
> +    case NVME_ASYNCHRONOUS_EVENT_CONF:
> +        n->features.async_config = dw11;
> +        break;
>      default:
>          trace_nvme_err_invalid_setfeat(dw10);
>          return NVME_INVALID_FIELD | NVME_DNR;
> @@ -1016,6 +1076,22 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>      return NVME_SUCCESS;
>  }
>
> +static uint16_t nvme_aer(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +{
> +    trace_nvme_aer(req->cid);
> +
> +    if (n->outstanding_aers > n->params.aerl) {
> +        trace_nvme_aer_aerl_exceeded();
> +        return NVME_AER_LIMIT_EXCEEDED;
> +    }
> +
> +    n->aer_reqs[n->outstanding_aers] = req;
> +    timer_mod(n->aer_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> +    n->outstanding_aers++;
> +
> +    return NVME_NO_COMPLETE;
> +}
> +
>  static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>  {
>      switch (cmd->opcode) {
> @@ -1037,12 +1113,60 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>          return nvme_set_feature(n, cmd, req);
>      case NVME_ADM_CMD_GET_FEATURES:
>          return nvme_get_feature(n, cmd, req);
> +    case NVME_ADM_CMD_ASYNC_EV_REQ:
> +        return nvme_aer(n, cmd, req);
>      default:
>          trace_nvme_err_invalid_admin_opc(cmd->opcode);
>          return NVME_INVALID_OPCODE | NVME_DNR;
>      }
>  }
>
> +static void nvme_process_aers(void *opaque)
> +{
> +    NvmeCtrl *n = opaque;
> +    NvmeAsyncEvent *event, *next;
> +
> +    trace_nvme_process_aers();
> +
> +    QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
> +        NvmeRequest *req;
> +        NvmeAerResult *result;
> +
> +        /* can't post cqe if there is nothing to complete */
> +        if (!n->outstanding_aers) {
> +            trace_nvme_no_outstanding_aers();
> +            break;
> +        }
> +
> +        /* ignore if masked (cqe posted, but event not cleared) */
> +        if (n->aer_mask & (1 << event->result.event_type)) {
> +            trace_nvme_aer_masked(event->result.event_type, n->aer_mask);
> +            continue;
> +        }
> +
> +        QTAILQ_REMOVE(&n->aer_queue, event, entry);
> +
> +        n->aer_mask |= 1 << event->result.event_type;
> +        n->aer_mask_queued &= ~(1 << event->result.event_type);
> +        n->outstanding_aers--;
> +
> +        req = n->aer_reqs[n->outstanding_aers];
> +
> +        result = (NvmeAerResult *) &req->cqe.result;
> +        result->event_type = event->result.event_type;
> +        result->event_info = event->result.event_info;
> +        result->log_page = event->result.log_page;
> +        g_free(event);
> +
> +        req->status = NVME_SUCCESS;
> +
> +        trace_nvme_aer_post_cqe(result->event_type, result->event_info,
> +            result->log_page);
> +
> +        nvme_enqueue_req_completion(&n->admin_cq, req);
> +    }
> +}
> +
>  static void nvme_process_sq(void *opaque)
>  {
>      NvmeSQueue *sq = opaque;
> @@ -1092,6 +1216,20 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
>          }
>      }
>
> +    if (n->aer_timer) {
> +        timer_del(n->aer_timer);
> +        timer_free(n->aer_timer);
> +        n->aer_timer = NULL;
> +    }
> +
> +    while (!QTAILQ_EMPTY(&n->aer_queue)) {
> +        NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
> +        QTAILQ_REMOVE(&n->aer_queue, event, entry);
> +        g_free(event);
> +    }
> +
> +    n->outstanding_aers = 0;
> +
>      blk_flush(n->conf.blk);
>      n->bar.cc = 0;
>  }
> @@ -1188,6 +1326,9 @@ static int nvme_start_ctrl(NvmeCtrl *n)
>
>      nvme_set_timestamp(n, 0ULL);
>
> +    n->aer_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_aers, n);
> +    QTAILQ_INIT(&n->aer_queue);
> +

Is the timer really needed here ? The CEQ can be posted either when requested
by host through AER, if there are any pending events, or once the
event is triggered
and there are active AER's.

>      return 0;
>  }
>
> @@ -1380,6 +1521,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>                             "completion queue doorbell write"
>                             " for nonexistent queue,"
>                             " sqid=%"PRIu32", ignoring", qid);
> +
> +            if (n->outstanding_aers) {
> +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> +                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
> +                    NVME_LOG_ERROR_INFO);
> +            }
> +
This one (as well as cases below) might not be entirely right
according to the spec. If given event is enabled for asynchronous
reporting the controller should retain that even. In this case, the event
will be ignored as there is no pending request.

>              return;
>          }
>
> @@ -1390,6 +1538,12 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>                             " beyond queue size, sqid=%"PRIu32","
>                             " new_head=%"PRIu16", ignoring",
>                             qid, new_head);
> +
> +            if (n->outstanding_aers) {
> +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> +                    NVME_AER_INFO_ERR_INVALID_DB_VALUE, NVME_LOG_ERROR_INFO);
> +            }
> +
>              return;
>          }
>
> @@ -1418,6 +1572,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>                             "submission queue doorbell write"
>                             " for nonexistent queue,"
>                             " sqid=%"PRIu32", ignoring", qid);
> +
> +            if (n->outstanding_aers) {
> +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> +                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
> +                    NVME_LOG_ERROR_INFO);
> +            }
> +
>              return;
>          }
>
> @@ -1428,6 +1589,12 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>                             " beyond queue size, sqid=%"PRIu32","
>                             " new_tail=%"PRIu16", ignoring",
>                             qid, new_tail);
> +
> +            if (n->outstanding_aers) {
> +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> +                    NVME_AER_INFO_ERR_INVALID_DB_VALUE, NVME_LOG_ERROR_INFO);
> +            }
> +
>              return;
>          }
>
> @@ -1524,6 +1691,7 @@ static void nvme_init_state(NvmeCtrl *n)
>      n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
>      n->temperature = NVME_TEMPERATURE;
>      n->features.temp_thresh = 0x14d;
> +    n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
>  }
>
>  static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
> @@ -1591,6 +1759,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
>      id->ver = cpu_to_le32(0x00010201);
>      id->oacs = cpu_to_le16(0);
>      id->acl = 3;
> +    id->aerl = n->params.aerl;

What about the configuration for the asynchronous events ?

BR

Beata


>      id->frmw = 7 << 1;
>      id->lpa = 1 << 2;
>      id->elpe = n->params.elpe;
> @@ -1678,6 +1847,7 @@ static void nvme_exit(PCIDevice *pci_dev)
>      g_free(n->cq);
>      g_free(n->sq);
>      g_free(n->elpes);
> +    g_free(n->aer_reqs);
>
>      if (n->params.cmb_size_mb) {
>          g_free(n->cmbuf);
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index fcfd504e3f9f..3fc36f577b46 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -7,17 +7,19 @@
>      DEFINE_PROP_STRING("serial", _state, _props.serial), \
>      DEFINE_PROP_UINT32("cmb_size_mb", _state, _props.cmb_size_mb, 0), \
>      DEFINE_PROP_UINT32("num_queues", _state, _props.num_queues, 64), \
> -    DEFINE_PROP_UINT8("elpe", _state, _props.elpe, 24)
> +    DEFINE_PROP_UINT8("elpe", _state, _props.elpe, 24), \
> +    DEFINE_PROP_UINT8("aerl", _state, _props.aerl, 3)
>
>  typedef struct NvmeParams {
>      char     *serial;
>      uint32_t num_queues;
>      uint32_t cmb_size_mb;
>      uint8_t  elpe;
> +    uint8_t  aerl;
>  } NvmeParams;
>
>  typedef struct NvmeAsyncEvent {
> -    QSIMPLEQ_ENTRY(NvmeAsyncEvent) entry;
> +    QTAILQ_ENTRY(NvmeAsyncEvent) entry;
>      NvmeAerResult result;
>  } NvmeAsyncEvent;
>
> @@ -89,6 +91,7 @@ typedef struct NvmeCtrl {
>      uint32_t    num_namespaces;
>      uint32_t    max_q_ents;
>      uint64_t    ns_size;
> +    uint8_t     outstanding_aers;
>      uint32_t    cmbsz;
>      uint32_t    cmbloc;
>      uint8_t     *cmbuf;
> @@ -98,6 +101,12 @@ typedef struct NvmeCtrl {
>      uint64_t    starttime_ms;
>      uint16_t    temperature;
>
> +    QEMUTimer   *aer_timer;
> +    uint8_t     aer_mask;
> +    uint8_t     aer_mask_queued;
> +    NvmeRequest **aer_reqs;
> +    QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
> +
>      NvmeNamespace   *namespaces;
>      NvmeSQueue      **sq;
>      NvmeCQueue      **cq;
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index a2c7919c861e..6ddb13d34061 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -47,6 +47,14 @@ nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_coun
>  nvme_setfeat_timestamp(uint64_t ts) "set feature timestamp = 0x%"PRIx64""
>  nvme_getfeat_timestamp(uint64_t ts) "get feature timestamp = 0x%"PRIx64""
>  nvme_get_log(uint16_t cid, uint16_t lid, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx16" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64""
> +nvme_process_aers(void) "processing aers"
> +nvme_aer(uint16_t cid) "cid %"PRIu16""
> +nvme_aer_aerl_exceeded(void) "aerl exceeded"
> +nvme_aer_masked(uint8_t type, uint8_t mask) "type 0x%"PRIx8" mask 0x%"PRIx8""
> +nvme_aer_post_cqe(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
> +nvme_enqueue_event(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
> +nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""
> +nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs"
>  nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
>  nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
>  nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index a088ec7d4e38..7f1832d531f0 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -386,8 +386,8 @@ enum NvmeAsyncEventRequest {
>      NVME_AER_TYPE_SMART                     = 1,
>      NVME_AER_TYPE_IO_SPECIFIC               = 6,
>      NVME_AER_TYPE_VENDOR_SPECIFIC           = 7,
> -    NVME_AER_INFO_ERR_INVALID_SQ            = 0,
> -    NVME_AER_INFO_ERR_INVALID_DB            = 1,
> +    NVME_AER_INFO_ERR_INVALID_DB_REGISTER   = 0,
> +    NVME_AER_INFO_ERR_INVALID_DB_VALUE      = 1,
>      NVME_AER_INFO_ERR_DIAG_FAIL             = 2,
>      NVME_AER_INFO_ERR_PERS_INTERNAL_ERR     = 3,
>      NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR    = 4,
> --
> 2.23.0
>
>
Klaus Jensen Nov. 19, 2019, 7:51 p.m. UTC | #2
On Tue, Nov 12, 2019 at 03:04:59PM +0000, Beata Michalska wrote:
> Hi Klaus,
> 
> On Tue, 15 Oct 2019 at 11:49, Klaus Jensen <its@irrelevant.dk> wrote:
> > @@ -1188,6 +1326,9 @@ static int nvme_start_ctrl(NvmeCtrl *n)
> >
> >      nvme_set_timestamp(n, 0ULL);
> >
> > +    n->aer_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_aers, n);
> > +    QTAILQ_INIT(&n->aer_queue);
> > +
> 
> Is the timer really needed here ? The CEQ can be posted either when requested
> by host through AER, if there are any pending events, or once the
> event is triggered
> and there are active AER's.
> 

I guess you are right. I mostly cribbed this from Keith's tree, but I
see no reason to keep the timer.

Keith, do you have any comments on this?

> > @@ -1380,6 +1521,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> >                             "completion queue doorbell write"
> >                             " for nonexistent queue,"
> >                             " sqid=%"PRIu32", ignoring", qid);
> > +
> > +            if (n->outstanding_aers) {
> > +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> > +                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
> > +                    NVME_LOG_ERROR_INFO);
> > +            }
> > +
> This one (as well as cases below) might not be entirely right
> according to the spec. If given event is enabled for asynchronous
> reporting the controller should retain that even. In this case, the event
> will be ignored as there is no pending request.
> 

I understand these notifications to be special cases (i.e. they cannot
be enabled/disabled through the Asynchronous Event Configuration
feature). See Section 4.1 of NVM Express 1.2.1. The spec specifically
says that "... and an Asynchronous Event Request command is outstanding,
...).

> > @@ -1591,6 +1759,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
> >      id->ver = cpu_to_le32(0x00010201);
> >      id->oacs = cpu_to_le16(0);
> >      id->acl = 3;
> > +    id->aerl = n->params.aerl;
> 
> What about the configuration for the asynchronous events ?
> 

It will default to an AEC vector of 0 (everything disabled).


K
Beata Michalska Nov. 25, 2019, 12:44 p.m. UTC | #3
On Tue, 19 Nov 2019 at 19:51, Klaus Birkelund <its@irrelevant.dk> wrote:
>
> On Tue, Nov 12, 2019 at 03:04:59PM +0000, Beata Michalska wrote:
> > Hi Klaus,
> >
> > On Tue, 15 Oct 2019 at 11:49, Klaus Jensen <its@irrelevant.dk> wrote:
> > > @@ -1188,6 +1326,9 @@ static int nvme_start_ctrl(NvmeCtrl *n)
> > >
> > >      nvme_set_timestamp(n, 0ULL);
> > >
> > > +    n->aer_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_aers, n);
> > > +    QTAILQ_INIT(&n->aer_queue);
> > > +
> >
> > Is the timer really needed here ? The CEQ can be posted either when requested
> > by host through AER, if there are any pending events, or once the
> > event is triggered
> > and there are active AER's.
> >
>
> I guess you are right. I mostly cribbed this from Keith's tree, but I
> see no reason to keep the timer.
>
> Keith, do you have any comments on this?
>
> > > @@ -1380,6 +1521,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> > >                             "completion queue doorbell write"
> > >                             " for nonexistent queue,"
> > >                             " sqid=%"PRIu32", ignoring", qid);
> > > +
> > > +            if (n->outstanding_aers) {
> > > +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> > > +                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
> > > +                    NVME_LOG_ERROR_INFO);
> > > +            }
> > > +
> > This one (as well as cases below) might not be entirely right
> > according to the spec. If given event is enabled for asynchronous
> > reporting the controller should retain that even. In this case, the event
> > will be ignored as there is no pending request.
> >
>
> I understand these notifications to be special cases (i.e. they cannot
> be enabled/disabled through the Asynchronous Event Configuration
> feature). See Section 4.1 of NVM Express 1.2.1. The spec specifically
> says that "... and an Asynchronous Event Request command is outstanding,
> ...).
>

 OK, I have missed that one.
Thanks for the reference.

BR
Beata
> > > @@ -1591,6 +1759,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
> > >      id->ver = cpu_to_le32(0x00010201);
> > >      id->oacs = cpu_to_le16(0);
> > >      id->acl = 3;
> > > +    id->aerl = n->params.aerl;
> >
> > What about the configuration for the asynchronous events ?
> >
>
> It will default to an AEC vector of 0 (everything disabled).
>
>
> K
diff mbox series

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 4412a3bea3bc..5cdee37582f9 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -334,6 +334,46 @@  static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
     timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
 }
 
+static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
+    uint8_t event_info, uint8_t log_page)
+{
+    NvmeAsyncEvent *event;
+
+    trace_nvme_enqueue_event(event_type, event_info, log_page);
+
+    /*
+     * Do not enqueue the event if something of this type is already queued.
+     * This bounds the size of the event queue and makes sure it does not grow
+     * indefinitely when events are not processed by the host (i.e. does not
+     * issue any AERs).
+     */
+    if (n->aer_mask_queued & (1 << event_type)) {
+        trace_nvme_enqueue_event_masked(event_type);
+        return;
+    }
+    n->aer_mask_queued |= (1 << event_type);
+
+    event = g_new(NvmeAsyncEvent, 1);
+    event->result = (NvmeAerResult) {
+        .event_type = event_type,
+        .event_info = event_info,
+        .log_page   = log_page,
+    };
+
+    QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
+
+    timer_mod(n->aer_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+}
+
+static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
+{
+    n->aer_mask &= ~(1 << event_type);
+    if (!QTAILQ_EMPTY(&n->aer_queue)) {
+        timer_mod(n->aer_timer,
+            qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+    }
+}
+
 static void nvme_rw_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
@@ -578,7 +618,7 @@  static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
     return NVME_SUCCESS;
 }
 
-static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd,
+static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
     uint32_t buf_len, uint64_t off, NvmeRequest *req)
 {
     uint32_t trans_len;
@@ -591,12 +631,16 @@  static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd,
 
     trans_len = MIN(sizeof(*n->elpes) * (n->params.elpe + 1) - off, buf_len);
 
+    if (!rae) {
+        nvme_clear_events(n, NVME_AER_TYPE_ERROR);
+    }
+
     return nvme_dma_read_prp(n, (uint8_t *) n->elpes + off, trans_len, prp1,
         prp2);
 }
 
-static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
-    uint64_t off, NvmeRequest *req)
+static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
+    uint32_t buf_len, uint64_t off, NvmeRequest *req)
 {
     uint64_t prp1 = le64_to_cpu(cmd->prp1);
     uint64_t prp2 = le64_to_cpu(cmd->prp2);
@@ -646,6 +690,10 @@  static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
     smart.power_on_hours[0] = cpu_to_le64(
         (((current_ms - n->starttime_ms) / 1000) / 60) / 60);
 
+    if (!rae) {
+        nvme_clear_events(n, NVME_AER_TYPE_SMART);
+    }
+
     return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1,
         prp2);
 }
@@ -698,9 +746,9 @@  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 
     switch (lid) {
     case NVME_LOG_ERROR_INFO:
-        return nvme_error_info(n, cmd, len, off, req);
+        return nvme_error_info(n, cmd, rae, len, off, req);
     case NVME_LOG_SMART_INFO:
-        return nvme_smart_info(n, cmd, len, off, req);
+        return nvme_smart_info(n, cmd, rae, len, off, req);
     case NVME_LOG_FW_SLOT_INFO:
         return nvme_fw_log_info(n, cmd, len, off, req);
     default:
@@ -958,6 +1006,9 @@  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         break;
     case NVME_TIMESTAMP:
         return nvme_get_feature_timestamp(n, cmd);
+    case NVME_ASYNCHRONOUS_EVENT_CONF:
+        result = cpu_to_le32(n->features.async_config);
+        break;
     default:
         trace_nvme_err_invalid_getfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -993,6 +1044,12 @@  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     switch (dw10) {
     case NVME_TEMPERATURE_THRESHOLD:
         n->features.temp_thresh = dw11;
+
+        if (n->features.temp_thresh <= n->temperature) {
+            nvme_enqueue_event(n, NVME_AER_TYPE_SMART,
+                NVME_AER_INFO_SMART_TEMP_THRESH, NVME_LOG_SMART_INFO);
+        }
+
         break;
 
     case NVME_VOLATILE_WRITE_CACHE:
@@ -1008,6 +1065,9 @@  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         break;
     case NVME_TIMESTAMP:
         return nvme_set_feature_timestamp(n, cmd);
+    case NVME_ASYNCHRONOUS_EVENT_CONF:
+        n->features.async_config = dw11;
+        break;
     default:
         trace_nvme_err_invalid_setfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -1016,6 +1076,22 @@  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_aer(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+{
+    trace_nvme_aer(req->cid);
+
+    if (n->outstanding_aers > n->params.aerl) {
+        trace_nvme_aer_aerl_exceeded();
+        return NVME_AER_LIMIT_EXCEEDED;
+    }
+
+    n->aer_reqs[n->outstanding_aers] = req;
+    timer_mod(n->aer_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+    n->outstanding_aers++;
+
+    return NVME_NO_COMPLETE;
+}
+
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
     switch (cmd->opcode) {
@@ -1037,12 +1113,60 @@  static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         return nvme_set_feature(n, cmd, req);
     case NVME_ADM_CMD_GET_FEATURES:
         return nvme_get_feature(n, cmd, req);
+    case NVME_ADM_CMD_ASYNC_EV_REQ:
+        return nvme_aer(n, cmd, req);
     default:
         trace_nvme_err_invalid_admin_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
 
+static void nvme_process_aers(void *opaque)
+{
+    NvmeCtrl *n = opaque;
+    NvmeAsyncEvent *event, *next;
+
+    trace_nvme_process_aers();
+
+    QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
+        NvmeRequest *req;
+        NvmeAerResult *result;
+
+        /* can't post cqe if there is nothing to complete */
+        if (!n->outstanding_aers) {
+            trace_nvme_no_outstanding_aers();
+            break;
+        }
+
+        /* ignore if masked (cqe posted, but event not cleared) */
+        if (n->aer_mask & (1 << event->result.event_type)) {
+            trace_nvme_aer_masked(event->result.event_type, n->aer_mask);
+            continue;
+        }
+
+        QTAILQ_REMOVE(&n->aer_queue, event, entry);
+
+        n->aer_mask |= 1 << event->result.event_type;
+        n->aer_mask_queued &= ~(1 << event->result.event_type);
+        n->outstanding_aers--;
+
+        req = n->aer_reqs[n->outstanding_aers];
+
+        result = (NvmeAerResult *) &req->cqe.result;
+        result->event_type = event->result.event_type;
+        result->event_info = event->result.event_info;
+        result->log_page = event->result.log_page;
+        g_free(event);
+
+        req->status = NVME_SUCCESS;
+
+        trace_nvme_aer_post_cqe(result->event_type, result->event_info,
+            result->log_page);
+
+        nvme_enqueue_req_completion(&n->admin_cq, req);
+    }
+}
+
 static void nvme_process_sq(void *opaque)
 {
     NvmeSQueue *sq = opaque;
@@ -1092,6 +1216,20 @@  static void nvme_clear_ctrl(NvmeCtrl *n)
         }
     }
 
+    if (n->aer_timer) {
+        timer_del(n->aer_timer);
+        timer_free(n->aer_timer);
+        n->aer_timer = NULL;
+    }
+
+    while (!QTAILQ_EMPTY(&n->aer_queue)) {
+        NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
+        QTAILQ_REMOVE(&n->aer_queue, event, entry);
+        g_free(event);
+    }
+
+    n->outstanding_aers = 0;
+
     blk_flush(n->conf.blk);
     n->bar.cc = 0;
 }
@@ -1188,6 +1326,9 @@  static int nvme_start_ctrl(NvmeCtrl *n)
 
     nvme_set_timestamp(n, 0ULL);
 
+    n->aer_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_aers, n);
+    QTAILQ_INIT(&n->aer_queue);
+
     return 0;
 }
 
@@ -1380,6 +1521,13 @@  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
                            "completion queue doorbell write"
                            " for nonexistent queue,"
                            " sqid=%"PRIu32", ignoring", qid);
+
+            if (n->outstanding_aers) {
+                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
+                    NVME_LOG_ERROR_INFO);
+            }
+
             return;
         }
 
@@ -1390,6 +1538,12 @@  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
                            " beyond queue size, sqid=%"PRIu32","
                            " new_head=%"PRIu16", ignoring",
                            qid, new_head);
+
+            if (n->outstanding_aers) {
+                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+                    NVME_AER_INFO_ERR_INVALID_DB_VALUE, NVME_LOG_ERROR_INFO);
+            }
+
             return;
         }
 
@@ -1418,6 +1572,13 @@  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
                            "submission queue doorbell write"
                            " for nonexistent queue,"
                            " sqid=%"PRIu32", ignoring", qid);
+
+            if (n->outstanding_aers) {
+                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
+                    NVME_LOG_ERROR_INFO);
+            }
+
             return;
         }
 
@@ -1428,6 +1589,12 @@  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
                            " beyond queue size, sqid=%"PRIu32","
                            " new_tail=%"PRIu16", ignoring",
                            qid, new_tail);
+
+            if (n->outstanding_aers) {
+                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+                    NVME_AER_INFO_ERR_INVALID_DB_VALUE, NVME_LOG_ERROR_INFO);
+            }
+
             return;
         }
 
@@ -1524,6 +1691,7 @@  static void nvme_init_state(NvmeCtrl *n)
     n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
     n->temperature = NVME_TEMPERATURE;
     n->features.temp_thresh = 0x14d;
+    n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
 }
 
 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
@@ -1591,6 +1759,7 @@  static void nvme_init_ctrl(NvmeCtrl *n)
     id->ver = cpu_to_le32(0x00010201);
     id->oacs = cpu_to_le16(0);
     id->acl = 3;
+    id->aerl = n->params.aerl;
     id->frmw = 7 << 1;
     id->lpa = 1 << 2;
     id->elpe = n->params.elpe;
@@ -1678,6 +1847,7 @@  static void nvme_exit(PCIDevice *pci_dev)
     g_free(n->cq);
     g_free(n->sq);
     g_free(n->elpes);
+    g_free(n->aer_reqs);
 
     if (n->params.cmb_size_mb) {
         g_free(n->cmbuf);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index fcfd504e3f9f..3fc36f577b46 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -7,17 +7,19 @@ 
     DEFINE_PROP_STRING("serial", _state, _props.serial), \
     DEFINE_PROP_UINT32("cmb_size_mb", _state, _props.cmb_size_mb, 0), \
     DEFINE_PROP_UINT32("num_queues", _state, _props.num_queues, 64), \
-    DEFINE_PROP_UINT8("elpe", _state, _props.elpe, 24)
+    DEFINE_PROP_UINT8("elpe", _state, _props.elpe, 24), \
+    DEFINE_PROP_UINT8("aerl", _state, _props.aerl, 3)
 
 typedef struct NvmeParams {
     char     *serial;
     uint32_t num_queues;
     uint32_t cmb_size_mb;
     uint8_t  elpe;
+    uint8_t  aerl;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
-    QSIMPLEQ_ENTRY(NvmeAsyncEvent) entry;
+    QTAILQ_ENTRY(NvmeAsyncEvent) entry;
     NvmeAerResult result;
 } NvmeAsyncEvent;
 
@@ -89,6 +91,7 @@  typedef struct NvmeCtrl {
     uint32_t    num_namespaces;
     uint32_t    max_q_ents;
     uint64_t    ns_size;
+    uint8_t     outstanding_aers;
     uint32_t    cmbsz;
     uint32_t    cmbloc;
     uint8_t     *cmbuf;
@@ -98,6 +101,12 @@  typedef struct NvmeCtrl {
     uint64_t    starttime_ms;
     uint16_t    temperature;
 
+    QEMUTimer   *aer_timer;
+    uint8_t     aer_mask;
+    uint8_t     aer_mask_queued;
+    NvmeRequest **aer_reqs;
+    QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
+
     NvmeNamespace   *namespaces;
     NvmeSQueue      **sq;
     NvmeCQueue      **cq;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index a2c7919c861e..6ddb13d34061 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -47,6 +47,14 @@  nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_coun
 nvme_setfeat_timestamp(uint64_t ts) "set feature timestamp = 0x%"PRIx64""
 nvme_getfeat_timestamp(uint64_t ts) "get feature timestamp = 0x%"PRIx64""
 nvme_get_log(uint16_t cid, uint16_t lid, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx16" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64""
+nvme_process_aers(void) "processing aers"
+nvme_aer(uint16_t cid) "cid %"PRIu16""
+nvme_aer_aerl_exceeded(void) "aerl exceeded"
+nvme_aer_masked(uint8_t type, uint8_t mask) "type 0x%"PRIx8" mask 0x%"PRIx8""
+nvme_aer_post_cqe(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
+nvme_enqueue_event(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
+nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""
+nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs"
 nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
diff --git a/include/block/nvme.h b/include/block/nvme.h
index a088ec7d4e38..7f1832d531f0 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -386,8 +386,8 @@  enum NvmeAsyncEventRequest {
     NVME_AER_TYPE_SMART                     = 1,
     NVME_AER_TYPE_IO_SPECIFIC               = 6,
     NVME_AER_TYPE_VENDOR_SPECIFIC           = 7,
-    NVME_AER_INFO_ERR_INVALID_SQ            = 0,
-    NVME_AER_INFO_ERR_INVALID_DB            = 1,
+    NVME_AER_INFO_ERR_INVALID_DB_REGISTER   = 0,
+    NVME_AER_INFO_ERR_INVALID_DB_VALUE      = 1,
     NVME_AER_INFO_ERR_DIAG_FAIL             = 2,
     NVME_AER_INFO_ERR_PERS_INTERNAL_ERR     = 3,
     NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR    = 4,