Patchwork [v2,4/8] rdma: unpin support

login
register
mail settings
Submitter mrhines@linux.vnet.ibm.com
Date June 28, 2013, 7:59 p.m.
Message ID <1372449603-20431-5-git-send-email-mrhines@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/255661/
State New
Headers show

Comments

mrhines@linux.vnet.ibm.com - June 28, 2013, 7:59 p.m.
From: "Michael R. Hines" <mrhines@us.ibm.com>

As requested, the protocol now includes memory unpinning support.
This has been implemented in a non-optimized manner, in such a way
that one could devise an LRU or other workload-specific information
on top of the basic mechanism to influence the way unpinning happens
during runtime.

The feature is not yet user-facing, and is thus can only be enable
at compile-time.

Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
---
 migration-rdma.c |  143 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
Paolo Bonzini - July 1, 2013, 12:04 p.m.
Il 28/06/2013 21:59, mrhines@linux.vnet.ibm.com ha scritto:
> +/*
> + * Perform a non-optimized memory unregistration after every transfer
> + * for demonsration purposes, only if pin-all is not requested.
> + *
> + * Potential optimizations:
> + * 1. Start a new thread to run this function continuously
> +        - for bit clearing
> +        - and for receipt of unregister messages
> + * 2. Use an LRU.
> + * 3. Use workload hints.
> + */
> +#ifdef RDMA_UNREGISTRATION_EXAMPLE
> +static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
> +{
> +    while (rdma->unregistrations[rdma->unregister_current]) {
> +        int ret;
> +        uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
> +        uint64_t chunk =
> +            (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
> +        uint64_t index =
> +            (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
> +        RDMALocalBlock *block =
> +            &(rdma->local_ram_blocks.block[index]);
> +        RDMARegister reg = { .current_index = index };
> +        RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
> +                                 };
> +        RDMAControlHeader head = { .len = sizeof(RDMARegister),
> +                                   .type = RDMA_CONTROL_UNREGISTER_REQUEST,
> +                                   .repeat = 1,
> +                                 };
> +
> +        DDPRINTF("Processing unregister for chunk: %" PRIu64 " at position %d\n",
> +                    chunk, rdma->unregister_current);
> +
> +        rdma->unregistrations[rdma->unregister_current] = 0;
> +        rdma->unregister_current++;
> +
> +        if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
> +            rdma->unregister_current = 0;
> +        }
> +
> +        DDPRINTF("Sending unregister for chunk: %" PRIu64 "\n", chunk);
> +
> +        clear_bit(chunk, block->unregister_bitmap);

The chunk is still registered at this point, shouldn't it be after
the ibv_dereg_mr or something like that?

> +        if (test_bit(chunk, block->transit_bitmap)) {
> +            DDPRINTF("Cannot unregister inflight chunk: %" PRIu64 "\n", chunk);
> +            continue;
> +        }

This was not clear from your answer: who exactly will unregister this
chunk?  Why not call the 15 lines below this one also at this point:

+    if (wr_id == RDMA_WRID_RDMA_WRITE) {
+        uint64_t chunk =
+            (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
+        uint64_t index =
+            (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
+        RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
+
+        DDDPRINTF("completions %s (%" PRId64 ") left %d, "
+                 "block %" PRIu64 ", chunk: %" PRIu64 "\n",
+                 print_wrid(wr_id), wr_id, rdma->nb_sent, index, chunk);
+
+        clear_bit(chunk, block->transit_bitmap);
+
+        if (rdma->nb_sent > 0) {
+            rdma->nb_sent--;
+        }

?

> +
> +        ret = ibv_dereg_mr(block->pmr[chunk]);
> +        block->pmr[chunk] = NULL;
> +        block->remote_keys[chunk] = 0;
> +
> +        if (ret != 0) {
> +            perror("unregistration chunk failed");
> +            return -ret;
> +        }
> +        rdma->total_registrations--;
> +
> +        reg.key.chunk = chunk;
> +        register_to_network(&reg);
> +        ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
> +                                &resp, NULL, NULL);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        DDPRINTF("Unregister for chunk: %" PRIu64 " complete.\n", chunk);
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * Set bit for unregistration in the next iteration.
> + * We cannot transmit right here, but will unpin later.
> + */
> +static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
> +                                        uint64_t chunk, uint64_t wr_id)
> +{
> +    if (rdma->unregistrations[rdma->unregister_next] != 0) {
> +        fprintf(stderr, "rdma migration: queue is full!\n");
> +    } else {
> +        RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
> +
> +        if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
> +            DDPRINTF("Appending unregister chunk %" PRIu64
> +                    " at position %d\n", chunk, rdma->unregister_next);
> +
> +            rdma->unregistrations[rdma->unregister_next++] = wr_id;
> +
> +            if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
> +                rdma->unregister_next = 0;
> +            }
> +        } else {
> +            DDPRINTF("Unregister chunk %" PRIu64 " already in queue.\n",
> +                    chunk);
> +        }
> +    }
> +}
> +#endif
>  static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
>                                     uint8_t *data, RDMAControlHeader *resp,
>                                     int *resp_idx,
> @@ -1006,6 +1132,17 @@ static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out)
>          if (rdma->nb_sent > 0) {
>              rdma->nb_sent--;
>          }
> +        if (!rdma->pin_all) {
> +            /*
> +             * FYI: If one wanted to signal a specific chunk to be unregistered
> +             * using LRU or workload-specific information, this is the function
> +             * you would call to do so. That chunk would then get asynchronously
> +             * unregistered later.
> +             */
> +#ifdef RDMA_UNREGISTRATION_EXAMPLE
> +            qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
> +#endif
> +        }
>      } else {
>          DDPRINTF("other completion %s (%" PRId64 ") received left %d\n",
>              print_wrid(wr_id), wr_id, rdma->nb_sent);
> @@ -1423,6 +1560,12 @@ retry:
>      chunk_start = ram_chunk_start(block, chunk);
>      chunk_end = ram_chunk_end(block, chunk);
>  
> +    if (!rdma->pin_all) {
> +#ifdef RDMA_UNREGISTRATION_EXAMPLE
> +        qemu_rdma_unregister_waiting(rdma);
> +#endif
> +    }
> +
>      while (test_bit(chunk, block->transit_bitmap)) {
>          (void)count;
>          DDPRINTF("(%d) Not clobbering: block: %d chunk %" PRIu64
>
mrhines@linux.vnet.ibm.com - July 1, 2013, 2:23 p.m.
On 07/01/2013 08:04 AM, Paolo Bonzini wrote:
> Il 28/06/2013 21:59, mrhines@linux.vnet.ibm.com ha scritto:
>> +/*
>> + * Perform a non-optimized memory unregistration after every transfer
>> + * for demonsration purposes, only if pin-all is not requested.
>> + *
>> + * Potential optimizations:
>> + * 1. Start a new thread to run this function continuously
>> +        - for bit clearing
>> +        - and for receipt of unregister messages
>> + * 2. Use an LRU.
>> + * 3. Use workload hints.
>> + */
>> +#ifdef RDMA_UNREGISTRATION_EXAMPLE
>> +static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
>> +{
>> +    while (rdma->unregistrations[rdma->unregister_current]) {
>> +        int ret;
>> +        uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
>> +        uint64_t chunk =
>> +            (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
>> +        uint64_t index =
>> +            (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
>> +        RDMALocalBlock *block =
>> +            &(rdma->local_ram_blocks.block[index]);
>> +        RDMARegister reg = { .current_index = index };
>> +        RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
>> +                                 };
>> +        RDMAControlHeader head = { .len = sizeof(RDMARegister),
>> +                                   .type = RDMA_CONTROL_UNREGISTER_REQUEST,
>> +                                   .repeat = 1,
>> +                                 };
>> +
>> +        DDPRINTF("Processing unregister for chunk: %" PRIu64 " at position %d\n",
>> +                    chunk, rdma->unregister_current);
>> +
>> +        rdma->unregistrations[rdma->unregister_current] = 0;
>> +        rdma->unregister_current++;
>> +
>> +        if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
>> +            rdma->unregister_current = 0;
>> +        }
>> +
>> +        DDPRINTF("Sending unregister for chunk: %" PRIu64 "\n", chunk);
>> +
>> +        clear_bit(chunk, block->unregister_bitmap);
> The chunk is still registered at this point, shouldn't it be after
> the ibv_dereg_mr or something like that?

No, this example is completely speculative (and single-threaded).
If the unpin can be done safely (i.e. memory is not being actively
transmitted), only then do we proceed with unregistering it.
Otherwise, we just skip the entire region. After the "in transmit"
memory completes later, we will *again* attempt to unregister
that memory in a later iteration of this function.

>> +        if (test_bit(chunk, block->transit_bitmap)) {
>> +            DDPRINTF("Cannot unregister inflight chunk: %" PRIu64 "\n", chunk);
>> +            continue;
>> +        }
> This was not clear from your answer: who exactly will unregister this
> chunk?  Why not call the 15 lines below this one also at this point:

hehe: This is because of the infiniband programming model =)

I know it's not clear at first. I'll explain:

*Everything* in infiniband is asynchronous, so you have to think about
infiniband code just like parallel code even when it's not explicitly 
parallel.

All messages in infiniband must have what is called a "receive work request"
posted on the receiver side of the connection *before* the sender attempts
actually send anything. This requires infiniband code to keep a very precise
count of how many work requests have been posted, otherwise the device
will throw an error and the whole connection shuts down.

The block of code below is part of a single-shot blocking call which is
expecting a specific series of asynchronous responses from the other side,
for which a specific number of receive work requests have already been 
posted
on the receiver-side RDMA device.

Without implementing a new thread, we cannot *initiate* a new exchange
of protocol messages *in the middle* of this blocking call path without 
causing the
number of receive work requests posted to the RDMA device to be incorrect.

The only way to unregister the memory (without a new thread) is to *mark*
the memory as "need to unregister" in the bitmap you see here and then
wait for the *current* RDMA transfer to complete and then when the coast
is clear, so to speak, then we can unregister all the memory that was 
previously
registered by piggybacking the unregistrations on top of the same call path
of NEW RDMA transfers that will happen in the future.

Does that make sense?

> +    if (wr_id == RDMA_WRID_RDMA_WRITE) {
> +        uint64_t chunk =
> +            (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
> +        uint64_t index =
> +            (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
> +        RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
> +
> +        DDDPRINTF("completions %s (%" PRId64 ") left %d, "
> +                 "block %" PRIu64 ", chunk: %" PRIu64 "\n",
> +                 print_wrid(wr_id), wr_id, rdma->nb_sent, index, chunk);
> +
> +        clear_bit(chunk, block->transit_bitmap);
> +
> +        if (rdma->nb_sent > 0) {
> +            rdma->nb_sent--;
> +        }
>
> ?
>
>> +
>> +        ret = ibv_dereg_mr(block->pmr[chunk]);
>> +        block->pmr[chunk] = NULL;
>> +        block->remote_keys[chunk] = 0;
>> +
>> +        if (ret != 0) {
>> +            perror("unregistration chunk failed");
>> +            return -ret;
>> +        }
>> +        rdma->total_registrations--;
>> +
>> +        reg.key.chunk = chunk;
>> +        register_to_network(&reg);
>> +        ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
>> +                                &resp, NULL, NULL);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +
>> +        DDPRINTF("Unregister for chunk: %" PRIu64 " complete.\n", chunk);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/*
>> + * Set bit for unregistration in the next iteration.
>> + * We cannot transmit right here, but will unpin later.
>> + */
>> +static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
>> +                                        uint64_t chunk, uint64_t wr_id)
>> +{
>> +    if (rdma->unregistrations[rdma->unregister_next] != 0) {
>> +        fprintf(stderr, "rdma migration: queue is full!\n");
>> +    } else {
>> +        RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
>> +
>> +        if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
>> +            DDPRINTF("Appending unregister chunk %" PRIu64
>> +                    " at position %d\n", chunk, rdma->unregister_next);
>> +
>> +            rdma->unregistrations[rdma->unregister_next++] = wr_id;
>> +
>> +            if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
>> +                rdma->unregister_next = 0;
>> +            }
>> +        } else {
>> +            DDPRINTF("Unregister chunk %" PRIu64 " already in queue.\n",
>> +                    chunk);
>> +        }
>> +    }
>> +}
>> +#endif
>>   static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
>>                                      uint8_t *data, RDMAControlHeader *resp,
>>                                      int *resp_idx,
>> @@ -1006,6 +1132,17 @@ static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out)
>>           if (rdma->nb_sent > 0) {
>>               rdma->nb_sent--;
>>           }
>> +        if (!rdma->pin_all) {
>> +            /*
>> +             * FYI: If one wanted to signal a specific chunk to be unregistered
>> +             * using LRU or workload-specific information, this is the function
>> +             * you would call to do so. That chunk would then get asynchronously
>> +             * unregistered later.
>> +             */
>> +#ifdef RDMA_UNREGISTRATION_EXAMPLE
>> +            qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
>> +#endif
>> +        }
>>       } else {
>>           DDPRINTF("other completion %s (%" PRId64 ") received left %d\n",
>>               print_wrid(wr_id), wr_id, rdma->nb_sent);
>> @@ -1423,6 +1560,12 @@ retry:
>>       chunk_start = ram_chunk_start(block, chunk);
>>       chunk_end = ram_chunk_end(block, chunk);
>>   
>> +    if (!rdma->pin_all) {
>> +#ifdef RDMA_UNREGISTRATION_EXAMPLE
>> +        qemu_rdma_unregister_waiting(rdma);
>> +#endif
>> +    }
>> +
>>       while (test_bit(chunk, block->transit_bitmap)) {
>>           (void)count;
>>           DDPRINTF("(%d) Not clobbering: block: %d chunk %" PRIu64
>>
>

Patch

diff --git a/migration-rdma.c b/migration-rdma.c
index 0bd5e23..6218d48 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -944,6 +944,132 @@  const char *print_wrid(int wrid)
     return wrid_desc[wrid];
 }
 
+/*
+ * RDMA requires memory registration (mlock/pinning), but this is not good for
+ * overcommitment.
+ *
+ * In preparation for the future where LRU information or workload-specific
+ * writable writable working set memory access behavior is available to QEMU
+ * it would be nice to have in place the ability to UN-register/UN-pin
+ * particular memory regions from the RDMA hardware when it is determine that
+ * those regions of memory will likely not be accessed again in the near future.
+ *
+ * While we do not yet have such information right now, the following
+ * compile-time option allows us to perform a non-optimized version of this
+ * behavior.
+ *
+ * By uncommenting this option, you will cause *all* RDMA transfers to be
+ * unregistered immediately after the transfer completes on both sides of the
+ * connection. This has no effect in 'rdma-pin-all' mode, only regular mode.
+ *
+ * This will have a terrible impact on migration performance, so until future
+ * workload information or LRU information is available, do not attempt to use
+ * this feature except for basic testing.
+ */
+//#define RDMA_UNREGISTRATION_EXAMPLE
+
+/*
+ * Perform a non-optimized memory unregistration after every transfer
+ * for demonsration purposes, only if pin-all is not requested.
+ *
+ * Potential optimizations:
+ * 1. Start a new thread to run this function continuously
+        - for bit clearing
+        - and for receipt of unregister messages
+ * 2. Use an LRU.
+ * 3. Use workload hints.
+ */
+#ifdef RDMA_UNREGISTRATION_EXAMPLE
+static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
+{
+    while (rdma->unregistrations[rdma->unregister_current]) {
+        int ret;
+        uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
+        uint64_t chunk =
+            (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
+        uint64_t index =
+            (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
+        RDMALocalBlock *block =
+            &(rdma->local_ram_blocks.block[index]);
+        RDMARegister reg = { .current_index = index };
+        RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
+                                 };
+        RDMAControlHeader head = { .len = sizeof(RDMARegister),
+                                   .type = RDMA_CONTROL_UNREGISTER_REQUEST,
+                                   .repeat = 1,
+                                 };
+
+        DDPRINTF("Processing unregister for chunk: %" PRIu64 " at position %d\n",
+                    chunk, rdma->unregister_current);
+
+        rdma->unregistrations[rdma->unregister_current] = 0;
+        rdma->unregister_current++;
+
+        if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
+            rdma->unregister_current = 0;
+        }
+
+        DDPRINTF("Sending unregister for chunk: %" PRIu64 "\n", chunk);
+
+        clear_bit(chunk, block->unregister_bitmap);
+
+        if (test_bit(chunk, block->transit_bitmap)) {
+            DDPRINTF("Cannot unregister inflight chunk: %" PRIu64 "\n", chunk);
+            continue;
+        }
+
+        ret = ibv_dereg_mr(block->pmr[chunk]);
+        block->pmr[chunk] = NULL;
+        block->remote_keys[chunk] = 0;
+
+        if (ret != 0) {
+            perror("unregistration chunk failed");
+            return -ret;
+        }
+        rdma->total_registrations--;
+
+        reg.key.chunk = chunk;
+        register_to_network(&reg);
+        ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
+                                &resp, NULL, NULL);
+        if (ret < 0) {
+            return ret;
+        }
+
+        DDPRINTF("Unregister for chunk: %" PRIu64 " complete.\n", chunk);
+    }
+
+    return 0;
+}
+
+/*
+ * Set bit for unregistration in the next iteration.
+ * We cannot transmit right here, but will unpin later.
+ */
+static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
+                                        uint64_t chunk, uint64_t wr_id)
+{
+    if (rdma->unregistrations[rdma->unregister_next] != 0) {
+        fprintf(stderr, "rdma migration: queue is full!\n");
+    } else {
+        RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
+
+        if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
+            DDPRINTF("Appending unregister chunk %" PRIu64
+                    " at position %d\n", chunk, rdma->unregister_next);
+
+            rdma->unregistrations[rdma->unregister_next++] = wr_id;
+
+            if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
+                rdma->unregister_next = 0;
+            }
+        } else {
+            DDPRINTF("Unregister chunk %" PRIu64 " already in queue.\n",
+                    chunk);
+        }
+    }
+}
+#endif
 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
                                    uint8_t *data, RDMAControlHeader *resp,
                                    int *resp_idx,
@@ -1006,6 +1132,17 @@  static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out)
         if (rdma->nb_sent > 0) {
             rdma->nb_sent--;
         }
+        if (!rdma->pin_all) {
+            /*
+             * FYI: If one wanted to signal a specific chunk to be unregistered
+             * using LRU or workload-specific information, this is the function
+             * you would call to do so. That chunk would then get asynchronously
+             * unregistered later.
+             */
+#ifdef RDMA_UNREGISTRATION_EXAMPLE
+            qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
+#endif
+        }
     } else {
         DDPRINTF("other completion %s (%" PRId64 ") received left %d\n",
             print_wrid(wr_id), wr_id, rdma->nb_sent);
@@ -1423,6 +1560,12 @@  retry:
     chunk_start = ram_chunk_start(block, chunk);
     chunk_end = ram_chunk_end(block, chunk);
 
+    if (!rdma->pin_all) {
+#ifdef RDMA_UNREGISTRATION_EXAMPLE
+        qemu_rdma_unregister_waiting(rdma);
+#endif
+    }
+
     while (test_bit(chunk, block->transit_bitmap)) {
         (void)count;
         DDPRINTF("(%d) Not clobbering: block: %d chunk %" PRIu64