Patchwork [PATCHv2,1/3] qemu: memory notifiers

login
register
mail settings
Submitter Michael S. Tsirkin
Date Jan. 4, 2010, 7:49 p.m.
Message ID <20100104194904.GB21299@redhat.com>
Download mbox | patch
Permalink /patch/42212/
State New
Headers show

Comments

Michael S. Tsirkin - Jan. 4, 2010, 7:49 p.m.
This adds notifiers for phys memory changes: a set of callbacks that
vhost can register and update kernel accordingly.  Down the road, kvm
code can be switched to use these as well, instead of calling kvm code
directly from exec.c as is done now.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 cpu-common.h |   19 ++++++++++
 exec.c       |  111 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 127 insertions(+), 3 deletions(-)
Avi Kivity - Jan. 18, 2010, 1:02 p.m.
On 01/04/2010 09:49 PM, Michael S. Tsirkin wrote:
> This adds notifiers for phys memory changes: a set of callbacks that
> vhost can register and update kernel accordingly.  Down the road, kvm
> code can be switched to use these as well, instead of calling kvm code
> directly from exec.c as is done now.
>
>
> +
> +static void phys_page_for_each_in_l1_map(PhysPageDesc **phys_map,
> +                                         CPUPhysMemoryClient *client)
> +{
> +    PhysPageDesc *pd;
> +    int l1, l2;
> +
> +    for (l1 = 0; l1<  L1_SIZE; ++l1) {
> +        pd = phys_map[l1];
> +        if (!pd) {
> +            continue;
> +        }
> +        for (l2 = 0; l2<  L2_SIZE; ++l2) {
> +            if (pd[l2].phys_offset == IO_MEM_UNASSIGNED) {
> +                continue;
> +            }
> +            client->set_memory(client, pd[l2].region_offset,
> +                               TARGET_PAGE_SIZE, pd[l2].phys_offset);
> +        }
> +    }
> +}
> +
> +static void phys_page_for_each(CPUPhysMemoryClient *client)
> +{
> +#if TARGET_PHYS_ADDR_SPACE_BITS>  32
> +
> +#if TARGET_PHYS_ADDR_SPACE_BITS>  (32 + L1_BITS)
> +#error unsupported TARGET_PHYS_ADDR_SPACE_BITS
> +#endif
> +    void **phys_map = (void **)l1_phys_map;
> +    int l1;
> +    if (!l1_phys_map) {
> +        return;
> +    }
> +    for (l1 = 0; l1<  L1_SIZE; ++l1) {
> +        if (phys_map[l1]) {
> +            phys_page_for_each_in_l1_map(phys_map[l1], client);
> +        }
> +    }
> +#else
> +    if (!l1_phys_map) {
> +        return;
> +    }
> +    phys_page_for_each_in_l1_map(l1_phys_map, client);
> +#endif
> +}
>    

This looks pretty frightening.  What is it needed for?

I think we should stick with range operations, but maybe I misunderstood 
something here.

Otherwise, I like this patchset.
Michael S. Tsirkin - Jan. 18, 2010, 1:52 p.m.
On Mon, Jan 18, 2010 at 03:02:59PM +0200, Avi Kivity wrote:
> On 01/04/2010 09:49 PM, Michael S. Tsirkin wrote:
>> This adds notifiers for phys memory changes: a set of callbacks that
>> vhost can register and update kernel accordingly.  Down the road, kvm
>> code can be switched to use these as well, instead of calling kvm code
>> directly from exec.c as is done now.
>>
>>
>> +
>> +static void phys_page_for_each_in_l1_map(PhysPageDesc **phys_map,
>> +                                         CPUPhysMemoryClient *client)
>> +{
>> +    PhysPageDesc *pd;
>> +    int l1, l2;
>> +
>> +    for (l1 = 0; l1<  L1_SIZE; ++l1) {
>> +        pd = phys_map[l1];
>> +        if (!pd) {
>> +            continue;
>> +        }
>> +        for (l2 = 0; l2<  L2_SIZE; ++l2) {
>> +            if (pd[l2].phys_offset == IO_MEM_UNASSIGNED) {
>> +                continue;
>> +            }
>> +            client->set_memory(client, pd[l2].region_offset,
>> +                               TARGET_PAGE_SIZE, pd[l2].phys_offset);
>> +        }
>> +    }
>> +}
>> +
>> +static void phys_page_for_each(CPUPhysMemoryClient *client)
>> +{
>> +#if TARGET_PHYS_ADDR_SPACE_BITS>  32
>> +
>> +#if TARGET_PHYS_ADDR_SPACE_BITS>  (32 + L1_BITS)
>> +#error unsupported TARGET_PHYS_ADDR_SPACE_BITS
>> +#endif
>> +    void **phys_map = (void **)l1_phys_map;
>> +    int l1;
>> +    if (!l1_phys_map) {
>> +        return;
>> +    }
>> +    for (l1 = 0; l1<  L1_SIZE; ++l1) {
>> +        if (phys_map[l1]) {
>> +            phys_page_for_each_in_l1_map(phys_map[l1], client);
>> +        }
>> +    }
>> +#else
>> +    if (!l1_phys_map) {
>> +        return;
>> +    }
>> +    phys_page_for_each_in_l1_map(l1_phys_map, client);
>> +#endif
>> +}
>>    
>
> This looks pretty frightening.  What is it needed for?

The point is that clients can be registered at any point.

A client that registered when memory is present needs to
be notified about it.

>
> I think we should stick with range operations, but maybe I misunderstood  
> something here.
> Otherwise, I like this patchset.
>
> -- 
> error compiling committee.c: too many arguments to function
Avi Kivity - Jan. 18, 2010, 1:58 p.m.
On 01/18/2010 03:52 PM, Michael S. Tsirkin wrote:
> On Mon, Jan 18, 2010 at 03:02:59PM +0200, Avi Kivity wrote:
>    
>> On 01/04/2010 09:49 PM, Michael S. Tsirkin wrote:
>>      
>>> This adds notifiers for phys memory changes: a set of callbacks that
>>> vhost can register and update kernel accordingly.  Down the road, kvm
>>> code can be switched to use these as well, instead of calling kvm code
>>> directly from exec.c as is done now.
>>>
>>>
>>> +
>>> +static void phys_page_for_each_in_l1_map(PhysPageDesc **phys_map,
>>> +                                         CPUPhysMemoryClient *client)
>>> +{
>>> +    PhysPageDesc *pd;
>>> +    int l1, l2;
>>> +
>>> +    for (l1 = 0; l1<   L1_SIZE; ++l1) {
>>> +        pd = phys_map[l1];
>>> +        if (!pd) {
>>> +            continue;
>>> +        }
>>> +        for (l2 = 0; l2<   L2_SIZE; ++l2) {
>>> +            if (pd[l2].phys_offset == IO_MEM_UNASSIGNED) {
>>> +                continue;
>>> +            }
>>> +            client->set_memory(client, pd[l2].region_offset,
>>> +                               TARGET_PAGE_SIZE, pd[l2].phys_offset);
>>> +        }
>>> +    }
>>> +}
>>> +
>>> +static void phys_page_for_each(CPUPhysMemoryClient *client)
>>> +{
>>> +#if TARGET_PHYS_ADDR_SPACE_BITS>   32
>>> +
>>> +#if TARGET_PHYS_ADDR_SPACE_BITS>   (32 + L1_BITS)
>>> +#error unsupported TARGET_PHYS_ADDR_SPACE_BITS
>>> +#endif
>>> +    void **phys_map = (void **)l1_phys_map;
>>> +    int l1;
>>> +    if (!l1_phys_map) {
>>> +        return;
>>> +    }
>>> +    for (l1 = 0; l1<   L1_SIZE; ++l1) {
>>> +        if (phys_map[l1]) {
>>> +            phys_page_for_each_in_l1_map(phys_map[l1], client);
>>> +        }
>>> +    }
>>> +#else
>>> +    if (!l1_phys_map) {
>>> +        return;
>>> +    }
>>> +    phys_page_for_each_in_l1_map(l1_phys_map, client);
>>> +#endif
>>> +}
>>>
>>>        
>> This looks pretty frightening.  What is it needed for?
>>      
> The point is that clients can be registered at any point.
>
> A client that registered when memory is present needs to
> be notified about it.
>    

It looks very expensive.  Maybe we mandate clients be registered at 
init-time?

Long term we need to move to a range based memory description.
Michael S. Tsirkin - Jan. 18, 2010, 2:44 p.m.
On Mon, Jan 18, 2010 at 03:58:51PM +0200, Avi Kivity wrote:
> On 01/18/2010 03:52 PM, Michael S. Tsirkin wrote:
>> On Mon, Jan 18, 2010 at 03:02:59PM +0200, Avi Kivity wrote:
>>    
>>> On 01/04/2010 09:49 PM, Michael S. Tsirkin wrote:
>>>      
>>>> This adds notifiers for phys memory changes: a set of callbacks that
>>>> vhost can register and update kernel accordingly.  Down the road, kvm
>>>> code can be switched to use these as well, instead of calling kvm code
>>>> directly from exec.c as is done now.
>>>>
>>>>
>>>> +
>>>> +static void phys_page_for_each_in_l1_map(PhysPageDesc **phys_map,
>>>> +                                         CPUPhysMemoryClient *client)
>>>> +{
>>>> +    PhysPageDesc *pd;
>>>> +    int l1, l2;
>>>> +
>>>> +    for (l1 = 0; l1<   L1_SIZE; ++l1) {
>>>> +        pd = phys_map[l1];
>>>> +        if (!pd) {
>>>> +            continue;
>>>> +        }
>>>> +        for (l2 = 0; l2<   L2_SIZE; ++l2) {
>>>> +            if (pd[l2].phys_offset == IO_MEM_UNASSIGNED) {
>>>> +                continue;
>>>> +            }
>>>> +            client->set_memory(client, pd[l2].region_offset,
>>>> +                               TARGET_PAGE_SIZE, pd[l2].phys_offset);
>>>> +        }
>>>> +    }
>>>> +}
>>>> +
>>>> +static void phys_page_for_each(CPUPhysMemoryClient *client)
>>>> +{
>>>> +#if TARGET_PHYS_ADDR_SPACE_BITS>   32
>>>> +
>>>> +#if TARGET_PHYS_ADDR_SPACE_BITS>   (32 + L1_BITS)
>>>> +#error unsupported TARGET_PHYS_ADDR_SPACE_BITS
>>>> +#endif
>>>> +    void **phys_map = (void **)l1_phys_map;
>>>> +    int l1;
>>>> +    if (!l1_phys_map) {
>>>> +        return;
>>>> +    }
>>>> +    for (l1 = 0; l1<   L1_SIZE; ++l1) {
>>>> +        if (phys_map[l1]) {
>>>> +            phys_page_for_each_in_l1_map(phys_map[l1], client);
>>>> +        }
>>>> +    }
>>>> +#else
>>>> +    if (!l1_phys_map) {
>>>> +        return;
>>>> +    }
>>>> +    phys_page_for_each_in_l1_map(l1_phys_map, client);
>>>> +#endif
>>>> +}
>>>>
>>>>        
>>> This looks pretty frightening.  What is it needed for?
>>>      
>> The point is that clients can be registered at any point.
>>
>> A client that registered when memory is present needs to
>> be notified about it.
>>    
>
> It looks very expensive.

Shouldn't be hard to optimize ...

> Maybe we mandate clients be registered at init-time?


This might be tricky - vhost currently only registers when the
first device is hot-added.

> Long term we need to move to a range based memory description.
>
> -- 
> error compiling committee.c: too many arguments to function
Avi Kivity - Jan. 18, 2010, 2:52 p.m.
On 01/18/2010 04:44 PM, Michael S. Tsirkin wrote:
>
>    
>>> The point is that clients can be registered at any point.
>>>
>>> A client that registered when memory is present needs to
>>> be notified about it.
>>>
>>>        
>> It looks very expensive.
>>      
> Shouldn't be hard to optimize ...
>    

It's O(memory size), which can be very big.

>> Maybe we mandate clients be registered at init-time?
>>      
>
> This might be tricky - vhost currently only registers when the
> first device is hot-added.
>    

I see.

Maybe coalesce adjacent pages and call the callback with the ranges?
Michael S. Tsirkin - Jan. 18, 2010, 3:08 p.m.
On Mon, Jan 18, 2010 at 04:52:10PM +0200, Avi Kivity wrote:
> On 01/18/2010 04:44 PM, Michael S. Tsirkin wrote:
>>
>>    
>>>> The point is that clients can be registered at any point.
>>>>
>>>> A client that registered when memory is present needs to
>>>> be notified about it.
>>>>
>>>>        
>>> It looks very expensive.
>>>      
>> Shouldn't be hard to optimize ...
>>    
>
> It's O(memory size), which can be very big.
>>> Maybe we mandate clients be registered at init-time?
>>>      
>>
>> This might be tricky - vhost currently only registers when the
>> first device is hot-added.
>>    
>
> I see.
>
> Maybe coalesce adjacent pages and call the callback with the ranges?

Yes, I'll do that.
Michael S. Tsirkin - Jan. 18, 2010, 3:45 p.m.
On Mon, Jan 18, 2010 at 04:52:10PM +0200, Avi Kivity wrote:
> On 01/18/2010 04:44 PM, Michael S. Tsirkin wrote:
>>
>>    
>>>> The point is that clients can be registered at any point.
>>>>
>>>> A client that registered when memory is present needs to
>>>> be notified about it.
>>>>
>>>>        
>>> It looks very expensive.
>>>      
>> Shouldn't be hard to optimize ...
>>    
>
> It's O(memory size), which can be very big.

cpu_register_physical_memory_offset already is O(memory size) btw.

>>> Maybe we mandate clients be registered at init-time?
>>>      
>>
>> This might be tricky - vhost currently only registers when the
>> first device is hot-added.
>>    
>
> I see.
>
> Maybe coalesce adjacent pages and call the callback with the ranges?

Hmm, it turns out to be tricky: it seems whether we can do this
really depends on what get_ram_ptr returns ...
Can't we just rely on callback to do the coalescing?


> -- 
> error compiling committee.c: too many arguments to function
Avi Kivity - Jan. 18, 2010, 4:04 p.m.
On 01/18/2010 05:45 PM, Michael S. Tsirkin wrote:
>
> cpu_register_physical_memory_offset already is O(memory size) btw.
>    

Right, but we'd like to replace it with a range API.

>    
>>>> Maybe we mandate clients be registered at init-time?
>>>>
>>>>          
>>> This might be tricky - vhost currently only registers when the
>>> first device is hot-added.
>>>
>>>        
>> I see.
>>
>> Maybe coalesce adjacent pages and call the callback with the ranges?
>>      
> Hmm, it turns out to be tricky: it seems whether we can do this
> really depends on what get_ram_ptr returns ...
> Can't we just rely on callback to do the coalescing?
>    

If the callback can do the coalescing, surely the caller can as well?

This way we don't introduce a new per-page API.
Michael S. Tsirkin - Jan. 18, 2010, 4:08 p.m.
On Mon, Jan 18, 2010 at 06:04:40PM +0200, Avi Kivity wrote:
> On 01/18/2010 05:45 PM, Michael S. Tsirkin wrote:
>>
>> cpu_register_physical_memory_offset already is O(memory size) btw.
>>    
>
> Right, but we'd like to replace it with a range API.

So, when we do the implementation of notifiers can follow?

>>>>> Maybe we mandate clients be registered at init-time?
>>>>>
>>>>>          
>>>> This might be tricky - vhost currently only registers when the
>>>> first device is hot-added.
>>>>
>>>>        
>>> I see.
>>>
>>> Maybe coalesce adjacent pages and call the callback with the ranges?
>>>      
>> Hmm, it turns out to be tricky: it seems whether we can do this
>> really depends on what get_ram_ptr returns ...
>> Can't we just rely on callback to do the coalescing?
>>    
>
> If the callback can do the coalescing, surely the caller can as well?

The callback calls qemu_ram_ptr and coalesces when the virtual memory
pointers are matching. caller can do this as well but it looks ugly: we
don't know this is what caller does ...

> This way we don't introduce a new per-page API.

What do you mean by new per-page API?
The API is range-based, implementation
currently scans all pages.

> -- 
> error compiling committee.c: too many arguments to function
Avi Kivity - Jan. 18, 2010, 4:15 p.m.
On 01/18/2010 06:08 PM, Michael S. Tsirkin wrote:
> On Mon, Jan 18, 2010 at 06:04:40PM +0200, Avi Kivity wrote:
>    
>> On 01/18/2010 05:45 PM, Michael S. Tsirkin wrote:
>>      
>>> cpu_register_physical_memory_offset already is O(memory size) btw.
>>>
>>>        
>> Right, but we'd like to replace it with a range API.
>>      
> So, when we do the implementation of notifiers can follow?
>    

Okay.

Patch

diff --git a/cpu-common.h b/cpu-common.h
index 6302372..0ec9b72 100644
--- a/cpu-common.h
+++ b/cpu-common.h
@@ -8,6 +8,7 @@ 
 #endif
 
 #include "bswap.h"
+#include "qemu-queue.h"
 
 /* address in the RAM (different from a physical address) */
 typedef unsigned long ram_addr_t;
@@ -61,6 +62,24 @@  void cpu_physical_memory_unmap(void *buffer, target_phys_addr_t len,
 void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque));
 void cpu_unregister_map_client(void *cookie);
 
+struct CPUPhysMemoryClient;
+typedef struct CPUPhysMemoryClient CPUPhysMemoryClient;
+struct CPUPhysMemoryClient {
+    void (*set_memory)(struct CPUPhysMemoryClient *client,
+                       target_phys_addr_t start_addr,
+                       ram_addr_t size,
+                       ram_addr_t phys_offset);
+    int (*sync_dirty_bitmap)(struct CPUPhysMemoryClient *client,
+                             target_phys_addr_t start_addr,
+                             target_phys_addr_t end_addr);
+    int (*migration_log)(struct CPUPhysMemoryClient *client,
+                         int enable);
+    QLIST_ENTRY(CPUPhysMemoryClient) list;
+};
+
+void cpu_register_phys_memory_client(CPUPhysMemoryClient *);
+void cpu_unregister_phys_memory_client(CPUPhysMemoryClient *);
+
 uint32_t ldub_phys(target_phys_addr_t addr);
 uint32_t lduw_phys(target_phys_addr_t addr);
 uint32_t ldl_phys(target_phys_addr_t addr);
diff --git a/exec.c b/exec.c
index 1190591..56c347b 100644
--- a/exec.c
+++ b/exec.c
@@ -1623,6 +1623,99 @@  const CPULogItem cpu_log_items[] = {
     { 0, NULL, NULL },
 };
 
+static QLIST_HEAD(memory_client_list, CPUPhysMemoryClient) memory_client_list
+    = QLIST_HEAD_INITIALIZER(memory_client_list);
+
+static void cpu_notify_set_memory(target_phys_addr_t start_addr,
+				  ram_addr_t size,
+				  ram_addr_t phys_offset)
+{
+    CPUPhysMemoryClient *client;
+    QLIST_FOREACH(client, &memory_client_list, list) {
+        client->set_memory(client, start_addr, size, phys_offset);
+    }
+}
+
+static int cpu_notify_sync_dirty_bitmap(target_phys_addr_t start,
+					target_phys_addr_t end)
+{
+    CPUPhysMemoryClient *client;
+    QLIST_FOREACH(client, &memory_client_list, list) {
+        int r = client->sync_dirty_bitmap(client, start, end);
+        if (r < 0)
+            return r;
+    }
+    return 0;
+}
+
+static int cpu_notify_migration_log(int enable)
+{
+    CPUPhysMemoryClient *client;
+    QLIST_FOREACH(client, &memory_client_list, list) {
+        int r = client->migration_log(client, enable);
+        if (r < 0)
+            return r;
+    }
+    return 0;
+}
+
+static void phys_page_for_each_in_l1_map(PhysPageDesc **phys_map,
+                                         CPUPhysMemoryClient *client)
+{
+    PhysPageDesc *pd;
+    int l1, l2;
+
+    for (l1 = 0; l1 < L1_SIZE; ++l1) {
+        pd = phys_map[l1];
+        if (!pd) {
+            continue;
+        }
+        for (l2 = 0; l2 < L2_SIZE; ++l2) {
+            if (pd[l2].phys_offset == IO_MEM_UNASSIGNED) {
+                continue;
+            }
+            client->set_memory(client, pd[l2].region_offset,
+                               TARGET_PAGE_SIZE, pd[l2].phys_offset);
+        }
+    }
+}
+
+static void phys_page_for_each(CPUPhysMemoryClient *client)
+{
+#if TARGET_PHYS_ADDR_SPACE_BITS > 32
+
+#if TARGET_PHYS_ADDR_SPACE_BITS > (32 + L1_BITS)
+#error unsupported TARGET_PHYS_ADDR_SPACE_BITS
+#endif
+    void **phys_map = (void **)l1_phys_map;
+    int l1;
+    if (!l1_phys_map) {
+        return;
+    }
+    for (l1 = 0; l1 < L1_SIZE; ++l1) {
+        if (phys_map[l1]) {
+            phys_page_for_each_in_l1_map(phys_map[l1], client);
+        }
+    }
+#else
+    if (!l1_phys_map) {
+        return;
+    }
+    phys_page_for_each_in_l1_map(l1_phys_map, client);
+#endif
+}
+
+void cpu_register_phys_memory_client(CPUPhysMemoryClient *client)
+{
+    QLIST_INSERT_HEAD(&memory_client_list, client, list);
+    phys_page_for_each(client);
+}
+
+void cpu_unregister_phys_memory_client(CPUPhysMemoryClient *client)
+{
+    QLIST_REMOVE(client, list);
+}
+
 static int cmp1(const char *s1, int n, const char *s2)
 {
     if (strlen(s2) != n)
@@ -1882,11 +1975,16 @@  void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
 
 int cpu_physical_memory_set_dirty_tracking(int enable)
 {
+    int ret = 0;
     in_migration = enable;
     if (kvm_enabled()) {
-        return kvm_set_migration_log(enable);
+        ret = kvm_set_migration_log(enable);
     }
-    return 0;
+    if (ret < 0) {
+        return ret;
+    }
+    ret = cpu_notify_migration_log(!!enable);
+    return ret;
 }
 
 int cpu_physical_memory_get_dirty_tracking(void)
@@ -1899,8 +1997,13 @@  int cpu_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
 {
     int ret = 0;
 
-    if (kvm_enabled())
+    if (kvm_enabled()) {
         ret = kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
+    }
+    if (ret < 0) {
+        return ret;
+    }
+    ret = cpu_notify_sync_dirty_bitmap(start_addr, end_addr);
     return ret;
 }
 
@@ -2315,6 +2418,8 @@  void cpu_register_physical_memory_offset(target_phys_addr_t start_addr,
     if (kvm_enabled())
         kvm_set_phys_mem(start_addr, size, phys_offset);
 
+    cpu_notify_set_memory(start_addr, size, phys_offset);
+
     if (phys_offset == IO_MEM_UNASSIGNED) {
         region_offset = start_addr;
     }