diff mbox series

[v4,33/66] i386/tdx: Make memory type private by default

Message ID 20240125032328.2522472-34-xiaoyao.li@intel.com
State New
Headers show
Series QEMU Guest memfd + QEMU TDX support | expand

Commit Message

Xiaoyao Li Jan. 25, 2024, 3:22 a.m. UTC
By default (due to the recent UPM change), restricted memory attribute is
shared.  Convert the memory region from shared to private at the memory
slot creation time.

add kvm region registering function to check the flag
and convert the region, and add memory listener to TDX guest code to set
the flag to the possible memory region.

Without this patch
- Secure-EPT violation on private area
- KVM_MEMORY_FAULT EXIT (kvm -> qemu)
- qemu converts the 4K page from shared to private
- Resume VCPU execution
- Secure-EPT violation again
- KVM resolves EPT Violation
This also prevents huge page because page conversion is done at 4K
granularity.  Although it's possible to merge 4K private mapping into
2M large page, it slows guest boot.

With this patch
- After memory slot creation, convert the region from private to shared
- Secure-EPT violation on private area.
- KVM resolves EPT Violation

Originated-from: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
---
 include/exec/memory.h |  1 +
 target/i386/kvm/tdx.c | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

Comments

David Hildenbrand Jan. 26, 2024, 2:58 p.m. UTC | #1
On 25.01.24 04:22, Xiaoyao Li wrote:
> By default (due to the recent UPM change), restricted memory attribute is
> shared.  Convert the memory region from shared to private at the memory
> slot creation time.
> 
> add kvm region registering function to check the flag
> and convert the region, and add memory listener to TDX guest code to set
> the flag to the possible memory region.
> 
> Without this patch
> - Secure-EPT violation on private area
> - KVM_MEMORY_FAULT EXIT (kvm -> qemu)
> - qemu converts the 4K page from shared to private
> - Resume VCPU execution
> - Secure-EPT violation again
> - KVM resolves EPT Violation
> This also prevents huge page because page conversion is done at 4K
> granularity.  Although it's possible to merge 4K private mapping into
> 2M large page, it slows guest boot.
> 
> With this patch
> - After memory slot creation, convert the region from private to shared
> - Secure-EPT violation on private area.
> - KVM resolves EPT Violation
> 
> Originated-from: Isaku Yamahata <isaku.yamahata@intel.com>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> ---
>   include/exec/memory.h |  1 +
>   target/i386/kvm/tdx.c | 20 ++++++++++++++++++++
>   2 files changed, 21 insertions(+)
> 
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 7229fcc0415f..f25959f6d30f 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -850,6 +850,7 @@ struct IOMMUMemoryRegion {
>   #define MEMORY_LISTENER_PRIORITY_MIN            0
>   #define MEMORY_LISTENER_PRIORITY_ACCEL          10
>   #define MEMORY_LISTENER_PRIORITY_DEV_BACKEND    10
> +#define MEMORY_LISTENER_PRIORITY_ACCEL_HIGH     20
>   
>   /**
>    * struct MemoryListener: callbacks structure for updates to the physical memory map
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index 7b250d80bc1d..f892551821ce 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -19,6 +19,7 @@
>   #include "standard-headers/asm-x86/kvm_para.h"
>   #include "sysemu/kvm.h"
>   #include "sysemu/sysemu.h"
> +#include "exec/address-spaces.h"
>   
>   #include "hw/i386/x86.h"
>   #include "kvm_i386.h"
> @@ -621,6 +622,19 @@ int tdx_pre_create_vcpu(CPUState *cpu, Error **errp)
>       return 0;
>   }
>   
> +static void tdx_guest_region_add(MemoryListener *listener,
> +                                 MemoryRegionSection *section)
> +{
> +    memory_region_set_default_private(section->mr);
> +}

That looks fishy. Why is TDX to decide what happens to other memory 
regions it doesn't own?

We should define that behavior when creating these memory region, and 
TDX could sanity check that they have been setup properly.

Let me ask differently: For which memory region where we have 
RAM_GUEST_MEMFD set would we *not* want to set private as default right 
from the start?
Xiaoyao Li Jan. 29, 2024, 2:18 a.m. UTC | #2
On 1/26/2024 10:58 PM, David Hildenbrand wrote:
> On 25.01.24 04:22, Xiaoyao Li wrote:
>> By default (due to the recent UPM change), restricted memory attribute is
>> shared.  Convert the memory region from shared to private at the memory
>> slot creation time.
>>
>> add kvm region registering function to check the flag
>> and convert the region, and add memory listener to TDX guest code to set
>> the flag to the possible memory region.
>>
>> Without this patch
>> - Secure-EPT violation on private area
>> - KVM_MEMORY_FAULT EXIT (kvm -> qemu)
>> - qemu converts the 4K page from shared to private
>> - Resume VCPU execution
>> - Secure-EPT violation again
>> - KVM resolves EPT Violation
>> This also prevents huge page because page conversion is done at 4K
>> granularity.  Although it's possible to merge 4K private mapping into
>> 2M large page, it slows guest boot.
>>
>> With this patch
>> - After memory slot creation, convert the region from private to shared
>> - Secure-EPT violation on private area.
>> - KVM resolves EPT Violation
>>
>> Originated-from: Isaku Yamahata <isaku.yamahata@intel.com>
>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>> ---
>>   include/exec/memory.h |  1 +
>>   target/i386/kvm/tdx.c | 20 ++++++++++++++++++++
>>   2 files changed, 21 insertions(+)
>>
>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>> index 7229fcc0415f..f25959f6d30f 100644
>> --- a/include/exec/memory.h
>> +++ b/include/exec/memory.h
>> @@ -850,6 +850,7 @@ struct IOMMUMemoryRegion {
>>   #define MEMORY_LISTENER_PRIORITY_MIN            0
>>   #define MEMORY_LISTENER_PRIORITY_ACCEL          10
>>   #define MEMORY_LISTENER_PRIORITY_DEV_BACKEND    10
>> +#define MEMORY_LISTENER_PRIORITY_ACCEL_HIGH     20
>>   /**
>>    * struct MemoryListener: callbacks structure for updates to the 
>> physical memory map
>> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
>> index 7b250d80bc1d..f892551821ce 100644
>> --- a/target/i386/kvm/tdx.c
>> +++ b/target/i386/kvm/tdx.c
>> @@ -19,6 +19,7 @@
>>   #include "standard-headers/asm-x86/kvm_para.h"
>>   #include "sysemu/kvm.h"
>>   #include "sysemu/sysemu.h"
>> +#include "exec/address-spaces.h"
>>   #include "hw/i386/x86.h"
>>   #include "kvm_i386.h"
>> @@ -621,6 +622,19 @@ int tdx_pre_create_vcpu(CPUState *cpu, Error **errp)
>>       return 0;
>>   }
>> +static void tdx_guest_region_add(MemoryListener *listener,
>> +                                 MemoryRegionSection *section)
>> +{
>> +    memory_region_set_default_private(section->mr);
>> +}
> 
> That looks fishy. Why is TDX to decide what happens to other memory 
> regions it doesn't own?
> 
> We should define that behavior when creating these memory region, and 
> TDX could sanity check that they have been setup properly.
> 
> Let me ask differently: For which memory region where we have 
> RAM_GUEST_MEMFD set would we *not* want to set private as default right 
> from the start?

All memory regions have RAM_GUEST_MEMFD set will benefit from being 
private as default, for TDX guest.

I will update the implementation to set RAM_DEFAULT_PRIVATE flag when 
guest_memfd is created successfully, like

diff --git a/system/physmem.c b/system/physmem.c
index fc59470191ef..60676689c807 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1850,6 +1850,8 @@ static void ram_block_add(RAMBlock *new_block, 
Error **errp)
              qemu_mutex_unlock_ramlist();
              return;
          }
+
+        new_block->flags |= RAM_DEFAULT_PRIVATE;
      }

then this patch can be dropped, and the calling of 
memory_region_set_default_private(mr) of Patch 45 can be dropped too.

I think this is what you suggested, right?
David Hildenbrand Feb. 20, 2024, 3:08 p.m. UTC | #3
On 29.01.24 03:18, Xiaoyao Li wrote:
> On 1/26/2024 10:58 PM, David Hildenbrand wrote:
>> On 25.01.24 04:22, Xiaoyao Li wrote:
>>> By default (due to the recent UPM change), restricted memory attribute is
>>> shared.  Convert the memory region from shared to private at the memory
>>> slot creation time.
>>>
>>> add kvm region registering function to check the flag
>>> and convert the region, and add memory listener to TDX guest code to set
>>> the flag to the possible memory region.
>>>
>>> Without this patch
>>> - Secure-EPT violation on private area
>>> - KVM_MEMORY_FAULT EXIT (kvm -> qemu)
>>> - qemu converts the 4K page from shared to private
>>> - Resume VCPU execution
>>> - Secure-EPT violation again
>>> - KVM resolves EPT Violation
>>> This also prevents huge page because page conversion is done at 4K
>>> granularity.  Although it's possible to merge 4K private mapping into
>>> 2M large page, it slows guest boot.
>>>
>>> With this patch
>>> - After memory slot creation, convert the region from private to shared
>>> - Secure-EPT violation on private area.
>>> - KVM resolves EPT Violation
>>>
>>> Originated-from: Isaku Yamahata <isaku.yamahata@intel.com>
>>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>>> ---
>>>    include/exec/memory.h |  1 +
>>>    target/i386/kvm/tdx.c | 20 ++++++++++++++++++++
>>>    2 files changed, 21 insertions(+)
>>>
>>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>>> index 7229fcc0415f..f25959f6d30f 100644
>>> --- a/include/exec/memory.h
>>> +++ b/include/exec/memory.h
>>> @@ -850,6 +850,7 @@ struct IOMMUMemoryRegion {
>>>    #define MEMORY_LISTENER_PRIORITY_MIN            0
>>>    #define MEMORY_LISTENER_PRIORITY_ACCEL          10
>>>    #define MEMORY_LISTENER_PRIORITY_DEV_BACKEND    10
>>> +#define MEMORY_LISTENER_PRIORITY_ACCEL_HIGH     20
>>>    /**
>>>     * struct MemoryListener: callbacks structure for updates to the
>>> physical memory map
>>> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
>>> index 7b250d80bc1d..f892551821ce 100644
>>> --- a/target/i386/kvm/tdx.c
>>> +++ b/target/i386/kvm/tdx.c
>>> @@ -19,6 +19,7 @@
>>>    #include "standard-headers/asm-x86/kvm_para.h"
>>>    #include "sysemu/kvm.h"
>>>    #include "sysemu/sysemu.h"
>>> +#include "exec/address-spaces.h"
>>>    #include "hw/i386/x86.h"
>>>    #include "kvm_i386.h"
>>> @@ -621,6 +622,19 @@ int tdx_pre_create_vcpu(CPUState *cpu, Error **errp)
>>>        return 0;
>>>    }
>>> +static void tdx_guest_region_add(MemoryListener *listener,
>>> +                                 MemoryRegionSection *section)
>>> +{
>>> +    memory_region_set_default_private(section->mr);
>>> +}
>>
>> That looks fishy. Why is TDX to decide what happens to other memory
>> regions it doesn't own?
>>
>> We should define that behavior when creating these memory region, and
>> TDX could sanity check that they have been setup properly.
>>
>> Let me ask differently: For which memory region where we have
>> RAM_GUEST_MEMFD set would we *not* want to set private as default right
>> from the start?
> 
> All memory regions have RAM_GUEST_MEMFD set will benefit from being
> private as default, for TDX guest.
> 
> I will update the implementation to set RAM_DEFAULT_PRIVATE flag when
> guest_memfd is created successfully, like
> 
> diff --git a/system/physmem.c b/system/physmem.c
> index fc59470191ef..60676689c807 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -1850,6 +1850,8 @@ static void ram_block_add(RAMBlock *new_block,
> Error **errp)
>                qemu_mutex_unlock_ramlist();
>                return;
>            }
> +
> +        new_block->flags |= RAM_DEFAULT_PRIVATE;
>        }
> 
> then this patch can be dropped, and the calling of
> memory_region_set_default_private(mr) of Patch 45 can be dropped too.
> 
> I think this is what you suggested, right?

Yes, if that works, great!
diff mbox series

Patch

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 7229fcc0415f..f25959f6d30f 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -850,6 +850,7 @@  struct IOMMUMemoryRegion {
 #define MEMORY_LISTENER_PRIORITY_MIN            0
 #define MEMORY_LISTENER_PRIORITY_ACCEL          10
 #define MEMORY_LISTENER_PRIORITY_DEV_BACKEND    10
+#define MEMORY_LISTENER_PRIORITY_ACCEL_HIGH     20
 
 /**
  * struct MemoryListener: callbacks structure for updates to the physical memory map
diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index 7b250d80bc1d..f892551821ce 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -19,6 +19,7 @@ 
 #include "standard-headers/asm-x86/kvm_para.h"
 #include "sysemu/kvm.h"
 #include "sysemu/sysemu.h"
+#include "exec/address-spaces.h"
 
 #include "hw/i386/x86.h"
 #include "kvm_i386.h"
@@ -621,6 +622,19 @@  int tdx_pre_create_vcpu(CPUState *cpu, Error **errp)
     return 0;
 }
 
+static void tdx_guest_region_add(MemoryListener *listener,
+                                 MemoryRegionSection *section)
+{
+    memory_region_set_default_private(section->mr);
+}
+
+static MemoryListener tdx_memory_listener = {
+    .name = TYPE_TDX_GUEST,
+    .region_add = tdx_guest_region_add,
+    /* Higher than KVM memory listener = 10. */
+    .priority = MEMORY_LISTENER_PRIORITY_ACCEL_HIGH,
+};
+
 static bool tdx_guest_get_sept_ve_disable(Object *obj, Error **errp)
 {
     TdxGuest *tdx = TDX_GUEST(obj);
@@ -695,6 +709,12 @@  OBJECT_DEFINE_TYPE_WITH_INTERFACES(TdxGuest,
 static void tdx_guest_init(Object *obj)
 {
     TdxGuest *tdx = TDX_GUEST(obj);
+    static bool memory_listener_registered = false;
+
+    if (!memory_listener_registered) {
+        memory_listener_register(&tdx_memory_listener, &address_space_memory);
+        memory_listener_registered = true;
+    }
 
     qemu_mutex_init(&tdx->lock);