diff mbox series

[2/5] UBUNTU: SAUCE: Support TDX+HCL (July 9, 2023)

Message ID 20230724170017.17988-3-tim.gardner@canonical.com
State New
Headers show
Series Azure: TDX updates to support HCL | expand

Commit Message

Tim Gardner July 24, 2023, 5 p.m. UTC
From: Dexuan Cui <decui@microsoft.com>

BugLink: https://bugs.launchpad.net/bugs/2028286

The changes will be furher cleaned up and posted to LKML.

Tested the below scenarios and the VMs were able to boot up with 128 VPs:
1) TDX with the pavavisor.
2) TDX without the pavavisor.
3) SNP with the pavavisor.
4) VBS.
5) Regular VMs.

(cherry picked from commit 9893873bdef6f1e5574f784ed6e1d9d5bc54f1d8 https://github.com/dcui/linux/commit/9893873bdef6f1e5574f784ed6e1d9d5bc54f1d8)
Signed-off-by: Dexuan Cui <decui@microsoft.com>
(cherry picked from commit 52283f363634df9b096b94634100b1c945ea60eb https://github.com/dcui/linux)
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
---
 arch/x86/coco/core.c            |   5 +-
 arch/x86/hyperv/hv_apic.c       |  13 +++-
 arch/x86/hyperv/hv_init.c       |  97 ++++++++++++++++++++-----
 arch/x86/hyperv/ivm.c           | 124 +++++++++++++++++++++++++++-----
 arch/x86/include/asm/coco.h     |   1 +
 arch/x86/include/asm/mshyperv.h |  15 ++--
 arch/x86/kernel/cpu/mshyperv.c  |  41 +++++++----
 arch/x86/kernel/eisa.c          |  10 +++
 drivers/hv/connection.c         |  11 ++-
 drivers/hv/hv.c                 |  32 +++++----
 drivers/hv/hv_common.c          |   4 +-
 include/asm-generic/mshyperv.h  |   3 +-
 include/linux/cpuhotplug.h      |   1 +
 13 files changed, 280 insertions(+), 77 deletions(-)
diff mbox series

Patch

diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index f4f0625691fd..a39a92efb6de 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -14,16 +14,19 @@ 
 #include <asm/processor.h>
 
 static enum cc_vendor vendor __ro_after_init;
+bool cc_attr_cpu_hotplug_disabled __ro_after_init = true;
 static u64 cc_mask __ro_after_init;
 
 static bool intel_cc_platform_has(enum cc_attr attr)
 {
 	switch (attr) {
 	case CC_ATTR_GUEST_UNROLL_STRING_IO:
-	case CC_ATTR_HOTPLUG_DISABLED:
 	case CC_ATTR_GUEST_MEM_ENCRYPT:
 	case CC_ATTR_MEM_ENCRYPT:
 		return true;
+
+	case CC_ATTR_HOTPLUG_DISABLED:
+		return cc_attr_cpu_hotplug_disabled;
 	default:
 		return false;
 	}
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index fb8b2c088681..4c9be526cb02 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -173,8 +173,10 @@  static bool __send_ipi_mask(const struct cpumask *mask, int vector,
 	    (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask)))
 		return true;
 
-	if (!hv_hypercall_pg)
-		return false;
+	if (!hv_hypercall_pg) {
+		if (!hv_isolation_type_tdx() || hyperv_paravisor_present)
+			return false;
+	}
 
 	if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
 		return false;
@@ -227,9 +229,14 @@  static bool __send_ipi_one(int cpu, int vector)
 
 	trace_hyperv_send_ipi_one(cpu, vector);
 
-	if (!hv_hypercall_pg || (vp == VP_INVAL))
+	if (vp == VP_INVAL)
 		return false;
 
+	if (!hv_hypercall_pg) {
+		if (!hv_isolation_type_tdx() || hyperv_paravisor_present)
+			return false;
+	}
+
 	if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
 		return false;
 
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 28ee240a2c90..400ec1573287 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -381,6 +381,36 @@  static void __init hv_get_partition_id(void)
 	local_irq_restore(flags);
 }
 
+static int hv_write_efer(unsigned int cpu)
+{
+	unsigned long long efer;
+
+	if (!hv_isolation_type_tdx() || !hyperv_paravisor_present)
+		return 0;
+
+	/*
+	 * Write EFER by force, otherwise the paravisor's hypercall
+	 * handler thinks that the VP is in 32-bit mode, and the
+	 * returning RIP is truncated to 32-bits, causing a fatal
+	 * page fault. This is a TDX-spefic issue because it looks
+	 * like the initial default value of EFER on non-boot VPs
+	 * already has the EFER.LMA bit, and when the reading of
+	 * EFER on a non-boot VP is the same as the value of EER
+	 * on VP0, Linux doesn't write the EFER register on a
+	 * non-boot VP: see the code in arch/x86/kernel/head_64.S
+	 * ("Avoid writing EFER if no change was made (for TDX guest)").
+	 * Also see commit 77a512e35db7 ("x86/boot: Avoid #VE during boot for TDX platforms")
+	 * Work around the issue for now by force an EFER write.
+	 *
+	 * XXX: This is a temporary hack. Need to figure out why the
+	 * initial default value of EFER on non-boot VPs is not zero.
+	 */
+	rdmsrl(MSR_EFER, efer);
+	wrmsrl(MSR_EFER, efer);
+
+	return 0;
+}
+
 /*
  * This function is to be invoked early in the boot sequence after the
  * hypervisor has been detected.
@@ -394,10 +424,19 @@  void __init hyperv_init(void)
 	u64 guest_id;
 	union hv_x64_msr_hypercall_contents hypercall_msr;
 	int cpuhp;
+	int ret;
 
 	if (x86_hyper_type != X86_HYPER_MS_HYPERV)
 		return;
 
+	if (hv_isolation_type_tdx() && hyperv_paravisor_present) {
+		ret = cpuhp_setup_state(CPUHP_AP_HYPERV_FORCE_EFER_WRITE,
+					"x86/hyperv_write_efer",
+					hv_write_efer, NULL);
+		if (WARN_ON(ret < 0))
+			return;
+	}
+
 	if (hv_common_init())
 		return;
 
@@ -429,24 +468,37 @@  void __init hyperv_init(void)
 			goto free_vp_assist_page;
 	}
 
-	cpuhp = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "x86/hyperv_init:online",
-				  hv_cpu_init, hv_cpu_die);
-	if (cpuhp < 0)
-		goto free_ghcb_page;
-
 	/*
 	 * Setup the hypercall page and enable hypercalls.
 	 * 1. Register the guest ID
 	 * 2. Enable the hypercall and register the hypercall page
+	 *
+	 * A TDX VM with no paravisor uses GHCI rather than hv_hypercall_pg.
+	 * When the VM needs to pass an input page to Hyper-V, the page must
+	 * be a shared page, e.g. hv_post_message() uses the per-CPU shared
+	 * page hyperv_pcpu_input_arg.
+	 *
+	 * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls,
+	 * which are handled by the paravisor and a private input page must be
+	 * used, e.g. see hv_mark_gpa_visibility(). The VM uses GHCI for
+	 * two hypercalls: HVCALL_SIGNAL_EVENT (see vmbus_set_event()) and
+	 * HVCALL_POST_MESSAGE (the input page must be a shared page, i.e.
+	 * hv_post_message() uses the per-CPU shared hyperv_pcpu_input_arg.)
+	 * NOTE: we must initialize hv_hypercall_pg before hv_cpu_init(),
+	 * because hv_cpu_init() -> hv_common_cpu_init() -> set_memory_decrypted()
+	 * -> ... -> hv_vtom_set_host_visibility() -> ... -> hv_do_hypercall()
+	 * needs to call the hv_hypercall_pg.
+	 */
+
+	/*
+	 * In the case of TDX with the paravisor, we should write the MSR
+	 * before hv_cpu_init(), which needs to call the paravisor-handled
+	 * HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY.
 	 */
 	guest_id = hv_generate_guest_id(LINUX_VERSION_CODE);
 	wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
 
-	/* Hyper-V requires to write guest os id via ghcb in SNP IVM. */
-	hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id);
-
-	/* A TDX guest uses the GHCI call rather than hv_hypercall_pg. */
-	if (hv_isolation_type_tdx())
+	if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
 		goto skip_hypercall_pg_init;
 
 	hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START,
@@ -454,7 +506,7 @@  void __init hyperv_init(void)
 			VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
 			__builtin_return_address(0));
 	if (hv_hypercall_pg == NULL)
-		goto clean_guest_os_id;
+		goto free_ghcb_page;
 
 	rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
 	hypercall_msr.enable = 1;
@@ -489,6 +541,18 @@  void __init hyperv_init(void)
 	}
 
 skip_hypercall_pg_init:
+	cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
+				  hv_cpu_init, hv_cpu_die);
+	if (cpuhp < 0)
+		goto clean_guest_os_id;
+
+	/*
+	 * In the case of SNP with the paravisor, we must write the MSR to
+	 * the hypervisor after hv_cpu_init(), which maps the hv_ghcb_pg first.
+	 */
+	if (hyperv_paravisor_present)
+		hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
 	/*
 	 * hyperv_init() is called before LAPIC is initialized: see
 	 * apic_intr_mode_init() -> x86_platform.apic_post_init() and
@@ -528,8 +592,8 @@  void __init hyperv_init(void)
 
 clean_guest_os_id:
 	wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
-	hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
-	cpuhp_remove_state(cpuhp);
+	if (hyperv_paravisor_present)
+		hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
 free_ghcb_page:
 	free_percpu(hv_ghcb_pg);
 free_vp_assist_page:
@@ -549,7 +613,8 @@  void hyperv_cleanup(void)
 
 	/* Reset our OS id */
 	wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
-	hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
+	if (hyperv_paravisor_present)
+		hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
 
 	/*
 	 * Reset hypercall page reference before reset the page,
@@ -612,8 +677,8 @@  bool hv_is_hyperv_initialized(void)
 	if (x86_hyper_type != X86_HYPER_MS_HYPERV)
 		return false;
 
-	/* A TDX guest uses the GHCI call rather than hv_hypercall_pg. */
-	if (hv_isolation_type_tdx())
+	/* A TDX guest without paravisor uses the GHCI call rather than hv_hypercall_pg */
+	if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
 		return true;
 	/*
 	 * Verify that earlier initialization succeeded by checking
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 5b6de5449704..4e31677d1c02 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -57,7 +57,7 @@  union hv_ghcb {
 
 static u16 hv_ghcb_version __ro_after_init;
 
-u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
+static u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
 {
 	union hv_ghcb *hv_ghcb;
 	void **ghcb_base;
@@ -100,6 +100,31 @@  u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
 	return status;
 }
 
+
+u64 hv_ivm_hypercall(u64 control, void *input, void *output, u32 input_size)
+{
+	if (hv_isolation_type_tdx()) {
+		u64 input_address = input ? (virt_to_phys(input) | ms_hyperv.shared_gpa_boundary) : 0;
+		u64 output_address = output ? (virt_to_phys(output) | ms_hyperv.shared_gpa_boundary) : 0;
+		return hv_tdx_hypercall(control, input_address, output_address);
+	} else if (hv_isolation_type_snp()) {
+		return hv_ghcb_hypercall(control, input, output, input_size);
+	} else {
+		return HV_STATUS_INVALID_HYPERCALL_CODE;
+	}
+}
+
+u64 hv_tdx_hypercall_fast(u64 control, u64 input)
+{
+	u64 input_address = input;
+	u64 output_address = 0;
+
+	return hv_tdx_hypercall(control | HV_HYPERCALL_FAST_BIT,
+				input_address, output_address);
+}
+EXPORT_SYMBOL_GPL(hv_tdx_hypercall_fast);
+
+
 static inline u64 rd_ghcb_msr(void)
 {
 	return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
@@ -174,7 +199,38 @@  bool hv_ghcb_negotiate_protocol(void)
 	return true;
 }
 
-void hv_ghcb_msr_write(u64 msr, u64 value)
+#define EXIT_REASON_MSR_READ            31
+#define EXIT_REASON_MSR_WRITE           32
+
+static void hv_tdx_read_msr(u64 msr, u64 *val)
+{
+	struct tdx_hypercall_args args = {
+		.r10 = TDX_HYPERCALL_STANDARD,
+		.r11 = EXIT_REASON_MSR_READ,
+		.r12 = msr,
+	};
+
+	u64 ret = __tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT);
+	if (WARN_ONCE(ret, "Failed to emulate MSR read: %lld\n", ret))
+		*val = 0;
+	else
+		*val = args.r11;
+}
+
+static void hv_tdx_write_msr(u64 msr, u64 val)
+{
+	struct tdx_hypercall_args args = {
+		.r10 = TDX_HYPERCALL_STANDARD,
+		.r11 = EXIT_REASON_MSR_WRITE,
+		.r12 = msr,
+		.r13 = val,
+	};
+
+	u64 ret =__tdx_hypercall(&args, 0);
+	WARN_ONCE(ret, "Failed to emulate MSR write: %lld\n", ret);
+}
+
+static void hv_ghcb_msr_write(u64 msr, u64 value)
 {
 	union hv_ghcb *hv_ghcb;
 	void **ghcb_base;
@@ -202,9 +258,17 @@  void hv_ghcb_msr_write(u64 msr, u64 value)
 
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(hv_ghcb_msr_write);
 
-void hv_ghcb_msr_read(u64 msr, u64 *value)
+void hv_ivm_msr_write(u64 msr, u64 value)
+{
+	if (hv_isolation_type_tdx())
+		hv_tdx_write_msr(msr, value);
+	else if (hv_isolation_type_snp())
+		hv_ghcb_msr_write(msr, value);
+}
+EXPORT_SYMBOL_GPL(hv_ivm_msr_write);
+
+static void hv_ghcb_msr_read(u64 msr, u64 *value)
 {
 	union hv_ghcb *hv_ghcb;
 	void **ghcb_base;
@@ -234,7 +298,6 @@  void hv_ghcb_msr_read(u64 msr, u64 *value)
 			| ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32);
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(hv_ghcb_msr_read);
 
 #ifdef CONFIG_INTEL_TDX_GUEST
 DEFINE_STATIC_KEY_FALSE(isolation_type_tdx);
@@ -259,6 +322,17 @@  u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
 EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
 #endif
 
+void hv_ivm_msr_read(u64 msr, u64 *value)
+{
+	if (hv_isolation_type_tdx())
+		hv_tdx_read_msr(msr, value);
+	else if (hv_isolation_type_snp())
+		hv_ghcb_msr_read(msr, value);
+}
+EXPORT_SYMBOL_GPL(hv_ivm_msr_read);
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct hv_gpa_range_for_visibility,
+				   hv_gpa_range_for_visibility);
 /*
  * hv_mark_gpa_visibility - Set pages visible to host via hvcall.
  *
@@ -266,10 +340,10 @@  EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
  * needs to set memory visible to host via hvcall before sharing memory
  * with host.
  */
-static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
+int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
 			   enum hv_mem_host_visibility visibility)
 {
-	struct hv_gpa_range_for_visibility **input_pcpu, *input;
+	struct hv_gpa_range_for_visibility *input;
 	u16 pages_processed;
 	u64 hv_status;
 	unsigned long flags;
@@ -285,14 +359,13 @@  static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
 	}
 
 	local_irq_save(flags);
-	input_pcpu = (struct hv_gpa_range_for_visibility **)
-			this_cpu_ptr(hyperv_pcpu_input_arg);
-	input = *input_pcpu;
-	if (unlikely(!input)) {
-		local_irq_restore(flags);
-		return -EINVAL;
-	}
-
+	/*
+	 * The page should be a private page, which is passed to the pavavisor
+	 * and is not shared with the hypervisor. Note: we shouldn't use the
+	 * hyperv_pcpu_input_arg, which is a shared page in the case of
+	 * a TDX VM with the pavavisor.
+         */
+	input = this_cpu_ptr(&hv_gpa_range_for_visibility);
 	input->partition_id = HV_PARTITION_ID_SELF;
 	input->host_visibility = visibility;
 	input->reserved0 = 0;
@@ -381,13 +454,30 @@  static bool hv_is_private_mmio(u64 addr)
 
 void __init hv_vtom_init(void)
 {
+	enum hv_isolation_type type = hv_get_isolation_type();
 	/*
 	 * By design, a VM using vTOM doesn't see the SEV setting,
 	 * so SEV initialization is bypassed and sev_status isn't set.
 	 * Set it here to indicate a vTOM VM.
 	 */
-	sev_status = MSR_AMD64_SNP_VTOM;
-	cc_set_vendor(CC_VENDOR_AMD);
+	switch (type) {
+	case HV_ISOLATION_TYPE_VBS:
+		fallthrough;
+
+	case HV_ISOLATION_TYPE_SNP:
+		sev_status = MSR_AMD64_SNP_VTOM;
+		cc_set_vendor(CC_VENDOR_AMD);
+		break;
+
+	case HV_ISOLATION_TYPE_TDX:
+		cc_set_vendor(CC_VENDOR_INTEL);
+		cc_attr_cpu_hotplug_disabled = false;
+		break;
+
+	default:
+		panic("hv_vtom_init: unsupported isolation type %d\n", type);
+	}
+
 	cc_set_mask(ms_hyperv.shared_gpa_boundary);
 	physical_mask &= ms_hyperv.shared_gpa_boundary - 1;
 
diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
index d2c6a2e8d04d..2bab556b7092 100644
--- a/arch/x86/include/asm/coco.h
+++ b/arch/x86/include/asm/coco.h
@@ -12,6 +12,7 @@  enum cc_vendor {
 
 void cc_set_vendor(enum cc_vendor v);
 void cc_set_mask(u64 mask);
+extern bool cc_attr_cpu_hotplug_disabled;
 
 #ifdef CONFIG_ARCH_HAS_CC_PLATFORM
 u64 cc_mkenc(u64 val);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 710b9e3cf2c7..734e94f4d3a8 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -33,6 +33,7 @@  void hyperv_vector_handler(struct pt_regs *regs);
 
 #if IS_ENABLED(CONFIG_HYPERV)
 extern int hyperv_init_cpuhp;
+extern bool hyperv_paravisor_present;
 
 extern void *hv_hypercall_pg;
 
@@ -59,7 +60,7 @@  static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 	u64 hv_status;
 
 #ifdef CONFIG_X86_64
-	if (hv_isolation_type_tdx())
+	if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
 		return hv_tdx_hypercall(control,
 					cc_mkdec(input_address),
 					cc_mkdec(output_address));
@@ -106,7 +107,7 @@  static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1)
 	u64 hv_status;
 
 #ifdef CONFIG_X86_64
-	if (hv_isolation_type_tdx())
+	if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
 		return hv_tdx_hypercall(control, input1, 0);
 
 	{
@@ -154,7 +155,7 @@  static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2)
 	u64 hv_status;
 
 #ifdef CONFIG_X86_64
-	if (hv_isolation_type_tdx())
+	if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
 		return hv_tdx_hypercall(control, input1, input2);
 
 	{
@@ -236,14 +237,14 @@  int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
 int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
 
 #ifdef CONFIG_AMD_MEM_ENCRYPT
-void hv_ghcb_msr_write(u64 msr, u64 value);
-void hv_ghcb_msr_read(u64 msr, u64 *value);
+void hv_ivm_msr_write(u64 msr, u64 value);
+void hv_ivm_msr_read(u64 msr, u64 *value);
 bool hv_ghcb_negotiate_protocol(void);
 void hv_ghcb_terminate(unsigned int set, unsigned int reason);
 void hv_vtom_init(void);
 #else
-static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
-static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
+static inline void hv_ivm_msr_write(u64 msr, u64 value) {}
+static inline void hv_ivm_msr_read(u64 msr, u64 *value) {}
 static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
 static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
 static inline void hv_vtom_init(void) {}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 9aad261d2843..63223d40aa03 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -39,6 +39,10 @@  bool hv_root_partition;
 bool hv_nested;
 struct ms_hyperv_info ms_hyperv;
 
+bool hyperv_paravisor_present;
+/* The variable is used in modules via hv_do_hypercall() */
+EXPORT_SYMBOL_GPL(hyperv_paravisor_present);
+
 #if IS_ENABLED(CONFIG_HYPERV)
 static inline unsigned int hv_get_nested_reg(unsigned int reg)
 {
@@ -64,8 +68,8 @@  u64 hv_get_non_nested_register(unsigned int reg)
 {
 	u64 value;
 
-	if (hv_is_synic_reg(reg) && hv_isolation_type_snp())
-		hv_ghcb_msr_read(reg, &value);
+	if (hv_is_synic_reg(reg) && hyperv_paravisor_present)
+		hv_ivm_msr_read(reg, &value);
 	else
 		rdmsrl(reg, value);
 	return value;
@@ -74,8 +78,8 @@  EXPORT_SYMBOL_GPL(hv_get_non_nested_register);
 
 void hv_set_non_nested_register(unsigned int reg, u64 value)
 {
-	if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) {
-		hv_ghcb_msr_write(reg, value);
+	if (hv_is_synic_reg(reg) && hyperv_paravisor_present) {
+		hv_ivm_msr_write(reg, value);
 
 		/* Write proxy bit via wrmsl instruction */
 		if (reg >= HV_REGISTER_SINT0 &&
@@ -424,6 +428,8 @@  static void __init ms_hyperv_init_platform(void)
 			ms_hyperv.shared_gpa_boundary =
 				BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
 
+		hyperv_paravisor_present = ms_hyperv.paravisor_present;
+
 		pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
 			ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
 
@@ -434,17 +440,24 @@  static void __init ms_hyperv_init_platform(void)
 		    hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) {
 			static_branch_enable(&isolation_type_tdx);
 
-			/*
-			 * The GPAs of SynIC Event/Message pages and VMBus
-			 * Moniter pages need to be added by this offset.
-			 */
-			ms_hyperv.shared_gpa_boundary = cc_mkdec(0);
+			/* A TDX VM must use x2APIC and doesn't use lazy EOI. */
+			ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
+
+			if (!hyperv_paravisor_present) {
+				/*
+				 * The GPAs of SynIC Event/Message pages and VMBus
+				 * Moniter pages need to be added by this offset.
+				 */
+				ms_hyperv.shared_gpa_boundary = cc_mkdec(0);
+
+				/* HV_REGISTER_CRASH_CTL is unsupported */
+				ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
 
-			/* HV_REGISTER_CRASH_CTL is unsupported */
-			ms_hyperv.misc_features &=
-				 ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+				/* Don't trust Hyper-V's TLB-flushing hypercalls */
+				ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
 
-			x86_init.acpi.reduced_hw_early_init = reduced_hw_init;
+				x86_init.acpi.reduced_hw_early_init = reduced_hw_init;
+			}
 		}
 	}
 
@@ -515,7 +528,7 @@  static void __init ms_hyperv_init_platform(void)
 
 #if IS_ENABLED(CONFIG_HYPERV)
 	if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
-	    (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP))
+	    hyperv_paravisor_present)
 		hv_vtom_init();
 	/*
 	 * Setup the hook to get control post apic initialization.
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
index e963344b0449..715b41968420 100644
--- a/arch/x86/kernel/eisa.c
+++ b/arch/x86/kernel/eisa.c
@@ -8,10 +8,20 @@ 
 
 #include <xen/xen.h>
 
+extern bool hyperv_paravisor_present;
+
 static __init int eisa_bus_probe(void)
 {
 	void __iomem *p;
 
+	/*
+	 * It looks like Hyper-V hasn't emulated this MMIO access yet for a TDX
+	 * VM with the pavavisor: in such a VM, the "readl(p)" below causes a
+	 * soft lockup. Work around the issue for now.
+	 */
+	if (hyperv_paravisor_present)
+		return 0;
+
 	if (xen_pv_domain() && !xen_initial_domain())
 		return 0;
 
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 5978e9dbc286..91837f72e7e3 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -483,10 +483,17 @@  void vmbus_set_event(struct vmbus_channel *channel)
 
 	++channel->sig_events;
 
+	if (!hyperv_paravisor_present) {
+		hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, channel->sig_event);
+		return;
+	}
+
 	if (hv_isolation_type_snp())
-		hv_ghcb_hypercall(HVCALL_SIGNAL_EVENT, &channel->sig_event,
+		hv_ivm_hypercall(HVCALL_SIGNAL_EVENT, &channel->sig_event,
 				NULL, sizeof(channel->sig_event));
+	else if (hv_isolation_type_tdx())
+		hv_tdx_hypercall_fast(HVCALL_SIGNAL_EVENT, channel->sig_event);
 	else
-		hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, channel->sig_event);
+		WARN_ON_ONCE(1);
 }
 EXPORT_SYMBOL_GPL(vmbus_set_event);
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index a6ecf742534f..a5d388f3706c 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -100,8 +100,8 @@  int hv_post_message(union hv_connection_id connection_id,
 	aligned_msg->payload_size = payload_size;
 	memcpy((void *)aligned_msg->payload, payload, payload_size);
 
-	if (hv_isolation_type_snp())
-		status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE,
+	if (hyperv_paravisor_present)
+		status = hv_ivm_hypercall(HVCALL_POST_MESSAGE,
 				(void *)aligned_msg, NULL,
 				sizeof(*aligned_msg));
 	else
@@ -146,7 +146,7 @@  int hv_synic_alloc(void)
 		 * Synic message and event pages are allocated by paravisor.
 		 * Skip these pages allocation here.
 		 */
-		if (!hv_isolation_type_snp() && !hv_root_partition) {
+		if (!hyperv_paravisor_present && !hv_root_partition) {
 			hv_cpu->synic_message_page =
 				(void *)get_zeroed_page(GFP_ATOMIC);
 			if (hv_cpu->synic_message_page == NULL) {
@@ -162,7 +162,8 @@  int hv_synic_alloc(void)
 			}
 		}
 
-		if (hv_isolation_type_tdx()) {
+		/* It's better to leak the page if the decryption fails. */
+		if (hv_isolation_type_tdx() && !hyperv_paravisor_present) {
 			ret = set_memory_decrypted(
 				(unsigned long)hv_cpu->synic_message_page, 1);
 			if (ret) {
@@ -198,12 +199,15 @@  void hv_synic_free(void)
 		struct hv_per_cpu_context *hv_cpu
 			= per_cpu_ptr(hv_context.cpu_context, cpu);
 
-		if (hv_isolation_type_tdx()) {
-			ret = set_memory_encrypted(
-				(unsigned long)hv_cpu->synic_message_page, 1);
-			if (ret) {
-				pr_err("Failed to encrypt SYNIC msg page\n");
-				continue;
+		/* It's better to leak the page if the encryption fails. */
+		if (hv_isolation_type_tdx() && !hyperv_paravisor_present) {
+			if (hv_cpu->synic_message_page) {
+				ret = set_memory_encrypted((unsigned long)
+					hv_cpu->synic_message_page, 1);
+				if (ret) {
+					pr_err("Failed to encrypt SYNIC msg page: %d\n", ret);
+					hv_cpu->synic_message_page = NULL;
+				}
 			}
 
 			ret = set_memory_encrypted(
@@ -241,7 +245,7 @@  void hv_synic_enable_regs(unsigned int cpu)
 	simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP);
 	simp.simp_enabled = 1;
 
-	if (hv_isolation_type_snp() || hv_root_partition) {
+	if (hyperv_paravisor_present || hv_root_partition) {
 		/* Mask out vTOM bit. ioremap_cache() maps decrypted */
 		u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
 				~ms_hyperv.shared_gpa_boundary;
@@ -264,7 +268,7 @@  void hv_synic_enable_regs(unsigned int cpu)
 	siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP);
 	siefp.siefp_enabled = 1;
 
-	if (hv_isolation_type_snp() || hv_root_partition) {
+	if (hyperv_paravisor_present || hv_root_partition) {
 		/* Mask out vTOM bit. ioremap_cache() maps decrypted */
 		u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
 				~ms_hyperv.shared_gpa_boundary;
@@ -351,7 +355,7 @@  void hv_synic_disable_regs(unsigned int cpu)
 	 * addresses.
 	 */
 	simp.simp_enabled = 0;
-	if (hv_isolation_type_snp() || hv_root_partition) {
+	if (hyperv_paravisor_present || hv_root_partition) {
 		iounmap(hv_cpu->synic_message_page);
 		hv_cpu->synic_message_page = NULL;
 	} else {
@@ -363,7 +367,7 @@  void hv_synic_disable_regs(unsigned int cpu)
 	siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP);
 	siefp.siefp_enabled = 0;
 
-	if (hv_isolation_type_snp() || hv_root_partition) {
+	if (hyperv_paravisor_present || hv_root_partition) {
 		iounmap(hv_cpu->synic_event_page);
 		hv_cpu->synic_event_page = NULL;
 	} else {
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 7eb25345c904..6c1fcfc6894a 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -331,8 +331,8 @@  void __weak hyperv_cleanup(void)
 }
 EXPORT_SYMBOL_GPL(hyperv_cleanup);
 
-u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
+u64 __weak hv_ivm_hypercall(u64 control, void *input, void *output, u32 input_size)
 {
 	return HV_STATUS_INVALID_PARAMETER;
 }
-EXPORT_SYMBOL_GPL(hv_ghcb_hypercall);
+EXPORT_SYMBOL_GPL(hv_ivm_hypercall);
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 3e48cdc02b74..e7e3445e99b7 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -284,7 +284,8 @@  enum hv_isolation_type hv_get_isolation_type(void);
 bool hv_is_isolation_supported(void);
 bool hv_set_memory_enc_dec_needed(void);
 bool hv_isolation_type_snp(void);
-u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size);
+u64 hv_ivm_hypercall(u64 control, void *input, void *output, u32 input_size);
+u64 hv_tdx_hypercall_fast(u64 control, u64 input);
 void hyperv_cleanup(void);
 bool hv_query_ext_cap(u64 cap_query);
 void hv_setup_dma_ops(struct device *dev, bool coherent);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 0cd429ccfc7f..fb3299235fc7 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -141,6 +141,7 @@  enum cpuhp_state {
 	 */
 	CPUHP_AP_IDLE_DEAD,
 	CPUHP_AP_OFFLINE,
+	CPUHP_AP_HYPERV_FORCE_EFER_WRITE,
 	CPUHP_AP_CACHECTRL_STARTING,
 	CPUHP_AP_SCHED_STARTING,
 	CPUHP_AP_RCUTREE_DYING,