diff mbox series

[RFT] x86/hpet: Use another crystalball to evaluate HPET usability

Message ID 87k0iy71rw.ffs@tglx
State New
Headers show
Series [RFT] x86/hpet: Use another crystalball to evaluate HPET usability | expand

Commit Message

Thomas Gleixner Sept. 30, 2021, 11:15 a.m. UTC
On recent Intel systems the HPET stops working when the system reaches PC10
idle state.

The approach of adding PCI ids to the early quirks to disable HPET on
these systems is a whack a mole game which makes no sense.

Check for PC10 instead and force disable HPET if supported. The check is
overbroad as it does not take ACPI, intel_idle enablement and command
line parameters into account. That's fine as long as there is at least
PMTIMER available to calibrate the TSC frequency. The decision can be
overruled by adding "hpet=force" on the kernel command line.

Remove the related early PCI quirks for affected Ice and Coffin Lake
systems as they are not longer required.

Fixes: Yet another hardware trainwreck
Reported-by: Jakub Kicinski <kuba@kernel.org>
Not-yet-signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
Notes: Completely untested. Use at your own peril.
---
 arch/x86/kernel/early-quirks.c |    6 --
 arch/x86/kernel/hpet.c         |   88 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 6 deletions(-)

Comments

Rafael J. Wysocki Sept. 30, 2021, 11:38 a.m. UTC | #1
On Thu, Sep 30, 2021 at 1:15 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On recent Intel systems the HPET stops working when the system reaches PC10
> idle state.
>
> The approach of adding PCI ids to the early quirks to disable HPET on
> these systems is a whack a mole game which makes no sense.
>
> Check for PC10 instead and force disable HPET if supported. The check is
> overbroad as it does not take ACPI, intel_idle enablement and command
> line parameters into account. That's fine as long as there is at least
> PMTIMER available to calibrate the TSC frequency. The decision can be
> overruled by adding "hpet=force" on the kernel command line.
>
> Remove the related early PCI quirks for affected Ice and Coffin Lake
> systems as they are not longer required.
>
> Fixes: Yet another hardware trainwreck
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> Not-yet-signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
> Notes: Completely untested. Use at your own peril.
> ---
>  arch/x86/kernel/early-quirks.c |    6 --
>  arch/x86/kernel/hpet.c         |   88 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 88 insertions(+), 6 deletions(-)
>
> --- a/arch/x86/kernel/early-quirks.c
> +++ b/arch/x86/kernel/early-quirks.c
> @@ -714,12 +714,6 @@ static struct chipset early_qrk[] __init
>          */
>         { PCI_VENDOR_ID_INTEL, 0x0f00,
>                 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
> -       { PCI_VENDOR_ID_INTEL, 0x3e20,
> -               PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
> -       { PCI_VENDOR_ID_INTEL, 0x3ec4,
> -               PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
> -       { PCI_VENDOR_ID_INTEL, 0x8a12,
> -               PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
>         { PCI_VENDOR_ID_BROADCOM, 0x4331,
>           PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
>         {}
> --- a/arch/x86/kernel/hpet.c
> +++ b/arch/x86/kernel/hpet.c
> @@ -10,6 +10,7 @@
>  #include <asm/irq_remapping.h>
>  #include <asm/hpet.h>
>  #include <asm/time.h>
> +#include <asm/mwait.h>
>
>  #undef  pr_fmt
>  #define pr_fmt(fmt) "hpet: " fmt
> @@ -916,6 +917,90 @@ static bool __init hpet_counting(void)
>         return false;
>  }
>
> +static bool __init get_mwait_substates(unsigned int *mwait_substates)
> +{
> +       unsigned int eax, ebx, ecx;
> +
> +       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
> +               return false;
> +
> +       if (!boot_cpu_has(X86_FEATURE_MWAIT))
> +               return false;
> +
> +       if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
> +               return false;
> +
> +       cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, mwait_substates);
> +
> +       if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
> +           !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
> +           !*mwait_substates)
> +               return false;

I would do

return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && (ecx &
CPUID5_ECX_INTERRUPT_BREAK) && *mwait_substates;

And this function could just return the mwait_substates value proper,
because returning 0 then would be equivalent to returning 'false' from
it as is.

LGTM otherwise.

> +
> +       return true;
> +}
> +
> +/*
> + * Check whether the system supports PC10. If so force disable HPET as that
> + * stops counting in PC10. This check is overbroad as it does not take any
> + * of the following into account:
> + *
> + *     - ACPI tables
> + *     - Enablement of intel_idle
> + *     - Command line arguments which limit intel_idle C-state support
> + *
> + * That's perfectly fine. HPET is a piece of hardware designed by committee
> + * and the only reasons why it is still in use on modern systems is the
> + * fact that it is impossible to reliably query the TSC frequency via
> + * CPUID or firmware.
> + *
> + * If HPET is functional it is useful for calibrating TSC, but this can be
> + * done via PMTIMER as well which seems to be the last remaining timer on
> + * X86/INTEL platforms that has not been completely wreckaged by feature
> + * creep.
> + *
> + * In theory HPET support should be removed altogether, but there are older
> + * systems out there which depend on it because TSC and APIC timer are
> + * dysfunctional in deeper C-states.
> + *
> + * It's only 20 years now that hardware people have been asked to provide
> + * reliable and discoverable facilities which can be used for timekeeping
> + * and per CPU timer interrupts.
> + *
> + * The probability that this problem is going to be solved in the
> + * forseeable future is close to zero, so the kernel has to be cluttered
> + * with heuristics to keep up with the ever growing amount of hardware and
> + * firmware trainwrecks. Hopefully some day hardware people will understand
> + * that the approach of "This can be fixed in software" is not sustainable.
> + * Hope dies last...
> + */
> +static bool __init hpet_is_pc10_damaged(void)
> +{
> +       unsigned int mwait_substates;
> +       unsigned long long pcfg;
> +
> +       if (!get_mwait_substates(&mwait_substates))
> +               return false;
> +
> +       /* Check whether PC10 substates are supported */
> +       if (!(mwait_substates & (0xF << 28)))
> +               return false;
> +
> +       /* Check whether PC10 is enabled in PKG C-state limit */
> +       rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg);
> +       if ((pcfg & 0xF) < 8)
> +               return false;
> +
> +       if (hpet_force_user) {
> +               pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n");
> +               return false;
> +       }
> +
> +       pr_info("HPET dysfunctional in PC10. Force disabled.\n");
> +       boot_hpet_disable = true;
> +       return true;
> +}
> +
>  /**
>   * hpet_enable - Try to setup the HPET timer. Returns 1 on success.
>   */
> @@ -929,6 +1014,9 @@ int __init hpet_enable(void)
>         if (!is_hpet_capable())
>                 return 0;
>
> +       if (hpet_is_pc10_damaged())
> +               return 0;
> +
>         hpet_set_mapping();
>         if (!hpet_virt_address)
>                 return 0;
Thomas Gleixner Sept. 30, 2021, 5:07 p.m. UTC | #2
On Thu, Sep 30 2021 at 13:38, Rafael J. Wysocki wrote:
>> +static bool __init get_mwait_substates(unsigned int *mwait_substates)
>> +{
>> +       unsigned int eax, ebx, ecx;
>> +
>> +       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
>> +               return false;
>> +
>> +       if (!boot_cpu_has(X86_FEATURE_MWAIT))
>> +               return false;
>> +
>> +       if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
>> +               return false;
>> +
>> +       cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, mwait_substates);
>> +
>> +       if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
>> +           !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
>> +           !*mwait_substates)
>> +               return false;
>
> I would do
>
> return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && (ecx &
> CPUID5_ECX_INTERRUPT_BREAK) && *mwait_substates;
>
> And this function could just return the mwait_substates value proper,
> because returning 0 then would be equivalent to returning 'false' from
> it as is.

Let me move that substates check into the function which makes it even simpler.

Thanks,

        tglx
diff mbox series

Patch

--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -714,12 +714,6 @@  static struct chipset early_qrk[] __init
 	 */
 	{ PCI_VENDOR_ID_INTEL, 0x0f00,
 		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
-	{ PCI_VENDOR_ID_INTEL, 0x3e20,
-		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
-	{ PCI_VENDOR_ID_INTEL, 0x3ec4,
-		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
-	{ PCI_VENDOR_ID_INTEL, 0x8a12,
-		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
 	{ PCI_VENDOR_ID_BROADCOM, 0x4331,
 	  PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
 	{}
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -10,6 +10,7 @@ 
 #include <asm/irq_remapping.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
+#include <asm/mwait.h>
 
 #undef  pr_fmt
 #define pr_fmt(fmt) "hpet: " fmt
@@ -916,6 +917,90 @@  static bool __init hpet_counting(void)
 	return false;
 }
 
+static bool __init get_mwait_substates(unsigned int *mwait_substates)
+{
+	unsigned int eax, ebx, ecx;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return false;
+
+	if (!boot_cpu_has(X86_FEATURE_MWAIT))
+		return false;
+
+	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+		return false;
+
+	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, mwait_substates);
+
+	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
+	    !*mwait_substates)
+		return false;
+
+	return true;
+}
+
+/*
+ * Check whether the system supports PC10. If so force disable HPET as that
+ * stops counting in PC10. This check is overbroad as it does not take any
+ * of the following into account:
+ *
+ *	- ACPI tables
+ *	- Enablement of intel_idle
+ *	- Command line arguments which limit intel_idle C-state support
+ *
+ * That's perfectly fine. HPET is a piece of hardware designed by committee
+ * and the only reasons why it is still in use on modern systems is the
+ * fact that it is impossible to reliably query the TSC frequency via
+ * CPUID or firmware.
+ *
+ * If HPET is functional it is useful for calibrating TSC, but this can be
+ * done via PMTIMER as well which seems to be the last remaining timer on
+ * X86/INTEL platforms that has not been completely wreckaged by feature
+ * creep.
+ *
+ * In theory HPET support should be removed altogether, but there are older
+ * systems out there which depend on it because TSC and APIC timer are
+ * dysfunctional in deeper C-states.
+ *
+ * It's only 20 years now that hardware people have been asked to provide
+ * reliable and discoverable facilities which can be used for timekeeping
+ * and per CPU timer interrupts.
+ *
+ * The probability that this problem is going to be solved in the
+ * forseeable future is close to zero, so the kernel has to be cluttered
+ * with heuristics to keep up with the ever growing amount of hardware and
+ * firmware trainwrecks. Hopefully some day hardware people will understand
+ * that the approach of "This can be fixed in software" is not sustainable.
+ * Hope dies last...
+ */
+static bool __init hpet_is_pc10_damaged(void)
+{
+	unsigned int mwait_substates;
+	unsigned long long pcfg;
+
+	if (!get_mwait_substates(&mwait_substates))
+		return false;
+
+	/* Check whether PC10 substates are supported */
+	if (!(mwait_substates & (0xF << 28)))
+		return false;
+
+	/* Check whether PC10 is enabled in PKG C-state limit */
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg);
+	if ((pcfg & 0xF) < 8)
+		return false;
+
+	if (hpet_force_user) {
+		pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n");
+		return false;
+	}
+
+	pr_info("HPET dysfunctional in PC10. Force disabled.\n");
+	boot_hpet_disable = true;
+	return true;
+}
+
 /**
  * hpet_enable - Try to setup the HPET timer. Returns 1 on success.
  */
@@ -929,6 +1014,9 @@  int __init hpet_enable(void)
 	if (!is_hpet_capable())
 		return 0;
 
+	if (hpet_is_pc10_damaged())
+		return 0;
+
 	hpet_set_mapping();
 	if (!hpet_virt_address)
 		return 0;