diff mbox series

[1/4] apic: add support for x2APIC mode

Message ID 20230221160500.30336-2-minhquangbui99@gmail.com
State New
Headers show
Series Support x2APIC mode with TCG accelerator | expand

Commit Message

Bui Quang Minh Feb. 21, 2023, 4:04 p.m. UTC
This commit refactors APIC registers read/write function to support both
MMIO read/write in xAPIC mode and MSR read/write in x2APIC mode. Also,
support larger APIC ID, self IPI, new IPI destination determination in
x2APIC mode.

Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
---
 hw/intc/apic.c                  | 211 +++++++++++++++++++++++++-------
 hw/intc/apic_common.c           |   2 +-
 include/hw/i386/apic.h          |   5 +-
 include/hw/i386/apic_internal.h |   2 +-
 4 files changed, 172 insertions(+), 48 deletions(-)

Comments

Igor Mammedov Feb. 24, 2023, 2:29 p.m. UTC | #1
On Tue, 21 Feb 2023 23:04:57 +0700
Bui Quang Minh <minhquangbui99@gmail.com> wrote:

> This commit refactors APIC registers read/write function to support both
> MMIO read/write in xAPIC mode and MSR read/write in x2APIC mode. Also,
> support larger APIC ID, self IPI, new IPI destination determination in
> x2APIC mode.
> 
> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> ---
>  hw/intc/apic.c                  | 211 +++++++++++++++++++++++++-------
>  hw/intc/apic_common.c           |   2 +-
>  include/hw/i386/apic.h          |   5 +-
>  include/hw/i386/apic_internal.h |   2 +-
>  4 files changed, 172 insertions(+), 48 deletions(-)
> 
> diff --git a/hw/intc/apic.c b/hw/intc/apic.c
> index 2d3e55f4e2..205d5923ec 100644
> --- a/hw/intc/apic.c
> +++ b/hw/intc/apic.c
> @@ -30,6 +30,7 @@
>  #include "hw/i386/apic-msidef.h"
>  #include "qapi/error.h"
>  #include "qom/object.h"
> +#include "tcg/helper-tcg.h"
>  
>  #define MAX_APICS 255

I'm curious how does it work without increasing ^^^?

>  #define MAX_APIC_WORDS 8
> @@ -48,7 +49,7 @@ DECLARE_INSTANCE_CHECKER(APICCommonState, APIC,
>  static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode);
>  static void apic_update_irq(APICCommonState *s);
>  static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
> -                                      uint8_t dest, uint8_t dest_mode);
> +                                      uint32_t dest, uint8_t dest_mode);
>  
>  /* Find first bit starting from msb */
>  static int apic_fls_bit(uint32_t value)
> @@ -275,7 +276,7 @@ static void apic_bus_deliver(const uint32_t *deliver_bitmask,
>                   apic_set_irq(apic_iter, vector_num, trigger_mode) );
>  }
>  
> -void apic_deliver_irq(uint8_t dest, uint8_t dest_mode, uint8_t delivery_mode,
> +void apic_deliver_irq(uint32_t dest, uint8_t dest_mode, uint8_t delivery_mode,
>                        uint8_t vector_num, uint8_t trigger_mode)
>  {
>      uint32_t deliver_bitmask[MAX_APIC_WORDS];
> @@ -287,8 +288,38 @@ void apic_deliver_irq(uint8_t dest, uint8_t dest_mode, uint8_t delivery_mode,
>      apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num, trigger_mode);
>  }
>  
> +bool is_x2apic_mode(DeviceState *dev)
> +{
> +    APICCommonState *s = APIC(dev);
> +
> +    return s->apicbase & MSR_IA32_APICBASE_EXTD;
> +}
> +
>  static void apic_set_base(APICCommonState *s, uint64_t val)
>  {
> +    /*
> +     * Transition into invalid state
> +     * (s->apicbase & MSR_IA32_APICBASE_ENABLE == 0) &&
> +     * (s->apicbase & MSR_IA32_APICBASE_EXTD) == 1
> +     */
> +    if (!(val & MSR_IA32_APICBASE_ENABLE) &&
> +        (val & MSR_IA32_APICBASE_EXTD))
> +        raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
> +
> +    /* Invalid transition from disabled mode to x2APIC */
> +    if (!(s->apicbase & MSR_IA32_APICBASE_ENABLE) &&
> +        !(s->apicbase & MSR_IA32_APICBASE_EXTD) &&
> +        (val & MSR_IA32_APICBASE_ENABLE) &&
> +        (val & MSR_IA32_APICBASE_EXTD))
> +        raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
> +
> +    /* Invalid transition from x2APIC to xAPIC */
> +    if ((s->apicbase & MSR_IA32_APICBASE_ENABLE) &&
> +        (s->apicbase & MSR_IA32_APICBASE_EXTD) &&
> +        (val & MSR_IA32_APICBASE_ENABLE) &&
> +        !(val & MSR_IA32_APICBASE_EXTD))
> +        raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
> +
>      s->apicbase = (val & 0xfffff000) |
>          (s->apicbase & (MSR_IA32_APICBASE_BSP | MSR_IA32_APICBASE_ENABLE));
>      /* if disabled, cannot be enabled again */
> @@ -297,6 +328,21 @@ static void apic_set_base(APICCommonState *s, uint64_t val)
>          cpu_clear_apic_feature(&s->cpu->env);
>          s->spurious_vec &= ~APIC_SV_ENABLE;
>      }
> +
> +    /* Transition from xAPIC to x2APIC */
> +    if (cpu_has_x2apic_feature(&s->cpu->env) &&
> +        !(s->apicbase & MSR_IA32_APICBASE_EXTD) &&
> +        (val & MSR_IA32_APICBASE_EXTD)) {
> +        s->apicbase |= MSR_IA32_APICBASE_EXTD;
> +
> +        s->log_dest = ((s->initial_apic_id & 0xffff0) << 16) |
> +                      (1 << (s->initial_apic_id & 0xf));
> +
> +        /* disable MMIO interface */
> +        qemu_mutex_lock_iothread();
> +        memory_region_set_enabled(&s->io_memory, false);
> +        qemu_mutex_unlock_iothread();
> +    }
>  }
>  
>  static void apic_set_tpr(APICCommonState *s, uint8_t val)
> @@ -454,7 +500,7 @@ static int apic_find_dest(uint8_t dest)
>  }
>  
>  static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
> -                                      uint8_t dest, uint8_t dest_mode)
> +                                      uint32_t dest, uint8_t dest_mode)
>  {
>      APICCommonState *apic_iter;
>      int i;
> @@ -474,14 +520,22 @@ static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
>          for(i = 0; i < MAX_APICS; i++) {
>              apic_iter = local_apics[i];
>              if (apic_iter) {
> -                if (apic_iter->dest_mode == 0xf) {
> -                    if (dest & apic_iter->log_dest)
> -                        apic_set_bit(deliver_bitmask, i);
> -                } else if (apic_iter->dest_mode == 0x0) {
> -                    if ((dest & 0xf0) == (apic_iter->log_dest & 0xf0) &&
> -                        (dest & apic_iter->log_dest & 0x0f)) {
> +                /* x2APIC mode */
> +                if (apic_iter->apicbase & MSR_IA32_APICBASE_EXTD) {
> +                    if ((dest & 0xffff0000) == (apic_iter->log_dest & 0xffff0000) &&
> +                        (dest & apic_iter->log_dest & 0xffff)) {
>                          apic_set_bit(deliver_bitmask, i);
>                      }
> +                } else {
> +                    if (apic_iter->dest_mode == 0xf) {
> +                        if (dest & apic_iter->log_dest)
> +                            apic_set_bit(deliver_bitmask, i);
> +                    } else if (apic_iter->dest_mode == 0x0) {
> +                        if ((dest & 0xf0) == (apic_iter->log_dest & 0xf0) &&
> +                            (dest & apic_iter->log_dest & 0x0f)) {
> +                            apic_set_bit(deliver_bitmask, i);
> +                        }
> +                    }
>                  }
>              } else {
>                  break;
> @@ -508,13 +562,12 @@ void apic_sipi(DeviceState *dev)
>      s->wait_for_sipi = 0;
>  }
>  
> -static void apic_deliver(DeviceState *dev, uint8_t dest, uint8_t dest_mode,
> +static void apic_deliver(DeviceState *dev, uint32_t dest, uint8_t dest_mode,
>                           uint8_t delivery_mode, uint8_t vector_num,
> -                         uint8_t trigger_mode)
> +                         uint8_t trigger_mode, uint8_t dest_shorthand)
>  {
>      APICCommonState *s = APIC(dev);
>      uint32_t deliver_bitmask[MAX_APIC_WORDS];
> -    int dest_shorthand = (s->icr[0] >> 18) & 3;
>      APICCommonState *apic_iter;
>  
>      switch (dest_shorthand) {
> @@ -635,16 +688,11 @@ static void apic_timer(void *opaque)
>      apic_timer_update(s, s->next_time);
>  }
>  
> -static uint64_t apic_mem_read(void *opaque, hwaddr addr, unsigned size)
> +uint64_t apic_register_read(int index)
>  {
>      DeviceState *dev;
>      APICCommonState *s;
> -    uint32_t val;
> -    int index;
> -
> -    if (size < 4) {
> -        return 0;
> -    }
> +    uint64_t val;
>  
>      dev = cpu_get_current_apic();
>      if (!dev) {
> @@ -652,10 +700,12 @@ static uint64_t apic_mem_read(void *opaque, hwaddr addr, unsigned size)
>      }
>      s = APIC(dev);
>  
> -    index = (addr >> 4) & 0xff;
>      switch(index) {
>      case 0x02: /* id */
> -        val = s->id << 24;
> +        if (is_x2apic_mode(dev))
> +            val = s->initial_apic_id;
> +        else
> +            val = s->id << 24;
>          break;
>      case 0x03: /* version */
>          val = s->version | ((APIC_LVT_NB - 1) << 16);
> @@ -678,9 +728,16 @@ static uint64_t apic_mem_read(void *opaque, hwaddr addr, unsigned size)
>          val = 0;
>          break;
>      case 0x0d:
> -        val = s->log_dest << 24;
> +        if (is_x2apic_mode(dev))
> +            val = s->log_dest;
> +        else
> +            val = s->log_dest << 24;
>          break;
>      case 0x0e:
> +        if (is_x2apic_mode(dev)) {
> +            raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
> +        }
> +
>          val = (s->dest_mode << 28) | 0xfffffff;
>          break;
>      case 0x0f:
> @@ -719,6 +776,22 @@ static uint64_t apic_mem_read(void *opaque, hwaddr addr, unsigned size)
>          val = 0;
>          break;
>      }
> +
> +    return val;
> +}
> +
> +static uint64_t apic_mem_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +    uint32_t val;
> +    int index;
> +
> +    if (size < 4) {
> +        return 0;
> +    }
> +
> +    index = (addr >> 4) & 0xff;
> +    val = (uint32_t) apic_register_read(index);
> +
>      trace_apic_mem_readl(addr, val);
>      return val;
>  }
> @@ -736,27 +809,10 @@ static void apic_send_msi(MSIMessage *msi)
>      apic_deliver_irq(dest, dest_mode, delivery, vector, trigger_mode);
>  }
>  
> -static void apic_mem_write(void *opaque, hwaddr addr, uint64_t val,
> -                           unsigned size)
> +void apic_register_write(int index, uint64_t val)
>  {
>      DeviceState *dev;
>      APICCommonState *s;
> -    int index = (addr >> 4) & 0xff;
> -
> -    if (size < 4) {
> -        return;
> -    }
> -
> -    if (addr > 0xfff || !index) {
> -        /* MSI and MMIO APIC are at the same memory location,
> -         * but actually not on the global bus: MSI is on PCI bus
> -         * APIC is connected directly to the CPU.
> -         * Mapping them on the global bus happens to work because
> -         * MSI registers are reserved in APIC MMIO and vice versa. */
> -        MSIMessage msi = { .address = addr, .data = val };
> -        apic_send_msi(&msi);
> -        return;
> -    }
>  
>      dev = cpu_get_current_apic();
>      if (!dev) {
> @@ -764,10 +820,12 @@ static void apic_mem_write(void *opaque, hwaddr addr, uint64_t val,
>      }
>      s = APIC(dev);
>  
> -    trace_apic_mem_writel(addr, val);
> -
>      switch(index) {
>      case 0x02:
> +        if (is_x2apic_mode(dev)) {
> +            raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
> +        }
> +
>          s->id = (val >> 24);
>          break;
>      case 0x03:
> @@ -787,9 +845,17 @@ static void apic_mem_write(void *opaque, hwaddr addr, uint64_t val,
>          apic_eoi(s);
>          break;
>      case 0x0d:
> +        if (is_x2apic_mode(dev)) {
> +            raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
> +        }
> +
>          s->log_dest = val >> 24;
>          break;
>      case 0x0e:
> +        if (is_x2apic_mode(dev)) {
> +            raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
> +        }
> +
>          s->dest_mode = val >> 28;
>          break;
>      case 0x0f:
> @@ -801,13 +867,27 @@ static void apic_mem_write(void *opaque, hwaddr addr, uint64_t val,
>      case 0x20 ... 0x27:
>      case 0x28:
>          break;
> -    case 0x30:
> +    case 0x30: {
> +        uint32_t dest;
> +
>          s->icr[0] = val;
> -        apic_deliver(dev, (s->icr[1] >> 24) & 0xff, (s->icr[0] >> 11) & 1,
> +        if (is_x2apic_mode(dev)) {
> +            s->icr[1] = val >> 32;
> +            dest = s->icr[1];
> +        } else {
> +            dest = (s->icr[1] >> 24) & 0xff;
> +        }
> +
> +        apic_deliver(dev, dest, (s->icr[0] >> 11) & 1,
>                       (s->icr[0] >> 8) & 7, (s->icr[0] & 0xff),
> -                     (s->icr[0] >> 15) & 1);
> +                     (s->icr[0] >> 15) & 1, (s->icr[0] >> 18) & 3);
>          break;
> +    }
>      case 0x31:
> +        if (is_x2apic_mode(dev)) {
> +            raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
> +        }
> +
>          s->icr[1] = val;
>          break;
>      case 0x32 ... 0x37:
> @@ -836,12 +916,53 @@ static void apic_mem_write(void *opaque, hwaddr addr, uint64_t val,
>              s->count_shift = (v + 1) & 7;
>          }
>          break;
> +    case 0x3f: {
> +        int vector = val & 0xff;
> +
> +        if (is_x2apic_mode(dev)) {
> +            raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
> +        }
> +
> +        /*
> +         * Self IPI is identical to IPI with
> +         * - Destination shorthand: 1 (Self)
> +         * - Trigger mode: 0 (Edge)
> +         * - Delivery mode: 0 (Fixed)
> +         */
> +        apic_deliver(dev, 0, 0, APIC_DM_FIXED, vector, 0, 1);
> +
> +        break;
> +    }
>      default:
>          s->esr |= APIC_ESR_ILLEGAL_ADDRESS;
>          break;
>      }
>  }
>  
> +static void apic_mem_write(void *opaque, hwaddr addr, uint64_t val,
> +                           unsigned size)
> +{
> +    int index = (addr >> 4) & 0xff;
> +
> +    if (size < 4) {
> +        return;
> +    }
> +
> +    if (addr > 0xfff || !index) {
> +        /* MSI and MMIO APIC are at the same memory location,
> +         * but actually not on the global bus: MSI is on PCI bus
> +         * APIC is connected directly to the CPU.
> +         * Mapping them on the global bus happens to work because
> +         * MSI registers are reserved in APIC MMIO and vice versa. */
> +        MSIMessage msi = { .address = addr, .data = val };
> +        apic_send_msi(&msi);
> +        return;
> +    }
> +
> +    trace_apic_mem_writel(addr, val);
> +    apic_register_write(index, val);
> +}
> +
>  static void apic_pre_save(APICCommonState *s)
>  {
>      apic_sync_vapic(s, SYNC_FROM_VAPIC);
> diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c
> index 4a34f03047..9ea1397530 100644
> --- a/hw/intc/apic_common.c
> +++ b/hw/intc/apic_common.c
> @@ -366,7 +366,7 @@ static const VMStateDescription vmstate_apic_common = {
>          VMSTATE_UINT8(arb_id, APICCommonState),
>          VMSTATE_UINT8(tpr, APICCommonState),
>          VMSTATE_UINT32(spurious_vec, APICCommonState),
> -        VMSTATE_UINT8(log_dest, APICCommonState),
> +        VMSTATE_UINT32(log_dest, APICCommonState),
>          VMSTATE_UINT8(dest_mode, APICCommonState),
>          VMSTATE_UINT32_ARRAY(isr, APICCommonState, 8),
>          VMSTATE_UINT32_ARRAY(tmr, APICCommonState, 8),
> diff --git a/include/hw/i386/apic.h b/include/hw/i386/apic.h
> index bdc15a7a73..871ca94c5c 100644
> --- a/include/hw/i386/apic.h
> +++ b/include/hw/i386/apic.h
> @@ -3,7 +3,7 @@
>  
>  
>  /* apic.c */
> -void apic_deliver_irq(uint8_t dest, uint8_t dest_mode, uint8_t delivery_mode,
> +void apic_deliver_irq(uint32_t dest, uint8_t dest_mode, uint8_t delivery_mode,
>                        uint8_t vector_num, uint8_t trigger_mode);
>  int apic_accept_pic_intr(DeviceState *s);
>  void apic_deliver_pic_intr(DeviceState *s, int level);
> @@ -18,6 +18,9 @@ void apic_sipi(DeviceState *s);
>  void apic_poll_irq(DeviceState *d);
>  void apic_designate_bsp(DeviceState *d, bool bsp);
>  int apic_get_highest_priority_irr(DeviceState *dev);
> +uint64_t apic_register_read(int index);
> +void apic_register_write(int index, uint64_t val);
> +bool is_x2apic_mode(DeviceState *d);
>  
>  /* pc.c */
>  DeviceState *cpu_get_current_apic(void);
> diff --git a/include/hw/i386/apic_internal.h b/include/hw/i386/apic_internal.h
> index 5f2ba24bfc..5f60cd5e78 100644
> --- a/include/hw/i386/apic_internal.h
> +++ b/include/hw/i386/apic_internal.h
> @@ -164,7 +164,7 @@ struct APICCommonState {
>      uint8_t arb_id;
>      uint8_t tpr;
>      uint32_t spurious_vec;
> -    uint8_t log_dest;
> +    uint32_t log_dest;
>      uint8_t dest_mode;
>      uint32_t isr[8];  /* in service register */
>      uint32_t tmr[8];  /* trigger mode register */
Bui Quang Minh Feb. 25, 2023, 10:15 a.m. UTC | #2
On 2/24/23 21:29, Igor Mammedov wrote:
> On Tue, 21 Feb 2023 23:04:57 +0700
> Bui Quang Minh <minhquangbui99@gmail.com> wrote:
> 
>> This commit refactors APIC registers read/write function to support both
>> MMIO read/write in xAPIC mode and MSR read/write in x2APIC mode. Also,
>> support larger APIC ID, self IPI, new IPI destination determination in
>> x2APIC mode.
>>
>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>> ---
>>   hw/intc/apic.c                  | 211 +++++++++++++++++++++++++-------
>>   hw/intc/apic_common.c           |   2 +-
>>   include/hw/i386/apic.h          |   5 +-
>>   include/hw/i386/apic_internal.h |   2 +-
>>   4 files changed, 172 insertions(+), 48 deletions(-)
>>
>> diff --git a/hw/intc/apic.c b/hw/intc/apic.c
>> index 2d3e55f4e2..205d5923ec 100644
>> --- a/hw/intc/apic.c
>> +++ b/hw/intc/apic.c
>> @@ -30,6 +30,7 @@
>>   #include "hw/i386/apic-msidef.h"
>>   #include "qapi/error.h"
>>   #include "qom/object.h"
>> +#include "tcg/helper-tcg.h"
>>   
>>   #define MAX_APICS 255
> 
> I'm curious how does it work without increasing ^^^?

Hmm, my commit message is not entirely correct. In this series, some 
operations (send IPI, IPI destination determination) have been updated 
to support x2APIC mode. However, the emulated APIC still doesn't support 
APIC ID larger than 255 because currently, we use a fixed length (255 + 
1) array to manage local APICs. So to support larger APIC ID, I think we 
need to find any way to manage those, as the possible allocated APIC ID 
range is large and maybe the allocated APIC ID is sparse which makes 
fixed length array so wasteful.

Thanks,
Quang Minh.
Igor Mammedov Feb. 27, 2023, 4:07 p.m. UTC | #3
On Sat, 25 Feb 2023 17:15:17 +0700
Bui Quang Minh <minhquangbui99@gmail.com> wrote:

> On 2/24/23 21:29, Igor Mammedov wrote:
> > On Tue, 21 Feb 2023 23:04:57 +0700
> > Bui Quang Minh <minhquangbui99@gmail.com> wrote:
> >   
> >> This commit refactors APIC registers read/write function to support both
> >> MMIO read/write in xAPIC mode and MSR read/write in x2APIC mode. Also,
> >> support larger APIC ID, self IPI, new IPI destination determination in
> >> x2APIC mode.
> >>
> >> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> >> ---
> >>   hw/intc/apic.c                  | 211 +++++++++++++++++++++++++-------
> >>   hw/intc/apic_common.c           |   2 +-
> >>   include/hw/i386/apic.h          |   5 +-
> >>   include/hw/i386/apic_internal.h |   2 +-
> >>   4 files changed, 172 insertions(+), 48 deletions(-)
> >>
> >> diff --git a/hw/intc/apic.c b/hw/intc/apic.c
> >> index 2d3e55f4e2..205d5923ec 100644
> >> --- a/hw/intc/apic.c
> >> +++ b/hw/intc/apic.c
> >> @@ -30,6 +30,7 @@
> >>   #include "hw/i386/apic-msidef.h"
> >>   #include "qapi/error.h"
> >>   #include "qom/object.h"
> >> +#include "tcg/helper-tcg.h"
> >>   
> >>   #define MAX_APICS 255  
> > 
> > I'm curious how does it work without increasing ^^^?  
> 
> Hmm, my commit message is not entirely correct. In this series, some 
> operations (send IPI, IPI destination determination) have been updated 
> to support x2APIC mode. However, the emulated APIC still doesn't support 
> APIC ID larger than 255 because currently, we use a fixed length (255 + 
> 1) array to manage local APICs. So to support larger APIC ID, I think we 
> need to find any way to manage those, as the possible allocated APIC ID 
> range is large and maybe the allocated APIC ID is sparse which makes 
> fixed length array so wasteful.
how much sparse it is? 

benefits of simple static array is simplicity in management and O(1) access time.
QEMU does know in advance max apic id so we can size array by dynamically
allocating it when 1st apic is created. Or if IDs are too sparse
switch to another structure to keep mapping.


> 
> Thanks,
> Quang Minh.
>
Bui Quang Minh Feb. 28, 2023, 2:34 p.m. UTC | #4
On 2/27/23 23:07, Igor Mammedov wrote:
> On Sat, 25 Feb 2023 17:15:17 +0700
> Bui Quang Minh <minhquangbui99@gmail.com> wrote:
> 
>> On 2/24/23 21:29, Igor Mammedov wrote:
>>> On Tue, 21 Feb 2023 23:04:57 +0700
>>> Bui Quang Minh <minhquangbui99@gmail.com> wrote:
>>>    
>>>> This commit refactors APIC registers read/write function to support both
>>>> MMIO read/write in xAPIC mode and MSR read/write in x2APIC mode. Also,
>>>> support larger APIC ID, self IPI, new IPI destination determination in
>>>> x2APIC mode.
>>>>
>>>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>>>> ---
>>>>    hw/intc/apic.c                  | 211 +++++++++++++++++++++++++-------
>>>>    hw/intc/apic_common.c           |   2 +-
>>>>    include/hw/i386/apic.h          |   5 +-
>>>>    include/hw/i386/apic_internal.h |   2 +-
>>>>    4 files changed, 172 insertions(+), 48 deletions(-)
>>>>
>>>> diff --git a/hw/intc/apic.c b/hw/intc/apic.c
>>>> index 2d3e55f4e2..205d5923ec 100644
>>>> --- a/hw/intc/apic.c
>>>> +++ b/hw/intc/apic.c
>>>> @@ -30,6 +30,7 @@
>>>>    #include "hw/i386/apic-msidef.h"
>>>>    #include "qapi/error.h"
>>>>    #include "qom/object.h"
>>>> +#include "tcg/helper-tcg.h"
>>>>    
>>>>    #define MAX_APICS 255
>>>
>>> I'm curious how does it work without increasing ^^^?
>>
>> Hmm, my commit message is not entirely correct. In this series, some
>> operations (send IPI, IPI destination determination) have been updated
>> to support x2APIC mode. However, the emulated APIC still doesn't support
>> APIC ID larger than 255 because currently, we use a fixed length (255 +
>> 1) array to manage local APICs. So to support larger APIC ID, I think we
>> need to find any way to manage those, as the possible allocated APIC ID
>> range is large and maybe the allocated APIC ID is sparse which makes
>> fixed length array so wasteful.
> how much sparse it is?

As far as I know, QEMU allows to set CPU's APIC ID, so user can pass a 
very sparse APIC ID array.

> benefits of simple static array is simplicity in management and O(1) access time.
> QEMU does know in advance max apic id so we can size array by dynamically
> allocating it when 1st apic is created. Or if IDs are too sparse
> switch to another structure to keep mapping.

I totally agree with this.

I admit that my main focus on this series is to make x2APIC mode 
function correctly with TCG accelerator, so I skip the part of extending 
the support for higher APIC ID.

Thanks,
Quang Minh.
Igor Mammedov Feb. 28, 2023, 4:39 p.m. UTC | #5
On Tue, 28 Feb 2023 21:34:33 +0700
Bui Quang Minh <minhquangbui99@gmail.com> wrote:

> On 2/27/23 23:07, Igor Mammedov wrote:
> > On Sat, 25 Feb 2023 17:15:17 +0700
> > Bui Quang Minh <minhquangbui99@gmail.com> wrote:
> >   
> >> On 2/24/23 21:29, Igor Mammedov wrote:  
> >>> On Tue, 21 Feb 2023 23:04:57 +0700
> >>> Bui Quang Minh <minhquangbui99@gmail.com> wrote:
> >>>      
> >>>> This commit refactors APIC registers read/write function to support both
> >>>> MMIO read/write in xAPIC mode and MSR read/write in x2APIC mode. Also,
> >>>> support larger APIC ID, self IPI, new IPI destination determination in
> >>>> x2APIC mode.
> >>>>
> >>>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> >>>> ---
> >>>>    hw/intc/apic.c                  | 211 +++++++++++++++++++++++++-------
> >>>>    hw/intc/apic_common.c           |   2 +-
> >>>>    include/hw/i386/apic.h          |   5 +-
> >>>>    include/hw/i386/apic_internal.h |   2 +-
> >>>>    4 files changed, 172 insertions(+), 48 deletions(-)
> >>>>
> >>>> diff --git a/hw/intc/apic.c b/hw/intc/apic.c
> >>>> index 2d3e55f4e2..205d5923ec 100644
> >>>> --- a/hw/intc/apic.c
> >>>> +++ b/hw/intc/apic.c
> >>>> @@ -30,6 +30,7 @@
> >>>>    #include "hw/i386/apic-msidef.h"
> >>>>    #include "qapi/error.h"
> >>>>    #include "qom/object.h"
> >>>> +#include "tcg/helper-tcg.h"
> >>>>    
> >>>>    #define MAX_APICS 255  
> >>>
> >>> I'm curious how does it work without increasing ^^^?  
> >>
> >> Hmm, my commit message is not entirely correct. In this series, some
> >> operations (send IPI, IPI destination determination) have been updated
> >> to support x2APIC mode. However, the emulated APIC still doesn't support
> >> APIC ID larger than 255 because currently, we use a fixed length (255 +
> >> 1) array to manage local APICs. So to support larger APIC ID, I think we
> >> need to find any way to manage those, as the possible allocated APIC ID
> >> range is large and maybe the allocated APIC ID is sparse which makes
> >> fixed length array so wasteful.  
> > how much sparse it is?  
> 
> As far as I know, QEMU allows to set CPU's APIC ID, so user can pass a 
> very sparse APIC ID array.

I don't think that it does permit this (if it does it's a bug that should be fixed).

As far as I'm aware QEMU derives apic_id from '-smp' and possibly cpu type
(there was some differences between Intel and AMD in how apic id was encoded
notably AMD having threads or cores that lead to sparse apic id), though I don't
remember current state of affairs in x86 cpu topo code.

> > benefits of simple static array is simplicity in management and O(1) access time.
> > QEMU does know in advance max apic id so we can size array by dynamically
> > allocating it when 1st apic is created. Or if IDs are too sparse
> > switch to another structure to keep mapping.  
> 
> I totally agree with this.
> 
> I admit that my main focus on this series is to make x2APIC mode 
> function correctly with TCG accelerator, so I skip the part of extending 
> the support for higher APIC ID.
the tricky part in such half approach is making sure that the code is
'correct' and won't lead to exploits.
It would be easier to review if it was completed solution instead of partial.  


> Thanks,
> Quang Minh.
>
Bui Quang Minh March 4, 2023, 2:10 p.m. UTC | #6
On 2/28/23 23:39, Igor Mammedov wrote:
> On Tue, 28 Feb 2023 21:34:33 +0700
> Bui Quang Minh <minhquangbui99@gmail.com> wrote:
> 
>> On 2/27/23 23:07, Igor Mammedov wrote:
>>> On Sat, 25 Feb 2023 17:15:17 +0700
>>> Bui Quang Minh <minhquangbui99@gmail.com> wrote:
>>>    
>>>> On 2/24/23 21:29, Igor Mammedov wrote:
>>>>> On Tue, 21 Feb 2023 23:04:57 +0700
>>>>> Bui Quang Minh <minhquangbui99@gmail.com> wrote:
>>>>>       
>>>>>> This commit refactors APIC registers read/write function to support both
>>>>>> MMIO read/write in xAPIC mode and MSR read/write in x2APIC mode. Also,
>>>>>> support larger APIC ID, self IPI, new IPI destination determination in
>>>>>> x2APIC mode.
>>>>>>
>>>>>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>>>>>> ---
>>>>>>     hw/intc/apic.c                  | 211 +++++++++++++++++++++++++-------
>>>>>>     hw/intc/apic_common.c           |   2 +-
>>>>>>     include/hw/i386/apic.h          |   5 +-
>>>>>>     include/hw/i386/apic_internal.h |   2 +-
>>>>>>     4 files changed, 172 insertions(+), 48 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/intc/apic.c b/hw/intc/apic.c
>>>>>> index 2d3e55f4e2..205d5923ec 100644
>>>>>> --- a/hw/intc/apic.c
>>>>>> +++ b/hw/intc/apic.c
>>>>>> @@ -30,6 +30,7 @@
>>>>>>     #include "hw/i386/apic-msidef.h"
>>>>>>     #include "qapi/error.h"
>>>>>>     #include "qom/object.h"
>>>>>> +#include "tcg/helper-tcg.h"
>>>>>>     
>>>>>>     #define MAX_APICS 255
>>>>>
>>>>> I'm curious how does it work without increasing ^^^?
>>>>
>>>> Hmm, my commit message is not entirely correct. In this series, some
>>>> operations (send IPI, IPI destination determination) have been updated
>>>> to support x2APIC mode. However, the emulated APIC still doesn't support
>>>> APIC ID larger than 255 because currently, we use a fixed length (255 +
>>>> 1) array to manage local APICs. So to support larger APIC ID, I think we
>>>> need to find any way to manage those, as the possible allocated APIC ID
>>>> range is large and maybe the allocated APIC ID is sparse which makes
>>>> fixed length array so wasteful.
>>> how much sparse it is?
>>
>> As far as I know, QEMU allows to set CPU's APIC ID, so user can pass a
>> very sparse APIC ID array.
> 
> I don't think that it does permit this (if it does it's a bug that should be fixed).
> 
> As far as I'm aware QEMU derives apic_id from '-smp' and possibly cpu type
> (there was some differences between Intel and AMD in how apic id was encoded
> notably AMD having threads or cores that lead to sparse apic id), though I don't
> remember current state of affairs in x86 cpu topo code.
> 
>>> benefits of simple static array is simplicity in management and O(1) access time.
>>> QEMU does know in advance max apic id so we can size array by dynamically
>>> allocating it when 1st apic is created. Or if IDs are too sparse
>>> switch to another structure to keep mapping.
>>
>> I totally agree with this.
>>
>> I admit that my main focus on this series is to make x2APIC mode
>> function correctly with TCG accelerator, so I skip the part of extending
>> the support for higher APIC ID.
> the tricky part in such half approach is making sure that the code is
> 'correct' and won't lead to exploits.
> It would be easier to review if it was completed solution instead of partial.

I looked around and found the way to dynamically allocate local_apics array

	void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
	{
		if (!kvm_irqchip_in_kernel()) {
         		apic_set_max_apic_id(x86ms->apic_id_limit);
     		}

	}

We already calculated apic_id_limit before creating CPU and local APIC 
so we can use that number to dynamically allocated local_apics.

However, there are still problems while trying to extending support to 
APIC ID larger than 255 because there are many places assume APIC ID is 
8-bit long. One of that is interrupt remapping which returns 32-bit 
destination ID but uses MSI (which has 8-bit destination) to send to 
APIC. I will look more into this.

Thanks,
Quang Minh.
Igor Mammedov March 6, 2023, 10:43 a.m. UTC | #7
On Sat, 4 Mar 2023 21:10:54 +0700
Bui Quang Minh <minhquangbui99@gmail.com> wrote:

> On 2/28/23 23:39, Igor Mammedov wrote:
> > On Tue, 28 Feb 2023 21:34:33 +0700
> > Bui Quang Minh <minhquangbui99@gmail.com> wrote:
> >   
> >> On 2/27/23 23:07, Igor Mammedov wrote:  
> >>> On Sat, 25 Feb 2023 17:15:17 +0700
> >>> Bui Quang Minh <minhquangbui99@gmail.com> wrote:
> >>>      
> >>>> On 2/24/23 21:29, Igor Mammedov wrote:  
> >>>>> On Tue, 21 Feb 2023 23:04:57 +0700
> >>>>> Bui Quang Minh <minhquangbui99@gmail.com> wrote:
> >>>>>         
> >>>>>> This commit refactors APIC registers read/write function to support both
> >>>>>> MMIO read/write in xAPIC mode and MSR read/write in x2APIC mode. Also,
> >>>>>> support larger APIC ID, self IPI, new IPI destination determination in
> >>>>>> x2APIC mode.
> >>>>>>
> >>>>>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> >>>>>> ---
> >>>>>>     hw/intc/apic.c                  | 211 +++++++++++++++++++++++++-------
> >>>>>>     hw/intc/apic_common.c           |   2 +-
> >>>>>>     include/hw/i386/apic.h          |   5 +-
> >>>>>>     include/hw/i386/apic_internal.h |   2 +-
> >>>>>>     4 files changed, 172 insertions(+), 48 deletions(-)
> >>>>>>
> >>>>>> diff --git a/hw/intc/apic.c b/hw/intc/apic.c
> >>>>>> index 2d3e55f4e2..205d5923ec 100644
> >>>>>> --- a/hw/intc/apic.c
> >>>>>> +++ b/hw/intc/apic.c
> >>>>>> @@ -30,6 +30,7 @@
> >>>>>>     #include "hw/i386/apic-msidef.h"
> >>>>>>     #include "qapi/error.h"
> >>>>>>     #include "qom/object.h"
> >>>>>> +#include "tcg/helper-tcg.h"
> >>>>>>     
> >>>>>>     #define MAX_APICS 255  
> >>>>>
> >>>>> I'm curious how does it work without increasing ^^^?  
> >>>>
> >>>> Hmm, my commit message is not entirely correct. In this series, some
> >>>> operations (send IPI, IPI destination determination) have been updated
> >>>> to support x2APIC mode. However, the emulated APIC still doesn't support
> >>>> APIC ID larger than 255 because currently, we use a fixed length (255 +
> >>>> 1) array to manage local APICs. So to support larger APIC ID, I think we
> >>>> need to find any way to manage those, as the possible allocated APIC ID
> >>>> range is large and maybe the allocated APIC ID is sparse which makes
> >>>> fixed length array so wasteful.  
> >>> how much sparse it is?  
> >>
> >> As far as I know, QEMU allows to set CPU's APIC ID, so user can pass a
> >> very sparse APIC ID array.  
> > 
> > I don't think that it does permit this (if it does it's a bug that should be fixed).
> > 
> > As far as I'm aware QEMU derives apic_id from '-smp' and possibly cpu type
> > (there was some differences between Intel and AMD in how apic id was encoded
> > notably AMD having threads or cores that lead to sparse apic id), though I don't
> > remember current state of affairs in x86 cpu topo code.
> >   
> >>> benefits of simple static array is simplicity in management and O(1) access time.
> >>> QEMU does know in advance max apic id so we can size array by dynamically
> >>> allocating it when 1st apic is created. Or if IDs are too sparse
> >>> switch to another structure to keep mapping.  
> >>
> >> I totally agree with this.
> >>
> >> I admit that my main focus on this series is to make x2APIC mode
> >> function correctly with TCG accelerator, so I skip the part of extending
> >> the support for higher APIC ID.  
> > the tricky part in such half approach is making sure that the code is
> > 'correct' and won't lead to exploits.
> > It would be easier to review if it was completed solution instead of partial.  
> 
> I looked around and found the way to dynamically allocate local_apics array
> 
> 	void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
> 	{
> 		if (!kvm_irqchip_in_kernel()) {
>          		apic_set_max_apic_id(x86ms->apic_id_limit);
>      		}
> 
> 	}
> 
> We already calculated apic_id_limit before creating CPU and local APIC 
> so we can use that number to dynamically allocated local_apics.
> 
> However, there are still problems while trying to extending support to 
> APIC ID larger than 255 because there are many places assume APIC ID is 
> 8-bit long.
that's what I was concerned about (i.e. just enabling x2apic without fixing
with all code that just assumes 8bit apicid).

> One of that is interrupt remapping which returns 32-bit 
> destination ID but uses MSI (which has 8-bit destination) to send to 
> APIC. I will look more into this.

Thanks!

> 
> Thanks,
> Quang Minh.
>
David Woodhouse March 6, 2023, 3:51 p.m. UTC | #8
On Mon, 2023-03-06 at 11:43 +0100, Igor Mammedov wrote:
> > However, there are still problems while trying to extending support to 
> > APIC ID larger than 255 because there are many places assume APIC ID is 
> > 8-bit long.
>
> that's what I was concerned about (i.e. just enabling x2apic without fixing
> with all code that just assumes 8bit apicid).

Even before you extend the physical APIC IDs past 254 or 255, there's
still the issue that 255 isn't a broadcast in X2APIC mode. And that
*logical* addressing will use more than 8 bits even when the physical
IDs are lower.

> > One of that is interrupt remapping which returns 32-bit 
> > destination ID but uses MSI (which has 8-bit destination) to send to 
> > APIC. I will look more into this.

The translated 'output' from the interrupt remapping doesn't "use MSI",
in the sense of a write transaction which happens to go to addresses in
the 0x00000000FEExxxxx range. The *input* to interrupt remapping comes
from that intercept.

The output is already "known" to be an MSI message, with a full 64-bit
address and 32-bit data, and the KVM API puts the high 24 bits of the
target APIC ID into the high word of the address.

If you look at the "generic" x86_iommu_irq_to_msi_message() in
hw/intc/x86-iommu.c you'll see it's also using the same trick:

    msg.__addr_hi = irq->dest & 0xffffff00;


That hack isn't guest-visible. There *is* a trick which is guest-
visible, implemented by both Hyper-V and KVM, which is to recognise
that actually there was an 'Extended Destination ID' field in bits 4-11
of the actual 32-bit MSI. Intel eventually used the low bit as the
selector for 'Remappable Format' MSI messages, but we've used the other
seven to extend the APIC ID to 15 bits even in a guest-visible way,
allowing up to 32768 CPUs without having to expose a virtual IOMMU.

But that should get translated to the KVM format with the high bits in
the top 32 bits of the address, fairly much transparently. All you need
to do is ensure that the TCG X2APIC support takes that KVM format, and
that it all enabled in the right circumstances.
David Woodhouse March 6, 2023, 4:01 p.m. UTC | #9
On Tue, 2023-02-21 at 23:04 +0700, Bui Quang Minh wrote:
> @@ -454,7 +500,7 @@ static int apic_find_dest(uint8_t dest)
>  }
>  
>  static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
> -                                      uint8_t dest, uint8_t dest_mode)
> +                                      uint32_t dest, uint8_t dest_mode)
>  {
>      APICCommonState *apic_iter;
>      int i;


I think somewhere here between these two hunks, you've forgotten to
stop interpreting 0xFF as broadcast when you're in X2APIC mode.

> @@ -474,14 +520,22 @@ static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
>          for(i = 0; i < MAX_APICS; i++) {
>              apic_iter = local_apics[i];
>              if (apic_iter) {
> -                if (apic_iter->dest_mode == 0xf) {
> -                    if (dest & apic_iter->log_dest)
> -                        apic_set_bit(deliver_bitmask, i);
> -                } else if (apic_iter->dest_mode == 0x0) {
> -                    if ((dest & 0xf0) == (apic_iter->log_dest & 0xf0) &&
> -                        (dest & apic_iter->log_dest & 0x0f)) {
> +                /* x2APIC mode */
> +                if (apic_iter->apicbase & MSR_IA32_APICBASE_EXTD) {
> +                    if ((dest & 0xffff0000) == (apic_iter->log_dest & 0xffff0000) &&
> +                        (dest & apic_iter->log_dest & 0xffff)) {
Bui Quang Minh March 6, 2023, 4:39 p.m. UTC | #10
On 3/6/23 22:51, David Woodhouse wrote:
> On Mon, 2023-03-06 at 11:43 +0100, Igor Mammedov wrote:
>>> However, there are still problems while trying to extending support to
>>> APIC ID larger than 255 because there are many places assume APIC ID is
>>> 8-bit long.
>>
>> that's what I was concerned about (i.e. just enabling x2apic without fixing
>> with all code that just assumes 8bit apicid).
> 
> Even before you extend the physical APIC IDs past 254 or 255, there's
> still the issue that 255 isn't a broadcast in X2APIC mode. And that
> *logical* addressing will use more than 8 bits even when the physical
> IDs are lower.
> 
>>> One of that is interrupt remapping which returns 32-bit
>>> destination ID but uses MSI (which has 8-bit destination) to send to
>>> APIC. I will look more into this.
> 
> The translated 'output' from the interrupt remapping doesn't "use MSI",
> in the sense of a write transaction which happens to go to addresses in
> the 0x00000000FEExxxxx range. The *input* to interrupt remapping comes
> from that intercept.
> 
> The output is already "known" to be an MSI message, with a full 64-bit
> address and 32-bit data, and the KVM API puts the high 24 bits of the
> target APIC ID into the high word of the address.
> 
> If you look at the "generic" x86_iommu_irq_to_msi_message() in
> hw/intc/x86-iommu.c you'll see it's also using the same trick:
> 
>      msg.__addr_hi = irq->dest & 0xffffff00;

Yeah, I see that trick too, with this hunk and temporarily disable 
broadcast destination id 0xff in physical mode, I am able to boot Linux 
with 255 CPUs (I can't see how to use few CPUs but configure with high 
APIC ID)

@@ -814,7 +816,12 @@ static void apic_send_msi(MSIMessage *msi)
  {
      uint64_t addr = msi->address;
      uint32_t data = msi->data;
-    uint8_t dest = (addr & MSI_ADDR_DEST_ID_MASK) >> 
MSI_ADDR_DEST_ID_SHIFT;
+    uint32_t dest = (addr & MSI_ADDR_DEST_ID_MASK) >> 
MSI_ADDR_DEST_ID_SHIFT;
+    /*
+     * The higher 3 bytes of destination id is stored in higher word of
+     * msi address. See x86_iommu_irq_to_msi_message()
+     */
+    dest = dest | (addr >> 32);

I am reading the manual now, looks like broadcast destination id in 
x2APIC is 0xffff_ffff in both physical and logic mode.

By the way, thank you very much for your review.
Quang Minh
David Woodhouse March 6, 2023, 4:50 p.m. UTC | #11
On Mon, 2023-03-06 at 23:39 +0700, Bui Quang Minh wrote:
> On 3/6/23 22:51, David Woodhouse wrote:
> > On Mon, 2023-03-06 at 11:43 +0100, Igor Mammedov wrote:
> > > > However, there are still problems while trying to extending support to
> > > > APIC ID larger than 255 because there are many places assume APIC ID is
> > > > 8-bit long.
> > > 
> > > that's what I was concerned about (i.e. just enabling x2apic without fixing
> > > with all code that just assumes 8bit apicid).
> > 
> > Even before you extend the physical APIC IDs past 254 or 255, there's
> > still the issue that 255 isn't a broadcast in X2APIC mode. And that
> > *logical* addressing will use more than 8 bits even when the physical
> > IDs are lower.
> > 
> > > > One of that is interrupt remapping which returns 32-bit
> > > > destination ID but uses MSI (which has 8-bit destination) to send to
> > > > APIC. I will look more into this.
> > 
> > The translated 'output' from the interrupt remapping doesn't "use MSI",
> > in the sense of a write transaction which happens to go to addresses in
> > the 0x00000000FEExxxxx range. The *input* to interrupt remapping comes
> > from that intercept.
> > 
> > The output is already "known" to be an MSI message, with a full 64-bit
> > address and 32-bit data, and the KVM API puts the high 24 bits of the
> > target APIC ID into the high word of the address.
> > 
> > If you look at the "generic" x86_iommu_irq_to_msi_message() in
> > hw/intc/x86-iommu.c you'll see it's also using the same trick:
> > 
> >      msg.__addr_hi = irq->dest & 0xffffff00;
> 
> Yeah, I see that trick too, with this hunk and temporarily disable 
> broadcast destination id 0xff in physical mode, I am able to boot Linux 
> with 255 CPUs (I can't see how to use few CPUs but configure with high 
> APIC ID)

I never worked out how to explicitly assign high APIC IDs but you can
at least spread them out by explicitly setting the topology to
something weird like sockets=17,cores=3,threads=3 so that some APIC IDs
get skipped.

Of course, that doesn't let you exercise the interesting corner case of
physical APIC ID 0xff though. I wonder if there's a way of doing it
such that only CPU#0 and CPU#255 are *online* at boot, even if the rest
theoretically exist? 

> @@ -814,7 +816,12 @@ static void apic_send_msi(MSIMessage *msi)
>   {
>       uint64_t addr = msi->address;
>       uint32_t data = msi->data;
> -    uint8_t dest = (addr & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
> +    uint32_t dest = (addr & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
> +    /*
> +     * The higher 3 bytes of destination id is stored in higher word of
> +     * msi address. See x86_iommu_irq_to_msi_message()
> +     */
> +    dest = dest | (addr >> 32);
> 
> I am reading the manual now, looks like broadcast destination id in 
> x2APIC is 0xffff_ffff in both physical and logic mode.

Yep, that looks about right.
Igor Mammedov March 7, 2023, 11:25 a.m. UTC | #12
On Mon, 06 Mar 2023 15:51:45 +0000
David Woodhouse <dwmw2@infradead.org> wrote:

> On Mon, 2023-03-06 at 11:43 +0100, Igor Mammedov wrote:
> > > However, there are still problems while trying to extending support to 
> > > APIC ID larger than 255 because there are many places assume APIC ID is 
> > > 8-bit long.  
> >
> > that's what I was concerned about (i.e. just enabling x2apic without fixing
> > with all code that just assumes 8bit apicid).  
> 
> Even before you extend the physical APIC IDs past 254 or 255, there's
> still the issue that 255 isn't a broadcast in X2APIC mode. And that
> *logical* addressing will use more than 8 bits even when the physical
> IDs are lower.
> 
> > > One of that is interrupt remapping which returns 32-bit 
> > > destination ID but uses MSI (which has 8-bit destination) to send to 
> > > APIC. I will look more into this.  
> 
> The translated 'output' from the interrupt remapping doesn't "use MSI",
> in the sense of a write transaction which happens to go to addresses in
> the 0x00000000FEExxxxx range. The *input* to interrupt remapping comes
> from that intercept.
> 
> The output is already "known" to be an MSI message, with a full 64-bit
> address and 32-bit data, and the KVM API puts the high 24 bits of the
> target APIC ID into the high word of the address.
> 
> If you look at the "generic" x86_iommu_irq_to_msi_message() in
> hw/intc/x86-iommu.c you'll see it's also using the same trick:
> 
>     msg.__addr_hi = irq->dest & 0xffffff00;
> 
> 
> That hack isn't guest-visible. There *is* a trick which is guest-
> visible, implemented by both Hyper-V and KVM, which is to recognise
> that actually there was an 'Extended Destination ID' field in bits 4-11
> of the actual 32-bit MSI. Intel eventually used the low bit as the
> selector for 'Remappable Format' MSI messages, but we've used the other
> seven to extend the APIC ID to 15 bits even in a guest-visible way,
> allowing up to 32768 CPUs without having to expose a virtual IOMMU.
> 
> But that should get translated to the KVM format with the high bits in
> the top 32 bits of the address, fairly much transparently. All you need
> to do is ensure that the TCG X2APIC support takes that KVM format, and
> that it all enabled in the right circumstances.
I wouldn't bother with 'Extended Destination ID' KVM trick for emulated
x2apic and just limit impl. to what real hardware does.
Though potentially supporting it in TCG mode could be used for CI tests
to make sure we do not regress it (if someone bothered to write test-cases
for it).
Igor Mammedov March 7, 2023, 11:34 a.m. UTC | #13
On Mon, 06 Mar 2023 16:50:29 +0000
David Woodhouse <dwmw2@infradead.org> wrote:

> On Mon, 2023-03-06 at 23:39 +0700, Bui Quang Minh wrote:
> > On 3/6/23 22:51, David Woodhouse wrote:  
> > > On Mon, 2023-03-06 at 11:43 +0100, Igor Mammedov wrote:  
> > > > > However, there are still problems while trying to extending support to
> > > > > APIC ID larger than 255 because there are many places assume APIC ID is
> > > > > 8-bit long.  
> > > > 
> > > > that's what I was concerned about (i.e. just enabling x2apic without fixing
> > > > with all code that just assumes 8bit apicid).  
> > > 
> > > Even before you extend the physical APIC IDs past 254 or 255, there's
> > > still the issue that 255 isn't a broadcast in X2APIC mode. And that
> > > *logical* addressing will use more than 8 bits even when the physical
> > > IDs are lower.
> > >   
> > > > > One of that is interrupt remapping which returns 32-bit
> > > > > destination ID but uses MSI (which has 8-bit destination) to send to
> > > > > APIC. I will look more into this.  
> > > 
> > > The translated 'output' from the interrupt remapping doesn't "use MSI",
> > > in the sense of a write transaction which happens to go to addresses in
> > > the 0x00000000FEExxxxx range. The *input* to interrupt remapping comes
> > > from that intercept.
> > > 
> > > The output is already "known" to be an MSI message, with a full 64-bit
> > > address and 32-bit data, and the KVM API puts the high 24 bits of the
> > > target APIC ID into the high word of the address.
> > > 
> > > If you look at the "generic" x86_iommu_irq_to_msi_message() in
> > > hw/intc/x86-iommu.c you'll see it's also using the same trick:
> > > 
> > >      msg.__addr_hi = irq->dest & 0xffffff00;  
> > 
> > Yeah, I see that trick too, with this hunk and temporarily disable 
> > broadcast destination id 0xff in physical mode, I am able to boot Linux 
> > with 255 CPUs (I can't see how to use few CPUs but configure with high 
> > APIC ID)  
> 
> I never worked out how to explicitly assign high APIC IDs but you can
> at least spread them out by explicitly setting the topology to
> something weird like sockets=17,cores=3,threads=3 so that some APIC IDs
> get skipped.
> 
> Of course, that doesn't let you exercise the interesting corner case of
> physical APIC ID 0xff though. I wonder if there's a way of doing it
> such that only CPU#0 and CPU#255 are *online* at boot, even if the rest
> theoretically exist? 

you can have arbitrary (withing -smp limits) vcpu at startup time by
using -device foo-cpu-type,topo-ids-here (modulo auto-created ones on
behalf -smp X value)

Possible vcpus for given -M/-smp/-cpu combination one can get using
hotpluggable-cpus HMP command or its QMP counterpart.
 
> > @@ -814,7 +816,12 @@ static void apic_send_msi(MSIMessage *msi)
> >   {
> >       uint64_t addr = msi->address;
> >       uint32_t data = msi->data;
> > -    uint8_t dest = (addr & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
> > +    uint32_t dest = (addr & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
> > +    /*
> > +     * The higher 3 bytes of destination id is stored in higher word of
> > +     * msi address. See x86_iommu_irq_to_msi_message()
> > +     */
> > +    dest = dest | (addr >> 32);
> > 
> > I am reading the manual now, looks like broadcast destination id in 
> > x2APIC is 0xffff_ffff in both physical and logic mode.  
> 
> Yep, that looks about right.
diff mbox series

Patch

diff --git a/hw/intc/apic.c b/hw/intc/apic.c
index 2d3e55f4e2..205d5923ec 100644
--- a/hw/intc/apic.c
+++ b/hw/intc/apic.c
@@ -30,6 +30,7 @@ 
 #include "hw/i386/apic-msidef.h"
 #include "qapi/error.h"
 #include "qom/object.h"
+#include "tcg/helper-tcg.h"
 
 #define MAX_APICS 255
 #define MAX_APIC_WORDS 8
@@ -48,7 +49,7 @@  DECLARE_INSTANCE_CHECKER(APICCommonState, APIC,
 static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode);
 static void apic_update_irq(APICCommonState *s);
 static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
-                                      uint8_t dest, uint8_t dest_mode);
+                                      uint32_t dest, uint8_t dest_mode);
 
 /* Find first bit starting from msb */
 static int apic_fls_bit(uint32_t value)
@@ -275,7 +276,7 @@  static void apic_bus_deliver(const uint32_t *deliver_bitmask,
                  apic_set_irq(apic_iter, vector_num, trigger_mode) );
 }
 
-void apic_deliver_irq(uint8_t dest, uint8_t dest_mode, uint8_t delivery_mode,
+void apic_deliver_irq(uint32_t dest, uint8_t dest_mode, uint8_t delivery_mode,
                       uint8_t vector_num, uint8_t trigger_mode)
 {
     uint32_t deliver_bitmask[MAX_APIC_WORDS];
@@ -287,8 +288,38 @@  void apic_deliver_irq(uint8_t dest, uint8_t dest_mode, uint8_t delivery_mode,
     apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num, trigger_mode);
 }
 
+bool is_x2apic_mode(DeviceState *dev)
+{
+    APICCommonState *s = APIC(dev);
+
+    return s->apicbase & MSR_IA32_APICBASE_EXTD;
+}
+
 static void apic_set_base(APICCommonState *s, uint64_t val)
 {
+    /*
+     * Transition into invalid state
+     * (s->apicbase & MSR_IA32_APICBASE_ENABLE == 0) &&
+     * (s->apicbase & MSR_IA32_APICBASE_EXTD) == 1
+     */
+    if (!(val & MSR_IA32_APICBASE_ENABLE) &&
+        (val & MSR_IA32_APICBASE_EXTD))
+        raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
+
+    /* Invalid transition from disabled mode to x2APIC */
+    if (!(s->apicbase & MSR_IA32_APICBASE_ENABLE) &&
+        !(s->apicbase & MSR_IA32_APICBASE_EXTD) &&
+        (val & MSR_IA32_APICBASE_ENABLE) &&
+        (val & MSR_IA32_APICBASE_EXTD))
+        raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
+
+    /* Invalid transition from x2APIC to xAPIC */
+    if ((s->apicbase & MSR_IA32_APICBASE_ENABLE) &&
+        (s->apicbase & MSR_IA32_APICBASE_EXTD) &&
+        (val & MSR_IA32_APICBASE_ENABLE) &&
+        !(val & MSR_IA32_APICBASE_EXTD))
+        raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
+
     s->apicbase = (val & 0xfffff000) |
         (s->apicbase & (MSR_IA32_APICBASE_BSP | MSR_IA32_APICBASE_ENABLE));
     /* if disabled, cannot be enabled again */
@@ -297,6 +328,21 @@  static void apic_set_base(APICCommonState *s, uint64_t val)
         cpu_clear_apic_feature(&s->cpu->env);
         s->spurious_vec &= ~APIC_SV_ENABLE;
     }
+
+    /* Transition from xAPIC to x2APIC */
+    if (cpu_has_x2apic_feature(&s->cpu->env) &&
+        !(s->apicbase & MSR_IA32_APICBASE_EXTD) &&
+        (val & MSR_IA32_APICBASE_EXTD)) {
+        s->apicbase |= MSR_IA32_APICBASE_EXTD;
+
+        s->log_dest = ((s->initial_apic_id & 0xffff0) << 16) |
+                      (1 << (s->initial_apic_id & 0xf));
+
+        /* disable MMIO interface */
+        qemu_mutex_lock_iothread();
+        memory_region_set_enabled(&s->io_memory, false);
+        qemu_mutex_unlock_iothread();
+    }
 }
 
 static void apic_set_tpr(APICCommonState *s, uint8_t val)
@@ -454,7 +500,7 @@  static int apic_find_dest(uint8_t dest)
 }
 
 static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
-                                      uint8_t dest, uint8_t dest_mode)
+                                      uint32_t dest, uint8_t dest_mode)
 {
     APICCommonState *apic_iter;
     int i;
@@ -474,14 +520,22 @@  static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
         for(i = 0; i < MAX_APICS; i++) {
             apic_iter = local_apics[i];
             if (apic_iter) {
-                if (apic_iter->dest_mode == 0xf) {
-                    if (dest & apic_iter->log_dest)
-                        apic_set_bit(deliver_bitmask, i);
-                } else if (apic_iter->dest_mode == 0x0) {
-                    if ((dest & 0xf0) == (apic_iter->log_dest & 0xf0) &&
-                        (dest & apic_iter->log_dest & 0x0f)) {
+                /* x2APIC mode */
+                if (apic_iter->apicbase & MSR_IA32_APICBASE_EXTD) {
+                    if ((dest & 0xffff0000) == (apic_iter->log_dest & 0xffff0000) &&
+                        (dest & apic_iter->log_dest & 0xffff)) {
                         apic_set_bit(deliver_bitmask, i);
                     }
+                } else {
+                    if (apic_iter->dest_mode == 0xf) {
+                        if (dest & apic_iter->log_dest)
+                            apic_set_bit(deliver_bitmask, i);
+                    } else if (apic_iter->dest_mode == 0x0) {
+                        if ((dest & 0xf0) == (apic_iter->log_dest & 0xf0) &&
+                            (dest & apic_iter->log_dest & 0x0f)) {
+                            apic_set_bit(deliver_bitmask, i);
+                        }
+                    }
                 }
             } else {
                 break;
@@ -508,13 +562,12 @@  void apic_sipi(DeviceState *dev)
     s->wait_for_sipi = 0;
 }
 
-static void apic_deliver(DeviceState *dev, uint8_t dest, uint8_t dest_mode,
+static void apic_deliver(DeviceState *dev, uint32_t dest, uint8_t dest_mode,
                          uint8_t delivery_mode, uint8_t vector_num,
-                         uint8_t trigger_mode)
+                         uint8_t trigger_mode, uint8_t dest_shorthand)
 {
     APICCommonState *s = APIC(dev);
     uint32_t deliver_bitmask[MAX_APIC_WORDS];
-    int dest_shorthand = (s->icr[0] >> 18) & 3;
     APICCommonState *apic_iter;
 
     switch (dest_shorthand) {
@@ -635,16 +688,11 @@  static void apic_timer(void *opaque)
     apic_timer_update(s, s->next_time);
 }
 
-static uint64_t apic_mem_read(void *opaque, hwaddr addr, unsigned size)
+uint64_t apic_register_read(int index)
 {
     DeviceState *dev;
     APICCommonState *s;
-    uint32_t val;
-    int index;
-
-    if (size < 4) {
-        return 0;
-    }
+    uint64_t val;
 
     dev = cpu_get_current_apic();
     if (!dev) {
@@ -652,10 +700,12 @@  static uint64_t apic_mem_read(void *opaque, hwaddr addr, unsigned size)
     }
     s = APIC(dev);
 
-    index = (addr >> 4) & 0xff;
     switch(index) {
     case 0x02: /* id */
-        val = s->id << 24;
+        if (is_x2apic_mode(dev))
+            val = s->initial_apic_id;
+        else
+            val = s->id << 24;
         break;
     case 0x03: /* version */
         val = s->version | ((APIC_LVT_NB - 1) << 16);
@@ -678,9 +728,16 @@  static uint64_t apic_mem_read(void *opaque, hwaddr addr, unsigned size)
         val = 0;
         break;
     case 0x0d:
-        val = s->log_dest << 24;
+        if (is_x2apic_mode(dev))
+            val = s->log_dest;
+        else
+            val = s->log_dest << 24;
         break;
     case 0x0e:
+        if (is_x2apic_mode(dev)) {
+            raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
+        }
+
         val = (s->dest_mode << 28) | 0xfffffff;
         break;
     case 0x0f:
@@ -719,6 +776,22 @@  static uint64_t apic_mem_read(void *opaque, hwaddr addr, unsigned size)
         val = 0;
         break;
     }
+
+    return val;
+}
+
+static uint64_t apic_mem_read(void *opaque, hwaddr addr, unsigned size)
+{
+    uint32_t val;
+    int index;
+
+    if (size < 4) {
+        return 0;
+    }
+
+    index = (addr >> 4) & 0xff;
+    val = (uint32_t) apic_register_read(index);
+
     trace_apic_mem_readl(addr, val);
     return val;
 }
@@ -736,27 +809,10 @@  static void apic_send_msi(MSIMessage *msi)
     apic_deliver_irq(dest, dest_mode, delivery, vector, trigger_mode);
 }
 
-static void apic_mem_write(void *opaque, hwaddr addr, uint64_t val,
-                           unsigned size)
+void apic_register_write(int index, uint64_t val)
 {
     DeviceState *dev;
     APICCommonState *s;
-    int index = (addr >> 4) & 0xff;
-
-    if (size < 4) {
-        return;
-    }
-
-    if (addr > 0xfff || !index) {
-        /* MSI and MMIO APIC are at the same memory location,
-         * but actually not on the global bus: MSI is on PCI bus
-         * APIC is connected directly to the CPU.
-         * Mapping them on the global bus happens to work because
-         * MSI registers are reserved in APIC MMIO and vice versa. */
-        MSIMessage msi = { .address = addr, .data = val };
-        apic_send_msi(&msi);
-        return;
-    }
 
     dev = cpu_get_current_apic();
     if (!dev) {
@@ -764,10 +820,12 @@  static void apic_mem_write(void *opaque, hwaddr addr, uint64_t val,
     }
     s = APIC(dev);
 
-    trace_apic_mem_writel(addr, val);
-
     switch(index) {
     case 0x02:
+        if (is_x2apic_mode(dev)) {
+            raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
+        }
+
         s->id = (val >> 24);
         break;
     case 0x03:
@@ -787,9 +845,17 @@  static void apic_mem_write(void *opaque, hwaddr addr, uint64_t val,
         apic_eoi(s);
         break;
     case 0x0d:
+        if (is_x2apic_mode(dev)) {
+            raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
+        }
+
         s->log_dest = val >> 24;
         break;
     case 0x0e:
+        if (is_x2apic_mode(dev)) {
+            raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
+        }
+
         s->dest_mode = val >> 28;
         break;
     case 0x0f:
@@ -801,13 +867,27 @@  static void apic_mem_write(void *opaque, hwaddr addr, uint64_t val,
     case 0x20 ... 0x27:
     case 0x28:
         break;
-    case 0x30:
+    case 0x30: {
+        uint32_t dest;
+
         s->icr[0] = val;
-        apic_deliver(dev, (s->icr[1] >> 24) & 0xff, (s->icr[0] >> 11) & 1,
+        if (is_x2apic_mode(dev)) {
+            s->icr[1] = val >> 32;
+            dest = s->icr[1];
+        } else {
+            dest = (s->icr[1] >> 24) & 0xff;
+        }
+
+        apic_deliver(dev, dest, (s->icr[0] >> 11) & 1,
                      (s->icr[0] >> 8) & 7, (s->icr[0] & 0xff),
-                     (s->icr[0] >> 15) & 1);
+                     (s->icr[0] >> 15) & 1, (s->icr[0] >> 18) & 3);
         break;
+    }
     case 0x31:
+        if (is_x2apic_mode(dev)) {
+            raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
+        }
+
         s->icr[1] = val;
         break;
     case 0x32 ... 0x37:
@@ -836,12 +916,53 @@  static void apic_mem_write(void *opaque, hwaddr addr, uint64_t val,
             s->count_shift = (v + 1) & 7;
         }
         break;
+    case 0x3f: {
+        int vector = val & 0xff;
+
+        if (is_x2apic_mode(dev)) {
+            raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
+        }
+
+        /*
+         * Self IPI is identical to IPI with
+         * - Destination shorthand: 1 (Self)
+         * - Trigger mode: 0 (Edge)
+         * - Delivery mode: 0 (Fixed)
+         */
+        apic_deliver(dev, 0, 0, APIC_DM_FIXED, vector, 0, 1);
+
+        break;
+    }
     default:
         s->esr |= APIC_ESR_ILLEGAL_ADDRESS;
         break;
     }
 }
 
+static void apic_mem_write(void *opaque, hwaddr addr, uint64_t val,
+                           unsigned size)
+{
+    int index = (addr >> 4) & 0xff;
+
+    if (size < 4) {
+        return;
+    }
+
+    if (addr > 0xfff || !index) {
+        /* MSI and MMIO APIC are at the same memory location,
+         * but actually not on the global bus: MSI is on PCI bus
+         * APIC is connected directly to the CPU.
+         * Mapping them on the global bus happens to work because
+         * MSI registers are reserved in APIC MMIO and vice versa. */
+        MSIMessage msi = { .address = addr, .data = val };
+        apic_send_msi(&msi);
+        return;
+    }
+
+    trace_apic_mem_writel(addr, val);
+    apic_register_write(index, val);
+}
+
 static void apic_pre_save(APICCommonState *s)
 {
     apic_sync_vapic(s, SYNC_FROM_VAPIC);
diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c
index 4a34f03047..9ea1397530 100644
--- a/hw/intc/apic_common.c
+++ b/hw/intc/apic_common.c
@@ -366,7 +366,7 @@  static const VMStateDescription vmstate_apic_common = {
         VMSTATE_UINT8(arb_id, APICCommonState),
         VMSTATE_UINT8(tpr, APICCommonState),
         VMSTATE_UINT32(spurious_vec, APICCommonState),
-        VMSTATE_UINT8(log_dest, APICCommonState),
+        VMSTATE_UINT32(log_dest, APICCommonState),
         VMSTATE_UINT8(dest_mode, APICCommonState),
         VMSTATE_UINT32_ARRAY(isr, APICCommonState, 8),
         VMSTATE_UINT32_ARRAY(tmr, APICCommonState, 8),
diff --git a/include/hw/i386/apic.h b/include/hw/i386/apic.h
index bdc15a7a73..871ca94c5c 100644
--- a/include/hw/i386/apic.h
+++ b/include/hw/i386/apic.h
@@ -3,7 +3,7 @@ 
 
 
 /* apic.c */
-void apic_deliver_irq(uint8_t dest, uint8_t dest_mode, uint8_t delivery_mode,
+void apic_deliver_irq(uint32_t dest, uint8_t dest_mode, uint8_t delivery_mode,
                       uint8_t vector_num, uint8_t trigger_mode);
 int apic_accept_pic_intr(DeviceState *s);
 void apic_deliver_pic_intr(DeviceState *s, int level);
@@ -18,6 +18,9 @@  void apic_sipi(DeviceState *s);
 void apic_poll_irq(DeviceState *d);
 void apic_designate_bsp(DeviceState *d, bool bsp);
 int apic_get_highest_priority_irr(DeviceState *dev);
+uint64_t apic_register_read(int index);
+void apic_register_write(int index, uint64_t val);
+bool is_x2apic_mode(DeviceState *d);
 
 /* pc.c */
 DeviceState *cpu_get_current_apic(void);
diff --git a/include/hw/i386/apic_internal.h b/include/hw/i386/apic_internal.h
index 5f2ba24bfc..5f60cd5e78 100644
--- a/include/hw/i386/apic_internal.h
+++ b/include/hw/i386/apic_internal.h
@@ -164,7 +164,7 @@  struct APICCommonState {
     uint8_t arb_id;
     uint8_t tpr;
     uint32_t spurious_vec;
-    uint8_t log_dest;
+    uint32_t log_dest;
     uint8_t dest_mode;
     uint32_t isr[8];  /* in service register */
     uint32_t tmr[8];  /* trigger mode register */