Message ID | 1336625347-10169-9-git-send-email-benh@kernel.crashing.org |
---|---|
State | New |
Headers | show |
On 05/09/2012 11:49 PM, Benjamin Herrenschmidt wrote: > From: David Gibson<david@gibson.dropbear.id.au> > > This patch adds the basic infrastructure necessary to emulate an IOMMU > visible to the guest. The DMAContext structure is extended with > information and a callback describing the translation, and the various > DMA functions used by devices will now perform IOMMU translation using > this callback. > > Cc: Michael S. Tsirkin<mst@redhat.com> > Cc: Richard Henderson<rth@twiddle.net> > > Signed-off-by: Eduard - Gabriel Munteanu<eduard.munteanu@linux360.ro> > Signed-off-by: David Gibson<david@gibson.dropbear.id.au> > Signed-off-by: Benjamin Herrenschmidt<benh@kernel.crashing.org> > --- > dma-helpers.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > dma.h | 108 ++++++++++++++++++++++------- > hw/qdev-dma.h | 4 +- > 3 files changed, 299 insertions(+), 27 deletions(-) > > diff --git a/dma-helpers.c b/dma-helpers.c > index 2dc4691..09591ef 100644 > --- a/dma-helpers.c > +++ b/dma-helpers.c > @@ -9,6 +9,10 @@ > > #include "dma.h" > #include "trace.h" > +#include "range.h" > +#include "qemu-thread.h" > + > +/* #define DEBUG_IOMMU */ > > void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint, DMAContext *dma) > { > @@ -244,3 +248,213 @@ void dma_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, > { > bdrv_acct_start(bs, cookie, sg->size, type); > } > + > +bool iommu_dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len, > + DMADirection dir) > +{ > + target_phys_addr_t paddr, plen; > + > +#ifdef DEBUG_IOMMU > + fprintf(stderr, "dma_memory_check context=%p addr=0x" DMA_ADDR_FMT > + " len=0x" DMA_ADDR_FMT " dir=%d\n", dma, addr, len, dir); > +#endif > + > + while (len) { > + if (dma->translate(dma, addr,&paddr,&plen, dir) != 0) { > + return false; > + } > + > + /* The translation might be valid for larger regions. */ > + if (plen> len) { > + plen = len; > + } > + > + len -= plen; > + addr += plen; > + } > + > + return true; > +} > + > +int iommu_dma_memory_rw(DMAContext *dma, dma_addr_t addr, > + void *buf, dma_addr_t len, DMADirection dir) > +{ > + target_phys_addr_t paddr, plen; > + int err; > + > +#ifdef DEBUG_IOMMU > + fprintf(stderr, "dma_memory_rw context=%p addr=0x" DMA_ADDR_FMT " len=0x" > + DMA_ADDR_FMT " dir=%d\n", dma, addr, len, dir); > +#endif > + > + while (len) { > + err = dma->translate(dma, addr,&paddr,&plen, dir); > + if (err) { > + return -1; > + } > + > + /* The translation might be valid for larger regions. */ > + if (plen> len) { > + plen = len; > + } > + > + cpu_physical_memory_rw(paddr, buf, plen, > + dir == DMA_DIRECTION_FROM_DEVICE); > + > + len -= plen; > + addr += plen; > + buf += plen; > + } > + > + return 0; > +} > + > +int iommu_dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len) > +{ > + target_phys_addr_t paddr, plen; > + int err; > + > +#ifdef DEBUG_IOMMU > + fprintf(stderr, "dma_memory_zero context=%p addr=0x" DMA_ADDR_FMT > + " len=0x" DMA_ADDR_FMT "\n", dma, addr, len); > +#endif > + > + while (len) { > + err = dma->translate(dma, addr,&paddr,&plen, > + DMA_DIRECTION_FROM_DEVICE); > + if (err) { > + return err; > + } > + > + /* The translation might be valid for larger regions. */ > + if (plen> len) { > + plen = len; > + } > + > + cpu_physical_memory_zero(paddr, plen); > + > + len -= plen; > + addr += plen; > + } > + > + return 0; > +} > + > +typedef struct { > + unsigned long count; > + QemuCond cond; > +} DMAInvalidationState; > + > +typedef struct DMAMemoryMap DMAMemoryMap; > +struct DMAMemoryMap { > + dma_addr_t addr; > + size_t len; > + void *buf; > + > + DMAInvalidationState *invalidate; > + QLIST_ENTRY(DMAMemoryMap) list; > +}; > + > +void dma_context_init(DMAContext *dma, DMATranslateFunc fn) > +{ > +#ifdef DEBUG_IOMMU > + fprintf(stderr, "dma_context_init(%p, %p)\n", dma, fn); > +#endif > + dma->translate = fn; > + QLIST_INIT(&dma->memory_maps); > +} > + > +void *iommu_dma_memory_map(DMAContext *dma, dma_addr_t addr, dma_addr_t *len, > + DMADirection dir) > +{ > + int err; > + target_phys_addr_t paddr, plen; > + void *buf; > + DMAMemoryMap *map; > + > + plen = *len; > + err = dma->translate(dma, addr,&paddr,&plen, dir); > + if (err) { > + return NULL; > + } > + > + /* > + * If this is true, the virtual region is contiguous, > + * but the translated physical region isn't. We just > + * clamp *len, much like cpu_physical_memory_map() does. > + */ > + if (plen< *len) { > + *len = plen; > + } > + > + buf = cpu_physical_memory_map(paddr,&plen, > + dir == DMA_DIRECTION_FROM_DEVICE); > + *len = plen; > + > + /* We treat maps as remote TLBs to cope with stuff like AIO. */ > + map = g_malloc(sizeof(DMAMemoryMap)); > + map->addr = addr; > + map->len = *len; > + map->buf = buf; > + map->invalidate = NULL; > + > + QLIST_INSERT_HEAD(&dma->memory_maps, map, list); > + > + return buf; > +} > + > +void iommu_dma_memory_unmap(DMAContext *dma, void *buffer, dma_addr_t len, > + DMADirection dir, dma_addr_t access_len) > +{ > + DMAMemoryMap *map; > + > + cpu_physical_memory_unmap(buffer, len, > + dir == DMA_DIRECTION_FROM_DEVICE, > + access_len); > + > + QLIST_FOREACH(map,&dma->memory_maps, list) { > + if ((map->buf == buffer)&& (map->len == len)) { > + QLIST_REMOVE(map, list); > + > + if (map->invalidate) { > + /* If this mapping was invalidated */ > + if (--map->invalidate->count == 0) { > + /* And we're the last mapping invalidated at the time */ > + /* Then wake up whoever was waiting for the > + * invalidation to complete */ > + qemu_cond_signal(&map->invalidate->cond); > + } > + } > + > + free(map); > + } > + } > + > + > + /* unmap called on a buffer that wasn't mapped */ > + assert(false); > +} > + > +extern QemuMutex qemu_global_mutex; > + > +void iommu_wait_for_invalidated_maps(DMAContext *dma, > + dma_addr_t addr, dma_addr_t len) > +{ > + DMAMemoryMap *map; > + DMAInvalidationState is; > + > + is.count = 0; > + qemu_cond_init(&is.cond); > + > + QLIST_FOREACH(map,&dma->memory_maps, list) { > + if (ranges_overlap(addr, len, map->addr, map->len)) { > + is.count++; > + map->invalidate =&is; > + } > + } > + > + if (is.count) { > + qemu_cond_wait(&is.cond,&qemu_global_mutex); > + } > + assert(is.count == 0); > +} I don't get what's going on here but I don't think it can possibly be right. What is the purpose of this function? Regards, Anthony Liguori
On Mon, May 14, 2012 at 07:49:16PM -0500, Anthony Liguori wrote: [snip] > >+void iommu_wait_for_invalidated_maps(DMAContext *dma, > >+ dma_addr_t addr, dma_addr_t len) > >+{ > >+ DMAMemoryMap *map; > >+ DMAInvalidationState is; > >+ > >+ is.count = 0; > >+ qemu_cond_init(&is.cond); > >+ > >+ QLIST_FOREACH(map,&dma->memory_maps, list) { > >+ if (ranges_overlap(addr, len, map->addr, map->len)) { > >+ is.count++; > >+ map->invalidate =&is; > >+ } > >+ } > >+ > >+ if (is.count) { > >+ qemu_cond_wait(&is.cond,&qemu_global_mutex); > >+ } > >+ assert(is.count == 0); > >+} > > I don't get what's going on here but I don't think it can possibly > be right. What is the purpose of this function? So. This is a function to be used by individual iommu implementations. When IOMMU mappings are updated on real hardware, there may be some lag in th effect, particularly for in-flight DMAs, due to shadow TLBs or other things. But generally, there will be some way to synchronize the IOMMU that once completed will ensure that no further DMA access to the old translations may occur. For the sPAPR TCE MMU, this actually happens after every PUT_TCE hcall. In our software implementation this is a problem if existing drivers have done a dma_memory_map() and haven't yet done a dma_memory_unmap(): they will have a real pointer to the translated memory which can't be intercepted. However, memory maps are supposed to be transient, so this helper function invalidates memory maps based on an IOVA address range, and blocks until they expire. This function would be called from CPU thread context, the dma_memory_unmap() would come from the IO thread (in the only existing case from AIO completion callbacks in the block IO code). This gives the IOMMU implementation a way of blocking the CPU initiating a sync operation until it really is safe to assume that no further DMA operations may hit the invalidated mappings. Note that if we actually hit the blocking path here, that almost certainly indicates the guest has done something wrong, or at least unusual - DMA devices should be stopped before removing their IOMMU mappings from under them. However, one of the points of the IOMMU is the ability to be able to forcibly stop DMAs, so we do need to implement this behaviour for that case. With difficulty, I've traced through qemu's difficult-to-follow thread synchronization logic and I'm about 75% convinced this works correctly with it.
On 05/14/2012 08:42 PM, David Gibson wrote: > On Mon, May 14, 2012 at 07:49:16PM -0500, Anthony Liguori wrote: > [snip] >>> +void iommu_wait_for_invalidated_maps(DMAContext *dma, >>> + dma_addr_t addr, dma_addr_t len) >>> +{ >>> + DMAMemoryMap *map; >>> + DMAInvalidationState is; >>> + >>> + is.count = 0; >>> + qemu_cond_init(&is.cond); >>> + >>> + QLIST_FOREACH(map,&dma->memory_maps, list) { >>> + if (ranges_overlap(addr, len, map->addr, map->len)) { >>> + is.count++; >>> + map->invalidate =&is; >>> + } >>> + } >>> + >>> + if (is.count) { >>> + qemu_cond_wait(&is.cond,&qemu_global_mutex); >>> + } >>> + assert(is.count == 0); >>> +} >> >> I don't get what's going on here but I don't think it can possibly >> be right. What is the purpose of this function? > > So. This is a function to be used by individual iommu > implementations. When IOMMU mappings are updated on real hardware, > there may be some lag in th effect, particularly for in-flight DMAs, > due to shadow TLBs or other things. But generally, there will be some > way to synchronize the IOMMU that once completed will ensure that no > further DMA access to the old translations may occur. For the sPAPR > TCE MMU, this actually happens after every PUT_TCE hcall. > > In our software implementation this is a problem if existing drivers > have done a dma_memory_map() and haven't yet done a > dma_memory_unmap(): they will have a real pointer to the translated > memory which can't be intercepted. However, memory maps are supposed > to be transient, so this helper function invalidates memory maps based > on an IOVA address range, and blocks until they expire. This function > would be called from CPU thread context, the dma_memory_unmap() would > come from the IO thread (in the only existing case from AIO completion > callbacks in the block IO code). > > This gives the IOMMU implementation a way of blocking the CPU > initiating a sync operation until it really is safe to assume that no > further DMA operations may hit the invalidated mappings. Note that if > we actually hit the blocking path here, that almost certainly > indicates the guest has done something wrong, or at least unusual - So the CPU thread runs in lock-step with the I/O thread. Dropping the CPU thread lock to let the I/O thread run is a dangerous thing to do in a place like this. Also, I think you'd effectively block the CPU until pending DMA operations complete? This could be many, many, milliseconds, no? That's going to make guests very upset. Regards, Anthony Liguori > DMA devices should be stopped before removing their IOMMU mappings > from under them. However, one of the points of the IOMMU is the > ability to be able to forcibly stop DMAs, so we do need to implement > this behaviour for that case. > > With difficulty, I've traced through qemu's difficult-to-follow thread > synchronization logic and I'm about 75% convinced this works correctly > with it. >
On Mon, 2012-05-14 at 21:03 -0500, Anthony Liguori wrote: > So the CPU thread runs in lock-step with the I/O thread. Dropping the CPU > thread lock to let the I/O thread run is a dangerous thing to do in a place like > this. > > Also, I think you'd effectively block the CPU until pending DMA operations > complete? This could be many, many, milliseconds, no? That's going to make > guests very upset. Do you see any other option ? IE. When the guest invalidates iommu translations, it must have a way to synchronize with anything that may have used such translations. IE. It must have a way to guarantee that - The translation is no longer used - Any physical address obtained as a result of looking at the translation table/tree/... is no longer used This is true regardless of the iommu model. For the PAPR TCE model, we need to provide such synchronization whenever we clear a TCE entry, but if I was to emulate the HW iommu of a G5 for example, I would have to provide similar guarantees in my emulation of MMIO accesses to the iommu TLB invalidate register. This is a problem with devices using the map/unmap calls. This is going to be even more of a problem as we start being more threaded (which from discussions I've read here or there seem to be the goal) since synchronizing with the IO thread isn't going to be enough. As long as the actual data transfer through the iommu is under control of the iommu code, then the iommu implementation can use whatever locking it needs to ensure this synchronization. But map/unmap defeats that. David's approach may not be the best long term, but provided it's not totally broken (I don't know qemu locking well enough to judge how dangerous it is) then it might be a "good enough" first step until we come up with something better ? The normal case will be that no map exist, ie, it will almost always be a guest programming error to remove an iommu mapping while a device is actively using it, so having this case be slow is probably a non-issue. Cheers, Ben.
On 05/14/2012 09:32 PM, Benjamin Herrenschmidt wrote: > On Mon, 2012-05-14 at 21:03 -0500, Anthony Liguori wrote: >> So the CPU thread runs in lock-step with the I/O thread. Dropping the CPU >> thread lock to let the I/O thread run is a dangerous thing to do in a place like >> this. >> >> Also, I think you'd effectively block the CPU until pending DMA operations >> complete? This could be many, many, milliseconds, no? That's going to make >> guests very upset. > > Do you see any other option ? Yes, ignore it. I have a hard time believing software depends on changing DMA translation mid-way through a transaction. > > IE. When the guest invalidates iommu translations, it must have a way to > synchronize with anything that may have used such translations. IE. It > must have a way to guarantee that > > - The translation is no longer used You can certainly prevent future uses of the translation. We're only talking about pending mapped requests. My assertion is that they don't matter. > - Any physical address obtained as a result of looking at > the translation table/tree/... is no longer used Why does this need to be guaranteed? How can software depend on this in a meaningful way? > This is true regardless of the iommu model. For the PAPR TCE model, we > need to provide such synchronization whenever we clear a TCE entry, but > if I was to emulate the HW iommu of a G5 for example, I would have to > provide similar guarantees in my emulation of MMIO accesses to the iommu > TLB invalidate register. > > This is a problem with devices using the map/unmap calls. This is going > to be even more of a problem as we start being more threaded (which from > discussions I've read here or there seem to be the goal) since > synchronizing with the IO thread isn't going to be enough. > > As long as the actual data transfer through the iommu is under control > of the iommu code, then the iommu implementation can use whatever > locking it needs to ensure this synchronization. > > But map/unmap defeats that. > > David's approach may not be the best long term, but provided it's not > totally broken (I don't know qemu locking well enough to judge how > dangerous it is) then it might be a "good enough" first step until we > come up with something better ? No, it's definitely not good enough. Dropping the global mutex in random places is asking for worlds of hurt. If this is really important, then we need some sort of cancellation API to go along with map/unmap although I doubt that's really possible. MMIO/PIO operations cannot block. Regards, Anthony Liguori > > The normal case will be that no map exist, ie, it will almost always be > a guest programming error to remove an iommu mapping while a device is > actively using it, so having this case be slow is probably a non-issue. > > Cheers, > Ben. > >
On Mon, 2012-05-14 at 21:50 -0500, Anthony Liguori wrote: > On 05/14/2012 09:32 PM, Benjamin Herrenschmidt wrote: > > On Mon, 2012-05-14 at 21:03 -0500, Anthony Liguori wrote: > >> So the CPU thread runs in lock-step with the I/O thread. Dropping the CPU > >> thread lock to let the I/O thread run is a dangerous thing to do in a place like > >> this. > >> > >> Also, I think you'd effectively block the CPU until pending DMA operations > >> complete? This could be many, many, milliseconds, no? That's going to make > >> guests very upset. > > > > Do you see any other option ? > > Yes, ignore it. > > I have a hard time believing software depends on changing DMA translation > mid-way through a transaction. It's a correctness issue. It won't happen in normal circumstances but it can, and thus should be handled gracefully. Cases where that matter are unloading of a (broken) driver, kexec/kdump from one guest to another etc... all involve potentially clearing all iommu tables while a driver might have left a device DMA'ing. The expectation is that the device will get target aborts from the iommu until the situation gets "cleaned up" in SW. > > IE. When the guest invalidates iommu translations, it must have a way to > > synchronize with anything that may have used such translations. IE. It > > must have a way to guarantee that > > > > - The translation is no longer used > > You can certainly prevent future uses of the translation. We're only talking > about pending mapped requests. My assertion is that they don't matter. > > > - Any physical address obtained as a result of looking at > > the translation table/tree/... is no longer used > > Why does this need to be guaranteed? How can software depend on this in a > meaningful way? The same as TLB invalidations :-) In real HW, this is a property of the HW itself, ie, whatever MMIO is used to invalidate the HW TLB provides a way to ensure (usually by reading back) that any request pending in the iommu pipeline has either been completed or canned. When we start having page fault capable iommu's this will be even more important as faults will be be part of the non-error case. For now, it's strictly error handling but it still should be done correctly. > > This is true regardless of the iommu model. For the PAPR TCE model, we > > need to provide such synchronization whenever we clear a TCE entry, but > > if I was to emulate the HW iommu of a G5 for example, I would have to > > provide similar guarantees in my emulation of MMIO accesses to the iommu > > TLB invalidate register. > > > > This is a problem with devices using the map/unmap calls. This is going > > to be even more of a problem as we start being more threaded (which from > > discussions I've read here or there seem to be the goal) since > > synchronizing with the IO thread isn't going to be enough. > > > > As long as the actual data transfer through the iommu is under control > > of the iommu code, then the iommu implementation can use whatever > > locking it needs to ensure this synchronization. > > > > But map/unmap defeats that. > > > > David's approach may not be the best long term, but provided it's not > > totally broken (I don't know qemu locking well enough to judge how > > dangerous it is) then it might be a "good enough" first step until we > > come up with something better ? > > No, it's definitely not good enough. Dropping the global mutex in random places > is asking for worlds of hurt. > > If this is really important, then we need some sort of cancellation API to go > along with map/unmap although I doubt that's really possible. > > MMIO/PIO operations cannot block. Well, there's a truckload of cases in real HW where an MMIO/PIO read is used to synchronize some sort of HW operation.... I suppose nothing that involves blocking at this stage in qemu but I would be careful with your expectations here... writes are usually pipelined but blocking on a read response does make a lot of sense. In any case, for the problem at hand, I can just drop the wait for now and maybe just print a warning if I see an existing map. We still need some kind of either locking or barrier to simply ensure that the updates to the TCE table are visible to other processors but that can be done in the backend. But I wouldn't just forget about the issue, it's going to come back and bite... Cheers, Ben. > Regards, > > Anthony Liguori > > > > > The normal case will be that no map exist, ie, it will almost always be > > a guest programming error to remove an iommu mapping while a device is > > actively using it, so having this case be slow is probably a non-issue. > > > > Cheers, > > Ben. > > > >
On 05/14/2012 10:02 PM, Benjamin Herrenschmidt wrote: > On Mon, 2012-05-14 at 21:50 -0500, Anthony Liguori wrote: >> On 05/14/2012 09:32 PM, Benjamin Herrenschmidt wrote: >>> On Mon, 2012-05-14 at 21:03 -0500, Anthony Liguori wrote: >>>> So the CPU thread runs in lock-step with the I/O thread. Dropping the CPU >>>> thread lock to let the I/O thread run is a dangerous thing to do in a place like >>>> this. >>>> >>>> Also, I think you'd effectively block the CPU until pending DMA operations >>>> complete? This could be many, many, milliseconds, no? That's going to make >>>> guests very upset. >>> >>> Do you see any other option ? >> >> Yes, ignore it. >> >> I have a hard time believing software depends on changing DMA translation >> mid-way through a transaction. > > It's a correctness issue. It won't happen in normal circumstances but it > can, and thus should be handled gracefully. I think the crux of your argument is that upon a change to the translation table, the operation acts as a barrier such that the exact moment it returns, you're guaranteed that no DMAs are in flight with the old translation mapping. That's not my understanding of at least VT-d and I have a hard time believing it's true for other IOMMUs as that kind of synchronization seems like it would be very expensive to implement in hardware. Rather, when the IOTLB is flushed, I believe the only guarantee that you have is that future IOTLB lookups will return the new mapping. But that doesn't mean that there isn't a request in flight that uses the old mapping. I will grant you that PCI transactions are typically much smaller than QEMU transactions such that we may continue to use the old mappings for much longer than real hardware would. But I think that still puts us well within the realm of correctness. > Cases where that matter are unloading of a (broken) driver, kexec/kdump > from one guest to another etc... all involve potentially clearing all > iommu tables while a driver might have left a device DMA'ing. The > expectation is that the device will get target aborts from the iommu > until the situation gets "cleaned up" in SW. Yes, this would be worse in QEMU than on bare metal because we essentially have a much larger translation TLB. But as I said above, I think we're well within the specified behavior here. >> Why does this need to be guaranteed? How can software depend on this in a >> meaningful way? > > The same as TLB invalidations :-) > > In real HW, this is a property of the HW itself, ie, whatever MMIO is > used to invalidate the HW TLB provides a way to ensure (usually by > reading back) that any request pending in the iommu pipeline has either > been completed or canned. Can you point to a spec that says this? This doesn't match my understanding. > When we start having page fault capable iommu's this will be even more > important as faults will be be part of the non-error case. We can revisit this discussion after every PCI device is changed to cope with a page fault capable IOMMU ;-) >>> David's approach may not be the best long term, but provided it's not >>> totally broken (I don't know qemu locking well enough to judge how >>> dangerous it is) then it might be a "good enough" first step until we >>> come up with something better ? >> >> No, it's definitely not good enough. Dropping the global mutex in random places >> is asking for worlds of hurt. >> >> If this is really important, then we need some sort of cancellation API to go >> along with map/unmap although I doubt that's really possible. >> >> MMIO/PIO operations cannot block. > > Well, there's a truckload of cases in real HW where an MMIO/PIO read is > used to synchronize some sort of HW operation.... I suppose nothing that > involves blocking at this stage in qemu but I would be careful with your > expectations here... writes are usually pipelined but blocking on a read > response does make a lot of sense. Blocking on an MMIO/PIO request effectively freezes a CPU. All sorts of badness results from that. Best case scenario, you trigger soft lockup warnings. > In any case, for the problem at hand, I can just drop the wait for now > and maybe just print a warning if I see an existing map. > > We still need some kind of either locking or barrier to simply ensure > that the updates to the TCE table are visible to other processors but > that can be done in the backend. > > But I wouldn't just forget about the issue, it's going to come back and > bite... I think working out the exact semantics of what we need to do is absolutely important. But I think you're taking an overly conservative approach to what we need to provide here. Regards, Anthony Liguori > > Cheers, > Ben. > >> Regards, >> >> Anthony Liguori >> >>> >>> The normal case will be that no map exist, ie, it will almost always be >>> a guest programming error to remove an iommu mapping while a device is >>> actively using it, so having this case be slow is probably a non-issue. >>> >>> Cheers, >>> Ben. >>> >>> > >
On Tue, 2012-05-15 at 09:02 -0500, Anthony Liguori wrote: > I think the crux of your argument is that upon a change to the translation > table, the operation acts as a barrier such that the exact moment it returns, > you're guaranteed that no DMAs are in flight with the old translation mapping. Not when the translation is changed in memory but whenever the translation cache are invalidated or whatever other mechanism the HW provides to do that synchronization. On PAPR, this guarantee is provided by the H_PUT_TCE hypervisor call which we use to manipulate translations. [ Note that for performance reasons, it might end up being very impractical to provide that guarantee since it prevents us from handling H_PUT_TCE entirely in kernel real mode like we to today... we'll have to figure our what we want to do here for the TCE backend implementation, maybe have qemu mark "in use" translations and cause exist when those are modified ... ] > That's not my understanding of at least VT-d and I have a hard time believing > it's true for other IOMMUs as that kind of synchronization seems like it would > be very expensive to implement in hardware. How so ? It's perfectly standard stuff ... it's usually part of the TLB flushing op. > Rather, when the IOTLB is flushed, I believe the only guarantee that you have is > that future IOTLB lookups will return the new mapping. But that doesn't mean > that there isn't a request in flight that uses the old mapping. I would be very surprised if that was the case :-) I don't think any sane HW implementation would fail to provide full synchronization with invalidations. That's how MMUs operate and I don't see any reason why an iommu shouldn't be held to the same standards. If it didn't, you'd have a nice host attack... have a guest doing pass-through start a very long transaction and immediately commit suicide. KVM starts reclaiming the pages, they go back to the host, might be re-used immediately ... while still being DMAed to. > I will grant you that PCI transactions are typically much smaller than QEMU > transactions such that we may continue to use the old mappings for much longer > than real hardware would. But I think that still puts us well within the realm > of correctness. No, a "random amount of time after invalidation" is not and will never be correct. On large SMP machines, the time between a page being freed and that page being re-used can be very small. The memory being re-used by something like kexec can happen almost immediately while qemu is blocked on an AIO that takes milliseconds ... etc.... At least because this is emulated iommu, qemu only writes to virtual addresses mapping the guest space, so this isn't a host attack (unlike with a real HW iommu however where the lack of such synchronization would definitely be, as I described earlier). > > Cases where that matter are unloading of a (broken) driver, kexec/kdump > > from one guest to another etc... all involve potentially clearing all > > iommu tables while a driver might have left a device DMA'ing. The > > expectation is that the device will get target aborts from the iommu > > until the situation gets "cleaned up" in SW. > > Yes, this would be worse in QEMU than on bare metal because we essentially have > a much larger translation TLB. But as I said above, I think we're well within > the specified behavior here. No :-) > >> Why does this need to be guaranteed? How can software depend on this in a > >> meaningful way? > > > > The same as TLB invalidations :-) > > > > In real HW, this is a property of the HW itself, ie, whatever MMIO is > > used to invalidate the HW TLB provides a way to ensure (usually by > > reading back) that any request pending in the iommu pipeline has either > > been completed or canned. > > Can you point to a spec that says this? This doesn't match my understanding. Appart from common sense ? I'd have to dig to get you actual specs but it should be plain obvious that you need that sort of sync or you simply cannot trust your iommu to do virtualization. > > When we start having page fault capable iommu's this will be even more > > important as faults will be be part of the non-error case. > > We can revisit this discussion after every PCI device is changed to cope with a > page fault capable IOMMU ;-) Heh, well, the point is that is still part of the base iommu model, page faulting is just going to make the problem worse. > >>> David's approach may not be the best long term, but provided it's not > >>> totally broken (I don't know qemu locking well enough to judge how > >>> dangerous it is) then it might be a "good enough" first step until we > >>> come up with something better ? > >> > >> No, it's definitely not good enough. Dropping the global mutex in random places > >> is asking for worlds of hurt. > >> > >> If this is really important, then we need some sort of cancellation API to go > >> along with map/unmap although I doubt that's really possible. > >> > >> MMIO/PIO operations cannot block. > > > > Well, there's a truckload of cases in real HW where an MMIO/PIO read is > > used to synchronize some sort of HW operation.... I suppose nothing that > > involves blocking at this stage in qemu but I would be careful with your > > expectations here... writes are usually pipelined but blocking on a read > > response does make a lot of sense. > > Blocking on an MMIO/PIO request effectively freezes a CPU. All sorts of badness > results from that. Best case scenario, you trigger soft lockup warnings. Well, that's exactly what happens in HW on PIO accesses and MMIO reads waiting for a reply... > > In any case, for the problem at hand, I can just drop the wait for now > > and maybe just print a warning if I see an existing map. > > > > We still need some kind of either locking or barrier to simply ensure > > that the updates to the TCE table are visible to other processors but > > that can be done in the backend. > > > > But I wouldn't just forget about the issue, it's going to come back and > > bite... > > I think working out the exact semantics of what we need to do is absolutely > important. But I think you're taking an overly conservative approach to what we > need to provide here. I'm happy to have the patches merged without that for now, it will get us going with USB emulation etc... which we need for graphics, but we do need to sort this out eventually. I'll re-submit without it. Cheers, Ben. > Regards, > > Anthony Liguori > > > > > Cheers, > > Ben. > > > >> Regards, > >> > >> Anthony Liguori > >> > >>> > >>> The normal case will be that no map exist, ie, it will almost always be > >>> a guest programming error to remove an iommu mapping while a device is > >>> actively using it, so having this case be slow is probably a non-issue. > >>> > >>> Cheers, > >>> Ben. > >>> > >>> > > > >
On 05/15/2012 04:55 PM, Benjamin Herrenschmidt wrote: > On Tue, 2012-05-15 at 09:02 -0500, Anthony Liguori wrote: > >> I think the crux of your argument is that upon a change to the translation >> table, the operation acts as a barrier such that the exact moment it returns, >> you're guaranteed that no DMAs are in flight with the old translation mapping. > > Not when the translation is changed in memory but whenever the > translation cache are invalidated or whatever other mechanism the HW > provides to do that synchronization. On PAPR, this guarantee is provided > by the H_PUT_TCE hypervisor call which we use to manipulate > translations. So this is from the VT-d spec: "6.2.1 Register Based Invalidation Interface The register based invalidations provides a synchronous hardware interface for invalidations. Software is expected to write to the IOTLB registers to submit invalidation command and may poll on these registers to check for invalidation completion. For optimal performance, hardware implementations are recommended to complete an invalidation request with minimal latency" This makes perfect sense. You write to an MMIO location to request invalidation and then *poll* on a separate register for completion. It's not a single MMIO operation that has an indefinitely return duration. Regards, Anthony Liguori
On Tue, 2012-05-15 at 17:02 -0500, Anthony Liguori wrote: > > "6.2.1 Register Based Invalidation Interface > The register based invalidations provides a synchronous hardware interface for > invalidations. Software is expected to write to the IOTLB registers to submit > invalidation command and may poll on these registers to check for invalidation > completion. For optimal performance, hardware implementations are recommended to > complete an invalidation request with minimal latency" > > This makes perfect sense. You write to an MMIO location to request invalidation > and then *poll* on a separate register for completion. > > It's not a single MMIO operation that has an indefinitely return duration. Sure, it's an implementation detail, I never meant that it had to be a single blocking register access, all I said is that the HW must provide such a mechanism that is typically used synchronously by the operating system. Polling for completion is a perfectly legit way to do it, that's how we do it on the Apple G5 "DART" iommu as well. The fact that MMIO operations can block is orthogonal, it is possible however, especially with ancient PIO devices. In our case (TCEs) it's a hypervisor call, not an MMIO op, so to some extent it's even more likely to do "blocking" things. It would have been possible to implement a "busy" return status with the guest having to try again, unfortunately that's not how Linux has implemented it, so we are stuck with the current semantics. Now, if you think that dropping the lock isn't good, what do you reckon I should do ? Cheers, Ben.
On 05/15/2012 06:08 PM, Benjamin Herrenschmidt wrote: > On Tue, 2012-05-15 at 17:02 -0500, Anthony Liguori wrote: >> >> "6.2.1 Register Based Invalidation Interface >> The register based invalidations provides a synchronous hardware interface for >> invalidations. Software is expected to write to the IOTLB registers to submit >> invalidation command and may poll on these registers to check for invalidation >> completion. For optimal performance, hardware implementations are recommended to >> complete an invalidation request with minimal latency" >> >> This makes perfect sense. You write to an MMIO location to request invalidation >> and then *poll* on a separate register for completion. >> >> It's not a single MMIO operation that has an indefinitely return duration. > > Sure, it's an implementation detail, I never meant that it had to be a > single blocking register access, all I said is that the HW must provide > such a mechanism that is typically used synchronously by the operating > system. Polling for completion is a perfectly legit way to do it, that's > how we do it on the Apple G5 "DART" iommu as well. > > The fact that MMIO operations can block is orthogonal, it is possible > however, especially with ancient PIO devices. Even ancient PIO devices really don't block indefinitely. > In our case (TCEs) it's a hypervisor call, not an MMIO op, so to some > extent it's even more likely to do "blocking" things. Yes, so I think the right thing to do is not model hypercalls for sPAPR as synchronous calls but rather as asynchronous calls. Obviously, simply ones can use a synchronous implementation... This is a matter of setting hlt=1 before dispatching the hypercall and passing a continuation to the call that when executed, prepare the CPUState for the hypercall return and then set hlt=0 to resume the CPU. > It would have been possible to implement a "busy" return status with the > guest having to try again, unfortunately that's not how Linux has > implemented it, so we are stuck with the current semantics. > > Now, if you think that dropping the lock isn't good, what do you reckon > I should do ? Add a reference count to dma map calls and a flush_pending flag. If flush_pending && ref > 0, return NULL for all map calls. Decrement ref on unmap and if ref = 0 and flush_pending, clear flush_pending. You could add a flush_notifier too for this event. dma_flush() sets flush_pending if ref > 0. Your TCE flush hypercall would register for flush notifications and squirrel away the hypercall completion continuation. VT-d actually has a concept of a invalidation completion queue which delivers interrupt based notification of invalidation completion events. The above flush_notify would be the natural way to support this since in this case, there is no VCPU event that's directly involved in the completion event. Regards, Anthony Liguori > Cheers, > Ben. > >
On Tue, 2012-05-15 at 18:58 -0500, Anthony Liguori wrote: > Even ancient PIO devices really don't block indefinitely. > > > In our case (TCEs) it's a hypervisor call, not an MMIO op, so to some > > extent it's even more likely to do "blocking" things. > > Yes, so I think the right thing to do is not model hypercalls for sPAPR as > synchronous calls but rather as asynchronous calls. Obviously, simply ones can > use a synchronous implementation... > > This is a matter of setting hlt=1 before dispatching the hypercall and passing a > continuation to the call that when executed, prepare the CPUState for the > hypercall return and then set hlt=0 to resume the CPU. Is there any reason not to set that hlt after the dispatch ? IE. from within the hypercall, for the very few that want to do asynchronous completion, do something like spapr_hcall_suspend() before returning ? > > It would have been possible to implement a "busy" return status with the > > guest having to try again, unfortunately that's not how Linux has > > implemented it, so we are stuck with the current semantics. > > > > Now, if you think that dropping the lock isn't good, what do you reckon > > I should do ? > > Add a reference count to dma map calls and a flush_pending flag. If > flush_pending && ref > 0, return NULL for all map calls. > > Decrement ref on unmap and if ref = 0 and flush_pending, clear flush_pending. > You could add a flush_notifier too for this event. > > dma_flush() sets flush_pending if ref > 0. Your TCE flush hypercall would > register for flush notifications and squirrel away the hypercall completion > continuation. Ok, I'll look into it, thanks. Any good example to look at for how that continuation stuff works ? > VT-d actually has a concept of a invalidation completion queue which delivers > interrupt based notification of invalidation completion events. The above > flush_notify would be the natural way to support this since in this case, there > is no VCPU event that's directly involved in the completion event. Cheers, Ben. > Regards, > > Anthony Liguori > > > Cheers, > > Ben. > > > >
On 05/15/2012 07:41 PM, Benjamin Herrenschmidt wrote: > On Tue, 2012-05-15 at 18:58 -0500, Anthony Liguori wrote: > >> Even ancient PIO devices really don't block indefinitely. >> >>> In our case (TCEs) it's a hypervisor call, not an MMIO op, so to some >>> extent it's even more likely to do "blocking" things. >> >> Yes, so I think the right thing to do is not model hypercalls for sPAPR as >> synchronous calls but rather as asynchronous calls. Obviously, simply ones can >> use a synchronous implementation... >> >> This is a matter of setting hlt=1 before dispatching the hypercall and passing a >> continuation to the call that when executed, prepare the CPUState for the >> hypercall return and then set hlt=0 to resume the CPU. > > Is there any reason not to set that hlt after the dispatch ? IE. from > within the hypercall, for the very few that want to do asynchronous > completion, do something like spapr_hcall_suspend() before returning ? You certainly could do that but it may get a little weird dealing with the return path. You'd have to return something like -EWOULDBLOCK and make sure you handle that in the dispatch code appropriately. >>> It would have been possible to implement a "busy" return status with the >>> guest having to try again, unfortunately that's not how Linux has >>> implemented it, so we are stuck with the current semantics. >>> >>> Now, if you think that dropping the lock isn't good, what do you reckon >>> I should do ? >> >> Add a reference count to dma map calls and a flush_pending flag. If >> flush_pending&& ref> 0, return NULL for all map calls. >> >> Decrement ref on unmap and if ref = 0 and flush_pending, clear flush_pending. >> You could add a flush_notifier too for this event. >> >> dma_flush() sets flush_pending if ref> 0. Your TCE flush hypercall would >> register for flush notifications and squirrel away the hypercall completion >> continuation. > > Ok, I'll look into it, thanks. Any good example to look at for how that > continuation stuff works ? Just a callback and an opaque. You could look at the AIOCB's in the block layer. Regards, Anthony Liguori >> VT-d actually has a concept of a invalidation completion queue which delivers >> interrupt based notification of invalidation completion events. The above >> flush_notify would be the natural way to support this since in this case, there >> is no VCPU event that's directly involved in the completion event. > > Cheers, > Ben. > >> Regards, >> >> Anthony Liguori >> >>> Cheers, >>> Ben. >>> >>> > >
On Tue, 2012-05-15 at 19:54 -0500, Anthony Liguori wrote: > > You certainly could do that but it may get a little weird dealing with the > return path. You'd have to return something like -EWOULDBLOCK and make sure you > handle that in the dispatch code appropriately. Hrm, our implementation of kvm_arch_handle_exit() always return "1" after a hypercall, forcing kvm_cpu_exec() to return back to the thread loop, so we should be ok to just set env->halted and return no ? Cheers, Ben.
On 05/15/2012 08:20 PM, Benjamin Herrenschmidt wrote: > On Tue, 2012-05-15 at 19:54 -0500, Anthony Liguori wrote: >> >> You certainly could do that but it may get a little weird dealing with the >> return path. You'd have to return something like -EWOULDBLOCK and make sure you >> handle that in the dispatch code appropriately. > > Hrm, our implementation of kvm_arch_handle_exit() always return "1" > after a hypercall, forcing kvm_cpu_exec() to return back to the thread > loop, so we should be ok to just set env->halted and return no ? I meant, if you wanted to have a synchronous hypercall function to dispatch, and then later call "hypercall_finish()", you would need a way to return an error in the synchronous hypercall to identify that something else would eventually call "hypercall_finish()." The sync hypercall would need to return something like -EWOULDBLOC. Setting env->halted=1 ought to be enough to delay returning to the guest although i'd have to go through the code to verify. Regards, Anthony Liguori > > Cheers, > Ben. > >
diff --git a/dma-helpers.c b/dma-helpers.c index 2dc4691..09591ef 100644 --- a/dma-helpers.c +++ b/dma-helpers.c @@ -9,6 +9,10 @@ #include "dma.h" #include "trace.h" +#include "range.h" +#include "qemu-thread.h" + +/* #define DEBUG_IOMMU */ void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint, DMAContext *dma) { @@ -244,3 +248,213 @@ void dma_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, { bdrv_acct_start(bs, cookie, sg->size, type); } + +bool iommu_dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len, + DMADirection dir) +{ + target_phys_addr_t paddr, plen; + +#ifdef DEBUG_IOMMU + fprintf(stderr, "dma_memory_check context=%p addr=0x" DMA_ADDR_FMT + " len=0x" DMA_ADDR_FMT " dir=%d\n", dma, addr, len, dir); +#endif + + while (len) { + if (dma->translate(dma, addr, &paddr, &plen, dir) != 0) { + return false; + } + + /* The translation might be valid for larger regions. */ + if (plen > len) { + plen = len; + } + + len -= plen; + addr += plen; + } + + return true; +} + +int iommu_dma_memory_rw(DMAContext *dma, dma_addr_t addr, + void *buf, dma_addr_t len, DMADirection dir) +{ + target_phys_addr_t paddr, plen; + int err; + +#ifdef DEBUG_IOMMU + fprintf(stderr, "dma_memory_rw context=%p addr=0x" DMA_ADDR_FMT " len=0x" + DMA_ADDR_FMT " dir=%d\n", dma, addr, len, dir); +#endif + + while (len) { + err = dma->translate(dma, addr, &paddr, &plen, dir); + if (err) { + return -1; + } + + /* The translation might be valid for larger regions. */ + if (plen > len) { + plen = len; + } + + cpu_physical_memory_rw(paddr, buf, plen, + dir == DMA_DIRECTION_FROM_DEVICE); + + len -= plen; + addr += plen; + buf += plen; + } + + return 0; +} + +int iommu_dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len) +{ + target_phys_addr_t paddr, plen; + int err; + +#ifdef DEBUG_IOMMU + fprintf(stderr, "dma_memory_zero context=%p addr=0x" DMA_ADDR_FMT + " len=0x" DMA_ADDR_FMT "\n", dma, addr, len); +#endif + + while (len) { + err = dma->translate(dma, addr, &paddr, &plen, + DMA_DIRECTION_FROM_DEVICE); + if (err) { + return err; + } + + /* The translation might be valid for larger regions. */ + if (plen > len) { + plen = len; + } + + cpu_physical_memory_zero(paddr, plen); + + len -= plen; + addr += plen; + } + + return 0; +} + +typedef struct { + unsigned long count; + QemuCond cond; +} DMAInvalidationState; + +typedef struct DMAMemoryMap DMAMemoryMap; +struct DMAMemoryMap { + dma_addr_t addr; + size_t len; + void *buf; + + DMAInvalidationState *invalidate; + QLIST_ENTRY(DMAMemoryMap) list; +}; + +void dma_context_init(DMAContext *dma, DMATranslateFunc fn) +{ +#ifdef DEBUG_IOMMU + fprintf(stderr, "dma_context_init(%p, %p)\n", dma, fn); +#endif + dma->translate = fn; + QLIST_INIT(&dma->memory_maps); +} + +void *iommu_dma_memory_map(DMAContext *dma, dma_addr_t addr, dma_addr_t *len, + DMADirection dir) +{ + int err; + target_phys_addr_t paddr, plen; + void *buf; + DMAMemoryMap *map; + + plen = *len; + err = dma->translate(dma, addr, &paddr, &plen, dir); + if (err) { + return NULL; + } + + /* + * If this is true, the virtual region is contiguous, + * but the translated physical region isn't. We just + * clamp *len, much like cpu_physical_memory_map() does. + */ + if (plen < *len) { + *len = plen; + } + + buf = cpu_physical_memory_map(paddr, &plen, + dir == DMA_DIRECTION_FROM_DEVICE); + *len = plen; + + /* We treat maps as remote TLBs to cope with stuff like AIO. */ + map = g_malloc(sizeof(DMAMemoryMap)); + map->addr = addr; + map->len = *len; + map->buf = buf; + map->invalidate = NULL; + + QLIST_INSERT_HEAD(&dma->memory_maps, map, list); + + return buf; +} + +void iommu_dma_memory_unmap(DMAContext *dma, void *buffer, dma_addr_t len, + DMADirection dir, dma_addr_t access_len) +{ + DMAMemoryMap *map; + + cpu_physical_memory_unmap(buffer, len, + dir == DMA_DIRECTION_FROM_DEVICE, + access_len); + + QLIST_FOREACH(map, &dma->memory_maps, list) { + if ((map->buf == buffer) && (map->len == len)) { + QLIST_REMOVE(map, list); + + if (map->invalidate) { + /* If this mapping was invalidated */ + if (--map->invalidate->count == 0) { + /* And we're the last mapping invalidated at the time */ + /* Then wake up whoever was waiting for the + * invalidation to complete */ + qemu_cond_signal(&map->invalidate->cond); + } + } + + free(map); + } + } + + + /* unmap called on a buffer that wasn't mapped */ + assert(false); +} + +extern QemuMutex qemu_global_mutex; + +void iommu_wait_for_invalidated_maps(DMAContext *dma, + dma_addr_t addr, dma_addr_t len) +{ + DMAMemoryMap *map; + DMAInvalidationState is; + + is.count = 0; + qemu_cond_init(&is.cond); + + QLIST_FOREACH(map, &dma->memory_maps, list) { + if (ranges_overlap(addr, len, map->addr, map->len)) { + is.count++; + map->invalidate = &is; + } + } + + if (is.count) { + qemu_cond_wait(&is.cond, &qemu_global_mutex); + } + assert(is.count == 0); +} diff --git a/dma.h b/dma.h index 876aea4..b57d72f 100644 --- a/dma.h +++ b/dma.h @@ -14,6 +14,7 @@ #include "hw/hw.h" #include "block.h" +typedef struct DMAContext DMAContext; typedef struct ScatterGatherEntry ScatterGatherEntry; typedef enum { @@ -30,28 +31,64 @@ struct QEMUSGList { }; #if defined(TARGET_PHYS_ADDR_BITS) -typedef target_phys_addr_t dma_addr_t; -#define DMA_ADDR_BITS TARGET_PHYS_ADDR_BITS -#define DMA_ADDR_FMT TARGET_FMT_plx +/* + * When an IOMMU is present, bus addresses become distinct from + * CPU/memory physical addresses and may be a different size. Because + * the IOVA size depends more on the bus than on the platform, we more + * or less have to treat these as 64-bit always to cover all (or at + * least most) cases. + */ +typedef uint64_t dma_addr_t; + +#define DMA_ADDR_BITS 64 +#define DMA_ADDR_FMT "%" PRIx64 + +typedef int DMATranslateFunc(DMAContext *dma, + dma_addr_t addr, + target_phys_addr_t *paddr, + target_phys_addr_t *len, + DMADirection dir); + +typedef struct DMAContext { + DMATranslateFunc *translate; + QLIST_HEAD(memory_maps, DMAMemoryMap) memory_maps; +} DMAContext; + +static inline bool dma_has_iommu(DMAContext *dma) +{ + return !!dma; +} /* Checks that the given range of addresses is valid for DMA. This is * useful for certain cases, but usually you should just use * dma_memory_{read,write}() and check for errors */ -static inline bool dma_memory_valid(DMAContext *dma, dma_addr_t addr, - dma_addr_t len, DMADirection dir) +bool iommu_dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len, + DMADirection dir); +static inline bool dma_memory_valid(DMAContext *dma, + dma_addr_t addr, dma_addr_t len, + DMADirection dir) { - /* Stub version, with no iommu we assume all bus addresses are valid */ - return true; + if (!dma_has_iommu(dma)) { + return true; + } else { + return iommu_dma_memory_valid(dma, addr, len, dir); + } } +int iommu_dma_memory_rw(DMAContext *dma, dma_addr_t addr, + void *buf, dma_addr_t len, DMADirection dir); static inline int dma_memory_rw(DMAContext *dma, dma_addr_t addr, void *buf, dma_addr_t len, DMADirection dir) { - /* Stub version when we have no iommu support */ - cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len, - dir == DMA_DIRECTION_FROM_DEVICE); - return 0; + if (!dma_has_iommu(dma)) { + /* Fast-path for no IOMMU */ + cpu_physical_memory_rw(addr, buf, len, + dir == DMA_DIRECTION_FROM_DEVICE); + return 0; + } else { + return iommu_dma_memory_rw(dma, addr, buf, len, dir); + } } static inline int dma_memory_read(DMAContext *dma, dma_addr_t addr, @@ -67,34 +104,53 @@ static inline int dma_memory_write(DMAContext *dma, dma_addr_t addr, DMA_DIRECTION_FROM_DEVICE); } +int iommu_dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len); static inline int dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len) { - /* Stub version when we have no iommu support */ - cpu_physical_memory_zero(addr, len); - return 0; + if (!dma_has_iommu(dma)) { + /* Fast-path for no IOMMU */ + cpu_physical_memory_zero(addr, len); + return 0; + } else { + return iommu_dma_memory_zero(dma, addr, len); + } } +void *iommu_dma_memory_map(DMAContext *dma, + dma_addr_t addr, dma_addr_t *len, + DMADirection dir); static inline void *dma_memory_map(DMAContext *dma, dma_addr_t addr, dma_addr_t *len, DMADirection dir) { - target_phys_addr_t xlen = *len; - void *p; - - p = cpu_physical_memory_map(addr, &xlen, - dir == DMA_DIRECTION_FROM_DEVICE); - *len = xlen; - return p; + if (!dma_has_iommu(dma)) { + target_phys_addr_t xlen = *len; + void *p; + + p = cpu_physical_memory_map(addr, &xlen, + dir == DMA_DIRECTION_FROM_DEVICE); + *len = xlen; + return p; + } else { + return iommu_dma_memory_map(dma, addr, len, dir); + } } +void iommu_dma_memory_unmap(DMAContext *dma, + void *buffer, dma_addr_t len, + DMADirection dir, dma_addr_t access_len); static inline void dma_memory_unmap(DMAContext *dma, void *buffer, dma_addr_t len, DMADirection dir, dma_addr_t access_len) { - return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len, - dir == DMA_DIRECTION_FROM_DEVICE, - access_len); + if (!dma_has_iommu(dma)) { + return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len, + dir == DMA_DIRECTION_FROM_DEVICE, + access_len); + } else { + iommu_dma_memory_unmap(dma, buffer, len, dir, access_len); + } } #define DEFINE_LDST_DMA(_lname, _sname, _bits, _end) \ @@ -135,6 +191,10 @@ DEFINE_LDST_DMA(q, q, 64, be); #undef DEFINE_LDST_DMA +void dma_context_init(DMAContext *dma, DMATranslateFunc fn); +void iommu_wait_for_invalidated_maps(DMAContext *dma, + dma_addr_t addr, dma_addr_t len); + struct ScatterGatherEntry { dma_addr_t base; dma_addr_t len; diff --git a/hw/qdev-dma.h b/hw/qdev-dma.h index f0ff558..6812735 100644 --- a/hw/qdev-dma.h +++ b/hw/qdev-dma.h @@ -6,7 +6,5 @@ * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. */ -#include "qdev-addr.h" - #define DEFINE_PROP_DMAADDR(_n, _s, _f, _d) \ - DEFINE_PROP_TADDR(_n, _s, _f, _d) + DEFINE_PROP_HEX64(_n, _s, _f, _d)