$NetBSD: patch-XSA321,v 1.1 2020/07/16 09:56:47 bouyer Exp $

From: Jan Beulich <jbeulich@suse.com>
Subject: vtd: improve IOMMU TLB flush

Do not limit PSI flushes to order 0 pages, in order to avoid doing a
full TLB flush if the passed in page has an order greater than 0 and
is aligned. Should increase the performance of IOMMU TLB flushes when
dealing with page orders greater than 0.

This is part of XSA-321.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -570,13 +570,14 @@ static int __must_check iommu_flush_iotl
         if ( iommu_domid == -1 )
             continue;
 
-        if ( page_count != 1 || dfn_eq(dfn, INVALID_DFN) )
+        if ( !page_count || (page_count & (page_count - 1)) ||
+             dfn_eq(dfn, INVALID_DFN) || !IS_ALIGNED(dfn_x(dfn), page_count) )
             rc = iommu_flush_iotlb_dsi(iommu, iommu_domid,
                                        0, flush_dev_iotlb);
         else
             rc = iommu_flush_iotlb_psi(iommu, iommu_domid,
                                        dfn_to_daddr(dfn),
-                                       PAGE_ORDER_4K,
+                                       get_order_from_pages(page_count),
                                        !dma_old_pte_present,
                                        flush_dev_iotlb);
 
From: <security@xenproject.org>
Subject: vtd: prune (and rename) cache flush functions

Rename __iommu_flush_cache to iommu_sync_cache and remove
iommu_flush_cache_page. Also remove the iommu_flush_cache_entry
wrapper and just use iommu_sync_cache instead. Note the _entry suffix
was meaningless as the wrapper was already taking a size parameter in
bytes. While there also constify the addr parameter.

No functional change intended.

This is part of XSA-321.

Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- xen/drivers/passthrough/vtd/extern.h.orig
+++ xen/drivers/passthrough/vtd/extern.h
@@ -43,8 +43,7 @@ void disable_qinval(struct vtd_iommu *io
 int enable_intremap(struct vtd_iommu *iommu, int eim);
 void disable_intremap(struct vtd_iommu *iommu);
 
-void iommu_flush_cache_entry(void *addr, unsigned int size);
-void iommu_flush_cache_page(void *addr, unsigned long npages);
+void iommu_sync_cache(const void *addr, unsigned int size);
 int iommu_alloc(struct acpi_drhd_unit *drhd);
 void iommu_free(struct acpi_drhd_unit *drhd);
 
--- xen/drivers/passthrough/vtd/intremap.c.orig
+++ xen/drivers/passthrough/vtd/intremap.c
@@ -230,7 +230,7 @@ static void free_remap_entry(struct vtd_
                      iremap_entries, iremap_entry);
 
     update_irte(iommu, iremap_entry, &new_ire, false);
-    iommu_flush_cache_entry(iremap_entry, sizeof(*iremap_entry));
+    iommu_sync_cache(iremap_entry, sizeof(*iremap_entry));
     iommu_flush_iec_index(iommu, 0, index);
 
     unmap_vtd_domain_page(iremap_entries);
@@ -406,7 +406,7 @@ static int ioapic_rte_to_remap_entry(str
     }
 
     update_irte(iommu, iremap_entry, &new_ire, !init);
-    iommu_flush_cache_entry(iremap_entry, sizeof(*iremap_entry));
+    iommu_sync_cache(iremap_entry, sizeof(*iremap_entry));
     iommu_flush_iec_index(iommu, 0, index);
 
     unmap_vtd_domain_page(iremap_entries);
@@ -695,7 +695,7 @@ static int msi_msg_to_remap_entry(
     update_irte(iommu, iremap_entry, &new_ire, msi_desc->irte_initialized);
     msi_desc->irte_initialized = true;
 
-    iommu_flush_cache_entry(iremap_entry, sizeof(*iremap_entry));
+    iommu_sync_cache(iremap_entry, sizeof(*iremap_entry));
     iommu_flush_iec_index(iommu, 0, index);
 
     unmap_vtd_domain_page(iremap_entries);
--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -140,7 +140,8 @@ static int context_get_domain_id(struct
 }
 
 static int iommus_incoherent;
-static void __iommu_flush_cache(void *addr, unsigned int size)
+
+void iommu_sync_cache(const void *addr, unsigned int size)
 {
     int i;
     static unsigned int clflush_size = 0;
@@ -155,16 +156,6 @@ static void __iommu_flush_cache(void *ad
         cacheline_flush((char *)addr + i);
 }
 
-void iommu_flush_cache_entry(void *addr, unsigned int size)
-{
-    __iommu_flush_cache(addr, size);
-}
-
-void iommu_flush_cache_page(void *addr, unsigned long npages)
-{
-    __iommu_flush_cache(addr, PAGE_SIZE * npages);
-}
-
 /* Allocate page table, return its machine address */
 uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
 {
@@ -183,7 +174,7 @@ uint64_t alloc_pgtable_maddr(unsigned lo
         vaddr = __map_domain_page(cur_pg);
         memset(vaddr, 0, PAGE_SIZE);
 
-        iommu_flush_cache_page(vaddr, 1);
+        iommu_sync_cache(vaddr, PAGE_SIZE);
         unmap_domain_page(vaddr);
         cur_pg++;
     }
@@ -216,7 +207,7 @@ static u64 bus_to_context_maddr(struct v
         }
         set_root_value(*root, maddr);
         set_root_present(*root);
-        iommu_flush_cache_entry(root, sizeof(struct root_entry));
+        iommu_sync_cache(root, sizeof(struct root_entry));
     }
     maddr = (u64) get_context_addr(*root);
     unmap_vtd_domain_page(root_entries);
@@ -263,7 +254,7 @@ static u64 addr_to_dma_page_maddr(struct
              */
             dma_set_pte_readable(*pte);
             dma_set_pte_writable(*pte);
-            iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
+            iommu_sync_cache(pte, sizeof(struct dma_pte));
         }
 
         if ( level == 2 )
@@ -640,7 +631,7 @@ static int __must_check dma_pte_clear_on
     *flush_flags |= IOMMU_FLUSHF_modified;
 
     spin_unlock(&hd->arch.mapping_lock);
-    iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
+    iommu_sync_cache(pte, sizeof(struct dma_pte));
 
     unmap_vtd_domain_page(page);
 
@@ -679,7 +670,7 @@ static void iommu_free_page_table(struct
             iommu_free_pagetable(dma_pte_addr(*pte), next_level);
 
         dma_clear_pte(*pte);
-        iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
+        iommu_sync_cache(pte, sizeof(struct dma_pte));
     }
 
     unmap_vtd_domain_page(pt_vaddr);
@@ -1400,7 +1391,7 @@ int domain_context_mapping_one(
     context_set_address_width(*context, agaw);
     context_set_fault_enable(*context);
     context_set_present(*context);
-    iommu_flush_cache_entry(context, sizeof(struct context_entry));
+    iommu_sync_cache(context, sizeof(struct context_entry));
     spin_unlock(&iommu->lock);
 
     /* Context entry was previously non-present (with domid 0). */
@@ -1564,7 +1555,7 @@ int domain_context_unmap_one(
 
     context_clear_present(*context);
     context_clear_entry(*context);
-    iommu_flush_cache_entry(context, sizeof(struct context_entry));
+    iommu_sync_cache(context, sizeof(struct context_entry));
 
     iommu_domid= domain_iommu_domid(domain, iommu);
     if ( iommu_domid == -1 )
@@ -1791,7 +1782,7 @@ static int __must_check intel_iommu_map_
 
     *pte = new;
 
-    iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
+    iommu_sync_cache(pte, sizeof(struct dma_pte));
     spin_unlock(&hd->arch.mapping_lock);
     unmap_vtd_domain_page(page);
 
@@ -1866,7 +1857,7 @@ int iommu_pte_flush(struct domain *d, ui
     int iommu_domid;
     int rc = 0;
 
-    iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
+    iommu_sync_cache(pte, sizeof(struct dma_pte));
 
     for_each_drhd_unit ( drhd )
     {
@@ -2724,7 +2715,7 @@ static int __init intel_iommu_quarantine
             dma_set_pte_addr(*pte, maddr);
             dma_set_pte_readable(*pte);
         }
-        iommu_flush_cache_page(parent, 1);
+        iommu_sync_cache(parent, PAGE_SIZE);
 
         unmap_vtd_domain_page(parent);
         parent = map_vtd_domain_page(maddr);
From: <security@xenproject.org>
Subject: x86/iommu: introduce a cache sync hook

The hook is only implemented for VT-d and it uses the already existing
iommu_sync_cache function present in VT-d code. The new hook is
added so that the cache can be flushed by code outside of VT-d when
using shared page tables.

Note that alloc_pgtable_maddr must use the now locally defined
sync_cache function, because IOMMU ops are not yet setup the first
time the function gets called during IOMMU initialization.

No functional change intended.

This is part of XSA-321.

Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- xen/drivers/passthrough/vtd/extern.h.orig
+++ xen/drivers/passthrough/vtd/extern.h
@@ -43,7 +43,6 @@ void disable_qinval(struct vtd_iommu *io
 int enable_intremap(struct vtd_iommu *iommu, int eim);
 void disable_intremap(struct vtd_iommu *iommu);
 
-void iommu_sync_cache(const void *addr, unsigned int size);
 int iommu_alloc(struct acpi_drhd_unit *drhd);
 void iommu_free(struct acpi_drhd_unit *drhd);
 
--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -141,7 +141,7 @@ static int context_get_domain_id(struct
 
 static int iommus_incoherent;
 
-void iommu_sync_cache(const void *addr, unsigned int size)
+static void sync_cache(const void *addr, unsigned int size)
 {
     int i;
     static unsigned int clflush_size = 0;
@@ -174,7 +174,7 @@ uint64_t alloc_pgtable_maddr(unsigned lo
         vaddr = __map_domain_page(cur_pg);
         memset(vaddr, 0, PAGE_SIZE);
 
-        iommu_sync_cache(vaddr, PAGE_SIZE);
+        sync_cache(vaddr, PAGE_SIZE);
         unmap_domain_page(vaddr);
         cur_pg++;
     }
@@ -2763,6 +2763,7 @@ const struct iommu_ops __initconstrel in
     .iotlb_flush_all = iommu_flush_iotlb_all,
     .get_reserved_device_memory = intel_iommu_get_reserved_device_memory,
     .dump_p2m_table = vtd_dump_p2m_table,
+    .sync_cache = sync_cache,
 };
 
 const struct iommu_init_ops __initconstrel intel_iommu_init_ops = {
--- xen/include/asm-x86/iommu.h.orig
+++ xen/include/asm-x86/iommu.h
@@ -121,6 +121,13 @@ extern bool untrusted_msi;
 int pi_update_irte(const struct pi_desc *pi_desc, const struct pirq *pirq,
                    const uint8_t gvec);
 
+#define iommu_sync_cache(addr, size) ({                 \
+    const struct iommu_ops *ops = iommu_get_ops();      \
+                                                        \
+    if ( ops->sync_cache )                              \
+        iommu_vcall(ops, sync_cache, addr, size);       \
+})
+
 #endif /* !__ARCH_X86_IOMMU_H__ */
 /*
  * Local variables:
--- xen/include/xen/iommu.h.orig
+++ xen/include/xen/iommu.h
@@ -250,6 +250,7 @@ struct iommu_ops {
     int (*setup_hpet_msi)(struct msi_desc *);
 
     int (*adjust_irq_affinities)(void);
+    void (*sync_cache)(const void *addr, unsigned int size);
 #endif /* CONFIG_X86 */
 
     int __must_check (*suspend)(void);
From: <security@xenproject.org>
Subject: vtd: don't assume addresses are aligned in sync_cache

Current code in sync_cache assume that the address passed in is
aligned to a cache line size. Fix the code to support passing in
arbitrary addresses not necessarily aligned to a cache line size.

This is part of XSA-321.

Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -143,8 +143,8 @@ static int iommus_incoherent;
 
 static void sync_cache(const void *addr, unsigned int size)
 {
-    int i;
-    static unsigned int clflush_size = 0;
+    static unsigned long clflush_size = 0;
+    const void *end = addr + size;
 
     if ( !iommus_incoherent )
         return;
@@ -152,8 +152,9 @@ static void sync_cache(const void *addr,
     if ( clflush_size == 0 )
         clflush_size = get_cache_line_size();
 
-    for ( i = 0; i < size; i += clflush_size )
-        cacheline_flush((char *)addr + i);
+    addr -= (unsigned long)addr & (clflush_size - 1);
+    for ( ; addr < end; addr += clflush_size )
+        cacheline_flush((char *)addr);
 }
 
 /* Allocate page table, return its machine address */
From: <security@xenproject.org>
Subject: x86/alternative: introduce alternative_2

It's based on alternative_io_2 without inputs or outputs but with an
added memory clobber.

This is part of XSA-321.

Acked-by: Jan Beulich <jbeulich@suse.com>

--- xen/include/asm-x86/alternative.h.orig
+++ xen/include/asm-x86/alternative.h
@@ -114,6 +114,11 @@ extern void alternative_branches(void);
 #define alternative(oldinstr, newinstr, feature)                        \
         asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
 
+#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
+	asm volatile (ALTERNATIVE_2(oldinstr, newinstr1, feature1,	\
+				    newinstr2, feature2)		\
+		      : : : "memory")
+
 /*
  * Alternative inline assembly with input.
  *
From: <security@xenproject.org>
Subject: vtd: optimize CPU cache sync

Some VT-d IOMMUs are non-coherent, which requires a cache write back
in order for the changes made by the CPU to be visible to the IOMMU.
This cache write back was unconditionally done using clflush, but there are
other more efficient instructions to do so, hence implement support
for them using the alternative framework.

This is part of XSA-321.

Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- xen/drivers/passthrough/vtd/extern.h.orig
+++ xen/drivers/passthrough/vtd/extern.h
@@ -68,7 +68,6 @@ int __must_check qinval_device_iotlb_syn
                                           u16 did, u16 size, u64 addr);
 
 unsigned int get_cache_line_size(void);
-void cacheline_flush(char *);
 void flush_all_cache(void);
 
 uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node);
--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -31,6 +31,7 @@
 #include <xen/pci_regs.h>
 #include <xen/keyhandler.h>
 #include <asm/msi.h>
+#include <asm/nops.h>
 #include <asm/irq.h>
 #include <asm/hvm/vmx/vmx.h>
 #include <asm/p2m.h>
@@ -154,7 +155,42 @@ static void sync_cache(const void *addr,
 
     addr -= (unsigned long)addr & (clflush_size - 1);
     for ( ; addr < end; addr += clflush_size )
-        cacheline_flush((char *)addr);
+/*
+ * The arguments to a macro must not include preprocessor directives. Doing so
+ * results in undefined behavior, so we have to create some defines here in
+ * order to avoid it.
+ */
+#if defined(HAVE_AS_CLWB)
+# define CLWB_ENCODING "clwb %[p]"
+#elif defined(HAVE_AS_XSAVEOPT)
+# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
+#else
+# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
+#endif
+
+#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
+#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
+# define INPUT BASE_INPUT
+#else
+# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
+#endif
+        /*
+         * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
+         * + prefix than a clflush + nop, and hence the prefix is added instead
+         * of letting the alternative framework fill the gap by appending nops.
+         */
+        alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
+                         "data16 clflush %[p]", /* clflushopt */
+                         X86_FEATURE_CLFLUSHOPT,
+                         CLWB_ENCODING,
+                         X86_FEATURE_CLWB, /* no outputs */,
+                         INPUT(addr));
+#undef INPUT
+#undef BASE_INPUT
+#undef CLWB_ENCODING
+
+    alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
+                      "sfence", X86_FEATURE_CLWB);
 }
 
 /* Allocate page table, return its machine address */
--- xen/drivers/passthrough/vtd/x86/vtd.c.orig
+++ xen/drivers/passthrough/vtd/x86/vtd.c
@@ -51,11 +51,6 @@ unsigned int get_cache_line_size(void)
     return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
 }
 
-void cacheline_flush(char * addr)
-{
-    clflush(addr);
-}
-
 void flush_all_cache()
 {
     wbinvd();
From: <security@xenproject.org>
Subject: x86/ept: flush cache when modifying PTEs and sharing page tables

Modifications made to the page tables by EPT code need to be written
to memory when the page tables are shared with the IOMMU, as Intel
IOMMUs can be non-coherent and thus require changes to be written to
memory in order to be visible to the IOMMU.

In order to achieve this make sure data is written back to memory
after writing an EPT entry when the recalc bit is not set in
atomic_write_ept_entry. If such bit is set, the entry will be
adjusted and atomic_write_ept_entry will be called a second time
without the recalc bit set. Note that when splitting a super page the
new tables resulting of the split should also be written back.

Failure to do so can allow devices behind the IOMMU access to the
stale super page, or cause coherency issues as changes made by the
processor to the page tables are not visible to the IOMMU.

This allows to remove the VT-d specific iommu_pte_flush helper, since
the cache write back is now performed by atomic_write_ept_entry, and
hence iommu_iotlb_flush can be used to flush the IOMMU TLB. The newly
used method (iommu_iotlb_flush) can result in less flushes, since it
might sometimes be called rightly with 0 flags, in which case it
becomes a no-op.

This is part of XSA-321.

Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- xen/arch/x86/mm/p2m-ept.c.orig
+++ xen/arch/x86/mm/p2m-ept.c
@@ -58,6 +58,19 @@ static int atomic_write_ept_entry(struct
 
     write_atomic(&entryptr->epte, new.epte);
 
+    /*
+     * The recalc field on the EPT is used to signal either that a
+     * recalculation of the EMT field is required (which doesn't effect the
+     * IOMMU), or a type change. Type changes can only be between ram_rw,
+     * logdirty and ioreq_server: changes to/from logdirty won't work well with
+     * an IOMMU anyway, as IOMMU #PFs are not synchronous and will lead to
+     * aborts, and changes to/from ioreq_server are already fully flushed
+     * before returning to guest context (see
+     * XEN_DMOP_map_mem_type_to_ioreq_server).
+     */
+    if ( !new.recalc && iommu_use_hap_pt(p2m->domain) )
+        iommu_sync_cache(entryptr, sizeof(*entryptr));
+
     return 0;
 }
 
@@ -278,6 +291,9 @@ static bool_t ept_split_super_page(struc
             break;
     }
 
+    if ( iommu_use_hap_pt(p2m->domain) )
+        iommu_sync_cache(table, EPT_PAGETABLE_ENTRIES * sizeof(ept_entry_t));
+
     unmap_domain_page(table);
 
     /* Even failed we should install the newly allocated ept page. */
@@ -821,7 +840,10 @@ out:
          need_modify_vtd_table )
     {
         if ( iommu_use_hap_pt(d) )
-            rc = iommu_pte_flush(d, gfn, &ept_entry->epte, order, vtd_pte_present);
+            rc = iommu_iotlb_flush(d, _dfn(gfn), (1u << order),
+                                   (iommu_flags ? IOMMU_FLUSHF_added : 0) |
+                                   (vtd_pte_present ? IOMMU_FLUSHF_modified
+                                                    : 0));
         else if ( need_iommu_pt_sync(d) )
             rc = iommu_flags ?
                 iommu_legacy_map(d, _dfn(gfn), mfn, order, iommu_flags) :
--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -1884,53 +1884,6 @@ static int intel_iommu_lookup_page(struc
     return 0;
 }
 
-int iommu_pte_flush(struct domain *d, uint64_t dfn, uint64_t *pte,
-                    int order, int present)
-{
-    struct acpi_drhd_unit *drhd;
-    struct vtd_iommu *iommu = NULL;
-    struct domain_iommu *hd = dom_iommu(d);
-    bool_t flush_dev_iotlb;
-    int iommu_domid;
-    int rc = 0;
-
-    iommu_sync_cache(pte, sizeof(struct dma_pte));
-
-    for_each_drhd_unit ( drhd )
-    {
-        iommu = drhd->iommu;
-        if ( !test_bit(iommu->index, &hd->arch.iommu_bitmap) )
-            continue;
-
-        flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
-        iommu_domid= domain_iommu_domid(d, iommu);
-        if ( iommu_domid == -1 )
-            continue;
-
-        rc = iommu_flush_iotlb_psi(iommu, iommu_domid,
-                                   __dfn_to_daddr(dfn),
-                                   order, !present, flush_dev_iotlb);
-        if ( rc > 0 )
-        {
-            iommu_flush_write_buffer(iommu);
-            rc = 0;
-        }
-    }
-
-    if ( unlikely(rc) )
-    {
-        if ( !d->is_shutting_down && printk_ratelimit() )
-            printk(XENLOG_ERR VTDPREFIX
-                   " d%d: IOMMU pages flush failed: %d\n",
-                   d->domain_id, rc);
-
-        if ( !is_hardware_domain(d) )
-            domain_crash(d);
-    }
-
-    return rc;
-}
-
 static int __init vtd_ept_page_compatible(struct vtd_iommu *iommu)
 {
     u64 ept_cap, vtd_cap = iommu->cap;
--- xen/include/asm-x86/iommu.h.orig
+++ xen/include/asm-x86/iommu.h
@@ -97,10 +97,6 @@ static inline int iommu_adjust_irq_affin
            : 0;
 }
 
-/* While VT-d specific, this must get declared in a generic header. */
-int __must_check iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte,
-                                 int order, int present);
-
 static inline bool iommu_supports_x2apic(void)
 {
     return iommu_init_ops && iommu_init_ops->supports_x2apic
