$NetBSD: patch-XSA346,v 1.1 2020/10/21 09:03:05 bouyer Exp $

From: Jan Beulich <jbeulich@suse.com>
Subject: IOMMU: suppress "iommu_dont_flush_iotlb" when about to free a page

Deferring flushes to a single, wide range one - as is done when
handling XENMAPSPACE_gmfn_range - is okay only as long as
pages don't get freed ahead of the eventual flush. While the only
function setting the flag (xenmem_add_to_physmap()) suggests by its name
that it's only mapping new entries, in reality the way
xenmem_add_to_physmap_one() works means an unmap would happen not only
for the page being moved (but not freed) but, if the destination GFN is
populated, also for the page being displaced from that GFN. Collapsing
the two flushes for this GFN into just one (end even more so deferring
it to a batched invocation) is not correct.

This is part of XSA-346.

Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Acked-by: Julien Grall <jgrall@amazon.com>

--- xen/common/memory.c.orig
+++ xen/common/memory.c
@@ -298,7 +298,10 @@ int guest_remove_page(struct domain *d,
     p2m_type_t p2mt;
 #endif
     mfn_t mfn;
+#ifdef CONFIG_HAS_PASSTHROUGH
+    bool *dont_flush_p, dont_flush;
     int rc;
+#endif
 
 #ifdef CONFIG_X86
     mfn = get_gfn_query(d, gmfn, &p2mt);
@@ -376,8 +379,22 @@ int guest_remove_page(struct domain *d,
         return -ENXIO;
     }
 
+#ifdef CONFIG_HAS_PASSTHROUGH
+    /*
+     * Since we're likely to free the page below, we need to suspend
+     * xenmem_add_to_physmap()'s suppressing of IOMMU TLB flushes.
+     */
+    dont_flush_p = &this_cpu(iommu_dont_flush_iotlb);
+    dont_flush = *dont_flush_p;
+    *dont_flush_p = false;
+#endif
+
     rc = guest_physmap_remove_page(d, _gfn(gmfn), mfn, 0);
 
+#ifdef CONFIG_HAS_PASSTHROUGH
+    *dont_flush_p = dont_flush;
+#endif
+
     /*
      * With the lack of an IOMMU on some platforms, domains with DMA-capable
      * device must retrieve the same pfn when the hypercall populate_physmap
From: Jan Beulich <jbeulich@suse.com>
Subject: IOMMU: hold page ref until after deferred TLB flush

When moving around a page via XENMAPSPACE_gmfn_range, deferring the TLB
flush for the "from" GFN range requires that the page remains allocated
to the guest until the TLB flush has actually occurred. Otherwise a
parallel hypercall to remove the page would only flush the TLB for the
GFN it has been moved to, but not the one is was mapped at originally.

This is part of XSA-346.

Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ")
Reported-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Julien Grall <jgrall@amazon.com>

--- xen/arch/arm/mm.c.orig
+++ xen/arch/arm/mm.c
@@ -1222,7 +1222,7 @@ void share_xen_page_with_guest(struct pa
 int xenmem_add_to_physmap_one(
     struct domain *d,
     unsigned int space,
-    union xen_add_to_physmap_batch_extra extra,
+    union add_to_physmap_extra extra,
     unsigned long idx,
     gfn_t gfn)
 {
@@ -1294,10 +1294,6 @@ int xenmem_add_to_physmap_one(
         break;
     }
     case XENMAPSPACE_dev_mmio:
-        /* extra should be 0. Reserved for future use. */
-        if ( extra.res0 )
-            return -EOPNOTSUPP;
-
         rc = map_dev_mmio_region(d, gfn, 1, _mfn(idx));
         return rc;
 
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@ -4634,7 +4634,7 @@ static int handle_iomem_range(unsigned l
 int xenmem_add_to_physmap_one(
     struct domain *d,
     unsigned int space,
-    union xen_add_to_physmap_batch_extra extra,
+    union add_to_physmap_extra extra,
     unsigned long idx,
     gfn_t gpfn)
 {
@@ -4721,9 +4721,20 @@ int xenmem_add_to_physmap_one(
         rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K);
 
  put_both:
-    /* In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. */
+    /*
+     * In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top.
+     * We also may need to transfer ownership of the page reference to our
+     * caller.
+     */
     if ( space == XENMAPSPACE_gmfn )
+    {
         put_gfn(d, gfn);
+        if ( !rc && extra.ppage )
+        {
+            *extra.ppage = page;
+            page = NULL;
+        }
+    }
 
     if ( page )
         put_page(page);
--- xen/common/memory.c.orig
+++ xen/common/memory.c
@@ -811,11 +811,10 @@ int xenmem_add_to_physmap(struct domain
 {
     unsigned int done = 0;
     long rc = 0;
-    union xen_add_to_physmap_batch_extra extra;
+    union add_to_physmap_extra extra = {};
+    struct page_info *pages[16];
 
-    if ( xatp->space != XENMAPSPACE_gmfn_foreign )
-        extra.res0 = 0;
-    else
+    if ( xatp->space == XENMAPSPACE_gmfn_foreign )
         extra.foreign_domid = DOMID_INVALID;
 
     if ( xatp->space != XENMAPSPACE_gmfn_range )
@@ -831,7 +830,10 @@ int xenmem_add_to_physmap(struct domain
 
 #ifdef CONFIG_HAS_PASSTHROUGH
     if ( need_iommu(d) )
+    {
         this_cpu(iommu_dont_flush_iotlb) = 1;
+        extra.ppage = &pages[0];
+    }
 #endif
 
     while ( xatp->size > done )
@@ -844,8 +846,12 @@ int xenmem_add_to_physmap(struct domain
         xatp->idx++;
         xatp->gpfn++;
 
+        if ( extra.ppage )
+            ++extra.ppage;
+
         /* Check for continuation if it's not the last iteration. */
-        if ( xatp->size > ++done && hypercall_preempt_check() )
+        if ( (++done > ARRAY_SIZE(pages) && extra.ppage) ||
+             (xatp->size > done && hypercall_preempt_check()) )
         {
             rc = start + done;
             break;
@@ -856,6 +862,7 @@ int xenmem_add_to_physmap(struct domain
     if ( need_iommu(d) )
     {
         int ret;
+        unsigned int i;
 
         this_cpu(iommu_dont_flush_iotlb) = 0;
 
@@ -863,6 +870,15 @@ int xenmem_add_to_physmap(struct domain
         if ( unlikely(ret) && rc >= 0 )
             rc = ret;
 
+        /*
+         * Now that the IOMMU TLB flush was done for the original GFN, drop
+         * the page references. The 2nd flush below is fine to make later, as
+         * whoever removes the page again from its new GFN will have to do
+         * another flush anyway.
+         */
+        for ( i = 0; i < done; ++i )
+            put_page(pages[i]);
+
         ret = iommu_iotlb_flush(d, xatp->gpfn - done, done);
         if ( unlikely(ret) && rc >= 0 )
             rc = ret;
@@ -876,6 +892,8 @@ static int xenmem_add_to_physmap_batch(s
                                        struct xen_add_to_physmap_batch *xatpb,
                                        unsigned int extent)
 {
+    union add_to_physmap_extra extra = {};
+
     if ( xatpb->size < extent )
         return -EILSEQ;
 
@@ -884,6 +902,19 @@ static int xenmem_add_to_physmap_batch(s
          !guest_handle_subrange_okay(xatpb->errs, extent, xatpb->size - 1) )
         return -EFAULT;
 
+    switch ( xatpb->space )
+    {
+    case XENMAPSPACE_dev_mmio:
+        /* res0 is reserved for future use. */
+        if ( xatpb->u.res0 )
+            return -EOPNOTSUPP;
+        break;
+
+    case XENMAPSPACE_gmfn_foreign:
+        extra.foreign_domid = xatpb->u.foreign_domid;
+        break;
+    }
+
     while ( xatpb->size > extent )
     {
         xen_ulong_t idx;
@@ -896,8 +927,7 @@ static int xenmem_add_to_physmap_batch(s
                                                extent, 1)) )
             return -EFAULT;
 
-        rc = xenmem_add_to_physmap_one(d, xatpb->space,
-                                       xatpb->u,
+        rc = xenmem_add_to_physmap_one(d, xatpb->space, extra,
                                        idx, _gfn(gpfn));
 
         if ( unlikely(__copy_to_guest_offset(xatpb->errs, extent, &rc, 1)) )
--- xen/include/xen/mm.h.orig
+++ xen/include/xen/mm.h
@@ -577,8 +577,22 @@ void scrub_one_page(struct page_info *);
                       &(d)->xenpage_list : &(d)->page_list)
 #endif
 
+union add_to_physmap_extra {
+    /*
+     * XENMAPSPACE_gmfn: When deferring TLB flushes, a page reference needs
+     * to be kept until after the flush, so the page can't get removed from
+     * the domain (and re-used for another purpose) beforehand. By passing
+     * non-NULL, the caller of xenmem_add_to_physmap_one() indicates it wants
+     * to have ownership of such a reference transferred in the success case.
+     */
+    struct page_info **ppage;
+
+    /* XENMAPSPACE_gmfn_foreign */
+    domid_t foreign_domid;
+};
+
 int xenmem_add_to_physmap_one(struct domain *d, unsigned int space,
-                              union xen_add_to_physmap_batch_extra extra,
+                              union add_to_physmap_extra extra,
                               unsigned long idx, gfn_t gfn);
 
 int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp,
