$NetBSD: patch-CVE-2013-1918_9,v 1.1 2013/05/03 16:48:38 drochner Exp $

--- xen/arch/x86/mm.c.orig	2013-05-03 13:38:09.000000000 +0000
+++ xen/arch/x86/mm.c
@@ -1183,7 +1183,16 @@ static int put_page_from_l3e(l3_pgentry_
 #endif
 
     if ( unlikely(partial > 0) )
+    {
+        ASSERT(preemptible >= 0);
         return __put_page_type(l3e_get_page(l3e), preemptible);
+    }
+
+    if ( preemptible < 0 )
+    {
+        current->arch.old_guest_table = l3e_get_page(l3e);
+        return 0;
+    }
 
     return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
 }
@@ -1196,7 +1205,17 @@ static int put_page_from_l4e(l4_pgentry_
          (l4e_get_pfn(l4e) != pfn) )
     {
         if ( unlikely(partial > 0) )
+        {
+            ASSERT(preemptible >= 0);
             return __put_page_type(l4e_get_page(l4e), preemptible);
+        }
+
+        if ( preemptible < 0 )
+        {
+            current->arch.old_guest_table = l4e_get_page(l4e);
+            return 0;
+        }
+
         return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
     }
     return 1;
@@ -1486,12 +1505,17 @@ static int alloc_l3_table(struct page_in
     if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
     {
         MEM_LOG("Failure in alloc_l3_table: entry %d", i);
+        if ( i )
+        {
+            page->nr_validated_ptes = i;
+            page->partial_pte = 0;
+            current->arch.old_guest_table = page;
+        }
         while ( i-- > 0 )
         {
             if ( !is_guest_l3_slot(i) )
                 continue;
             unadjust_guest_l3e(pl3e[i], d);
-            put_page_from_l3e(pl3e[i], pfn, 0, 0);
         }
     }
 
@@ -1521,22 +1545,24 @@ static int alloc_l4_table(struct page_in
             page->nr_validated_ptes = i;
             page->partial_pte = partial ?: 1;
         }
-        else if ( rc == -EINTR )
+        else if ( rc < 0 )
         {
+            if ( rc != -EINTR )
+                MEM_LOG("Failure in alloc_l4_table: entry %d", i);
             if ( i )
             {
                 page->nr_validated_ptes = i;
                 page->partial_pte = 0;
-                rc = -EAGAIN;
+                if ( rc == -EINTR )
+                    rc = -EAGAIN;
+                else
+                {
+                    if ( current->arch.old_guest_table )
+                        page->nr_validated_ptes++;
+                    current->arch.old_guest_table = page;
+                }
             }
         }
-        else if ( rc < 0 )
-        {
-            MEM_LOG("Failure in alloc_l4_table: entry %d", i);
-            while ( i-- > 0 )
-                if ( is_guest_l4_slot(d, i) )
-                    put_page_from_l4e(pl4e[i], pfn, 0, 0);
-        }
         if ( rc < 0 )
             return rc;
 
@@ -1966,7 +1992,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
         pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
     }
 
-    put_page_from_l3e(ol3e, pfn, 0, 0);
+    put_page_from_l3e(ol3e, pfn, 0, -preemptible);
     return rc;
 }
 
@@ -2029,7 +2055,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
         return -EFAULT;
     }
 
-    put_page_from_l4e(ol4e, pfn, 0, 0);
+    put_page_from_l4e(ol4e, pfn, 0, -preemptible);
     return rc;
 }
 
@@ -2187,7 +2213,15 @@ static int alloc_page_type(struct page_i
                 PRtype_info ": caf=%08lx taf=%" PRtype_info,
                 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
                 type, page->count_info, page->u.inuse.type_info);
-        page->u.inuse.type_info = 0;
+        if ( page != current->arch.old_guest_table )
+            page->u.inuse.type_info = 0;
+        else
+        {
+            ASSERT((page->u.inuse.type_info &
+                    (PGT_count_mask | PGT_validated)) == 1);
+            get_page_light(page);
+            page->u.inuse.type_info |= PGT_partial;
+        }
     }
     else
     {
@@ -2725,49 +2759,150 @@ static void put_superpage(unsigned long 
 
 #endif
 
+static int put_old_guest_table(struct vcpu *v)
+{
+    int rc;
+
+    if ( !v->arch.old_guest_table )
+        return 0;
+
+    switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table, 1) )
+    {
+    case -EINTR:
+    case -EAGAIN:
+        return -EAGAIN;
+    }
+
+    v->arch.old_guest_table = NULL;
+
+    return rc;
+}
+
+int vcpu_destroy_pagetables(struct vcpu *v)
+{
+    unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
+    struct page_info *page;
+    int rc = put_old_guest_table(v);
+
+    if ( rc )
+        return rc;
+
+#ifdef __x86_64__
+    if ( is_pv_32on64_vcpu(v) )
+        mfn = l4e_get_pfn(*(l4_pgentry_t *)mfn_to_virt(mfn));
+#endif
+
+    if ( mfn )
+    {
+        page = mfn_to_page(mfn);
+        if ( paging_mode_refcounts(v->domain) )
+            put_page(page);
+        else
+            rc = put_page_and_type_preemptible(page, 1);
+    }
+
+#ifdef __x86_64__
+    if ( is_pv_32on64_vcpu(v) )
+    {
+        if ( !rc )
+            l4e_write(
+                (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
+                l4e_empty());
+    }
+    else
+#endif
+    if ( !rc )
+    {
+        v->arch.guest_table = pagetable_null();
+
+#ifdef __x86_64__
+        /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
+        mfn = pagetable_get_pfn(v->arch.guest_table_user);
+        if ( mfn )
+        {
+            page = mfn_to_page(mfn);
+            if ( paging_mode_refcounts(v->domain) )
+                put_page(page);
+            else
+                rc = put_page_and_type_preemptible(page, 1);
+        }
+        if ( !rc )
+            v->arch.guest_table_user = pagetable_null();
+#endif
+    }
+
+    v->arch.cr3 = 0;
+
+    return rc;
+}
 
 int new_guest_cr3(unsigned long mfn)
 {
     struct vcpu *curr = current;
     struct domain *d = curr->domain;
-    int okay;
+    int rc;
     unsigned long old_base_mfn;
 
 #ifdef __x86_64__
     if ( is_pv_32on64_domain(d) )
     {
-        okay = paging_mode_refcounts(d)
-            ? 0 /* Old code was broken, but what should it be? */
-            : mod_l4_entry(
+        rc = paging_mode_refcounts(d)
+             ? -EINVAL /* Old code was broken, but what should it be? */
+             : mod_l4_entry(
                     __va(pagetable_get_paddr(curr->arch.guest_table)),
                     l4e_from_pfn(
                         mfn,
                         (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
-                    pagetable_get_pfn(curr->arch.guest_table), 0, 0, curr) == 0;
-        if ( unlikely(!okay) )
+                    pagetable_get_pfn(curr->arch.guest_table), 0, 1, curr);
+        switch ( rc )
         {
+        case 0:
+            break;
+        case -EINTR:
+        case -EAGAIN:
+            return -EAGAIN;
+        default:
             MEM_LOG("Error while installing new compat baseptr %lx", mfn);
-            return 0;
+            return rc;
         }
 
         invalidate_shadow_ldt(curr, 0);
         write_ptbase(curr);
 
-        return 1;
+        return 0;
     }
 #endif
-    okay = paging_mode_refcounts(d)
-        ? get_page_from_pagenr(mfn, d)
-        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
-    if ( unlikely(!okay) )
+    rc = put_old_guest_table(curr);
+    if ( unlikely(rc) )
+        return rc;
+
+    old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
+    /*
+     * This is particularly important when getting restarted after the
+     * previous attempt got preempted in the put-old-MFN phase.
+     */
+    if ( old_base_mfn == mfn )
     {
-        MEM_LOG("Error while installing new baseptr %lx", mfn);
+        write_ptbase(curr);
         return 0;
     }
 
-    invalidate_shadow_ldt(curr, 0);
+    rc = paging_mode_refcounts(d)
+         ? (get_page_from_pagenr(mfn, d) ? 0 : -EINVAL)
+         : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 1);
+    switch ( rc )
+    {
+    case 0:
+        break;
+    case -EINTR:
+    case -EAGAIN:
+        return -EAGAIN;
+    default:
+        MEM_LOG("Error while installing new baseptr %lx", mfn);
+        return rc;
+    }
 
-    old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
+    invalidate_shadow_ldt(curr, 0);
 
     curr->arch.guest_table = pagetable_from_pfn(mfn);
     update_cr3(curr);
@@ -2776,13 +2911,25 @@ int new_guest_cr3(unsigned long mfn)
 
     if ( likely(old_base_mfn != 0) )
     {
+        struct page_info *page = mfn_to_page(old_base_mfn);
+
         if ( paging_mode_refcounts(d) )
-            put_page(mfn_to_page(old_base_mfn));
+            put_page(page);
         else
-            put_page_and_type(mfn_to_page(old_base_mfn));
+            switch ( rc = put_page_and_type_preemptible(page, 1) )
+            {
+            case -EINTR:
+                rc = -EAGAIN;
+            case -EAGAIN:
+                curr->arch.old_guest_table = page;
+                break;
+            default:
+                BUG_ON(rc);
+                break;
+            }
     }
 
-    return 1;
+    return rc;
 }
 
 static struct domain *get_pg_owner(domid_t domid)
@@ -2911,12 +3058,29 @@ long do_mmuext_op(
     unsigned int foreigndom)
 {
     struct mmuext_op op;
-    int rc = 0, i = 0, okay;
     unsigned long type;
-    unsigned int done = 0;
+    unsigned int i = 0, done = 0;
     struct vcpu *curr = current;
     struct domain *d = curr->domain;
     struct domain *pg_owner;
+    int okay, rc = put_old_guest_table(curr);
+
+    if ( unlikely(rc) )
+    {
+        if ( likely(rc == -EAGAIN) )
+            rc = hypercall_create_continuation(
+                     __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone,
+                     foreigndom);
+        return rc;
+    }
+
+    if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
+         likely(guest_handle_is_null(uops)) )
+    {
+        /* See the curr->arch.old_guest_table related
+         * hypercall_create_continuation() below. */
+        return (int)foreigndom;
+    }
 
     if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
     {
@@ -2941,7 +3105,7 @@ long do_mmuext_op(
 
     for ( i = 0; i < count; i++ )
     {
-        if ( hypercall_preempt_check() )
+        if ( curr->arch.old_guest_table || hypercall_preempt_check() )
         {
             rc = -EAGAIN;
             break;
@@ -3001,21 +3165,17 @@ long do_mmuext_op(
             page = mfn_to_page(mfn);
 
             if ( (rc = xsm_memory_pin_page(d, page)) != 0 )
-            {
-                put_page_and_type(page);
                 okay = 0;
-                break;
-            }
-
-            if ( unlikely(test_and_set_bit(_PGT_pinned,
-                                           &page->u.inuse.type_info)) )
+            else if ( unlikely(test_and_set_bit(_PGT_pinned,
+                                                &page->u.inuse.type_info)) )
             {
                 MEM_LOG("Mfn %lx already pinned", mfn);
-                put_page_and_type(page);
                 okay = 0;
-                break;
             }
 
+            if ( unlikely(!okay) )
+                goto pin_drop;
+
             /* A page is dirtied when its pin status is set. */
             paging_mark_dirty(pg_owner, mfn);
            
@@ -3029,7 +3189,13 @@ long do_mmuext_op(
                                                &page->u.inuse.type_info));
                 spin_unlock(&pg_owner->page_alloc_lock);
                 if ( drop_ref )
-                    put_page_and_type(page);
+                {
+        pin_drop:
+                    if ( type == PGT_l1_page_table )
+                        put_page_and_type(page);
+                    else
+                        curr->arch.old_guest_table = page;
+                }
             }
 
             break;
@@ -3059,7 +3225,17 @@ long do_mmuext_op(
                 break;
             }
 
-            put_page_and_type(page);
+            switch ( rc = put_page_and_type_preemptible(page, 1) )
+            {
+            case -EINTR:
+            case -EAGAIN:
+                curr->arch.old_guest_table = page;
+                rc = 0;
+                break;
+            default:
+                BUG_ON(rc);
+                break;
+            }
             put_page(page);
 
             /* A page is dirtied when its pin status is cleared. */
@@ -3069,7 +3245,8 @@ long do_mmuext_op(
         }
 
         case MMUEXT_NEW_BASEPTR:
-            okay = new_guest_cr3(gmfn_to_mfn(d, op.arg1.mfn));
+            rc = new_guest_cr3(gmfn_to_mfn(d, op.arg1.mfn));
+            okay = !rc;
             break;
         
 #ifdef __x86_64__
@@ -3077,29 +3254,55 @@ long do_mmuext_op(
             unsigned long old_mfn, mfn;
 
             mfn = gmfn_to_mfn(d, op.arg1.mfn);
+            old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
+            /*
+             * This is particularly important when getting restarted after the
+             * previous attempt got preempted in the put-old-MFN phase.
+             */
+            if ( old_mfn == mfn )
+                break;
+
             if ( mfn != 0 )
             {
                 if ( paging_mode_refcounts(d) )
                     okay = get_page_from_pagenr(mfn, d);
                 else
-                    okay = !get_page_and_type_from_pagenr(
-                        mfn, PGT_root_page_table, d, 0, 0);
+                {
+                    rc = get_page_and_type_from_pagenr(
+                        mfn, PGT_root_page_table, d, 0, 1);
+                    okay = !rc;
+                }
                 if ( unlikely(!okay) )
                 {
-                    MEM_LOG("Error while installing new mfn %lx", mfn);
+                    if ( rc == -EINTR )
+                        rc = -EAGAIN;
+                    else if ( rc != -EAGAIN )
+                        MEM_LOG("Error while installing new mfn %lx", mfn);
                     break;
                 }
             }
 
-            old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
             curr->arch.guest_table_user = pagetable_from_pfn(mfn);
 
             if ( old_mfn != 0 )
             {
+                struct page_info *page = mfn_to_page(old_mfn);
+
                 if ( paging_mode_refcounts(d) )
-                    put_page(mfn_to_page(old_mfn));
+                    put_page(page);
                 else
-                    put_page_and_type(mfn_to_page(old_mfn));
+                    switch ( rc = put_page_and_type_preemptible(page, 1) )
+                    {
+                    case -EINTR:
+                        rc = -EAGAIN;
+                    case -EAGAIN:
+                        curr->arch.old_guest_table = page;
+                        okay = 0;
+                        break;
+                    default:
+                        BUG_ON(rc);
+                        break;
+                    }
             }
 
             break;
@@ -3338,9 +3541,27 @@ long do_mmuext_op(
     }
 
     if ( rc == -EAGAIN )
+    {
+        ASSERT(i < count);
         rc = hypercall_create_continuation(
             __HYPERVISOR_mmuext_op, "hihi",
             uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+    }
+    else if ( curr->arch.old_guest_table )
+    {
+        XEN_GUEST_HANDLE(void) null;
+
+        ASSERT(rc || i == count);
+        set_xen_guest_handle(null, NULL);
+        /*
+         * In order to have a way to communicate the final return value to
+         * our continuation, we pass this in place of "foreigndom", building
+         * on the fact that this argument isn't needed anymore.
+         */
+        rc = hypercall_create_continuation(
+                __HYPERVISOR_mmuext_op, "hihi", null,
+                MMU_UPDATE_PREEMPTED, null, rc);
+    }
 
     put_pg_owner(pg_owner);
 
@@ -3367,11 +3588,28 @@ long do_mmu_update(
     void *va;
     unsigned long gpfn, gmfn, mfn;
     struct page_info *page;
-    int rc = 0, okay = 1, i = 0;
-    unsigned int cmd, done = 0, pt_dom;
-    struct vcpu *v = current;
+    unsigned int cmd, i = 0, done = 0, pt_dom;
+    struct vcpu *curr = current, *v = curr;
     struct domain *d = v->domain, *pt_owner = d, *pg_owner;
     struct domain_mmap_cache mapcache;
+    int rc = put_old_guest_table(curr), okay = 1;
+
+    if ( unlikely(rc) )
+    {
+        if ( likely(rc == -EAGAIN) )
+            rc = hypercall_create_continuation(
+                     __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone,
+                     foreigndom);
+        return rc;
+    }
+
+    if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
+         likely(guest_handle_is_null(ureqs)) )
+    {
+        /* See the curr->arch.old_guest_table related
+         * hypercall_create_continuation() below. */
+        return (int)foreigndom;
+    }
 
     if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
     {
@@ -3420,7 +3658,7 @@ long do_mmu_update(
 
     for ( i = 0; i < count; i++ )
     {
-        if ( hypercall_preempt_check() )
+        if ( curr->arch.old_guest_table || hypercall_preempt_check() )
         {
             rc = -EAGAIN;
             break;
@@ -3685,9 +3923,27 @@ long do_mmu_update(
     }
 
     if ( rc == -EAGAIN )
+    {
+        ASSERT(i < count);
         rc = hypercall_create_continuation(
             __HYPERVISOR_mmu_update, "hihi",
             ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+    }
+    else if ( curr->arch.old_guest_table )
+    {
+        XEN_GUEST_HANDLE(void) null;
+
+        ASSERT(rc || i == count);
+        set_xen_guest_handle(null, NULL);
+        /*
+         * In order to have a way to communicate the final return value to
+         * our continuation, we pass this in place of "foreigndom", building
+         * on the fact that this argument isn't needed anymore.
+         */
+        rc = hypercall_create_continuation(
+                __HYPERVISOR_mmu_update, "hihi", null,
+                MMU_UPDATE_PREEMPTED, null, rc);
+    }
 
     put_pg_owner(pg_owner);
 
