$NetBSD: patch-XSA348,v 1.1 2020/12/16 17:15:22 bouyer Exp $ 

From: Jan Beulich <jbeulich@suse.com>
Subject: x86: replace reset_stack_and_jump_nolp()

Move the necessary check into check_for_livepatch_work(), rather than
mostly duplicating reset_stack_and_jump() for this purpose. This is to
prevent an inflation of reset_stack_and_jump() flavors.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Juergen Gross <jgross@suse.com>

--- xen/arch/x86/domain.c.orig	2020-10-30 17:22:39.000000000 +0100
+++ xen/arch/x86/domain.c	2020-11-10 17:51:10.894525721 +0100
@@ -192,7 +192,7 @@ static void noreturn continue_idle_domai
 {
     /* Idle vcpus might be attached to non-idle units! */
     if ( !is_idle_domain(v->sched_unit->domain) )
-        reset_stack_and_jump_nolp(guest_idle_loop);
+        reset_stack_and_jump(guest_idle_loop);
 
     reset_stack_and_jump(idle_loop);
 }
--- xen/arch/x86/hvm/svm/svm.c.orig	2020-10-30 17:22:39.000000000 +0100
+++ xen/arch/x86/hvm/svm/svm.c	2020-11-10 17:51:10.898525723 +0100
@@ -1032,7 +1032,7 @@ static void noreturn svm_do_resume(struc
 
     hvm_do_resume(v);
 
-    reset_stack_and_jump_nolp(svm_asm_do_resume);
+    reset_stack_and_jump(svm_asm_do_resume);
 }
 
 void svm_vmenter_helper(const struct cpu_user_regs *regs)
--- xen/arch/x86/hvm/vmx/vmcs.c.orig	2020-05-18 18:53:09.000000000 +0200
+++ xen/arch/x86/hvm/vmx/vmcs.c	2020-11-10 17:51:10.898525723 +0100
@@ -1889,7 +1889,7 @@ void vmx_do_resume(struct vcpu *v)
     if ( host_cr4 != read_cr4() )
         __vmwrite(HOST_CR4, read_cr4());
 
-    reset_stack_and_jump_nolp(vmx_asm_do_vmentry);
+    reset_stack_and_jump(vmx_asm_do_vmentry);
 }
 
 static inline unsigned long vmr(unsigned long field)
--- xen/arch/x86/pv/domain.c.orig	2020-10-30 17:22:39.000000000 +0100
+++ xen/arch/x86/pv/domain.c	2020-11-10 17:51:10.898525723 +0100
@@ -61,7 +61,7 @@ custom_runtime_param("pcid", parse_pcid)
 static void noreturn continue_nonidle_domain(struct vcpu *v)
 {
     check_wakeup_from_wait();
-    reset_stack_and_jump_nolp(ret_from_intr);
+    reset_stack_and_jump(ret_from_intr);
 }
 
 static int setup_compat_l4(struct vcpu *v)
--- xen/arch/x86/setup.c.orig	2020-05-18 18:53:09.000000000 +0200
+++ xen/arch/x86/setup.c	2020-11-10 17:51:10.898525723 +0100
@@ -631,7 +631,7 @@ static void __init noreturn reinit_bsp_s
     stack_base[0] = stack;
     memguard_guard_stack(stack);
 
-    reset_stack_and_jump_nolp(init_done);
+    reset_stack_and_jump(init_done);
 }
 
 /*
--- xen/common/livepatch.c.orig	2020-05-18 18:53:09.000000000 +0200
+++ xen/common/livepatch.c	2020-11-10 17:51:10.898525723 +0100
@@ -1300,6 +1300,11 @@ void check_for_livepatch_work(void)
     s_time_t timeout;
     unsigned long flags;
 
+    /* Only do any work when invoked in truly idle state. */
+    if ( system_state != SYS_STATE_active ||
+         !is_idle_domain(current->sched_unit->domain) )
+        return;
+
     /* Fast path: no work to do. */
     if ( !per_cpu(work_to_do, cpu ) )
         return;
--- xen/include/asm-x86/current.h.orig	2019-12-18 16:18:59.000000000 +0100
+++ xen/include/asm-x86/current.h	2020-11-10 17:51:10.902525725 +0100
@@ -129,22 +129,16 @@ unsigned long get_stack_dump_bottom (uns
 # define CHECK_FOR_LIVEPATCH_WORK ""
 #endif
 
-#define switch_stack_and_jump(fn, instr)                                \
+#define reset_stack_and_jump(fn)                                        \
     ({                                                                  \
         __asm__ __volatile__ (                                          \
             "mov %0,%%"__OP"sp;"                                        \
-            instr                                                       \
+            CHECK_FOR_LIVEPATCH_WORK                                    \
              "jmp %c1"                                                  \
             : : "r" (guest_cpu_user_regs()), "i" (fn) : "memory" );     \
         unreachable();                                                  \
     })
 
-#define reset_stack_and_jump(fn)                                        \
-    switch_stack_and_jump(fn, CHECK_FOR_LIVEPATCH_WORK)
-
-#define reset_stack_and_jump_nolp(fn)                                   \
-    switch_stack_and_jump(fn, "")
-
 /*
  * Which VCPU's state is currently running on each CPU?
  * This is not necesasrily the same as 'current' as a CPU may be
From: Jan Beulich <jbeulich@suse.com>
Subject: x86: fold guest_idle_loop() into idle_loop()

The latter can easily be made cover both cases. This is in preparation
of using idle_loop directly for populating idle_csw.tail.

Take the liberty and also adjust indentation / spacing in involved code.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Juergen Gross <jgross@suse.com>

--- xen/arch/x86/domain.c.orig	2020-11-10 17:51:10.894525721 +0100
+++ xen/arch/x86/domain.c	2020-11-10 17:51:46.354546349 +0100
@@ -133,14 +133,22 @@ void play_dead(void)
 static void idle_loop(void)
 {
     unsigned int cpu = smp_processor_id();
+    /*
+     * Idle vcpus might be attached to non-idle units! We don't do any
+     * standard idle work like tasklets or livepatching in this case.
+     */
+    bool guest = !is_idle_domain(current->sched_unit->domain);
 
     for ( ; ; )
     {
         if ( cpu_is_offline(cpu) )
+        {
+            ASSERT(!guest);
             play_dead();
+        }
 
         /* Are we here for running vcpu context tasklets, or for idling? */
-        if ( unlikely(tasklet_work_to_do(cpu)) )
+        if ( !guest && unlikely(tasklet_work_to_do(cpu)) )
         {
             do_tasklet();
             /* Livepatch work is always kicked off via a tasklet. */
@@ -151,28 +159,14 @@ static void idle_loop(void)
          * and then, after it is done, whether softirqs became pending
          * while we were scrubbing.
          */
-        else if ( !softirq_pending(cpu) && !scrub_free_pages()  &&
-                    !softirq_pending(cpu) )
-            pm_idle();
-        do_softirq();
-    }
-}
-
-/*
- * Idle loop for siblings in active schedule units.
- * We don't do any standard idle work like tasklets or livepatching.
- */
-static void guest_idle_loop(void)
-{
-    unsigned int cpu = smp_processor_id();
-
-    for ( ; ; )
-    {
-        ASSERT(!cpu_is_offline(cpu));
-
-        if ( !softirq_pending(cpu) && !scrub_free_pages() &&
-             !softirq_pending(cpu))
-            sched_guest_idle(pm_idle, cpu);
+        else if ( !softirq_pending(cpu) && !scrub_free_pages() &&
+                  !softirq_pending(cpu) )
+        {
+            if ( guest )
+                sched_guest_idle(pm_idle, cpu);
+            else
+                pm_idle();
+        }
         do_softirq();
     }
 }
@@ -190,10 +184,6 @@ void startup_cpu_idle_loop(void)
 
 static void noreturn continue_idle_domain(struct vcpu *v)
 {
-    /* Idle vcpus might be attached to non-idle units! */
-    if ( !is_idle_domain(v->sched_unit->domain) )
-        reset_stack_and_jump(guest_idle_loop);
-
     reset_stack_and_jump(idle_loop);
 }
 
From: Jan Beulich <jbeulich@suse.com>
Subject: x86: avoid calling {svm,vmx}_do_resume()

These functions follow the following path: hvm_do_resume() ->
handle_hvm_io_completion() -> hvm_wait_for_io() ->
wait_on_xen_event_channel() -> do_softirq() -> schedule() ->
sched_context_switch() -> continue_running() and hence may
recursively invoke themselves. If this ends up happening a couple of
times, a stack overflow would result.

Prevent this by also resetting the stack at the
->arch.ctxt_switch->tail() invocations (in both places for consistency)
and thus jumping to the functions instead of calling them.

This is XSA-348 / CVE-2020-29566.

Reported-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Juergen Gross <jgross@suse.com>

--- xen/arch/x86/domain.c.orig	2020-11-10 17:51:46.354546349 +0100
+++ xen/arch/x86/domain.c	2020-11-10 17:56:58.758730088 +0100
@@ -130,7 +130,7 @@ void play_dead(void)
         dead_idle();
 }
 
-static void idle_loop(void)
+static void noreturn idle_loop(void)
 {
     unsigned int cpu = smp_processor_id();
     /*
@@ -182,11 +182,6 @@ void startup_cpu_idle_loop(void)
     reset_stack_and_jump(idle_loop);
 }
 
-static void noreturn continue_idle_domain(struct vcpu *v)
-{
-    reset_stack_and_jump(idle_loop);
-}
-
 void init_hypercall_page(struct domain *d, void *ptr)
 {
     memset(ptr, 0xcc, PAGE_SIZE);
@@ -535,7 +530,7 @@ int arch_domain_create(struct domain *d,
         static const struct arch_csw idle_csw = {
             .from = paravirt_ctxt_switch_from,
             .to   = paravirt_ctxt_switch_to,
-            .tail = continue_idle_domain,
+            .tail = idle_loop,
         };
 
         d->arch.ctxt_switch = &idle_csw;
@@ -1833,20 +1828,12 @@ void context_switch(struct vcpu *prev, s
     /* Ensure that the vcpu has an up-to-date time base. */
     update_vcpu_system_time(next);
 
-    /*
-     * Schedule tail *should* be a terminal function pointer, but leave a
-     * bug frame around just in case it returns, to save going back into the
-     * context switching code and leaving a far more subtle crash to diagnose.
-     */
-    nextd->arch.ctxt_switch->tail(next);
-    BUG();
+    reset_stack_and_jump_ind(nextd->arch.ctxt_switch->tail);
 }
 
 void continue_running(struct vcpu *same)
 {
-    /* See the comment above. */
-    same->domain->arch.ctxt_switch->tail(same);
-    BUG();
+    reset_stack_and_jump_ind(same->domain->arch.ctxt_switch->tail);
 }
 
 int __sync_local_execstate(void)
--- xen/arch/x86/hvm/svm/svm.c.orig	2020-11-10 17:51:10.898525723 +0100
+++ xen/arch/x86/hvm/svm/svm.c	2020-11-10 17:56:58.762730090 +0100
@@ -987,8 +987,9 @@ static void svm_ctxt_switch_to(struct vc
         wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
 }
 
-static void noreturn svm_do_resume(struct vcpu *v)
+static void noreturn svm_do_resume(void)
 {
+    struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm.svm.vmcb;
     bool debug_state = (v->domain->debugger_attached ||
                         v->domain->arch.monitor.software_breakpoint_enabled ||
--- xen/arch/x86/hvm/vmx/vmcs.c.orig	2020-11-10 17:51:10.898525723 +0100
+++ xen/arch/x86/hvm/vmx/vmcs.c	2020-11-10 17:56:58.762730090 +0100
@@ -1830,8 +1830,9 @@ void vmx_vmentry_failure(void)
     domain_crash(curr->domain);
 }
 
-void vmx_do_resume(struct vcpu *v)
+void vmx_do_resume(void)
 {
+    struct vcpu *v = current;
     bool_t debug_state;
     unsigned long host_cr4;
 
--- xen/arch/x86/pv/domain.c.orig	2020-11-10 17:51:10.898525723 +0100
+++ xen/arch/x86/pv/domain.c	2020-11-10 17:56:58.762730090 +0100
@@ -58,7 +58,7 @@ static int parse_pcid(const char *s)
 }
 custom_runtime_param("pcid", parse_pcid);
 
-static void noreturn continue_nonidle_domain(struct vcpu *v)
+static void noreturn continue_nonidle_domain(void)
 {
     check_wakeup_from_wait();
     reset_stack_and_jump(ret_from_intr);
--- xen/include/asm-x86/current.h.orig	2020-11-10 17:51:10.902525725 +0100
+++ xen/include/asm-x86/current.h	2020-11-10 17:56:58.762730090 +0100
@@ -129,16 +129,23 @@ unsigned long get_stack_dump_bottom (uns
 # define CHECK_FOR_LIVEPATCH_WORK ""
 #endif
 
-#define reset_stack_and_jump(fn)                                        \
+#define switch_stack_and_jump(fn, instr, constr)                        \
     ({                                                                  \
         __asm__ __volatile__ (                                          \
             "mov %0,%%"__OP"sp;"                                        \
             CHECK_FOR_LIVEPATCH_WORK                                    \
-             "jmp %c1"                                                  \
-            : : "r" (guest_cpu_user_regs()), "i" (fn) : "memory" );     \
+            instr "1"                                                   \
+            : : "r" (guest_cpu_user_regs()), constr (fn) : "memory" );  \
         unreachable();                                                  \
     })
 
+#define reset_stack_and_jump(fn)                                        \
+    switch_stack_and_jump(fn, "jmp %c", "i")
+
+/* The constraint may only specify non-call-clobbered registers. */
+#define reset_stack_and_jump_ind(fn)                                    \
+    switch_stack_and_jump(fn, "INDIRECT_JMP %", "b")
+
 /*
  * Which VCPU's state is currently running on each CPU?
  * This is not necesasrily the same as 'current' as a CPU may be
--- xen/include/asm-x86/domain.h.orig	2020-10-30 17:22:39.000000000 +0100
+++ xen/include/asm-x86/domain.h	2020-11-10 17:56:58.762730090 +0100
@@ -313,7 +313,7 @@ struct arch_domain
     const struct arch_csw {
         void (*from)(struct vcpu *);
         void (*to)(struct vcpu *);
-        void (*tail)(struct vcpu *);
+        void noreturn (*tail)(void);
     } *ctxt_switch;
 
 #ifdef CONFIG_HVM
--- xen/include/asm-x86/hvm/vmx/vmx.h.orig	2019-12-18 16:18:59.000000000 +0100
+++ xen/include/asm-x86/hvm/vmx/vmx.h	2020-11-10 17:56:58.762730090 +0100
@@ -95,7 +95,7 @@ typedef enum {
 void vmx_asm_vmexit_handler(struct cpu_user_regs);
 void vmx_asm_do_vmentry(void);
 void vmx_intr_assist(void);
-void noreturn vmx_do_resume(struct vcpu *);
+void noreturn vmx_do_resume(void);
 void vmx_vlapic_msr_changed(struct vcpu *v);
 void vmx_realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt);
 void vmx_realmode(struct cpu_user_regs *regs);
