Work around the halt/wakeup race in barrier_halt_wait() (issue #13)

There is an unavoidable race between one core halting after decrementing the barrier count and another core sending it the wakeup NMI. This can only occur if the core sending the wakeup is running at many times the speed of the core halting, but it has been observed on an Intel Icelake mobile processor.
2025-02-25 18:55:23 -06:00 · 2022-03-08 23:14:26 +00:00 · 2022-03-08 23:14:26 +00:00 · f6da06b117
commit f6da06b117
parent 0076e63885
2 changed files with 48 additions and 5 deletions
--- a/app/interrupt.c
+++ b/app/interrupt.c
@ -10,6 +10,7 @@

 #include <stdint.h>

+#include "cpuid.h"
 #include "hwctrl.h"
 #include "keyboard.h"
 #include "screen.h"
@ -25,6 +26,7 @@
 //------------------------------------------------------------------------------

 #define HLT_OPCODE  0xf4
+#define JE_OPCODE   0x74

 #ifdef __x86_64__
 #define REG_PREFIX  "r"
@ -120,7 +122,27 @@ void interrupt(struct trap_regs *trap_regs)
    if (trap_regs->vect == 2) {
        uint8_t *pc = (uint8_t *)trap_regs->ip;
        if (pc[-1] == HLT_OPCODE) {
-            // Assume this is a wakeup signal sent via IPI.
+            // Assume this is a barrier wakeup signal sent via IPI.
+            return;
+        }
+        // Catch the rare case that a core will fail to reach the HLT instruction before
+        // its wakeup signal arrives. The barrier code contains an atomic decrement, a JE
+        // instruction (two bytes), and a HLT instruction (one byte). The atomic decrement
+        // must have completed if another core has reached the point of sending the wakeup
+        // signals, so we should find the HLT opcode either at pc[0] or at pc[2]. If we find
+        // it, adjust the interrupt return address to point to the following instruction.
+        if (pc[0] == HLT_OPCODE || (pc[0] == JE_OPCODE && pc[2] == HLT_OPCODE)) {
+            uintptr_t *return_addr;
+            if (cpuid_info.flags.lm == 1) {
+                return_addr = (uintptr_t *)(trap_regs->sp - 40);
+            } else {
+                return_addr = (uintptr_t *)(trap_regs->sp - 12);
+            }
+            if (pc[2] == HLT_OPCODE) {
+                *return_addr += 3;
+            } else {
+                *return_addr += 1;
+            }
            return;
        }
 #if REPORT_PARITY_ERRORS
--- a/lib/barrier.c
+++ b/lib/barrier.c
@ -66,10 +66,29 @@ void barrier_halt_wait(barrier_t *barrier)
    local_flag_t *waiting_flags = local_flags(barrier->flag_num);
    int my_cpu = smp_my_cpu_num();
    waiting_flags[my_cpu].flag = true;
-    if (__sync_sub_and_fetch(&barrier->count, 1) != 0) {
-        __asm__ __volatile__ ("hlt");
-        return;
-    }
+    //
+    // There is a small window of opportunity for the wakeup signal to arrive
+    // between us decrementing the barrier count and halting. So code the
+    // following in assembler, both to ensure the window of opportunity is as
+    // small as possible, and also to allow us to detect and skip over the
+    // halt in the interrupt handler.
+    //
+    // if (__sync_sub_and_fetch(&barrier->count, 1) != 0) {
+    //     __asm__ __volatile__ ("hlt");
+    //     return;
+    // }
+    //
+    __asm__ goto ("\t"
+        "lock decl %0 \n\t"
+        "je 0f        \n\t"
+        "hlt          \n\t"
+        "jmp %l[end]  \n"
+        "0:           \n"
+        : /* no outputs */
+        : "m" (barrier->count)
+        : /* no clobbers */
+        : end
+    );
    // Last one here, so reset the barrier and wake the others.
    barrier->count = barrier->num_threads;
    __sync_synchronize();
@ -80,4 +99,6 @@ void barrier_halt_wait(barrier_t *barrier)
            smp_send_nmi(cpu_num);
        }
    }
+end:
+    return;
 }