diff --git a/app/interrupt.c b/app/interrupt.c
index ab26782..2266a85 100644
--- a/app/interrupt.c
+++ b/app/interrupt.c
@@ -10,6 +10,7 @@
 
 #include <stdint.h>
 
+#include "cpuid.h"
 #include "hwctrl.h"
 #include "keyboard.h"
 #include "screen.h"
@@ -25,6 +26,7 @@
 //------------------------------------------------------------------------------
 
 #define HLT_OPCODE  0xf4
+#define JE_OPCODE   0x74
 
 #ifdef __x86_64__
 #define REG_PREFIX  "r"
@@ -120,7 +122,27 @@ void interrupt(struct trap_regs *trap_regs)
     if (trap_regs->vect == 2) {
         uint8_t *pc = (uint8_t *)trap_regs->ip;
         if (pc[-1] == HLT_OPCODE) {
-            // Assume this is a wakeup signal sent via IPI.
+            // Assume this is a barrier wakeup signal sent via IPI.
+            return;
+        }
+        // Catch the rare case that a core will fail to reach the HLT instruction before
+        // its wakeup signal arrives. The barrier code contains an atomic decrement, a JE
+        // instruction (two bytes), and a HLT instruction (one byte). The atomic decrement
+        // must have completed if another core has reached the point of sending the wakeup
+        // signals, so we should find the HLT opcode either at pc[0] or at pc[2]. If we find
+        // it, adjust the interrupt return address to point to the following instruction.
+        if (pc[0] == HLT_OPCODE || (pc[0] == JE_OPCODE && pc[2] == HLT_OPCODE)) {
+            uintptr_t *return_addr;
+            if (cpuid_info.flags.lm == 1) {
+                return_addr = (uintptr_t *)(trap_regs->sp - 40);
+            } else {
+                return_addr = (uintptr_t *)(trap_regs->sp - 12);
+            }
+            if (pc[2] == HLT_OPCODE) {
+                *return_addr += 3;
+            } else {
+                *return_addr += 1;
+            }
             return;
         }
 #if REPORT_PARITY_ERRORS
diff --git a/lib/barrier.c b/lib/barrier.c
index 9a4983a..17b8eb8 100644
--- a/lib/barrier.c
+++ b/lib/barrier.c
@@ -66,10 +66,29 @@ void barrier_halt_wait(barrier_t *barrier)
     local_flag_t *waiting_flags = local_flags(barrier->flag_num);
     int my_cpu = smp_my_cpu_num();
     waiting_flags[my_cpu].flag = true;
-    if (__sync_sub_and_fetch(&barrier->count, 1) != 0) {
-        __asm__ __volatile__ ("hlt");
-        return;
-    }
+    //
+    // There is a small window of opportunity for the wakeup signal to arrive
+    // between us decrementing the barrier count and halting. So code the
+    // following in assembler, both to ensure the window of opportunity is as
+    // small as possible, and also to allow us to detect and skip over the
+    // halt in the interrupt handler.
+    //
+    // if (__sync_sub_and_fetch(&barrier->count, 1) != 0) {
+    //     __asm__ __volatile__ ("hlt");
+    //     return;
+    // }
+    //
+    __asm__ goto ("\t"
+        "lock decl %0 \n\t"
+        "je 0f        \n\t"
+        "hlt          \n\t"
+        "jmp %l[end]  \n"
+        "0:           \n"
+        : /* no outputs */
+        : "m" (barrier->count)
+        : /* no clobbers */
+        : end
+    );
     // Last one here, so reset the barrier and wake the others.
     barrier->count = barrier->num_threads;
     __sync_synchronize();
@@ -80,4 +99,6 @@ void barrier_halt_wait(barrier_t *barrier)
             smp_send_nmi(cpu_num);
         }
     }
+end:
+    return;
 }