memtest86plus/boot/startup64.S
Martin Whitaker 4100a44b12 Properly protect the startup stack with a mutex.
Because we start the APs sequentially, it is unlikely they will coincide
for the brief period that they use the temporary startup stack, but we
should guard against it. This allows us to remove the mutex around the
restart of each AP when relocating, which should improve test times.
2022-01-31 21:54:24 +00:00

621 lines
11 KiB
ArmAsm

// SPDX-License-Identifier: GPL-2.0
//
// startup64.S contains the 64-bit startup code for both the BSP and APs.
// It initialises stacks, memory management, and exception handling, clears
// the BSS, completes relocation, and finally calls the main application.
// It supports both the 32-bit and 64-bit Linux boot protocols and EFI boot
// for the first boot of the BSP.
//
// Copyright (C) 2020-2022 Martin Whitaker.
//
// Derived from memtest86+ head.S:
//
// linux/boot/head.S
// Copyright (C) 1991, 1992 Linus Torvalds
// 1-Jan-96 Modified by Chris Brady for use as a boot/loader for MemTest-86.
// Set up the memory management for flat non-paged linear addressing.
// 17 May 2004 : Added X86_PWRCAP for AMD64 (Memtest86+ - Samuel D.)
#define __ASSEMBLY__
#include "boot.h"
#define NUM_INT_VEC 20
.text
.code32
# The Linux 32-bit boot entry point.
.globl startup32
startup32:
cld
cli
# Get the load address.
movl 0x214(%esi), %ebx # bootparams.code32_start
# Save the boot params pointer.
movl %esi, (boot_params_addr - startup32)(%ebx)
# Use the startup stack until we pick the correct one.
leal (startup_stack_top - startup32)(%ebx), %esp
# Initialise the pml4 and pdp tables.
leal (pml4 - startup32)(%ebx), %ecx
leal (pdp - startup32)(%ebx), %edx
movl %edx, %eax
addl $0x3, %eax
movl %eax, 0(%ecx)
leal (pd0 - startup32)(%ebx), %eax
addl $0x3, %eax
movl %eax, 0(%edx)
leal (pd1 - startup32)(%ebx), %eax
addl $0x3, %eax
movl %eax, 8(%edx)
leal (pd2 - startup32)(%ebx), %eax
addl $0x3, %eax
movl %eax, 16(%edx)
leal (pd3 - startup32)(%ebx), %eax
addl $0x3, %eax
movl %eax, 24(%edx)
# Set the page directory base address.
movl %ecx, %cr3
# Enable PAE.
movl %cr4, %eax
orl $0x20, %eax
movl %eax, %cr4
# Enable long mode.
movl $0xc0000080, %ecx
rdmsr
orl $0x00000100, %eax
wrmsr
# Enable paging and protection.
movl %cr0, %eax
orl $0x80000001, %eax
movl %eax, %cr0
# Initialise the 64-bit GDT descriptor.
leal (gdt - startup32)(%ebx), %eax
movl %eax, 2 + (gdt_descr - startup32)(%ebx)
# Load the GDT and enter long mode.
lgdt (gdt_descr - startup32)(%ebx)
leal (startup - startup32)(%ebx), %eax
movw $KERNEL_CS, -2(%esp)
movl %eax, -6(%esp)
ljmp *-6(%esp)
.code64
# The EFI PE32+ boot entry point.
.org 0x1e0
.globl efi_boot
efi_boot:
movq %rcx, %rdi # the EFI image handle
movq %rdx, %rsi # the EFI system table pointer
movq $0, %rdx # the boot params pointer (0 = not yet allocated)
jmp efi_handover
# The Linux 64-bit boot entry point.
.org 0x200
.globl startup64
startup64:
cld
cli
# Save the boot params pointer.
movq %rsi, boot_params_addr(%rip)
jmp startup
# The Linux 64-bit EFI handover point.
.org 0x210
.globl efi_handover
efi_handover:
andq $~0xf, %rsp
call efi_setup
# Save the boot params pointer.
movq %rax, boot_params_addr(%rip)
# The 64-bit entry point for AP boot and for restart after relocation.
.globl startup
startup:
# Use the startup stack until we pick the correct one. We
# need to take a mutex to protect our use of the stack.
0: lock bts $0, startup_stack_mutex(%rip)
jc 0b
leaq startup_stack_top(%rip), %rsp
# Pick the correct stack. The stacks are allocated immediately
# after the end of the loaded program, BSP first, then APs.
xorq %rax, %rax
call smp_my_pcpu_num
movl $AP_STACK_SIZE, %edx
mul %edx
addq $BSP_STACK_SIZE, %rax
leaq _end(%rip), %rsp
addq %rax, %rsp
# Release the mutex that protects the startup stack.
movl $0, startup_stack_mutex(%rip)
# Initialise the pml4 and pdp tables.
leaq pml4(%rip), %rcx
leaq pdp(%rip), %rdx
movq %rdx, %rax
addq $0x3, %rax
movq %rax, 0(%rcx)
leaq pd0(%rip), %rax
addq $0x3, %rax
movq %rax, 0(%rdx)
leaq pd1(%rip), %rax
addq $0x3, %rax
movq %rax, 8(%rdx)
leaq pd2(%rip), %rax
addq $0x3, %rax
movq %rax, 16(%rdx)
leaq pd3(%rip), %rax
addq $0x3, %rax
movq %rax, 24(%rdx)
# Set the page directory base address.
movq %rcx, %cr3
# Initialise the GDT descriptor.
leaq gdt(%rip), %rax
movq %rax, 2 + gdt_descr(%rip)
# Load the GDT and the segment registers.
lgdt gdt_descr(%rip)
leaq flush(%rip), %rax
movw $KERNEL_CS, -2(%rsp)
movl %eax, -6(%rsp)
ljmp *-6(%rsp)
flush: movw $KERNEL_DS, %ax
movw %ax, %ds
movw %ax, %es
movw %ax, %fs
movw %ax, %gs
movw %ax, %ss
# Initialise the IDT.
leaq idt(%rip), %rdi
leaq vec0(%rip), %rsi
movw $NUM_INT_VEC, %cx
0: movq %rsi, %rdx
movl $(KERNEL_CS << 16), %eax
movw %dx, %ax # selector = 0x0010 = cs
movw $0x8E00, %dx # interrupt gate - dpl=0, present
movl %eax, (%edi)
movl %edx, 4(%edi)
shrq $32, %rdx
movl %edx, 8(%edi)
movl $0, 12(%edi)
addq $(vec1-vec0), %rsi
addq $16, %rdi
dec %cx
jnz 0b
# Initialise the IDT descriptor.
leaq idt(%rip), %rax
movq %rax, 2 + idt_descr(%rip)
# Load the IDT.
lidt idt_descr(%rip)
# Zero the BSS (if first boot).
cmpl $1, first_boot(%rip)
jnz 1f
xorq %rax, %rax
leaq _bss(%rip), %rdi
leaq _end(%rip), %rcx
subq %rdi, %rcx
0: movq %rax, (%rdi)
addq $8, %rdi
subq $8, %rcx
jnz 0b
movl $0, first_boot(%rip)
1:
# Initialise the FPU.
finit
#if 0
# Enable SSE.
movq %cr0, %rax
andw $0xfffb, %ax # clear coprocessor emulation bit
orw $0x0002, %ax # set coprocessor monitoring bit
mov %rax, %cr0
movq %cr4, %rax
orw $0x0600, %ax # set OSFXSR and OSXMMEXCPT
movq %rax, %cr4
#endif
# Call the dynamic linker to fix up the addresses in the GOT.
call reloc
# Run the application.
call main
# In case we return, simulate an exception.
pushfq
xorq %rax, %rax
movw %cs, %ax
pushq %rax
call 0f
0: pushq $0 # error code
pushq $257 # vector
jmp int_handler
# Individual interrupt vector handlers. These need to be spaced equally, to
# allow the IDT initialisation loop above to work, so we use noops to pad out
# where required.
vec0:
pushq $0 # error code
pushq $0 # vector
jmp int_handler
vec1:
pushq $0 # error code
pushq $1 # vector
jmp int_handler
vec2:
pushq $0 # error code
pushq $2 # vector
jmp int_handler
vec3:
pushq $0 # error code
pushq $3 # vector
jmp int_handler
vec4:
pushq $0 # error code
pushq $4 # vector
jmp int_handler
vec5:
pushq $0 # error code
pushq $5 # vector
jmp int_handler
vec6:
pushq $0 # error code
pushq $6 # vector
jmp int_handler
vec7:
pushq $0 # error code
pushq $7 # vector
jmp int_handler
vec8:
nop;nop # error code already provided
pushq $8 # vector
jmp int_handler
vec9:
pushq $0 # error code
pushq $9 # vector
jmp int_handler
vec10:
nop;nop # error code already provided
pushq $10 # vector
jmp int_handler
vec11:
nop;nop # error code already provided
pushq $11 # vector
jmp int_handler
vec12:
nop;nop # error code already provided
pushq $12 # vector
jmp int_handler
vec13:
nop;nop # error code already provided
pushq $13 # vector
jmp int_handler
vec14:
nop;nop # error code already provided
pushq $14 # vector
jmp int_handler
vec15:
pushq $0 # error code
pushq $15 # vector
jmp int_handler
vec16:
pushq $0 # error code
pushq $16 # vector
jmp int_handler
vec17:
nop;nop # error code
pushq $17 # vector
jmp int_handler
vec18:
pushq $0 # error code
pushq $18 # vector
jmp int_handler
vec19:
pushq $0 # error code
pushq $19 # vector
jmp int_handler
# The common interrupt handler code. Pass the register state to the
# application interrupt handler.
int_handler:
pushq %rax
pushq %rbx
pushq %rcx
pushq %rdx
pushq %rdi
pushq %rsi
pushq %rbp
# original stack pointer
leaq 96(%rsp), %rax
pushq %rax
xorq %rax, %rax
movw %ds, %ax
pushq %rax
movw %es, %ax
pushq %rax
movw %ss, %ax
pushq %rax
movq %rsp, %rdi # pointer to trap regs struct on the stack
call interrupt
addq $32, %rsp
popq %rbp
popq %rsi
popq %rdi
popq %rdx
popq %rcx
popq %rbx
popq %rax
addq $16, %rsp
iretq
# The interrupt descriptor table.
.align 4
.word 0 # for alignment
idt_descr:
.word idt_end - idt - 1 # size
.quad 0 # addr: filled in at run time
idt:
.fill NUM_INT_VEC*2, 8, 0 # filled in at run time
idt_end:
# The global descriptor table.
.word 0 # for alignment
gdt_descr:
.word gdt_end - gdt - 1 # size
.quad 0 # addr: filled in at run time
.align 4
.globl gdt
gdt:
.quad 0x0000000000000000 # NULL descriptor
.quad 0x0000000000000000 # not used
.quad 0x00209a0000000000 # 0x10 64-bit code at 0x000000
.quad 0x0000920000000000 # 0x18 64-bit data at 0x000000
.globl gdt_end
gdt_end:
.data
.macro ptes64 start, count=64
.quad \start + 0x0000000 + 0x83
.quad \start + 0x0200000 + 0x83
.quad \start + 0x0400000 + 0x83
.quad \start + 0x0600000 + 0x83
.quad \start + 0x0800000 + 0x83
.quad \start + 0x0A00000 + 0x83
.quad \start + 0x0C00000 + 0x83
.quad \start + 0x0E00000 + 0x83
.if \count-1
ptes64 "(\start+0x01000000)",\count-1
.endif
.endm
.macro maxdepth depth=1
.if \depth-1
maxdepth \depth-1
.endif
.endm
maxdepth
# The level 4 page map table.
.align 4096
.globl pml4
pml4:
.quad 0 # filled in at run time
# Page Directory Pointer Table:
# 4 Entries, pointing to the Page Directory Tables.
.align 4096
.globl pdp
pdp:
.quad 0 # filled in at run time
.quad 0 # filled in at run time
.quad 0 # filled in at run time
.quad 0 # filled in at run time
# Page Directory Tables:
# There are 4 tables. The first two map the first 2 GB of memory. The third
# is used with PAE to map the rest of memory in 1 GB segments. The fourth is
# reserved for mapping the video frame buffer. We use 2 MB pages so only the
# Page Directory Table is used (no page tables).
.align 4096
.globl pd0
pd0:
ptes64 0x0000000000000000
.align 4096
.globl pd1
pd1:
ptes64 0x0000000040000000
.align 4096
.globl pd2
pd2:
ptes64 0x0000000080000000
.align 4096
.globl pd3
pd3:
ptes64 0x00000000C0000000
.previous
# ap_trampoline is the entry point for CPUs other than the bootstrap
# CPU (BSP). It gets copied to a page in low memory, to enable the APs
# to boot when the main program has been loaded in high memory.
.code16
.align 4
.globl ap_trampoline
ap_trampoline:
movw %cs, %ax
movw %ax, %ds
# Patch the jump address.
movl (ap_startup_addr - ap_trampoline), %ebx
movl %ebx, (ap_jump - ap_trampoline + 2)
# Patch and load the GDT descriptor. It should point to the main
# GDT descriptor, which has already been initialised by the BSP.
movl %ebx, %eax
addl $(gdt - startup), %eax
movl %eax, (ap_gdt_descr - ap_trampoline + 2)
lgdt ap_gdt_descr - ap_trampoline
# Set the page directory base address.
movl %ebx, %eax
addl $(pml4 - startup), %eax
movl %eax, %cr3
# Enable PAE.
movl %cr4, %eax
orl $0x20, %eax
movl %eax, %cr4
# Enable long mode.
movl $0xc0000080, %ecx
rdmsr
orl $0x00000100, %eax
wrmsr
# Enable paging and protection.
movl %cr0, %eax
orl $0x80000001, %eax
movl %eax, %cr0
# Jump to the 64-bit entry point.
ap_jump:
data32 ljmp $KERNEL_CS, $0
.align 4
.word 0 # for alignment
ap_gdt_descr:
.word gdt_end - gdt - 1 # gdt limit
.long 0 # gdt base - filled in at run time
.globl ap_startup_addr
ap_startup_addr:
.long 0 # filled in at run time
.globl ap_trampoline_end
ap_trampoline_end:
.previous
# Variables.
.data
.align 4
.globl boot_params_addr
boot_params_addr:
.quad 0
startup_stack_mutex:
.long 0
first_boot:
.long 1
.previous
# Startup stack.
.bss
.align 16
startup_stack_base:
. = . + 64
startup_stack_top:
.previous