memtest86plus/boot/startup64.S
Lionel Debroux 71c7bdfc00 WIP BROKEN NX enablement, for now only for the second page directory.
TODO:
    * selective NX enablement on pd0, pd1 and pd3.
      Unconditional NX on the whole pd3 makes memtest86+ reboot in a QEMU-emulated computer.
    * if supported on all x86_64 CPUs, simply enable long mode and NX simultaneously ? A real K8 dual-core processor didn't seem to hate it, at least.
    * startup code: NX enablement for x86, on capable computers (CPUID 0x80000001, edx bit 20).
    * set the appropriate flag in the headers.
2023-10-15 21:34:00 +02:00

655 lines
12 KiB
ArmAsm

// SPDX-License-Identifier: GPL-2.0
//
// startup64.S contains the 64-bit startup code for both the BSP and APs.
// It initialises stacks, memory management, and exception handling, clears
// the BSS, completes relocation, and finally calls the main application.
// It supports both the 32-bit and 64-bit Linux boot protocols and EFI boot
// for the first boot of the BSP.
//
// Copyright (C) 2020-2022 Martin Whitaker.
//
// Derived from memtest86+ head.S:
//
// linux/boot/head.S
// Copyright (C) 1991, 1992 Linus Torvalds
// 1-Jan-96 Modified by Chris Brady for use as a boot/loader for MemTest-86.
// Set up the memory management for flat non-paged linear addressing.
// 17 May 2004 : Added X86_PWRCAP for AMD64 (Memtest86+ - Samuel D.)
#define __ASSEMBLY__
#include "boot.h"
#define NUM_INT_VEC 20
.text
.code32
# The Linux 32-bit boot entry point.
.globl startup32
startup32:
cld
cli
# Get the load address.
movl 0x214(%esi), %ebx # bootparams.code32_start
# Save the boot params pointer.
movl %esi, (boot_params_addr - startup32)(%ebx)
# Use the startup stack until we pick the correct one.
leal (startup_stack_top - startup32)(%ebx), %esp
# Initialise the pml4 and pdp tables.
leal (pml4 - startup32)(%ebx), %ecx
leal (pdp - startup32)(%ebx), %edx
movl %edx, %eax
addl $0x3, %eax
movl %eax, 0(%ecx)
leal (pd0 - startup32)(%ebx), %eax
addl $0x3, %eax
movl %eax, 0(%edx)
leal (pd1 - startup32)(%ebx), %eax
addl $0x3, %eax
movl %eax, 8(%edx)
leal (pd2 - startup32)(%ebx), %eax
addl $0x3, %eax
movl %eax, 16(%edx)
leal (pd3 - startup32)(%ebx), %eax
addl $0x3, %eax
movl %eax, 24(%edx)
# Set the page directory base address.
movl %ecx, %cr3
# Enable PAE.
movl %cr4, %eax
orb $0x20, %al
movl %eax, %cr4
# Enable long mode.
movl $0xc0000080, %ecx
rdmsr
orb $0x01, %ah
wrmsr
# Enable paging and protection.
movl %cr0, %eax
orl $0x80000001, %eax
movl %eax, %cr0
# Initialise the 64-bit GDT descriptor.
leal (gdt - startup32)(%ebx), %eax
movl %eax, 2 + (gdt_descr - startup32)(%ebx)
# Load the GDT and enter long mode.
lgdt (gdt_descr - startup32)(%ebx)
leal (startup - startup32)(%ebx), %eax
movw $KERNEL_CS, -2(%esp)
movl %eax, -6(%esp)
ljmp *-6(%esp)
.code64
# The EFI PE32+ boot entry point.
.org 0x1e0
.globl efi_boot
efi_boot:
movq %rcx, %rdi # the EFI image handle
movq %rdx, %rsi # the EFI system table pointer
movq $0, %rdx # the boot params pointer (0 = not yet allocated)
jmp efi_handover
# The Linux 64-bit boot entry point.
.org 0x200
.globl startup64
startup64:
cld
cli
# Save the boot params pointer.
movq %rsi, boot_params_addr(%rip)
jmp startup
# The Linux 64-bit EFI handover point.
.org 0x210
.globl efi_handover
efi_handover:
andq $~0xf, %rsp
call efi_setup
# Save the boot params pointer.
movq %rax, boot_params_addr(%rip)
# The 64-bit entry point for AP boot and for restart after relocation.
.globl startup
startup:
# Some of the startup actions are not thread safe. Use a mutex
# to protect this section of code.
0: lock btsl $0, startup_mutex(%rip)
jc 0b
# Use the startup stack until we pick the correct one.
leaq startup_stack_top(%rip), %rsp
# Pick the correct stack.
xorq %rax, %rax
call smp_my_cpu_num
movl $AP_STACK_SIZE, %edx
mul %edx
addq $(BSP_STACK_SIZE - LOCALS_SIZE), %rax
leaq _stacks(%rip), %rsp
addq %rax, %rsp
# Enable NX.
movl $0xc0000080, %ecx
rdmsr
orb $0x08, %ah
wrmsr
# Initialise the pml4 and pdp tables.
leaq pml4(%rip), %rcx
leaq pdp(%rip), %rdx
movq %rdx, %rax
addq $0x3, %rax
movq %rax, 0(%rcx)
leaq pd0(%rip), %rax
addq $0x3, %rax
movq %rax, 0(%rdx)
leaq pd1(%rip), %rax
addq $0x3, %rax
movq %rax, 8(%rdx)
leaq pd2(%rip), %rax
addq $0x3, %rax
movq %rax, 16(%rdx)
leaq pd3(%rip), %rax
addq $0x3, %rax
movq %rax, 24(%rdx)
# Set the page directory base address.
movq %rcx, %cr3
# Initialise the GDT descriptor.
leaq gdt(%rip), %rax
movq %rax, 2 + gdt_descr(%rip)
# Load the GDT and the segment registers.
lgdt gdt_descr(%rip)
leaq flush(%rip), %rax
movw $KERNEL_CS, -2(%rsp)
movl %eax, -6(%rsp)
ljmp *-6(%rsp)
flush: movw $KERNEL_DS, %ax
movw %ax, %ds
movw %ax, %es
movw %ax, %fs
movw %ax, %gs
movw %ax, %ss
# Initialise the IDT.
leaq idt(%rip), %rdi
leaq vec0(%rip), %rsi
movw $NUM_INT_VEC, %cx
0: movq %rsi, %rdx
movl $(KERNEL_CS << 16), %eax
movw %dx, %ax # selector = 0x0010 = cs
movw $0x8E00, %dx # interrupt gate - dpl=0, present
movl %eax, (%rdi)
movl %edx, 4(%rdi)
shrq $32, %rdx
movl %edx, 8(%rdi)
movl $0, 12(%rdi)
addq $(vec1-vec0), %rsi
addq $16, %rdi
dec %cx
jnz 0b
# Initialise the IDT descriptor.
leaq idt(%rip), %rax
movq %rax, 2 + idt_descr(%rip)
# Load the IDT.
lidt idt_descr(%rip)
# Zero the BSS (if first boot).
cmpl $1, first_boot(%rip)
jne 1f
xorq %rax, %rax
leaq _bss(%rip), %rdi
leaq _end(%rip), %rcx
subq %rdi, %rcx
0: movq %rax, (%rdi)
addq $8, %rdi
subq $8, %rcx
jnz 0b
movl $0, first_boot(%rip)
1:
# Initialise the FPU.
finit
#if 0
# Enable SSE.
movq %cr0, %rax
andb $0xfb, %al # clear coprocessor emulation bit
orb $0x02, %al # set coprocessor monitoring bit
mov %rax, %cr0
movq %cr4, %rax
orw $0x0600, %ax # set OSFXSR and OSXMMEXCPT
movq %rax, %cr4
#endif
# Call the dynamic linker to fix up the addresses in the GOT.
call reloc
# Release the startup mutex.
movl $0, startup_mutex(%rip)
# Run the application.
call main
# In case we return, simulate an exception.
pushfq
xorq %rax, %rax
movw %cs, %ax
pushq %rax
call 0f
0: pushq $0 # error code
pushq $257 # vector
jmp int_handler
# Individual interrupt vector handlers. These need to be spaced equally, to
# allow the IDT initialisation loop above to work, so we use noops to pad out
# where required.
vec0:
pushq $0 # error code
pushq $0 # vector
jmp int_handler
vec1:
pushq $0 # error code
pushq $1 # vector
jmp int_handler
vec2:
pushq $0 # error code
pushq $2 # vector
jmp int_handler
vec3:
pushq $0 # error code
pushq $3 # vector
jmp int_handler
vec4:
pushq $0 # error code
pushq $4 # vector
jmp int_handler
vec5:
pushq $0 # error code
pushq $5 # vector
jmp int_handler
vec6:
pushq $0 # error code
pushq $6 # vector
jmp int_handler
vec7:
pushq $0 # error code
pushq $7 # vector
jmp int_handler
vec8:
nop;nop # error code already provided
pushq $8 # vector
jmp int_handler
vec9:
pushq $0 # error code
pushq $9 # vector
jmp int_handler
vec10:
nop;nop # error code already provided
pushq $10 # vector
jmp int_handler
vec11:
nop;nop # error code already provided
pushq $11 # vector
jmp int_handler
vec12:
nop;nop # error code already provided
pushq $12 # vector
jmp int_handler
vec13:
nop;nop # error code already provided
pushq $13 # vector
jmp int_handler
vec14:
nop;nop # error code already provided
pushq $14 # vector
jmp int_handler
vec15:
pushq $0 # error code
pushq $15 # vector
jmp int_handler
vec16:
pushq $0 # error code
pushq $16 # vector
jmp int_handler
vec17:
nop;nop # error code
pushq $17 # vector
jmp int_handler
vec18:
pushq $0 # error code
pushq $18 # vector
jmp int_handler
vec19:
pushq $0 # error code
pushq $19 # vector
jmp int_handler
# The common interrupt handler code. Pass the register state to the application
# interrupt handler. On entry this expects the stack to contain:
#
# rsp+30 ss
# rsp+28 rsp
# rsp+20 rflags
# rsp+18 cs
# rsp+10 rip
# rsp+08 error code
# rsp+00 vector number
#
# It adds the additional state expected by the application to the bottom of the
# stack frame.
int_handler:
pushq %rbp
pushq %rsi
pushq %rdi
pushq %rdx
pushq %rcx
pushq %rbx
pushq %rax
xorq %rax, %rax
movw %ss, %ax
pushq %rax
movw %es, %ax
pushq %rax
movw %ds, %ax
pushq %rax
movq %rsp, %rdi # pointer to trap regs struct on the stack
cld
call interrupt
addq $24, %rsp
popq %rax
popq %rbx
popq %rcx
popq %rdx
popq %rdi
popq %rsi
popq %rbp
addq $16, %rsp # discard the vector number and error code
iretq
# The interrupt descriptor table.
.align 4
.word 0 # for alignment
idt_descr:
.word idt_end - idt - 1 # size
.quad 0 # addr: filled in at run time
idt:
.fill NUM_INT_VEC*2, 8, 0 # filled in at run time
idt_end:
# The global descriptor table.
.word 0 # for alignment
gdt_descr:
.word gdt_end - gdt - 1 # size
.quad 0 # addr: filled in at run time
.align 4
.globl gdt
gdt:
.quad 0x0000000000000000 # NULL descriptor
.quad 0x0000000000000000 # not used
.quad 0x00209a0000000000 # 0x10 64-bit code at 0x000000
.quad 0x0000920000000000 # 0x18 64-bit data at 0x000000
.globl gdt_end
gdt_end:
.data
.macro ptes64 start, flags, count=64
.quad \start + 0x0000000 + \flags
.quad \start + 0x0200000 + \flags
.quad \start + 0x0400000 + \flags
.quad \start + 0x0600000 + \flags
.quad \start + 0x0800000 + \flags
.quad \start + 0x0A00000 + \flags
.quad \start + 0x0C00000 + \flags
.quad \start + 0x0E00000 + \flags
.if \count-1
ptes64 "(\start+0x01000000)", \flags, \count-1
.endif
.endm
.macro maxdepth depth=1
.if \depth-1
maxdepth \depth-1
.endif
.endm
maxdepth
# The level 4 page map table.
.align 4096
.globl pml4
pml4:
.quad 0 # filled in at run time
# Page Directory Pointer Table:
# 4 Entries, pointing to the Page Directory Tables.
.align 4096
.globl pdp
pdp:
.quad 0 # filled in at run time
.quad 0 # filled in at run time
.quad 0 # filled in at run time
.quad 0 # filled in at run time
# Page Directory Tables:
# There are 4 tables. The first two map the first 2 GB of memory. The third
# is used with PAE to map the rest of memory in 1 GB segments. The fourth is
# reserved for mapping the video frame buffer. We use 2 MB pages so only the
# Page Directory Table is used (no page tables).
.align 4096
.globl pd0
pd0:
ptes64 0x0000000000000000,0x0000000000000083
.align 4096
.globl pd1
pd1:
ptes64 0x0000000040000000,0x0000000000000083
.align 4096
.globl pd2
pd2:
ptes64 0x0000000080000000,0x8000000000000083
.align 4096
.globl pd3
pd3:
ptes64 0x00000000C0000000,0x0000000000000083
.previous
# ap_trampoline is the entry point for CPUs other than the bootstrap
# CPU (BSP). It gets copied to a page in low memory, to enable the APs
# to boot when the main program has been loaded in high memory.
.code16
.align 4
.globl ap_trampoline
ap_trampoline:
movw %cs, %ax
movw %ax, %ds
# Patch the jump address.
movl (ap_startup_addr - ap_trampoline), %ebx
movl %ebx, (ap_jump - ap_trampoline + 2)
# Patch and load the GDT descriptor. It should point to the main
# GDT descriptor, which has already been initialised by the BSP.
movl %ebx, %eax
addl $(gdt - startup), %eax
movl %eax, (ap_gdt_descr - ap_trampoline + 2)
data32 lgdt ap_gdt_descr - ap_trampoline
# Set the page directory base address.
movl %ebx, %eax
addl $(pml4 - startup), %eax
movl %eax, %cr3
# Enable PAE.
movl %cr4, %eax
orb $0x20, %al
movl %eax, %cr4
# Enable long mode.
movl $0xc0000080, %ecx
rdmsr
orb $0x01, %ah
wrmsr
# Enable paging and protection.
movl %cr0, %eax
orl $0x80000001, %eax
movl %eax, %cr0
# Enable NX.
movl $0xc0000080, %ecx
rdmsr
orb $0x08, %ah
wrmsr
# Jump to the 64-bit entry point.
ap_jump:
data32 ljmp $KERNEL_CS, $0
.align 4
.word 0 # for alignment
ap_gdt_descr:
.word gdt_end - gdt - 1 # gdt limit
.long 0 # gdt base - filled in at run time
.globl ap_startup_addr
ap_startup_addr:
.long 0 # filled in at run time
.globl ap_trampoline_end
ap_trampoline_end:
.previous
# Variables.
.data
.align 4
.globl boot_params_addr
boot_params_addr:
.quad 0
startup_mutex:
.long 0
first_boot:
.long 1
.previous
# Startup stack.
.bss
.align 16
startup_stack_base:
. = . + 64
startup_stack_top:
.previous
# Main stack area.
.section ".stacks", "aw", @nobits
.align 16
. = . + STACKS_SIZE
.previous