When writing my OS I used two separate files for the different stages. It makes your code more easily readable and you can figure out where everything is much faster. Here are my bootloaders.
Stage 1
org 0x7c00
bits 16
; BOOTLOADER STAGE 1
; Make sure our assumptions about addressing are ok.
%if STAGE2_ADDRESS < 0x8000
%error Stage 2 load address must be >= 0x8000.
%endif
; Declare the boot sector for a 1.44M floppy with FAT 12.
%define DECLARE_FAT12_BOOT_SECTOR
%define BOOT_RESERVED_SECTORS (STAGE2_SIZE + 1)
%include "boot_sector.inc"
; Loader entry point
; Prepare segment registers
mov ax, cs
mov ds, ax
mov es, ax
mov ss, ax
; Create a stack above the code right below the 0x8000 line. The second stage
; MUST be loaded no lower than 0x8000.
mov sp, 0x7ffe
; The BIOS stores drive number in DL on startup. We'll need this later. To
; maintain stack alignment, we'll store it as a word. The upper byte won't ever
; get used, so we don't care what it is. Store DX.
mov [drive_id], dx
; Enable A20 to access memory above the 1 MB real mode limit
call enable_a20
jc failed
; Read the second stage into memory.
push WORD STAGE2_SEGMENT ; Segment address for the second stage
push WORD STAGE2_OFFSET ; Offset into that segment
push WORD STAGE2_SIZE ; The size of the second stage in sectors
push WORD 1 ; Start with the sector right after this one
push WORD [drive_id] ; This drive's ID
call read_sectors ; Load it.
; If the carry flag is set something failed, so if something went wrong while
; reading we don't jump to the second stage and will fall straight through to
; the error handler.
mov dx, [drive_id]
jnc STAGE2_ADDRESS
failed:
; Clear screen, set attributes to white text on red background.
mov ax, 0x4f20
mov cx, 2000
push WORD 0xb800
pop es
xor di, di
rep stosw
; Print out error string
mov ah, 0x4f
mov si, fail_message
xor di, di
mov cx, (end_msg - fail_message)
.print_loop:
lodsb
stosw
loop .print_loop
cli
hlt
; Other code
%include "read_sectors.inc" ; Read logical sectors on the disk.
%include "enable_a20.inc" ; Enable the A20 line to access memory > 1MiB
; ------------------------------------------------------------------------------
; DATA AREA
drive_id: dw 0
fail_message: db "FATAL ERROR: Could not read from disk."
end_msg:
; Fill up the rest of the sector and put the boot signature at the end
times 510-($-$$) db 0xcc
db 0x55, 0xaa
Stage 2
org STAGE2_ADDRESS
bits 16
stage_2:
; Set up registers and stack
mov ax, cs
mov ds, ax
mov es, ax
mov ss, ax
; 128-byte stack above this code
mov sp, (end_stage_2 + 128)
; Initialize the FAT driver
call config_fat_driver
jc errors.fat_init_failed
; Clear screen - white text on blue background
call video_init
; Print startup message
mov si, msg_startup
call print
; Find the kernel file.
mov si, ROOT_OFFSET
mov di, kernel_name
call find_file
; If the carry flag is set the file wasn't found.
jc errors.kernel_not_found
; If we get here, then the kernel file was found. SI points to the dirent,
; and AX contains the number of the first cluster in the file.
.load_loop:
mov si, FAT_OFFSET ; DS:SI points to FAT
les di, [dta_addr] ; ES:DI points to buffer
call load_cluster ; Cluster to load is in AX.
jc errors.load_failed ; Barf on an error DEBUG: @802F
call copy_to_kernel ; Copy the data in the DTA to the kernel
; Make sure we're staying within the valid range of cluster numbers.
cmp ax, 0x0002
jb errors.load_failed
; AX contains the number of the next cluster to read. Compare it to the
; EOF marker and see if we finished loading the file. If so, bail and
; jump into the kernel.
cmp ax, 0xfef
jb .load_loop
; This is what calls the kernel. It sets up a basic GDT, jumps into protected
; mode, resets the segment registers, and creates a 4K stack. After doing a few
; other things to give the kernel information, it jumps to the actual kernel.
;
; Once the kernel returns, it pops out of protected mode, resets the segment
; registers, and prints out a message telling the user they can turn off the
; computer. After that, it just halts. Eventually we'll make it actually turn
; the computer off, but for now we'll just let the users do that themselves.
kernel_boot:
; Create the GDT and jump into protected mode
lgdt [gdt_descriptor] ; Load GDT
mov eax, cr0 ; Get CR0 into EAX
or eax, 1 ; Set the PMODE bit
mov cr0, eax ; Put the modified value into CR0
; Long jump so that the GDT takes effect
jmp 0x08:_pmode_start
errors:
.fat_init_failed:
mov si, msg_fat_failed
jmp .crash
.kernel_not_found:
mov si, msg_no_kernel
jmp .crash
.load_failed:
mov si, msg_load_failed
.crash:
call print
cli
hlt
; ------------------------------------------------------------------------------
video_init:
pusha
push es
mov ax, 0x0003
int 0x10
mov ah, [attrib]
mov al, ' '
push WORD 0xb800
pop es
xor di, di
mov cx, 2000
rep stosw
mov WORD [prn_addr], 0
pop es
popa
ret
; Prints an ASCIIZ string. SI is pointer to string.
print:
pusha
push es
push WORD 0xb800
pop es
mov di, [prn_addr] ; Set video buffer pointer
.print_loop:
mov ah, [attrib]
lodsb ; Load the next character from the string in AL
cmp al, 0 ; Check to see if it's the terminating null
je .done ; If so, bail.
cmp al, 0x0a ; Compare to linefeed
je .linefeed ; If it is a linefeed, jump to its handler
stosw ; If not, print. AH has character attribute.
jmp .print_loop ; Jump back up.
.linefeed:
mov ax, di ; Address / chars per row -> AX=row, DX=col
xor dx, dx
mov cx, 160
div cx
neg dx ; chars per row - col = remainder
add dx, 160 ; Negate column, add characters per row
add di, dx ; Add remainder to current pointer
jmp .print_loop ; Resume
.done:
mov [prn_addr], di
pop es
popa
ret
copy_to_kernel:
pusha
push es
push ds
; Set up pointers
lds si, [dta_addr] ; DTA buffer is the source
les di, [krnl_addr] ; Resume at last load location in dest.
mov cx, [bytes_per_cluster] ; Copy one cluster at a time
rep movsb ; Copy the data
; Update the offset so we can resume loading at the next cluster.
mov [krnl_addr.offset], di
pop ds
pop es
popa
ret
; ------------------------------------------------------------------------------
; OTHER CODE
%include "realfatdrv.inc"
; ==============================================================================
; 32-BIT CODE
bits 32
_pmode_start:
; Set up segment registers to the data descriptor
mov ax, 0x10
mov ds, ax
mov es, ax
mov ss, ax
; Figure out how much memory we have and set the stack accordingly. For now
; we'll assume that the kernel is less than 64K and we'll put the stack at
; the top of that. If we load the kernel at 1MB, 1MB+64K gives the base of
; the stack at 0x110000. Allocate a 4K stack, so the top is at 0x111000.
mov esp, 0x111000
; FS and GS point to the kernel's DTA (data transfer area), a 512K buffer
; used by the BIOS for disk accesses.
mov ax, 0x18
mov fs, ax
mov gs, ax
; Disable interrupts until the kernel can set up its own IDT. Then it'll re-
; enable them.
; cli
call KERNEL_ADDRESS
_halt_pc:
; The kernel main routine has returned. Break out of protected mode.
mov eax, cr0
and eax, 0xfffffffe
mov cr0, eax
jmp 0x00:no_pmode
no_pmode:
; Clear the screen, reverse video.
push WORD 0xb800
pop es
mov ax, 0x7020
mov ecx, 2000
xor edi, edi
rep stosw
; Print shutdown message
mov esi, turn_off
xor edi, edi
.prn_loop:
lodsb
cmp al, 0
je .finish
mov ah, 0x70
stosw
jmp .prn_loop
; Halt
.finish:
cli
hlt
; ------------------------------------------------------------------------------
; CONSTANT DATA
msg_startup: db "Loading kernel...", 0x00
msg_fat_failed: db 0x0a, "FATAL ERROR: Failed to load FAT driver.", 0x00
msg_no_kernel: db 0x0a, "FATAL ERROR: Kernel file not found.", 0x00
msg_load_failed:db 0x0a, "FATAL ERROR: Could not load kernel into memory.", 0x00
dbg_success: db 0x0a, "[DEBUG]: Success.", 0x00
turn_off: db "It is now safe to turn off your computer.", 0x00
kernel_name: db "OSKERNELSYS"
attrib: db 0x1f
; ------------------------------------------------------------------------------
; VARIABLE DATA
prn_addr: dw 0x0000 ; Offset into video buffer
; Address of DTA
dta_addr:
.offset: dw DTA_OFFSET
.segment: dw DTA_SEGMENT
; Address of kernel
krnl_addr:
.offset: dw KERNEL_OFFSET
.segment: dw KERNEL_SEGMENT
; ------------------------------------------------------------------------------
; GDT descriptors
gdt_descriptor:
gdt_size: dw (gdt_end - gdt_start - 1)
gdt_offset: dd gdt_start
gdt_start:
gdt_null: dq 0
; KERNEL CODE SEGMENT
gdt_kernel_code:
.limit_low: dw 0xffff
.base_0_15: dw 0x0000
.base_16_23: db 0x00
; Present, ring 0, code, <= IOPL can execute,
; readable, not accessed
.access: db 1_00_1_1_1_1_0b
; 4K granularity, 32-bit code
.flags_limit_16_19: db 1_1_00_1111b
.base_24_31: db 0x00
; KERNEL DATA SEGMENT
gdt_kernel_data:
.limit_low: dw 0xffff
.base_0_15: dw 0x0000
.base_16_23: db 0x00
; Present, ring 0, data, grows up, writable, not
; accessed
.access: db 1_00_1_0_0_1_0b
; 4K granularity, 32-bit data addressing
.flags_limit_16_19: db 1_1_00_1111b
.base_24_31: db 0x00
; KERNEL DISK TRANSFER AREA - USE FS/GS FOR THIS
; For disk I/O, the kernel will drop out of protected mode, execute the BIOS
; functions needed, and then jump back into protected mode. All data will be
; in this segment, which begins at 0x8000 and is 512K long.
;
; Note: this is configured exactly the same as the gdt_kernel_data segment
; with the exception of the base and limit, which is why it isn't commented.
gdt_kernel_dta:
.limit_low: dw 0x0000
.base_0_15: dw 0x8000
.base_16_23: db 0x00
.access: db 1_00_1_0_0_1_0b
.flags_limit_16_19: db 1_1_00_1000b
.base_24_31: db 0x00
gdt_end:
times (512 * STAGE2_SIZE)-($-$$) db 0xcc
end_stage_2:
There are a few constants that I have defined in my makefile like STAGE2_SIZE so it won't work right out of the box, but you get the idea.
The 8086 could only address one megabyte of memory even though it used a 16-bit segment and 16-bit offset, which theoretically could address 4 GB. How it calculated addresses was quite stupid: It shifted the segment left by four bits and added the offset. Thus, the same location in memory could have over 1024 different addresses. For example, these are all the same address:
0000:7c00
0001:7bf0
07c0:0000
...
Enabling the A20 line removes this restriction and switches to a linear addressing mode, so this doesn't happen. Addresses are now unique. You can now use 32-bit registers to address 4GB of memory. This itself doesn't inherently disable interrupts, and in fact enabling A20 alone won't get you into protected mode. Once you
do jump into protected mode,
then BIOS interrupts are unavailable unless you do something clever about it.
As for relative jumps, NASM uses $ to represent the current address, so a relative jump looks like this:
jmp $+5.