1/*P:900 2 * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride 3 * both the Host and Guest to do the low-level Guest<->Host switch. It is as 4 * simple as it can be made, but it's naturally very specific to x86. 5 * 6 * You have now completed Preparation. If this has whet your appetite; if you 7 * are feeling invigorated and refreshed then the next, more challenging stage 8 * can be found in "make Guest". 9 :*/ 10 11/*M:012 12 * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must 13 * gain at least 1% more performance. Since neither LOC nor performance can be 14 * measured beforehand, it generally means implementing a feature then deciding 15 * if it's worth it. And once it's implemented, who can say no? 16 * 17 * This is why I haven't implemented this idea myself. I want to, but I 18 * haven't. You could, though. 19 * 20 * The main place where lguest performance sucks is Guest page faulting. When 21 * a Guest userspace process hits an unmapped page we switch back to the Host, 22 * walk the page tables, find it's not mapped, switch back to the Guest page 23 * fault handler, which calls a hypercall to set the page table entry, then 24 * finally returns to userspace. That's two round-trips. 25 * 26 * If we had a small walker in the Switcher, we could quickly check the Guest 27 * page table and if the page isn't mapped, immediately reflect the fault back 28 * into the Guest. This means the Switcher would have to know the top of the 29 * Guest page table and the page fault handler address. 30 * 31 * For simplicity, the Guest should only handle the case where the privilege 32 * level of the fault is 3 and probably only not present or write faults. It 33 * should also detect recursive faults, and hand the original fault to the 34 * Host (which is actually really easy). 35 * 36 * Two questions remain. Would the performance gain outweigh the complexity? 37 * And who would write the verse documenting it? 38:*/ 39 40/*M:011 41 * Lguest64 handles NMI. This gave me NMI envy (until I looked at their 42 * code). It's worth doing though, since it would let us use oprofile in the 43 * Host when a Guest is running. 44:*/ 45 46/*S:100 47 * Welcome to the Switcher itself! 48 * 49 * This file contains the low-level code which changes the CPU to run the Guest 50 * code, and returns to the Host when something happens. Understand this, and 51 * you understand the heart of our journey. 52 * 53 * Because this is in assembler rather than C, our tale switches from prose to 54 * verse. First I tried limericks: 55 * 56 * There once was an eax reg, 57 * To which our pointer was fed, 58 * It needed an add, 59 * Which asm-offsets.h had 60 * But this limerick is hurting my head. 61 * 62 * Next I tried haikus, but fitting the required reference to the seasons in 63 * every stanza was quickly becoming tiresome: 64 * 65 * The %eax reg 66 * Holds "struct lguest_pages" now: 67 * Cherry blossoms fall. 68 * 69 * Then I started with Heroic Verse, but the rhyming requirement leeched away 70 * the content density and led to some uniquely awful oblique rhymes: 71 * 72 * These constants are coming from struct offsets 73 * For use within the asm switcher text. 74 * 75 * Finally, I settled for something between heroic hexameter, and normal prose 76 * with inappropriate linebreaks. Anyway, it aint no Shakespeare. 77 */ 78 79// Not all kernel headers work from assembler 80// But these ones are needed: the ENTRY() define 81// And constants extracted from struct offsets 82// To avoid magic numbers and breakage: 83// Should they change the compiler can't save us 84// Down here in the depths of assembler code. 85#include <linux/linkage.h> 86#include <asm/asm-offsets.h> 87#include <asm/page.h> 88#include <asm/segment.h> 89#include <asm/lguest.h> 90 91// We mark the start of the code to copy 92// It's placed in .text tho it's never run here 93// You'll see the trick macro at the end 94// Which interleaves data and text to effect. 95.text 96ENTRY(start_switcher_text) 97 98// When we reach switch_to_guest we have just left 99// The safe and comforting shores of C code 100// %eax has the "struct lguest_pages" to use 101// Where we save state and still see it from the Guest 102// And %ebx holds the Guest shadow pagetable: 103// Once set we have truly left Host behind. 104ENTRY(switch_to_guest) 105 // We told gcc all its regs could fade, 106 // Clobbered by our journey into the Guest 107 // We could have saved them, if we tried 108 // But time is our master and cycles count. 109 110 // Segment registers must be saved for the Host 111 // We push them on the Host stack for later 112 pushl %es 113 pushl %ds 114 pushl %gs 115 pushl %fs 116 // But the compiler is fickle, and heeds 117 // No warning of %ebp clobbers 118 // When frame pointers are used. That register 119 // Must be saved and restored or chaos strikes. 120 pushl %ebp 121 // The Host's stack is done, now save it away 122 // In our "struct lguest_pages" at offset 123 // Distilled into asm-offsets.h 124 movl %esp, LGUEST_PAGES_host_sp(%eax) 125 126 // All saved and there's now five steps before us: 127 // Stack, GDT, IDT, TSS 128 // Then last of all the page tables are flipped. 129 130 // Yet beware that our stack pointer must be 131 // Always valid lest an NMI hits 132 // %edx does the duty here as we juggle 133 // %eax is lguest_pages: our stack lies within. 134 movl %eax, %edx 135 addl $LGUEST_PAGES_regs, %edx 136 movl %edx, %esp 137 138 // The Guest's GDT we so carefully 139 // Placed in the "struct lguest_pages" before 140 lgdt LGUEST_PAGES_guest_gdt_desc(%eax) 141 142 // The Guest's IDT we did partially 143 // Copy to "struct lguest_pages" as well. 144 lidt LGUEST_PAGES_guest_idt_desc(%eax) 145 146 // The TSS entry which controls traps 147 // Must be loaded up with "ltr" now: 148 // The GDT entry that TSS uses 149 // Changes type when we load it: damn Intel! 150 // For after we switch over our page tables 151 // That entry will be read-only: we'd crash. 152 movl $(GDT_ENTRY_TSS*8), %edx 153 ltr %dx 154 155 // Look back now, before we take this last step! 156 // The Host's TSS entry was also marked used; 157 // Let's clear it again for our return. 158 // The GDT descriptor of the Host 159 // Points to the table after two "size" bytes 160 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx 161 // Clear "used" from type field (byte 5, bit 2) 162 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) 163 164 // Once our page table's switched, the Guest is live! 165 // The Host fades as we run this final step. 166 // Our "struct lguest_pages" is now read-only. 167 movl %ebx, %cr3 168 169 // The page table change did one tricky thing: 170 // The Guest's register page has been mapped 171 // Writable under our %esp (stack) -- 172 // We can simply pop off all Guest regs. 173 popl %eax 174 popl %ebx 175 popl %ecx 176 popl %edx 177 popl %esi 178 popl %edi 179 popl %ebp 180 popl %gs 181 popl %fs 182 popl %ds 183 popl %es 184 185 // Near the base of the stack lurk two strange fields 186 // Which we fill as we exit the Guest 187 // These are the trap number and its error 188 // We can simply step past them on our way. 189 addl $8, %esp 190 191 // The last five stack slots hold return address 192 // And everything needed to switch privilege 193 // From Switcher's level 0 to Guest's 1, 194 // And the stack where the Guest had last left it. 195 // Interrupts are turned back on: we are Guest. 196 iret 197 198// We tread two paths to switch back to the Host 199// Yet both must save Guest state and restore Host 200// So we put the routine in a macro. 201#define SWITCH_TO_HOST \ 202 /* We save the Guest state: all registers first \ 203 * Laid out just as "struct lguest_regs" defines */ \ 204 pushl %es; \ 205 pushl %ds; \ 206 pushl %fs; \ 207 pushl %gs; \ 208 pushl %ebp; \ 209 pushl %edi; \ 210 pushl %esi; \ 211 pushl %edx; \ 212 pushl %ecx; \ 213 pushl %ebx; \ 214 pushl %eax; \ 215 /* Our stack and our code are using segments \ 216 * Set in the TSS and IDT \ 217 * Yet if we were to touch data we'd use \ 218 * Whatever data segment the Guest had. \ 219 * Load the lguest ds segment for now. */ \ 220 movl $(LGUEST_DS), %eax; \ 221 movl %eax, %ds; \ 222 /* So where are we? Which CPU, which struct? \ 223 * The stack is our clue: our TSS starts \ 224 * It at the end of "struct lguest_pages". \ 225 * Or we may have stumbled while restoring \ 226 * Our Guest segment regs while in switch_to_guest, \ 227 * The fault pushed atop that part-unwound stack. \ 228 * If we round the stack down to the page start \ 229 * We're at the start of "struct lguest_pages". */ \ 230 movl %esp, %eax; \ 231 andl $(~(1 << PAGE_SHIFT - 1)), %eax; \ 232 /* Save our trap number: the switch will obscure it \ 233 * (In the Host the Guest regs are not mapped here) \ 234 * %ebx holds it safe for deliver_to_host */ \ 235 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ 236 /* The Host GDT, IDT and stack! \ 237 * All these lie safely hidden from the Guest: \ 238 * We must return to the Host page tables \ 239 * (Hence that was saved in struct lguest_pages) */ \ 240 movl LGUEST_PAGES_host_cr3(%eax), %edx; \ 241 movl %edx, %cr3; \ 242 /* As before, when we looked back at the Host \ 243 * As we left and marked TSS unused \ 244 * So must we now for the Guest left behind. */ \ 245 andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \ 246 /* Switch to Host's GDT, IDT. */ \ 247 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ 248 lidt LGUEST_PAGES_host_idt_desc(%eax); \ 249 /* Restore the Host's stack where its saved regs lie */ \ 250 movl LGUEST_PAGES_host_sp(%eax), %esp; \ 251 /* Last the TSS: our Host is returned */ \ 252 movl $(GDT_ENTRY_TSS*8), %edx; \ 253 ltr %dx; \ 254 /* Restore now the regs saved right at the first. */ \ 255 popl %ebp; \ 256 popl %fs; \ 257 popl %gs; \ 258 popl %ds; \ 259 popl %es 260 261// The first path is trod when the Guest has trapped: 262// (Which trap it was has been pushed on the stack). 263// We need only switch back, and the Host will decode 264// Why we came home, and what needs to be done. 265return_to_host: 266 SWITCH_TO_HOST 267 iret 268 269// We are lead to the second path like so: 270// An interrupt, with some cause external 271// Has ajerked us rudely from the Guest's code 272// Again we must return home to the Host 273deliver_to_host: 274 SWITCH_TO_HOST 275 // But now we must go home via that place 276 // Where that interrupt was supposed to go 277 // Had we not been ensconced, running the Guest. 278 // Here we see the trickness of run_guest_once(): 279 // The Host stack is formed like an interrupt 280 // With EIP, CS and EFLAGS layered. 281 // Interrupt handlers end with "iret" 282 // And that will take us home at long long last. 283 284 // But first we must find the handler to call! 285 // The IDT descriptor for the Host 286 // Has two bytes for size, and four for address: 287 // %edx will hold it for us for now. 288 movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx 289 // We now know the table address we need, 290 // And saved the trap's number inside %ebx. 291 // Yet the pointer to the handler is smeared 292 // Across the bits of the table entry. 293 // What oracle can tell us how to extract 294 // From such a convoluted encoding? 295 // I consulted gcc, and it gave 296 // These instructions, which I gladly credit: 297 leal (%edx,%ebx,8), %eax 298 movzwl (%eax),%edx 299 movl 4(%eax), %eax 300 xorw %ax, %ax 301 orl %eax, %edx 302 // Now the address of the handler's in %edx 303 // We call it now: its "iret" drops us home. 304 jmp *%edx 305 306// Every interrupt can come to us here 307// But we must truly tell each apart. 308// They number two hundred and fifty six 309// And each must land in a different spot, 310// Push its number on stack, and join the stream. 311 312// And worse, a mere six of the traps stand apart 313// And push on their stack an addition: 314// An error number, thirty two bits long 315// So we punish the other two fifty 316// And make them push a zero so they match. 317 318// Yet two fifty six entries is long 319// And all will look most the same as the last 320// So we create a macro which can make 321// As many entries as we need to fill. 322 323// Note the change to .data then .text: 324// We plant the address of each entry 325// Into a (data) table for the Host 326// To know where each Guest interrupt should go. 327.macro IRQ_STUB N TARGET 328 .data; .long 1f; .text; 1: 329 // Trap eight, ten through fourteen and seventeen 330 // Supply an error number. Else zero. 331 .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) 332 pushl $0 333 .endif 334 pushl $\N 335 jmp \TARGET 336 ALIGN 337.endm 338 339// This macro creates numerous entries 340// Using GAS macros which out-power C's. 341.macro IRQ_STUBS FIRST LAST TARGET 342 irq=\FIRST 343 .rept \LAST-\FIRST+1 344 IRQ_STUB irq \TARGET 345 irq=irq+1 346 .endr 347.endm 348 349// Here's the marker for our pointer table 350// Laid in the data section just before 351// Each macro places the address of code 352// Forming an array: each one points to text 353// Which handles interrupt in its turn. 354.data 355.global default_idt_entries 356default_idt_entries: 357.text 358 // The first two traps go straight back to the Host 359 IRQ_STUBS 0 1 return_to_host 360 // We'll say nothing, yet, about NMI 361 IRQ_STUB 2 handle_nmi 362 // Other traps also return to the Host 363 IRQ_STUBS 3 31 return_to_host 364 // All interrupts go via their handlers 365 IRQ_STUBS 32 127 deliver_to_host 366 // 'Cept system calls coming from userspace 367 // Are to go to the Guest, never the Host. 368 IRQ_STUB 128 return_to_host 369 IRQ_STUBS 129 255 deliver_to_host 370 371// The NMI, what a fabulous beast 372// Which swoops in and stops us no matter that 373// We're suspended between heaven and hell, 374// (Or more likely between the Host and Guest) 375// When in it comes! We are dazed and confused 376// So we do the simplest thing which one can. 377// Though we've pushed the trap number and zero 378// We discard them, return, and hope we live. 379handle_nmi: 380 addl $8, %esp 381 iret 382 383// We are done; all that's left is Mastery 384// And "make Mastery" is a journey long 385// Designed to make your fingers itch to code. 386 387// Here ends the text, the file and poem. 388ENTRY(end_switcher_text) 389