xv6 Lab3: page tables
Lapin Gris Lv3

Page table 使用用来映射进程虚拟地址到物理地址。有了 pagetable 进程看到的是虚拟地址,对操作系统来说,有了 pagetable,可以实现更多的管理面的控制,Isolating,Guarding and etc.

Speed up system calls

加速 sys_getpid 这个系统调用。资源抽象有很多维度,我们内核对外提供服务的抽象就是 proc (process 的缩写)。而 proc 是 xv6 内核提供的抽象,其标识符就是 pid。也就是说 pid 是内核能看到的,用户态向获取自身的 pid 只能通过 syscall 来实现。proc 的 pid 是放在内核页里的,用户态看不到,我们加速 getpid syscall 的思路很简单,将 pid 保存到用户态页就行了。

Linux 内核获取时间的系统调用 gettimeofday() 也是这个思路,在用户态映射一个只读的页,里面存储时间。在 Linux 上这种优化手段叫做 vsyscall/vdso,

Linux vdso speed up

xv6 里,我们这个页叫做 USYSCALL,被安排放在 TRAPFRAME 页的下一页。既然,USYSCALL 这个页已经映射到用户态,那么获取 pid 的 syscall 也要在用户态实现一遍了。xv6 里,是实现在了 ulibc 里。

这种加速思路还可以用来加速各种各样的 syscall,USYSCALL 这个页有 4k 大小。现在只使用了一个 int 字段。还有很多空间放置其他字段可以用于加速其他 syscall

Show me the code

diff --git a/kernel/proc.c b/kernel/proc.c
index 58a8a0b..cf059d3 100644
--- a/kernel/proc.c
+++ b/kernel/proc.c
@@ -132,6 +132,14 @@ allocproc(void)
return 0;
}

+ // Allocate a USYSCALL page.
+ if((p->usyscall = (struct usyscall *)kalloc()) == 0){
+ freeproc(p);
+ release(&p->lock);
+ return 0;
+ }
+ p->usyscall->pid = p->pid;
+
// An empty user page table.
p->pagetable = proc_pagetable(p);
if(p->pagetable == 0){
@@ -158,6 +166,9 @@ freeproc(struct proc *p)
if(p->trapframe)
kfree((void*)p->trapframe);
p->trapframe = 0;
+ if(p->usyscall)
+ kfree((void*)p->usyscall);
+ p->usyscall = 0;
if(p->pagetable)
proc_freepagetable(p->pagetable, p->sz);
p->pagetable = 0;
@@ -202,6 +213,15 @@ proc_pagetable(struct proc *p)
return 0;
}

+ // map the USYSCALL page just below the TRAMPOLINE page, for
+ // fast syscall.
+ if(mappages(pagetable, USYSCALL, PGSIZE,
+ (uint64)(p->usyscall), PTE_R | PTE_U) < 0){
+ uvmunmap(pagetable, USYSCALL, 1, 0);
+ uvmfree(pagetable, 0);
+ return 0;
+ }
+
return pagetable;
}

@@ -212,6 +232,7 @@ proc_freepagetable(pagetable_t pagetable, uint64 sz)
{
uvmunmap(pagetable, TRAMPOLINE, 1, 0);
uvmunmap(pagetable, TRAPFRAME, 1, 0);
+ uvmunmap(pagetable, USYSCALL, 1, 0);
uvmfree(pagetable, sz);
}

diff --git a/kernel/proc.h b/kernel/proc.h
index d021857..155da44 100644
--- a/kernel/proc.h
+++ b/kernel/proc.h
@@ -100,6 +100,7 @@ struct proc {
uint64 sz; // Size of process memory (bytes)
pagetable_t pagetable; // User page table
struct trapframe *trapframe; // data page for trampoline.S
+ struct usyscall *usyscall; // page for fast syscall
struct context context; // swtch() here to run process
struct file *ofile[NOFILE]; // Open files
struct inode *cwd; // Current directory

为什么 pagetable 是 3 层?

硬件设计使然,看 xv6-book 里面的解释

As Figure 3.2 shows, a RISC-V CPU translates a virtual address into a physical in three steps. A page table is stored in physical memory as a three-level tree. The root of the tree is a 4096-byte page-table page that contains 512 PTEs, which contain the physical addresses for page-table pages in the next level of the tree. Each of those pages contains 512 PTEs for the final level in the tree. The paging hardware uses the top 9 bits of the 27 bits to select a PTE in the root page-table page, the middle 9 bits to select a PTE in a page-table page in the next level of the tree, and the bottom 9 bits to select the final PTE. (In Sv48 RISC-V a page table has four levels, and bits 39 through 47 of a virtual address index into the top-level.)

需要注意的是当 PTE (pagetable entry) 的 r/w/x 均没有置位的时候,代表指向的是下一级 pagetable。

if((pte & PTE_V) && (pte & (PTE_R|PTE_W|PTE_X)) == 0){
// this PTE points to a lower-level page table.
}

Show me the code

diff --git a/kernel/defs.h b/kernel/defs.h
index a3c962b..bdbd654 100644
--- a/kernel/defs.h
+++ b/kernel/defs.h
@@ -173,6 +173,7 @@ uint64 walkaddr(pagetable_t, uint64);
int copyout(pagetable_t, uint64, char *, uint64);
int copyin(pagetable_t, char *, uint64, uint64);
int copyinstr(pagetable_t, char *, uint64, uint64);
+void vmprint(pagetable_t pagetable);

// plic.c
void plicinit(void);
diff --git a/kernel/exec.c b/kernel/exec.c
index e18bbb6..b5adef2 100644
--- a/kernel/exec.c
+++ b/kernel/exec.c
@@ -128,6 +128,11 @@ exec(char *path, char **argv)
p->trapframe->sp = sp; // initial stack pointer
proc_freepagetable(oldpagetable, oldsz);

+ // print pagetable
+ if(p->pid == 1) {
+ vmprint(pagetable);
+ }
+
return argc; // this ends up in a0, the first argument to main(argc, argv)

bad:
diff --git a/kernel/vm.c b/kernel/vm.c
index 5c31e87..0d7338d 100644
--- a/kernel/vm.c
+++ b/kernel/vm.c
@@ -449,3 +449,29 @@ copyinstr(pagetable_t pagetable, char *dst, uint64 srcva, uint64 max)
return -1;
}
}
+
+void pte_print_walk(pagetable_t pagetable, int level) {
+ // there are 2^9 = 512 PTEs in a page table.
+ for (int i = 0; i < 512; i++) {
+ pte_t pte = pagetable[i];
+ if (!(pte & PTE_V)) {
+ continue;
+ }
+
+ for (int i = 0; i <= level; i++) {
+ printf("..");
+ }
+ printf("%d: pte %p pa %p\n", i, pte, PTE2PA(pte));
+
+ if ((pte & (PTE_R | PTE_W | PTE_X)) == 0) {
+ // this PTE points to a lower-level page table.
+ uint64 child = PTE2PA(pte);
+ pte_print_walk((pagetable_t)child, level + 1);
+ }
+ }
+}
+
+void vmprint(pagetable_t pagetable) {
+ printf("page table %p\n", pagetable);
+ pte_print_walk(pagetable, 0);
+}

Detect which pages have been accessed

查看 xv6-book ,其中 Figure 3.2 写了 RISC-V 的 Paging(换页) 的细节。
其中 bit 6 就是本次实验要用的 access bit。
image.png

根据最后的实现,我们只声明了 pte 的 bit 6 为 PTE_A, 但并没有看到任何的置位代码。所以,PTE_A 应该是硬件(CPU)置位. 因为 xv6 里 paging 是硬件实现的,置位自然也是硬件做。

Show me the code

diff --git a/kernel/riscv.h b/kernel/riscv.h
index 20a01db..100895e 100644
--- a/kernel/riscv.h
+++ b/kernel/riscv.h
@@ -343,6 +343,7 @@ typedef uint64 *pagetable_t; // 512 PTEs
#define PTE_W (1L << 2)
#define PTE_X (1L << 3)
#define PTE_U (1L << 4) // user can access
+#define PTE_A (1L << 6)

// shift a physical address to the right place for a PTE.
#define PA2PTE(pa) ((((uint64)pa) >> 12) << 10)
diff --git a/kernel/sysproc.c b/kernel/sysproc.c
index 88644b2..296935f 100644
--- a/kernel/sysproc.c
+++ b/kernel/sysproc.c
@@ -71,11 +71,36 @@ sys_sleep(void)


#ifdef LAB_PGTBL
-int
-sys_pgaccess(void)
-{
- // lab pgtbl: your code here.
- return 0;
+int sys_pgaccess(void) {
+
+ uint64 baseaddr;
+ int upper;
+ uint64 outaddr;
+
+ uint64 va;
+ pte_t *pte;
+ uint64 abits = 0;
+
+ argaddr(0, &baseaddr);
+ argint(1, &upper);
+ argaddr(2, &outaddr);
+
+ struct proc *p = myproc();
+
+ for (int i = 0; i < upper; i++) {
+ va = baseaddr + i * PGSIZE;
+ pte = walk(p->pagetable, va, 0);
+
+ if (*pte & PTE_A) {
+ abits = abits | (1 << i); // set bit
+ *pte = (*pte) & (~PTE_A); // reset PTE_A
+ }
+ }
+
+ if (copyout(p->pagetable, outaddr, (char *)&abits, sizeof(abits)) < 0)
+ return -1;
+
+ return 0;
}
#endif

测试结果

最后,make grade 查看结果,

== Test pgtbltest ==
$ make qemu-gdb
(3.8s)
== Test pgtbltest: ugetpid ==
pgtbltest: ugetpid: OK
== Test pgtbltest: pgaccess ==
pgtbltest: pgaccess: OK
== Test pte printout ==
$ make qemu-gdb
pte printout: OK (0.7s)
== Test answers-pgtbl.txt ==
answers-pgtbl.txt: OK
== Test usertests ==
$ make qemu-gdb
(141.1s)
== Test usertests: all tests ==
usertests: all tests: OK
== Test time ==
time: OK
Score: 46/46

Reference

1、6.181: Q&A Labs (PGTBL) 2023

2、Implementing virtual system calls