userfaultfdについてのメモ
まえがき
TWDのシーズン10がNetflixで公開されてたので,見ていたら遅れました(多分).
この記事はIPFactory Advent Calendar 2020の3日目の分です.
IPFactoryというサークルについてはこちらをご覧ください.
ネタが無かったので,最近Userfaultfdについて調べたものの,日本語の資料がほぼ無くて困った時に書いたメモをまとめておきます.
環境
vagrant@ubuntu2004:~$ uname -r
5.4.0-47-generic
userfaultfd
ここに書いてある事はmanと公式ドキュメントを読めばわかる.
あくまでメモである為,参考程度に考えて欲しい.
#include <sys/types.h>
#include <linux/userfaultfd.h>
int userfaultfd(int flags);
glibcのラッパーはまだ無い.
ユーザ空間でページフォルトのハンドリングをする為のオブジェクトを生成し,そのファイルディスクリプタを返す.
flag
に指定出来るのは以下の値
O_CLOEXEC
O_NONBLOCK
ここら辺はopen
と同じなので省略
作成はuserfaultfd
だけど,操作はread
とかfcntl
とかいつものioctl
とか.
userfaultfd
に対するioctl
についてはman 2 ioctl_userfaultfd
見ればいい.
man 2 userfaultfdに書いてあるPoCの解説
APIの説明とかは,文章で説明するよりもコードで読んだ方がわかりやすいと個人的に思っているので,そうする.
非常にありがたい事に,manにPoCが置いてあるので,そのコードを元に動作の説明をする.
ちょっと冗長な実装だったりしたところを書き換えているが本質的な機能は変わっていないはず.err(3)
を使っているくらい.
Code
#define _GNU_SOURCE
#include <sys/types.h>
#include <stdio.h>
#include <linux/userfaultfd.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <signal.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <poll.h>
#include <err.h>
static int page_size;
static void* fault_handler_thread(void *arg) {
static struct uffd_msg msg; // data read from uffd
static int fault_cnt = 0; // number of faults so far handled
long uffd;
static char *page = NULL;
struct uffdio_copy uffdio_copy;
ssize_t nread;
uffd = (long)arg;
if (page == NULL) {
page = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (page == MAP_FAILED)
err(1, "mmap");
}
while (1) {
struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
if (nready == -1)
err(1, "poll");
printf("\nfault_handler_thread():\n");
printf("\tpoll() returns: nready = %d; "
"POLLIN = %d; POLLERR = %d\n", nready,
(pollfd.revents & POLLIN) != 0,
(pollfd.revents & POLLERR) != 0);
// read an event from the uffd
nread = read(uffd, &msg, sizeof(msg));
if (nread == 0)
err(1, "EOF on userfaultfd!");
if (nread == -1)
err(1, "read");
// expect only one kind of event
if (msg.event != UFFD_EVENT_PAGEFAULT)
err(1, "Unexpected event on userfaultfd");
// display about page fault event
printf("\tUFFD_EVENT_PAGEFAULT event: ");
printf("flags = %llx; ", msg.arg.pagefault.flags);
printf("address = %llx\n", msg.arg.pagefault.address);
/* copy the page pointed to by 'page' into the faulting
* region. vary the contents that are copied in, so that it
* is more obvious that each fault is handled separarely.
*/
memset(page, '0' + fault_cnt % 20, page_size);
fault_cnt++;
uffdio_copy.src = (unsigned long)page;
/* We need to handle page faults in units of pages.
* So, round faulting address down to page boundary
*/
uffdio_copy.dst = (unsigned long)msg.arg.pagefault.address & ~(page_size - 1);
uffdio_copy.len = page_size;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1)
err(1, "ioctl(UFFDIO_COPY)");
printf("\t(uffdio_copy.copy returned %lld)\n", uffdio_copy.copy);
}
}
int main(int argc, char **argv) {
long uffd;
char *addr;
unsigned long len;
pthread_t thr;
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
int s;
if (argc != 2)
return fprintf(stderr, "Usage: %s num-pages\n", *argv), 1;
page_size = sysconf(_SC_PAGE_SIZE);
len = strtoul(argv[1], NULL, 0) * page_size;
// create and enable uffd object
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
err(1, "userfaultfd");
uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
err(1, "ioctl(UFFDIO_API)");
/* create a private anonymous mapping. The memory will
* be demand-zero paged that is, not yet allocated. When
* we actually touvh the memory, it will be allocated via
* the userfaultfd.
*/
addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED)
err(1, "mmap");
printf("Address returned by mmap() = %p\n", addr);
/* register the memory range of the mapping we just created for
* handling by the uffd object. In mode, we request to track
* midding pages (i.e., pages that have not yet been faulted in).
*/
uffdio_register.range.start = (unsigned long)addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
err(1, "ioctl(UFFDIO_REGISTER)");
//create a therad that will process the userfaultfd events
s = pthread_create(&thr, NULL, fault_handler_thread, (void*)uffd);
if (s != 0)
(errno = s), err(1, "pthread_create");
int l;
l = 0xf;
while (l < len) {
char c = addr[l]; // index out off bounds
printf("Read address %p in main: ", addr + l);
printf("%c\n", c);
l += 1024;
usleep(100000);
}
exit(EXIT_SUCCESS);
}
Usage
userfaultfd
はマルチスレッドプログラムに於いて他のスレッドのページングをユーザ空間で行う事が出来る.
プロセスのスレッド内でページフォルトが発生した時,userfaultfd
に登録されている場合,faultが発生したスレッドはスリープ状態になり,イベントが生成される.
faultを処理するスレッドは,発生したイベントをioctl
で操作する.
userfaultfd
でfdを取得
long uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
struct uffdio_api
をioctl UFFDIO_API
で取得したfdに設定する.
今回はページフォルトのイベントのみを取得したい.
ページフォルトの通知機能はデフォルトで有効化されているのでuffdio_api.features
は0でいい.
struct uffdio_api uffdio_api;
uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
err(1, "ioctl(UFFDIO_API)");
適切な権限でmapした領域を登録する
unsigned long len = sysconf(_SC_PAGE_SIZE) * strtoul(argv[1]);
char *addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
struct uffdio_register uffdio_register;
uffdio_register.range.start = (unsigned long)addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
err(1, "ioctl(UFFDIO_REGISTER)");
以下はハンドリングする側のスレッドの操作
uffd
からの情報はstruct uffd_msg
,poll
とかで良い感じのタイミングに読む
static struct uffd_msg msg;
read(uffd, &msg, sizeof(msg));
if (msg.event == UFFD_EVENT_PAGEFAULT)
printf("PAGEFAULT at %llx\n", msg.arg.pagefault.address);
で,面白いのがここから.
static char *page = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
memset(page , 'A', page_size);
良い感じのデータを用意して,
struct uffdio_copy uffdio_copy;
uffdio_copy.src = (unsigned long)page;
uffdio_copy.dst = (unsigned long)msg.arg.pagefault.address & ~(page_size - 1);
uffdio_copy.len = page_size;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1)
err(1, "ioctl(UFFDIO_COPY)");
やると,なんとuffdio_copy
の名前の通りデータをコピーできる.すごいね.
上の処理をする関数を別スレッドで生やす.
s = pthread_create(&thr, NULL, fault_handler_thread, (void*)uffd);
if (s != 0)
(errno = s), err(1, "pthread_create");
メインスレッドでは以下の処理でページフォルトを発生させる.
int l;
l = 0xf;
while (l < len) {
char c = addr[l];
printf("Read address %p in main: ", addr + l);
print("%c \n", c);
l += 1024;
usleep(100000);
}
mmap
したページに対する初回アクセス時にPage Faultが発生するのを利用して,そのFaultをトリガーに先程作成したスレッドでデータを書き換えている.
Page Faultはページサイズ毎に発生するので,例えばページサイズが4096だった場合,1024/4096
で4回毎にfault_hander_thread
が実行される.
Address returned by mmap() = 0x7fa0c4891000
fault_handler_thread():
poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
UFFD_EVENT_PAGEFAULT event: flags = 0/ address = 7fa0c4891000
(uffdio_copy.copy returned 4096)
Read address 0x7fa0c489100f in main: A
Read address 0x7fa0c489140f in main: A
Read address 0x7fa0c489180f in main: A
Read address 0x7fa0c4891c0f in main: A
fault_handler_thread():
poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
UFFD_EVENT_PAGEFAULT event: flags = 0/ address = 7fa0c4892000
(uffdio_copy.copy returned 4096)
Read address 0x7fa0c489200f in main: B
Read address 0x7fa0c489240f in main: B
Read address 0x7fa0c489280f in main: B
Read address 0x7fa0c4892c0f in main: B
fault_handler_thread():
poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
UFFD_EVENT_PAGEFAULT event: flags = 0/ address = 7fa0c4893000
(uffdio_copy.copy returned 4096)
Read address 0x7fa0c489300f in main: C
Read address 0x7fa0c489340f in main: C
Read address 0x7fa0c489380f in main: C
Read address 0x7fa0c4893c0f in main: C
まとめ
実はmmap
で確保した領域へのアクセス時にページフォルトが発生してるのを知らずにびっくりした.
デマンドページングとか概念は知ってたけど,まさかページフォルトが発生してるとは思わなんだ.
明日12/4はry0kvnによる「WOW64でのSystemCallをトレースしてみる」です.ご期待下さい.
以下.マジのメモです.ブログ用にまとめたりしてないので,今後書き加えるかもしれません.
PoCに無い部分
event
userfaultfd
で確認,対処できるイベントは
pagefault
fork
remap
remove
unmap
で,読んできたuffd_msg.event
にUFFD_EVENT_PAGEFAULT
みたいに設定されている.
それぞれ有効化するには,uffdio_api.features
にフラグを設定してioctl
する必要がある.
uffd_msg
eventは名前の通りeventの種類が,
event毎の情報は共用体としてuffd_msg.arg
にある
struct uffd_msg {
__u8 event;
__u8 reserved1;
__u16 reserved2;
__u32 reserved3;
union {
struct {
__u64 flags;
__u64 address;
union {
__u32 ptid;
} feat;
} pagefault;
struct {
__u32 ufd;
} fork;
struct {
__u64 from;
__u64 to;
__u64 len;
} remap;
struct {
__u64 start;
__u64 end;
} remove;
struct {
/* unused reserved fields */
__u64 reserved1;
__u64 reserved2;
__u64 reserved3;
} reserved;
} arg;
} __packed;
uffdio_api
struct uffdio_api {
/* userland asks for an API number and the features to enable */
__u64 api;
/*
* Kernel answers below with the all available features for
* the API, this notifies userland of which events and/or
* which flags for each event are enabled in the current
* kernel.
*
* Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
* are to be considered implicitly always enabled in all kernels as
* long as the uffdio_api.api requested matches UFFD_API.
*
* UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER
* with UFFDIO_REGISTER_MODE_MISSING mode will succeed on
* hugetlbfs virtual memory ranges. Adding or not adding
* UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has
* no real functional effect after UFFDIO_API returns, but
* it's only useful for an initial feature set probe at
* UFFDIO_API time. There are two ways to use it:
*
* 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the
* uffdio_api.features before calling UFFDIO_API, an error
* will be returned by UFFDIO_API on a kernel without
* hugetlbfs missing support
*
* 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in
* uffdio_api.features and instead it will be set by the
* kernel in the uffdio_api.features if the kernel supports
* it, so userland can later check if the feature flag is
* present in uffdio_api.features after UFFDIO_API
* succeeded.
*
* UFFD_FEATURE_MISSING_SHMEM works the same as
* UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
* (i.e. tmpfs and other shmem based APIs).
*
* UFFD_FEATURE_SIGBUS feature means no page-fault
* (UFFD_EVENT_PAGEFAULT) event will be delivered, instead
* a SIGBUS signal will be sent to the faulting process.
*
* UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
* be returned, if feature is not requested 0 will be returned.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
#define UFFD_FEATURE_EVENT_REMAP (1<<2)
#define UFFD_FEATURE_EVENT_REMOVE (1<<3)
#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4)
#define UFFD_FEATURE_MISSING_SHMEM (1<<5)
#define UFFD_FEATURE_EVENT_UNMAP (1<<6)
#define UFFD_FEATURE_SIGBUS (1<<7)
#define UFFD_FEATURE_THREAD_ID (1<<8)
__u64 features;
__u64 ioctls;
};
現在のカーネルでサポートしているイベントなどの情報が得られる.
基本的にUFFD_EVENT_PAGEFAULT
とUFFD_PAGEFAULT_FLAG_WRITE
はデフォルトで有効化されているのでPoCのようなコードは動く.
uffdio_register
struct uffdio_register {
struct uffdio_range range;
#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0)
#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1)
__u64 mode;
/*
* kernel answers which ioctl commands are available for the
* range, keep at the end as the last 8 bytes aren't read.
*/
__u64 ioctls;
};
uffdio_range
struct uffdio_range {
__u64 start;
__u64 len;
}
Comments