まえがき

TWDのシーズン10がNetflixで公開されてたので,見ていたら遅れました(多分).

この記事はIPFactory Advent Calendar 2020の3日目の分です.

IPFactoryというサークルについてはこちらをご覧ください.

ネタが無かったので,最近Userfaultfdについて調べたものの,日本語の資料がほぼ無くて困った時に書いたメモをまとめておきます.

環境

vagrant@ubuntu2004:~$ uname -r
5.4.0-47-generic

userfaultfd

ここに書いてある事はman公式ドキュメントを読めばわかる.

あくまでメモである為,参考程度に考えて欲しい.

#include <sys/types.h>
#include <linux/userfaultfd.h>

int userfaultfd(int flags);

glibcのラッパーはまだ無い.

ユーザ空間でページフォルトのハンドリングをする為のオブジェクトを生成し,そのファイルディスクリプタを返す. flagに指定出来るのは以下の値

ここら辺はopenと同じなので省略

作成はuserfaultfdだけど,操作はreadとかfcntlとかいつものioctlとか.

userfaultfdに対するioctlについてはman 2 ioctl_userfaultfd見ればいい.

man 2 userfaultfdに書いてあるPoCの解説

APIの説明とかは,文章で説明するよりもコードで読んだ方がわかりやすいと個人的に思っているので,そうする.

非常にありがたい事に,manにPoCが置いてあるので,そのコードを元に動作の説明をする.

ちょっと冗長な実装だったりしたところを書き換えているが本質的な機能は変わっていないはず.err(3)を使っているくらい.

Code

#define _GNU_SOURCE
#include <sys/types.h>
#include <stdio.h>
#include <linux/userfaultfd.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <signal.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <poll.h>
#include <err.h>

static int page_size;

static void* fault_handler_thread(void *arg) {
    static struct uffd_msg msg;     // data read from uffd
    static int fault_cnt = 0;       // number of faults so far handled
    long uffd;
    static char *page = NULL;
    struct uffdio_copy uffdio_copy;
    ssize_t nread;

    uffd = (long)arg;

    if (page == NULL) {
        page = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (page == MAP_FAILED)
            err(1, "mmap");
    }

    while (1) {
        struct pollfd pollfd;
        int nready;
        pollfd.fd = uffd;
        pollfd.events = POLLIN;
        nready = poll(&pollfd, 1, -1);
        if (nready == -1)
            err(1, "poll");

        printf("\nfault_handler_thread():\n");
        printf("\tpoll() returns: nready = %d; "
                "POLLIN = %d; POLLERR = %d\n", nready,
                (pollfd.revents & POLLIN) != 0,
                (pollfd.revents & POLLERR) != 0);

        // read an event from the uffd
        nread = read(uffd, &msg, sizeof(msg));
        if (nread == 0)
            err(1, "EOF on userfaultfd!");

        if (nread == -1)
            err(1, "read");

        // expect only one kind of event
        if (msg.event != UFFD_EVENT_PAGEFAULT)
            err(1, "Unexpected event on userfaultfd");

        // display about page fault event
        printf("\tUFFD_EVENT_PAGEFAULT event: ");
        printf("flags = %llx; ", msg.arg.pagefault.flags);
        printf("address = %llx\n", msg.arg.pagefault.address);

        /* copy the page pointed to by 'page' into the faulting
         * region. vary the contents that are copied in, so that it
         * is more obvious that each fault is handled separarely.
         */
        memset(page, '0' + fault_cnt % 20, page_size);
        fault_cnt++;

        uffdio_copy.src = (unsigned long)page;

        /* We need to handle page faults in units of pages.
         * So, round faulting address down to page boundary
         */
        uffdio_copy.dst = (unsigned long)msg.arg.pagefault.address & ~(page_size - 1);

        uffdio_copy.len = page_size;
        uffdio_copy.mode = 0;
        uffdio_copy.copy = 0;
        if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1)
            err(1, "ioctl(UFFDIO_COPY)");

        printf("\t(uffdio_copy.copy returned %lld)\n", uffdio_copy.copy);
    }
}

int main(int argc, char **argv) {
    long uffd;
    char *addr;
    unsigned long len;
    pthread_t thr;
    struct uffdio_api uffdio_api;
    struct uffdio_register uffdio_register;
    int s;

    if (argc != 2)
        return fprintf(stderr, "Usage: %s num-pages\n", *argv), 1;

    page_size = sysconf(_SC_PAGE_SIZE);
    len = strtoul(argv[1], NULL, 0) * page_size;

    // create and enable uffd object
    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1)
        err(1, "userfaultfd");

    uffdio_api.api = UFFD_API;
    uffdio_api.features = 0;
    if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
        err(1, "ioctl(UFFDIO_API)");

    /* create a private anonymous mapping. The memory will
     * be demand-zero paged that is, not yet allocated. When
     * we actually touvh the memory, it will be allocated via
     * the userfaultfd.
     */
    addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

    if (addr == MAP_FAILED)
        err(1, "mmap");

    printf("Address returned by mmap() = %p\n", addr);

    /* register the memory range of the mapping we just created for
     * handling by the uffd object. In mode, we request to track
     * midding pages (i.e., pages that have not yet been faulted in).
     */
    uffdio_register.range.start = (unsigned long)addr;
    uffdio_register.range.len = len;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
    if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
        err(1, "ioctl(UFFDIO_REGISTER)");

    //create a therad that will process the userfaultfd events
    s = pthread_create(&thr, NULL, fault_handler_thread, (void*)uffd);
    if (s != 0)
        (errno = s), err(1, "pthread_create");

    int l;
    l = 0xf;

    while (l < len) {
        char c = addr[l];   // index out off bounds
        printf("Read address %p in main: ", addr + l);
        printf("%c\n", c);
        l += 1024;
        usleep(100000);
    }
    exit(EXIT_SUCCESS);
}

Usage

userfaultfdはマルチスレッドプログラムに於いて他のスレッドのページングをユーザ空間で行う事が出来る.

プロセスのスレッド内でページフォルトが発生した時,userfaultfdに登録されている場合,faultが発生したスレッドはスリープ状態になり,イベントが生成される.

faultを処理するスレッドは,発生したイベントをioctlで操作する.

userfaultfdでfdを取得

long uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);

struct uffdio_apiioctl UFFDIO_APIで取得したfdに設定する.

今回はページフォルトのイベントのみを取得したい.

ページフォルトの通知機能はデフォルトで有効化されているのでuffdio_api.featuresは0でいい.

struct uffdio_api uffdio_api;
uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
    err(1, "ioctl(UFFDIO_API)");

適切な権限でmapした領域を登録する

unsigned long len = sysconf(_SC_PAGE_SIZE) * strtoul(argv[1]);
char *addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

struct uffdio_register uffdio_register;
uffdio_register.range.start = (unsigned long)addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
    err(1, "ioctl(UFFDIO_REGISTER)");

以下はハンドリングする側のスレッドの操作

uffdからの情報はstruct uffd_msgpollとかで良い感じのタイミングに読む

static struct uffd_msg msg;
read(uffd, &msg, sizeof(msg));
if (msg.event == UFFD_EVENT_PAGEFAULT)
    printf("PAGEFAULT at %llx\n", msg.arg.pagefault.address);

で,面白いのがここから.

static char *page = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
memset(page , 'A', page_size);

良い感じのデータを用意して,

struct uffdio_copy uffdio_copy;
uffdio_copy.src = (unsigned long)page;
uffdio_copy.dst = (unsigned long)msg.arg.pagefault.address & ~(page_size - 1);
uffdio_copy.len = page_size;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1)
    err(1, "ioctl(UFFDIO_COPY)");

やると,なんとuffdio_copyの名前の通りデータをコピーできる.すごいね.

上の処理をする関数を別スレッドで生やす.

s = pthread_create(&thr, NULL, fault_handler_thread, (void*)uffd);
if (s != 0)
    (errno = s), err(1, "pthread_create");

メインスレッドでは以下の処理でページフォルトを発生させる.

int l;
l = 0xf;

while (l < len) {
    char c = addr[l];
    printf("Read address %p in main: ", addr + l);
    print("%c \n", c);
    l += 1024;
    usleep(100000);
}

mmapしたページに対する初回アクセス時にPage Faultが発生するのを利用して,そのFaultをトリガーに先程作成したスレッドでデータを書き換えている.

Page Faultはページサイズ毎に発生するので,例えばページサイズが4096だった場合,1024/4096で4回毎にfault_hander_threadが実行される.

Address returned by mmap() = 0x7fa0c4891000

fault_handler_thread():
        poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
        UFFD_EVENT_PAGEFAULT event: flags = 0/ address = 7fa0c4891000
        (uffdio_copy.copy returned 4096)
Read address 0x7fa0c489100f in main: A
Read address 0x7fa0c489140f in main: A
Read address 0x7fa0c489180f in main: A
Read address 0x7fa0c4891c0f in main: A

fault_handler_thread():
        poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
        UFFD_EVENT_PAGEFAULT event: flags = 0/ address = 7fa0c4892000
        (uffdio_copy.copy returned 4096)
Read address 0x7fa0c489200f in main: B
Read address 0x7fa0c489240f in main: B
Read address 0x7fa0c489280f in main: B
Read address 0x7fa0c4892c0f in main: B

fault_handler_thread():
        poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
        UFFD_EVENT_PAGEFAULT event: flags = 0/ address = 7fa0c4893000
        (uffdio_copy.copy returned 4096)
Read address 0x7fa0c489300f in main: C
Read address 0x7fa0c489340f in main: C
Read address 0x7fa0c489380f in main: C
Read address 0x7fa0c4893c0f in main: C

まとめ

実はmmapで確保した領域へのアクセス時にページフォルトが発生してるのを知らずにびっくりした.

デマンドページングとか概念は知ってたけど,まさかページフォルトが発生してるとは思わなんだ.

明日12/4はry0kvnによる「WOW64でのSystemCallをトレースしてみる」です.ご期待下さい.


以下.マジのメモです.ブログ用にまとめたりしてないので,今後書き加えるかもしれません.

PoCに無い部分

event

userfaultfdで確認,対処できるイベントは

で,読んできたuffd_msg.eventUFFD_EVENT_PAGEFAULTみたいに設定されている.

それぞれ有効化するには,uffdio_api.featuresにフラグを設定してioctlする必要がある.

uffd_msg

eventは名前の通りeventの種類が,

event毎の情報は共用体としてuffd_msg.argにある

struct uffd_msg {
        __u8    event;

        __u8    reserved1;
        __u16   reserved2;
        __u32   reserved3;

        union {
                struct {
                        __u64   flags;
                        __u64   address;
                        union {
                                __u32 ptid;
                        } feat;
                } pagefault;

                struct {
                        __u32   ufd;
                } fork;

                struct {
                        __u64   from;
                        __u64   to;
                        __u64   len;
                } remap;

                struct {
                        __u64   start;
                        __u64   end;
                } remove;

                struct {
                        /* unused reserved fields */
                        __u64   reserved1;
                        __u64   reserved2;
                        __u64   reserved3;
                } reserved;
        } arg;
} __packed;

uffdio_api

struct uffdio_api {
        /* userland asks for an API number and the features to enable */
        __u64 api;
        /*
         * Kernel answers below with the all available features for
         * the API, this notifies userland of which events and/or
         * which flags for each event are enabled in the current
         * kernel.
         *
         * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
         * are to be considered implicitly always enabled in all kernels as
         * long as the uffdio_api.api requested matches UFFD_API.
         *
         * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER
         * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on
         * hugetlbfs virtual memory ranges. Adding or not adding
         * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has
         * no real functional effect after UFFDIO_API returns, but
         * it's only useful for an initial feature set probe at
         * UFFDIO_API time. There are two ways to use it:
         *
         * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the
         *    uffdio_api.features before calling UFFDIO_API, an error
         *    will be returned by UFFDIO_API on a kernel without
         *    hugetlbfs missing support
         *
         * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in
         *    uffdio_api.features and instead it will be set by the
         *    kernel in the uffdio_api.features if the kernel supports
         *    it, so userland can later check if the feature flag is
         *    present in uffdio_api.features after UFFDIO_API
         *    succeeded.
         *
         * UFFD_FEATURE_MISSING_SHMEM works the same as
         * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
         * (i.e. tmpfs and other shmem based APIs).
         *
         * UFFD_FEATURE_SIGBUS feature means no page-fault
         * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead
         * a SIGBUS signal will be sent to the faulting process.
         *
         * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
         * be returned, if feature is not requested 0 will be returned.
         */
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP          (1<<0)
#define UFFD_FEATURE_EVENT_FORK                 (1<<1)
#define UFFD_FEATURE_EVENT_REMAP                (1<<2)
#define UFFD_FEATURE_EVENT_REMOVE               (1<<3)
#define UFFD_FEATURE_MISSING_HUGETLBFS          (1<<4)
#define UFFD_FEATURE_MISSING_SHMEM              (1<<5)
#define UFFD_FEATURE_EVENT_UNMAP                (1<<6)
#define UFFD_FEATURE_SIGBUS                     (1<<7)
#define UFFD_FEATURE_THREAD_ID                  (1<<8)
        __u64 features;

        __u64 ioctls;
};

現在のカーネルでサポートしているイベントなどの情報が得られる.

基本的にUFFD_EVENT_PAGEFAULTUFFD_PAGEFAULT_FLAG_WRITEはデフォルトで有効化されているのでPoCのようなコードは動く.

uffdio_register

struct uffdio_register {
    struct uffdio_range range;
#define UFFDIO_REGISTER_MODE_MISSING    ((__u64)1<<0)
#define UFFDIO_REGISTER_MODE_WP     ((__u64)1<<1)
    __u64 mode;
    /*
     * kernel answers which ioctl commands are available for the
     * range, keep at the end as the last 8 bytes aren't read.
     */
    __u64 ioctls;
};

uffdio_range

struct uffdio_range {
    __u64 start;
    __u64 len;
}