| // SPDX-License-Identifier: GPL-2.0 |
| /* |
| * memfd GUP test-case |
| * This tests memfd interactions with get_user_pages(). We require the |
| * fuse_mnt.c program to provide a fake direct-IO FUSE mount-point for us. This |
| * file-system delays _all_ reads by 1s and forces direct-IO. This means, any |
| * read() on files in that file-system will pin the receive-buffer pages for at |
| * least 1s via get_user_pages(). |
| * |
| * We use this trick to race ADD_SEALS against a write on a memfd object. The |
| * ADD_SEALS must fail if the memfd pages are still pinned. Note that we use |
| * the read() syscall with our memory-mapped memfd object as receive buffer to |
| * force the kernel to write into our memfd object. |
| */ |
| |
| #define _GNU_SOURCE |
| #define __EXPORTED_HEADERS__ |
| |
| #include <errno.h> |
| #include <inttypes.h> |
| #include <limits.h> |
| #include <linux/falloc.h> |
| #include <linux/fcntl.h> |
| #include <linux/memfd.h> |
| #include <sched.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <signal.h> |
| #include <string.h> |
| #include <sys/mman.h> |
| #include <sys/stat.h> |
| #include <sys/syscall.h> |
| #include <sys/wait.h> |
| #include <unistd.h> |
| |
| #define MFD_DEF_SIZE 8192 |
| #define STACK_SIZE 65536 |
| |
| static int sys_memfd_create(const char *name, |
| unsigned int flags) |
| { |
| return syscall(__NR_memfd_create, name, flags); |
| } |
| |
| static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) |
| { |
| int r, fd; |
| |
| fd = sys_memfd_create(name, flags); |
| if (fd < 0) { |
| printf("memfd_create(\"%s\", %u) failed: %m\n", |
| name, flags); |
| abort(); |
| } |
| |
| r = ftruncate(fd, sz); |
| if (r < 0) { |
| printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz); |
| abort(); |
| } |
| |
| return fd; |
| } |
| |
| static __u64 mfd_assert_get_seals(int fd) |
| { |
| long r; |
| |
| r = fcntl(fd, F_GET_SEALS); |
| if (r < 0) { |
| printf("GET_SEALS(%d) failed: %m\n", fd); |
| abort(); |
| } |
| |
| return r; |
| } |
| |
| static void mfd_assert_has_seals(int fd, __u64 seals) |
| { |
| __u64 s; |
| |
| s = mfd_assert_get_seals(fd); |
| if (s != seals) { |
| printf("%llu != %llu = GET_SEALS(%d)\n", |
| (unsigned long long)seals, (unsigned long long)s, fd); |
| abort(); |
| } |
| } |
| |
| static void mfd_assert_add_seals(int fd, __u64 seals) |
| { |
| long r; |
| __u64 s; |
| |
| s = mfd_assert_get_seals(fd); |
| r = fcntl(fd, F_ADD_SEALS, seals); |
| if (r < 0) { |
| printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n", |
| fd, (unsigned long long)s, (unsigned long long)seals); |
| abort(); |
| } |
| } |
| |
| static int mfd_busy_add_seals(int fd, __u64 seals) |
| { |
| long r; |
| __u64 s; |
| |
| r = fcntl(fd, F_GET_SEALS); |
| if (r < 0) |
| s = 0; |
| else |
| s = r; |
| |
| r = fcntl(fd, F_ADD_SEALS, seals); |
| if (r < 0 && errno != EBUSY) { |
| printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected with EBUSY: %m\n", |
| fd, (unsigned long long)s, (unsigned long long)seals); |
| abort(); |
| } |
| |
| return r; |
| } |
| |
| static void *mfd_assert_mmap_shared(int fd) |
| { |
| void *p; |
| |
| p = mmap(NULL, |
| MFD_DEF_SIZE, |
| PROT_READ | PROT_WRITE, |
| MAP_SHARED, |
| fd, |
| 0); |
| if (p == MAP_FAILED) { |
| printf("mmap() failed: %m\n"); |
| abort(); |
| } |
| |
| return p; |
| } |
| |
| static void *mfd_assert_mmap_private(int fd) |
| { |
| void *p; |
| |
| p = mmap(NULL, |
| MFD_DEF_SIZE, |
| PROT_READ | PROT_WRITE, |
| MAP_PRIVATE, |
| fd, |
| 0); |
| if (p == MAP_FAILED) { |
| printf("mmap() failed: %m\n"); |
| abort(); |
| } |
| |
| return p; |
| } |
| |
| static int global_mfd = -1; |
| static void *global_p = NULL; |
| |
| static int sealing_thread_fn(void *arg) |
| { |
| int sig, r; |
| |
| /* |
| * This thread first waits 200ms so any pending operation in the parent |
| * is correctly started. After that, it tries to seal @global_mfd as |
| * SEAL_WRITE. This _must_ fail as the parent thread has a read() into |
| * that memory mapped object still ongoing. |
| * We then wait one more second and try sealing again. This time it |
| * must succeed as there shouldn't be anyone else pinning the pages. |
| */ |
| |
| /* wait 200ms for FUSE-request to be active */ |
| usleep(200000); |
| |
| /* unmount mapping before sealing to avoid i_mmap_writable failures */ |
| munmap(global_p, MFD_DEF_SIZE); |
| |
| /* Try sealing the global file; expect EBUSY or success. Current |
| * kernels will never succeed, but in the future, kernels might |
| * implement page-replacements or other fancy ways to avoid racing |
| * writes. */ |
| r = mfd_busy_add_seals(global_mfd, F_SEAL_WRITE); |
| if (r >= 0) { |
| printf("HURRAY! This kernel fixed GUP races!\n"); |
| } else { |
| /* wait 1s more so the FUSE-request is done */ |
| sleep(1); |
| |
| /* try sealing the global file again */ |
| mfd_assert_add_seals(global_mfd, F_SEAL_WRITE); |
| } |
| |
| return 0; |
| } |
| |
| static pid_t spawn_sealing_thread(void) |
| { |
| uint8_t *stack; |
| pid_t pid; |
| |
| stack = malloc(STACK_SIZE); |
| if (!stack) { |
| printf("malloc(STACK_SIZE) failed: %m\n"); |
| abort(); |
| } |
| |
| pid = clone(sealing_thread_fn, |
| stack + STACK_SIZE, |
| SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM, |
| NULL); |
| if (pid < 0) { |
| printf("clone() failed: %m\n"); |
| abort(); |
| } |
| |
| return pid; |
| } |
| |
| static void join_sealing_thread(pid_t pid) |
| { |
| waitpid(pid, NULL, 0); |
| } |
| |
| int main(int argc, char **argv) |
| { |
| static const char zero[MFD_DEF_SIZE]; |
| int fd, mfd, r; |
| void *p; |
| int was_sealed; |
| pid_t pid; |
| |
| if (argc < 2) { |
| printf("error: please pass path to file in fuse_mnt mount-point\n"); |
| abort(); |
| } |
| |
| /* open FUSE memfd file for GUP testing */ |
| printf("opening: %s\n", argv[1]); |
| fd = open(argv[1], O_RDONLY | O_CLOEXEC); |
| if (fd < 0) { |
| printf("cannot open(\"%s\"): %m\n", argv[1]); |
| abort(); |
| } |
| |
| /* create new memfd-object */ |
| mfd = mfd_assert_new("kern_memfd_fuse", |
| MFD_DEF_SIZE, |
| MFD_CLOEXEC | MFD_ALLOW_SEALING); |
| |
| /* mmap memfd-object for writing */ |
| p = mfd_assert_mmap_shared(mfd); |
| |
| /* pass mfd+mapping to a separate sealing-thread which tries to seal |
| * the memfd objects with SEAL_WRITE while we write into it */ |
| global_mfd = mfd; |
| global_p = p; |
| pid = spawn_sealing_thread(); |
| |
| /* Use read() on the FUSE file to read into our memory-mapped memfd |
| * object. This races the other thread which tries to seal the |
| * memfd-object. |
| * If @fd is on the memfd-fake-FUSE-FS, the read() is delayed by 1s. |
| * This guarantees that the receive-buffer is pinned for 1s until the |
| * data is written into it. The racing ADD_SEALS should thus fail as |
| * the pages are still pinned. */ |
| r = read(fd, p, MFD_DEF_SIZE); |
| if (r < 0) { |
| printf("read() failed: %m\n"); |
| abort(); |
| } else if (!r) { |
| printf("unexpected EOF on read()\n"); |
| abort(); |
| } |
| |
| was_sealed = mfd_assert_get_seals(mfd) & F_SEAL_WRITE; |
| |
| /* Wait for sealing-thread to finish and verify that it |
| * successfully sealed the file after the second try. */ |
| join_sealing_thread(pid); |
| mfd_assert_has_seals(mfd, F_SEAL_WRITE); |
| |
| /* *IF* the memfd-object was sealed at the time our read() returned, |
| * then the kernel did a page-replacement or canceled the read() (or |
| * whatever magic it did..). In that case, the memfd object is still |
| * all zero. |
| * In case the memfd-object was *not* sealed, the read() was successfull |
| * and the memfd object must *not* be all zero. |
| * Note that in real scenarios, there might be a mixture of both, but |
| * in this test-cases, we have explicit 200ms delays which should be |
| * enough to avoid any in-flight writes. */ |
| |
| p = mfd_assert_mmap_private(mfd); |
| if (was_sealed && memcmp(p, zero, MFD_DEF_SIZE)) { |
| printf("memfd sealed during read() but data not discarded\n"); |
| abort(); |
| } else if (!was_sealed && !memcmp(p, zero, MFD_DEF_SIZE)) { |
| printf("memfd sealed after read() but data discarded\n"); |
| abort(); |
| } |
| |
| close(mfd); |
| close(fd); |
| |
| printf("fuse: DONE\n"); |
| |
| return 0; |
| } |