// Copyright 2014 Google Inc. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "process-tools.h" #define PRINT_DEBUG(...) \ do { \ if (global_debug) { \ fprintf(stderr, __FILE__ ":" S__LINE__ ": " __VA_ARGS__); \ } \ } while (0) static bool global_debug = false; static double global_kill_delay; static int global_child_pid; static volatile sig_atomic_t global_signal; // The uid and gid of the user and group 'nobody'. static const int kNobodyUid = 65534; static const int kNobodyGid = 65534; // Options parsing result. struct Options { double timeout_secs; // How long to wait before killing the child (-T) double kill_delay_secs; // How long to wait before sending SIGKILL in case of // timeout (-t) const char *stdout_path; // Where to redirect stdout (-l) const char *stderr_path; // Where to redirect stderr (-L) char *const *args; // Command to run (--) const char *sandbox_root; // Sandbox root (-S) const char *working_dir; // Working directory (-W) char **mount_sources; // Map of directories to mount, from char **mount_targets; // sources -> targets (-m) int num_mounts; // How many mounts were specified }; // Child function used by CheckNamespacesSupported() in call to clone(). static int CheckNamespacesSupportedChild(void *arg) { return 0; } // Check whether the required namespaces are supported. static int CheckNamespacesSupported() { const int stackSize = 1024 * 1024; char *stack; char *stackTop; pid_t pid; // Allocate stack for child. stack = malloc(stackSize); if (stack == NULL) { DIE("malloc failed\n"); } // Assume stack grows downward. stackTop = stack + stackSize; // Create child with own namespaces. We use clone() instead of unshare() here // because of the kernel bug (ref. CreateNamespaces) that lets unshare fail // sometimes. As this check has to run as fast as possible, we can't afford to // spend time sleeping and retrying here until it eventually works (or not). CHECK_CALL(pid = clone(CheckNamespacesSupportedChild, stackTop, CLONE_NEWUSER | CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | SIGCHLD, NULL)); CHECK_CALL(waitpid(pid, NULL, 0)); return EXIT_SUCCESS; } // Print out a usage error. argc and argv are the argument counter and vector, // fmt is a format, // string for the error message to print. static void Usage(int argc, char *const *argv, const char *fmt, ...) { int i; va_list ap; va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fprintf(stderr, "\nUsage: %s [-S sandbox-root] [-W working-dir] [-M source -m " "target] -- command arg1\n", argv[0]); fprintf(stderr, " provided:"); for (i = 0; i < argc; i++) { fprintf(stderr, " %s", argv[i]); } fprintf(stderr, "\nMandatory arguments:\n" " -S directory which will become the root of the sandbox\n" " -- command to run inside sandbox, followed by arguments\n" "\n" "Optional arguments:\n" " -W working directory\n" " -t time to give the child to shutdown cleanly before sending it a " "SIGKILL\n" " -T timeout after which sandbox will be terminated\n" " -t in case timeout occurs, how long to wait before killing the " "child with SIGKILL\n" " -M/-m system directory to mount inside the sandbox\n" " Multiple directories can be specified and each of them will\n" " be mounted readonly. The -M option specifies which directory\n" " to mount, the -m option specifies where to mount it in the\n" " sandbox.\n" " -D if set, debug info will be printed\n" " -l redirect stdout to a file\n" " -L redirect stderr to a file\n"); exit(EXIT_FAILURE); } // Parse the command line flags and return the result in an Options structure // passed as argument. static void ParseCommandLine(int argc, char *const *argv, struct Options *opt) { extern char *optarg; extern int optind, optopt; int c; while ((c = getopt(argc, argv, ":CDS:W:t:T:M:m:l:L:")) != -1) { switch (c) { case 'C': // Shortcut for the "does this system support sandboxing" check. exit(CheckNamespacesSupported()); break; case 'S': if (opt->sandbox_root == NULL) { opt->sandbox_root = optarg; } else { Usage(argc, argv, "Multiple sandbox roots (-S) specified, expected one."); } break; case 'W': if (opt->working_dir == NULL) { opt->working_dir = optarg; } else { Usage(argc, argv, "Multiple working directories (-W) specified, expected at most " "one."); } break; case 't': if (sscanf(optarg, "%lf", &opt->kill_delay_secs) != 1 || opt->kill_delay_secs < 0) { Usage(argc, argv, "Invalid kill delay (-t) value: %lf", opt->kill_delay_secs); } break; case 'T': if (sscanf(optarg, "%lf", &opt->timeout_secs) != 1 || opt->timeout_secs < 0) { Usage(argc, argv, "Invalid timeout (-T) value: %lf", opt->timeout_secs); } break; case 'M': if (opt->mount_sources[opt->num_mounts] != NULL) { Usage(argc, argv, "The -M option must be followed by an -m option."); } opt->mount_sources[opt->num_mounts] = optarg; break; case 'm': if (opt->mount_sources[opt->num_mounts] == NULL) { Usage(argc, argv, "The -m option must be preceded by an -M option."); } if (opt->sandbox_root == NULL) { Usage(argc, argv, "The sandbox root must be set via the -S option before " "specifying an" " -m option."); } if (strstr(optarg, opt->sandbox_root) != optarg) { Usage(argc, argv, "A path passed to the -m option must start with the sandbox " "root."); } opt->mount_targets[opt->num_mounts++] = optarg; break; case 'D': global_debug = true; break; case 'l': if (opt->stdout_path == NULL) { opt->stdout_path = optarg; } else { Usage(argc, argv, "Cannot redirect stdout to more than one destination."); } break; case 'L': if (opt->stderr_path == NULL) { opt->stderr_path = optarg; } else { Usage(argc, argv, "Cannot redirect stderr to more than one destination."); } break; case '?': Usage(argc, argv, "Unrecognized argument: -%c (%d)", optopt, optind); break; case ':': Usage(argc, argv, "Flag -%c requires an argument", optopt); break; } } if (opt->sandbox_root == NULL) { Usage(argc, argv, "Sandbox root (-S) must be specified"); } if (opt->mount_sources[opt->num_mounts] != NULL && opt->mount_sources[opt->num_mounts] == NULL) { Usage(argc, argv, "An -m option is missing."); } opt->args = argv + optind; if (argc <= optind) { Usage(argc, argv, "No command specified."); } } static void CreateNamespaces() { // This weird workaround is necessary due to unshare seldomly failing with // EINVAL due to a race condition in the Linux kernel (see // https://lkml.org/lkml/2015/7/28/833). An alternative would be to use // clone/waitpid instead. int delay = 1; int tries = 0; const int max_tries = 100; while (tries++ < max_tries) { if (unshare(CLONE_NEWUSER | CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC) == 0) { PRINT_DEBUG("unshare succeeded after %d tries\n", tries); return; } else { if (errno != EINVAL) { perror("unshare"); exit(EXIT_FAILURE); } } // Exponential back-off, but sleep at most 250ms. usleep(delay); if (delay < 250000) { delay *= 2; } } fprintf(stderr, "unshare failed with EINVAL even after %d tries, giving up.\n", tries); exit(EXIT_FAILURE); } static void CreateFile(const char *path) { int handle; CHECK_CALL(handle = open(path, O_CREAT | O_WRONLY | O_EXCL, 0666)); CHECK_CALL(close(handle)); } static void SetupDevices() { CHECK_CALL(mkdir("dev", 0755)); const char *devs[] = {"/dev/null", "/dev/random", "/dev/urandom", "/dev/zero", NULL}; for (int i = 0; devs[i] != NULL; i++) { CreateFile(devs[i] + 1); CHECK_CALL(mount(devs[i], devs[i] + 1, NULL, MS_BIND, NULL)); } CHECK_CALL(symlink("/proc/self/fd", "dev/fd")); } // Recursively creates the file or directory specified in "path" and its parent // directories. static int CreateTarget(const char *path, bool is_directory) { if (path == NULL) { errno = EINVAL; return -1; } struct stat sb; // If the path already exists... if (stat(path, &sb) == 0) { if (is_directory && S_ISDIR(sb.st_mode)) { // and it's a directory and supposed to be a directory, we're done here. return 0; } else if (!is_directory && S_ISREG(sb.st_mode)) { // and it's a regular file and supposed to be one, we're done here. return 0; } else { // otherwise something is really wrong. errno = is_directory ? ENOTDIR : EEXIST; return -1; } } else { // If stat failed because of any error other than "the path does not exist", // this is an error. if (errno != ENOENT) { return -1; } } // Create the parent directory. CHECK_CALL(CreateTarget(dirname(strdupa(path)), true)); if (is_directory) { CHECK_CALL(mkdir(path, 0755)); } else { CreateFile(path); } return 0; } static void SetupDirectories(struct Options *opt) { // Mount the sandbox and go there. CHECK_CALL(mount(opt->sandbox_root, opt->sandbox_root, NULL, MS_BIND | MS_NOSUID, NULL)); CHECK_CALL(chdir(opt->sandbox_root)); // Setup /dev. SetupDevices(); CHECK_CALL(mkdir("proc", 0755)); CHECK_CALL(mount("/proc", "proc", NULL, MS_REC | MS_BIND, NULL)); CHECK_CALL(mkdir("tmp", 0755)); CHECK_CALL(mount("tmpfs", "tmp", "tmpfs", MS_NOSUID | MS_NODEV, "size=25%,mode=1777")); // Make sure the home directory exists and is writable. const char *homedir; if ((homedir = getenv("HOME")) == NULL) { homedir = getpwuid(getuid())->pw_dir; } if (homedir[0] != '/') { DIE("Home directory of user nobody must be an absolute path, but is %s", homedir); } char *homedir_absolute = malloc(strlen(opt->sandbox_root) + strlen(homedir) + 1); strcpy(homedir_absolute, opt->sandbox_root); strcat(homedir_absolute, homedir); CreateTarget(homedir_absolute, true); CHECK_CALL(mount("tmpfs", homedir_absolute, "tmpfs", MS_NOSUID | MS_NODEV, "size=25%,mode=1777")); // Mount directories passed in argv for (int i = 0; i < opt->num_mounts; i++) { struct stat sb; stat(opt->mount_sources[i], &sb); CHECK_CALL(CreateTarget(opt->mount_targets[i], S_ISDIR(sb.st_mode))); PRINT_DEBUG("mount -o rbind,ro %s %s\n", opt->mount_sources[i], opt->mount_targets[i]); CHECK_CALL(mount(opt->mount_sources[i], opt->mount_targets[i], NULL, MS_REC | MS_BIND | MS_RDONLY, NULL)); } } // Write the file "filename" using a format string specified by "fmt". Returns // -1 on failure. static int WriteFile(const char *filename, const char *fmt, ...) { int r; va_list ap; FILE *stream = fopen(filename, "w"); if (stream == NULL) { return -1; } va_start(ap, fmt); r = vfprintf(stream, fmt, ap); va_end(ap); if (r >= 0) { r = fclose(stream); } return r; } static void SetupUserNamespace(int uid, int gid) { // Disable needs for CAP_SETGID int r = WriteFile("/proc/self/setgroups", "deny"); if (r < 0 && errno != ENOENT) { // Writing to /proc/self/setgroups might fail on earlier // version of linux because setgroups does not exist, ignore. perror("WriteFile(\"/proc/self/setgroups\", \"deny\")"); exit(EXIT_FAILURE); } // Set group and user mapping from outer namespace to inner: // No changes in the parent, be nobody in the child. // // We can't be root in the child, because some code may assume that running as // root grants it certain capabilities that it doesn't in fact have. It's // safer to let the child think that it is just a normal user. CHECK_CALL(WriteFile("/proc/self/uid_map", "%d %d 1\n", kNobodyUid, uid)); CHECK_CALL(WriteFile("/proc/self/gid_map", "%d %d 1\n", kNobodyGid, gid)); CHECK_CALL(setresuid(kNobodyUid, kNobodyUid, kNobodyUid)); CHECK_CALL(setresgid(kNobodyGid, kNobodyGid, kNobodyGid)); } static void ChangeRoot(struct Options *opt) { // move the real root to old_root, then detach it char old_root[16] = "old-root-XXXXXX"; if (mkdtemp(old_root) == NULL) { perror("mkdtemp"); DIE("mkdtemp returned NULL\n"); } // pivot_root has no wrapper in libc, so we need syscall() CHECK_CALL(syscall(SYS_pivot_root, ".", old_root)); CHECK_CALL(chroot(".")); CHECK_CALL(umount2(old_root, MNT_DETACH)); CHECK_CALL(rmdir(old_root)); if (opt->working_dir != NULL) { CHECK_CALL(chdir(opt->working_dir)); } } // Called when timeout or signal occurs. void OnSignal(int sig) { global_signal = sig; // Nothing to do if we received a signal before spawning the child. if (global_child_pid == -1) { return; } if (sig == SIGALRM) { // SIGALRM represents a timeout, so we should give the process a bit of // time to die gracefully if it needs it. KillEverything(global_child_pid, true, global_kill_delay); } else { // Signals should kill the process quickly, as it's typically blocking // the return of the prompt after a user hits "Ctrl-C". KillEverything(global_child_pid, false, global_kill_delay); } } // Run the command specified by the argv array and kill it after timeout // seconds. static void SpawnCommand(char *const *argv, double timeout_secs) { for (int i = 0; argv[i] != NULL; i++) { PRINT_DEBUG("arg: %s\n", argv[i]); } CHECK_CALL(global_child_pid = fork()); if (global_child_pid == 0) { // In child. CHECK_CALL(setsid()); ClearSignalMask(); // Force umask to include read and execute for everyone, to make // output permissions predictable. umask(022); // Does not return unless something went wrong. CHECK_CALL(execvp(argv[0], argv)); } else { // In parent. // Set up a signal handler which kills all subprocesses when the given // signal is triggered. HandleSignal(SIGALRM, OnSignal); HandleSignal(SIGTERM, OnSignal); HandleSignal(SIGINT, OnSignal); SetTimeout(timeout_secs); int status = WaitChild(global_child_pid, argv[0]); // The child is done for, but may have grandchildren that we still have to // kill. kill(-global_child_pid, SIGKILL); if (global_signal > 0) { // Don't trust the exit code if we got a timeout or signal. UnHandle(global_signal); raise(global_signal); } else if (WIFEXITED(status)) { exit(WEXITSTATUS(status)); } else { int sig = WTERMSIG(status); UnHandle(sig); raise(sig); } } } int main(int argc, char *const argv[]) { struct Options opt; memset(&opt, 0, sizeof(opt)); opt.mount_sources = calloc(argc, sizeof(char *)); opt.mount_targets = calloc(argc, sizeof(char *)); ParseCommandLine(argc, argv, &opt); global_kill_delay = opt.kill_delay_secs; int uid = SwitchToEuid(); int gid = SwitchToEgid(); RedirectStdout(opt.stdout_path); RedirectStderr(opt.stderr_path); PRINT_DEBUG("sandbox root is %s\n", opt.sandbox_root); PRINT_DEBUG("working dir is %s\n", (opt.working_dir != NULL) ? opt.working_dir : "/ (default)"); CreateNamespaces(); // Make our mount namespace private, so that further mounts do not affect the // outside environment. CHECK_CALL(mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)); SetupDirectories(&opt); SetupUserNamespace(uid, gid); ChangeRoot(&opt); SpawnCommand(opt.args, opt.timeout_secs); free(opt.mount_sources); free(opt.mount_targets); return 0; }