// Copyright 2016 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

/**
 * This is PID 1 inside the sandbox environment and runs in a separate user,
 * mount, UTS, IPC and PID namespace.
 */

#include "linux-sandbox-options.h"
#include "linux-sandbox-utils.h"
#include "linux-sandbox.h"

// Note that we define DIE() here and not in a shared header, because we want to
// use _exit() in the
// pid1 child, but exit() in the parent.
#define DIE(args...)                                     \
  {                                                      \
    fprintf(stderr, __FILE__ ":" S__LINE__ ": \"" args); \
    fprintf(stderr, "\": ");                             \
    perror(NULL);                                        \
    _exit(EXIT_FAILURE);                                 \
  }

#include <errno.h>
#include <fcntl.h>
#include <libgen.h>
#include <math.h>
#include <mntent.h>
#include <net/if.h>
#include <pwd.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

static int global_child_pid;
static char global_inaccessible_directory[] = "tmp/empty.XXXXXX";
static char global_inaccessible_file[] = "tmp/empty.XXXXXX";

static void SetupSelfDestruction(int *sync_pipe) {
  // We could also poll() on the pipe fd to find out when the parent goes away,
  // and rely on SIGCHLD interrupting that otherwise. That might require us to
  // install some trivial handler for SIGCHLD. Using O_ASYNC to turn the pipe
  // close into SIGIO may also work. Another option is signalfd, although that's
  // almost as obscure as this prctl.
  if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
    DIE("prctl");
  }

  // Verify that the parent still lives.
  char buf = 0;
  if (close(sync_pipe[0]) < 0) {
    DIE("close");
  }
  if (write(sync_pipe[1], &buf, 1) < 0) {
    DIE("write");
  }
  if (close(sync_pipe[1]) < 0) {
    DIE("close");
  }
}

static void SetupMountNamespace() {
  // Fully isolate our mount namespace private from outside events, so that
  // mounts in the outside environment do not affect our sandbox.
  if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
    DIE("mount");
  }
}

static void WriteFile(const char *filename, const char *fmt, ...) {
  FILE *stream = fopen(filename, "w");
  if (stream == NULL) {
    DIE("fopen(%s)", filename);
  }

  va_list ap;
  va_start(ap, fmt);
  int r = vfprintf(stream, fmt, ap);
  va_end(ap);

  if (r < 0) {
    DIE("vfprintf");
  }

  if (fclose(stream) != 0) {
    DIE("fclose(%s)", filename);
  }
}

static void SetupUserNamespace() {
  // Disable needs for CAP_SETGID.
  struct stat sb;
  if (stat("/proc/self/setgroups", &sb) == 0) {
    WriteFile("/proc/self/setgroups", "deny");
  } else {
    // Ignore ENOENT, because older Linux versions do not have this file (but
    // also do not require writing to it).
    if (errno != ENOENT) {
      DIE("stat(/proc/self/setgroups");
    }
  }

  int inner_uid = 0, inner_gid = 0;
  if (!opt.fake_root) {
    struct passwd *pwd = getpwnam("nobody");
    if (pwd == NULL) {
      DIE("unable to find passwd entry for user nobody")
    }

    inner_uid = pwd->pw_uid;
    inner_gid = pwd->pw_gid;
  }

  WriteFile("/proc/self/uid_map", "%d %d 1\n", inner_uid, global_outer_uid);
  WriteFile("/proc/self/gid_map", "%d %d 1\n", inner_gid, global_outer_gid);
}

static void SetupUtsNamespace() {
  if (sethostname("sandbox", 7) < 0) {
    DIE("sethostname");
  }

  if (setdomainname("sandbox", 7) < 0) {
    DIE("setdomainname");
  }
}

static void SetupHelperFiles() {
  if (mkdtemp(global_inaccessible_directory) == NULL) {
    DIE("mkdtemp(%s)", global_inaccessible_directory);
  }
  if (chmod(global_inaccessible_directory, 0) < 0) {
    DIE("chmod(%s, 0)", global_inaccessible_directory);
  }

  int handle = mkstemp(global_inaccessible_file);
  if (handle < 0) {
    DIE("mkstemp(%s)", global_inaccessible_file);
  }
  if (fchmod(handle, 0)) {
    DIE("fchmod(%s, 0)", global_inaccessible_file);
  }
  if (close(handle) < 0) {
    DIE("close(%s)", global_inaccessible_file);
  }
}

static bool IsDirectory(const char *path) {
  struct stat sb;
  if (stat(path, &sb) < 0) {
    DIE("stat(%s)", path);
  }
  return S_ISDIR(sb.st_mode);
}

// Recursively creates the file or directory specified in "path" and its parent
// directories.
static int CreateTarget(const char *path, bool is_directory) {
  PRINT_DEBUG("CreateTarget(%s, %s)", path, is_directory ? "true" : "false");
  if (path == NULL) {
    errno = EINVAL;
    return -1;
  }

  struct stat sb;
  // If the path already exists...
  if (stat(path, &sb) == 0) {
    if (is_directory && S_ISDIR(sb.st_mode)) {
      // and it's a directory and supposed to be a directory, we're done here.
      return 0;
    } else if (!is_directory && S_ISREG(sb.st_mode)) {
      // and it's a regular file and supposed to be one, we're done here.
      return 0;
    } else {
      // otherwise something is really wrong.
      errno = is_directory ? ENOTDIR : EEXIST;
      return -1;
    }
  } else {
    // If stat failed because of any error other than "the path does not exist",
    // this is an error.
    if (errno != ENOENT) {
      return -1;
    }
  }

  // Create the parent directory.
  if (CreateTarget(dirname(strdupa(path)), true) < 0) {
    DIE("CreateTarget(%s, true)", dirname(strdupa(path)));
  }

  if (is_directory) {
    if (mkdir(path, 0755) < 0) {
      DIE("mkdir(%s, 0755)", path);
    }
  } else {
    int handle;
    if ((handle = open(path, O_CREAT | O_WRONLY | O_EXCL, 0666)) < 0) {
      DIE("open(%s, O_CREAT | O_WRONLY | O_EXCL, 0666)", path);
    }
    if (close(handle) < 0) {
      DIE("close(%d)", handle);
    }
  }

  return 0;
}

static void MountFilesystems() {
  if (mount("/", opt.sandbox_root_dir, NULL, MS_BIND | MS_REC, NULL) < 0) {
    DIE("mount(/, %s, NULL, MS_BIND | MS_REC, NULL)", opt.sandbox_root_dir);
  }

  if (chdir(opt.sandbox_root_dir) < 0) {
    DIE("chdir(%s)", opt.sandbox_root_dir);
  }

  for (const char *tmpfs_dir : opt.tmpfs_dirs) {
    PRINT_DEBUG("tmpfs: %s", tmpfs_dir);
    if (mount("tmpfs", tmpfs_dir + 1, "tmpfs",
              MS_NOSUID | MS_NODEV | MS_NOATIME, NULL) < 0) {
      DIE("mount(tmpfs, %s, tmpfs, MS_NOSUID | MS_NODEV | MS_NOATIME, NULL)",
          tmpfs_dir + 1);
    }
  }

  // Make sure that our working directory is a mount point. The easiest way to
  // do this is by bind-mounting it upon itself.
  PRINT_DEBUG("working dir: %s", opt.working_dir);
  CreateTarget(opt.working_dir + 1, true);
  if (mount(opt.working_dir, opt.working_dir + 1, NULL, MS_BIND, NULL) < 0) {
    DIE("mount(%s, %s, NULL, MS_BIND, NULL)", opt.working_dir,
        opt.working_dir + 1);
  }

  for (const char *bind_mount : opt.bind_mounts) {
    PRINT_DEBUG("bind mount: %s", bind_mount);
    CreateTarget(bind_mount + 1, IsDirectory(bind_mount));
    if (mount(bind_mount, bind_mount + 1, NULL, MS_BIND, NULL) < 0) {
      DIE("mount(%s, %s, NULL, MS_BIND, NULL)", bind_mount, bind_mount + 1);
    }
  }

  for (const char *writable_file : opt.writable_files) {
    PRINT_DEBUG("writable: %s", writable_file);
    if (mount(writable_file, writable_file + 1, NULL, MS_BIND, NULL) < 0) {
      DIE("mount(%s, %s, NULL, MS_BIND, NULL)", writable_file,
          writable_file + 1);
    }
  }

  SetupHelperFiles();

  for (const char *inaccessible_file : opt.inaccessible_files) {
    struct stat sb;
    if (stat(inaccessible_file, &sb) < 0) {
      DIE("stat(%s)", inaccessible_file);
    }

    if (S_ISDIR(sb.st_mode)) {
      PRINT_DEBUG("inaccessible dir: %s", inaccessible_file);
      if (mount(global_inaccessible_directory, inaccessible_file + 1, NULL,
                MS_BIND, NULL) < 0) {
        DIE("mount(%s, %s, NULL, MS_BIND, NULL)", global_inaccessible_directory,
            inaccessible_file + 1);
      }
    } else {
      PRINT_DEBUG("inaccessible file: %s", inaccessible_file);
      if (mount(global_inaccessible_file, inaccessible_file + 1, NULL, MS_BIND,
                NULL) < 0) {
        DIE("mount(%s, %s, NULL, MS_BIND, NULL", global_inaccessible_file,
            inaccessible_file + 1);
      }
    }
  }
}

// We later remount everything read-only, except the paths for which this method
// returns true.
static bool ShouldBeWritable(char *mnt_dir) {
  mnt_dir += strlen(opt.sandbox_root_dir);

  if (strcmp(mnt_dir, opt.working_dir) == 0) {
    return true;
  }

  for (const char *writable_file : opt.writable_files) {
    if (strcmp(mnt_dir, writable_file) == 0) {
      return true;
    }
  }

  for (const char *tmpfs_dir : opt.tmpfs_dirs) {
    if (strcmp(mnt_dir, tmpfs_dir) == 0) {
      return true;
    }
  }

  return false;
}

static bool IsUnderTmpDir(const char *mnt_dir) {
  for (const char *tmpfs_dir : opt.tmpfs_dirs) {
    if (strstr(mnt_dir, tmpfs_dir) == mnt_dir) {
      return true;
    }
  }
  return false;
}

// Makes the whole filesystem read-only, except for the paths for which
// ShouldBeWritable returns true.
static void MakeFilesystemMostlyReadOnly() {
  FILE *mounts = setmntent("/proc/self/mounts", "r");
  if (mounts == NULL) {
    DIE("setmntent");
  }

  struct mntent *ent;
  while ((ent = getmntent(mounts)) != NULL) {
    // Skip mounts that do not belong to our sandbox.
    if (strstr(ent->mnt_dir, opt.sandbox_root_dir) != ent->mnt_dir) {
      continue;
    }
    // Skip mounts that are under tmpfs directories because we've already
    // replaced such directories with new tmpfs instances.
    // mount() would fail with ENOENT if we tried to remount such mount points.
    if (IsUnderTmpDir(ent->mnt_dir + strlen(opt.sandbox_root_dir))) {
      continue;
    }

    int mountFlags = MS_BIND | MS_REMOUNT;

    // MS_REMOUNT does not allow us to change certain flags. This means, we have
    // to first read them out and then pass them in back again. There seems to
    // be no better way than this (an API for just getting the mount flags of a
    // mount entry as a bitmask would be great).
    if (hasmntopt(ent, "nodev") != NULL) {
      mountFlags |= MS_NODEV;
    }
    if (hasmntopt(ent, "noexec") != NULL) {
      mountFlags |= MS_NOEXEC;
    }
    if (hasmntopt(ent, "nosuid") != NULL) {
      mountFlags |= MS_NOSUID;
    }
    if (hasmntopt(ent, "noatime") != NULL) {
      mountFlags |= MS_NOATIME;
    }
    if (hasmntopt(ent, "nodiratime") != NULL) {
      mountFlags |= MS_NODIRATIME;
    }
    if (hasmntopt(ent, "relatime") != NULL) {
      mountFlags |= MS_RELATIME;
    }

    if (!ShouldBeWritable(ent->mnt_dir)) {
      mountFlags |= MS_RDONLY;
    }

    PRINT_DEBUG("remount %s: %s", (mountFlags & MS_RDONLY) ? "ro" : "rw",
                ent->mnt_dir);
    if (mount(NULL, ent->mnt_dir, NULL, mountFlags, NULL) < 0) {
      // If we get EACCES, this might be a mount-point for which we don't have
      // read access. Not much we can do about this, but it also won't do any
      // harm, so let's go on. The same goes for EINVAL, which is fired in case
      // a later mount overlaps an earlier mount, e.g. consider the case of
      // /proc, /proc/sys/fs/binfmt_misc and /proc, with the latter /proc being
      // the one that an outer sandbox has mounted on top of its parent /proc.
      // In that case, we're not allowed to remount /proc/sys/fs/binfmt_misc,
      // because it is hidden.
      if (errno != EACCES && errno != EINVAL) {
        DIE("remount(NULL, %s, NULL, %d, NULL)", ent->mnt_dir, mountFlags);
      }
    }
  }

  endmntent(mounts);
}

static void MountProc() {
  // Mount a new proc on top of the old one, because the old one still refers to
  // our parent PID namespace.
  if (mount("proc", "proc", "proc", MS_NODEV | MS_NOEXEC | MS_NOSUID, NULL) <
      0) {
    DIE("mount");
  }
}

static void SetupNetworking() {
  // When running in a separate network namespace, enable the loopback interface
  // because some application may want to use it.
  if (opt.create_netns) {
    int fd;
    fd = socket(AF_INET, SOCK_DGRAM, 0);
    if (fd < 0) {
      DIE("socket");
    }

    struct ifreq ifr;
    memset(&ifr, 0, sizeof(ifr));
    strncpy(ifr.ifr_name, "lo", IF_NAMESIZE);

    // Verify that name is valid.
    if (if_nametoindex(ifr.ifr_name) == 0) {
      DIE("if_nametoindex");
    }

    // Enable the interface.
    ifr.ifr_flags |= IFF_UP;
    if (ioctl(fd, SIOCSIFFLAGS, &ifr) < 0) {
      DIE("ioctl");
    }

    if (close(fd) < 0) {
      DIE("close");
    }
  }
}

static void EnterSandbox() {
  // Move the real root to old_root, then detach it.
  char old_root[] = "tmp/old-root-XXXXXX";
  if (mkdtemp(old_root) == NULL) {
    DIE("mkdtemp(%s)", old_root);
  }

  // pivot_root has no wrapper in libc, so we need syscall()
  if (syscall(SYS_pivot_root, ".", old_root) < 0) {
    DIE("pivot_root(., %s)", old_root);
  }

  if (chroot(".") < 0) {
    DIE("chroot(.)");
  }

  if (umount2(old_root, MNT_DETACH) < 0) {
    DIE("umount2(%s, MNT_DETACH)", old_root);
  }

  if (rmdir(old_root) < 0) {
    DIE("rmdir(%s)", old_root);
  }

  if (chdir(opt.working_dir) < 0) {
    DIE("chdir(%s)", opt.working_dir);
  }
}

static void InstallSignalHandler(int signum, void (*handler)(int)) {
  struct sigaction sa;
  memset(&sa, 0, sizeof(sa));
  sa.sa_handler = handler;
  if (handler == SIG_IGN || handler == SIG_DFL) {
    // No point in blocking signals when using the default handler or ignoring
    // the signal.
    if (sigemptyset(&sa.sa_mask) < 0) {
      DIE("sigemptyset");
    }
  } else {
    // When using a custom handler, block all signals from firing while the
    // handler is running.
    if (sigfillset(&sa.sa_mask) < 0) {
      DIE("sigfillset");
    }
  }
  // sigaction may fail for certain reserved signals. Ignore failure in this
  // case, but report it in debug mode, just in case.
  if (sigaction(signum, &sa, NULL) < 0) {
    PRINT_DEBUG("sigaction(%d, &sa, NULL) failed", signum);
  }
}

static void IgnoreSignal(int signum) { InstallSignalHandler(signum, SIG_IGN); }

// Reset the signal mask and restore the default handler for all signals.
static void RestoreSignalHandlersAndMask() {
  // Use an empty signal mask for the process (= unblock all signals).
  sigset_t empty_set;
  if (sigemptyset(&empty_set) < 0) {
    DIE("sigemptyset");
  }
  if (sigprocmask(SIG_SETMASK, &empty_set, nullptr) < 0) {
    DIE("sigprocmask(SIG_SETMASK, <empty set>, nullptr)");
  }

  // Set the default signal handler for all signals.
  struct sigaction sa;
  memset(&sa, 0, sizeof(sa));
  if (sigemptyset(&sa.sa_mask) < 0) {
    DIE("sigemptyset");
  }
  sa.sa_handler = SIG_DFL;
  for (int i = 1; i < NSIG; ++i) {
    // Ignore possible errors, because we might not be allowed to set the
    // handler for certain signals, but we still want to try.
    sigaction(i, &sa, nullptr);
  }
}

static void ForwardSignal(int signum) {
  PRINT_DEBUG("ForwardSignal(%d)", signum);
  kill(-global_child_pid, signum);
}

static void SetupSignalHandlers() {
  RestoreSignalHandlersAndMask();

  for (int signum = 1; signum < NSIG; signum++) {
    switch (signum) {
      // Some signals should indeed kill us and not be forwarded to the child,
      // thus we can use the default handler.
      case SIGABRT:
      case SIGBUS:
      case SIGFPE:
      case SIGILL:
      case SIGSEGV:
      case SIGSYS:
      case SIGTRAP:
        break;
      // It's fine to use the default handler for SIGCHLD, because we use
      // waitpid() in the main loop to wait for children to die anyway.
      case SIGCHLD:
        break;
      // One does not simply install a signal handler for these two signals
      case SIGKILL:
      case SIGSTOP:
        break;
      // Ignore SIGTTIN and SIGTTOU, as we hand off the terminal to the child in
      // SpawnChild().
      case SIGTTIN:
      case SIGTTOU:
        IgnoreSignal(signum);
        break;
      // All other signals should be forwarded to the child.
      default:
        InstallSignalHandler(signum, ForwardSignal);
        break;
    }
  }
}

static void SpawnChild() {
  global_child_pid = fork();

  if (global_child_pid < 0) {
    DIE("fork()");
  } else if (global_child_pid == 0) {
    // Put the child into its own process group.
    if (setpgid(0, 0) < 0) {
      DIE("setpgid");
    }

    // Try to assign our terminal to the child process.
    if (tcsetpgrp(STDIN_FILENO, getpgrp()) < 0 && errno != ENOTTY) {
      DIE("tcsetpgrp")
    }

    // Unblock all signals, restore default handlers.
    RestoreSignalHandlersAndMask();

    // Force umask to include read and execute for everyone, to make output
    // permissions predictable.
    umask(022);

    // argv[] passed to execve() must be a null-terminated array.
    opt.args.push_back(nullptr);

    if (execvp(opt.args[0], opt.args.data()) < 0) {
      DIE("execvp(%s, %p)", opt.args[0], opt.args.data());
    }
  }
}

static void WaitForChild() {
  while (1) {
    // Check for zombies to be reaped and exit, if our own child exited.
    int status;
    pid_t killed_pid = waitpid(-1, &status, 0);
    PRINT_DEBUG("waitpid returned %d", killed_pid);

    if (killed_pid < 0) {
      // Our PID1 process got a signal that interrupted the waitpid() call and
      // that was either ignored or forwared to the child. This is expected &
      // fine, just continue waiting.
      if (errno == EINTR) {
        continue;
      }
      DIE("waitpid")
    } else {
      if (killed_pid == global_child_pid) {
        // If the child process we spawned earlier terminated, we'll also
        // terminate. We can simply _exit() here, because the Linux kernel will
        // kindly SIGKILL all remaining processes in our PID namespace once we
        // exit.
        if (WIFSIGNALED(status)) {
          PRINT_DEBUG("child died due to signal %d", WTERMSIG(status));
          _exit(128 + WTERMSIG(status));
        } else {
          PRINT_DEBUG("child exited with code %d", WEXITSTATUS(status));
          _exit(WEXITSTATUS(status));
        }
      }
    }
  }
}

int Pid1Main(void *sync_pipe_param) {
  if (getpid() != 1) {
    DIE("Using PID namespaces, but we are not PID 1");
  }

  SetupSelfDestruction(reinterpret_cast<int *>(sync_pipe_param));
  SetupMountNamespace();
  SetupUserNamespace();
  SetupUtsNamespace();
  MountFilesystems();
  MakeFilesystemMostlyReadOnly();
  MountProc();
  SetupNetworking();
  EnterSandbox();
  SetupSignalHandlers();
  SpawnChild();
  WaitForChild();
  _exit(EXIT_FAILURE);
}