src/shutdown/shutdown.c - systemd-stable - Rivoreo Source Code Repositories

 /* SPDX-License-Identifier: LGPL-2.1+ */
 /***
   Copyright © 2010 ProFUSION embedded systems
 ***/

 #include <errno.h>
 #include <getopt.h>
 #include <linux/reboot.h>
 #include <stdbool.h>
 #include <stdlib.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/reboot.h>
 #include <sys/stat.h>
 #include <unistd.h>

 #include "alloc-util.h"
 #include "async.h"
 #include "binfmt-util.h"
 #include "cgroup-setup.h"
 #include "cgroup-util.h"
 #include "def.h"
 #include "exec-util.h"
 #include "fd-util.h"
 #include "fileio.h"
 #include "killall.h"
 #include "log.h"
 #include "parse-util.h"
 #include "process-util.h"
 #include "reboot-util.h"
 #include "rlimit-util.h"
 #include "signal-util.h"
 #include "string-util.h"
 #include "switch-root.h"
 #include "sysctl-util.h"
 #include "terminal-util.h"
 #include "umount.h"
 #include "util.h"
 #include "virt.h"
 #include "watchdog.h"

 #define SYNC_PROGRESS_ATTEMPTS 3
 #define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC)

 static char* arg_verb;
 static uint8_t arg_exit_code;
 static usec_t arg_timeout = DEFAULT_TIMEOUT_USEC;

 static int parse_argv(int argc, char *argv[]) {
         enum {
                 ARG_LOG_LEVEL = 0x100,
                 ARG_LOG_TARGET,
                 ARG_LOG_COLOR,
                 ARG_LOG_LOCATION,
                 ARG_EXIT_CODE,
                 ARG_TIMEOUT,
         };

         static const struct option options[] = {
                 { "log-level",     required_argument, NULL, ARG_LOG_LEVEL    },
                 { "log-target",    required_argument, NULL, ARG_LOG_TARGET   },
                 { "log-color",     optional_argument, NULL, ARG_LOG_COLOR    },
                 { "log-location",  optional_argument, NULL, ARG_LOG_LOCATION },
                 { "exit-code",     required_argument, NULL, ARG_EXIT_CODE    },
                 { "timeout",       required_argument, NULL, ARG_TIMEOUT      },
                 {}
         };

         int c, r;

         assert(argc >= 1);
         assert(argv);

         /* "-" prevents getopt from permuting argv[] and moving the verb away
          * from argv[1]. Our interface to initrd promises it'll be there. */
         while ((c = getopt_long(argc, argv, "-", options, NULL)) >= 0)
                 switch (c) {

                 case ARG_LOG_LEVEL:
                         r = log_set_max_level_from_string(optarg);
                         if (r < 0)
                                 log_error_errno(r, "Failed to parse log level %s, ignoring: %m", optarg);

                         break;

                 case ARG_LOG_TARGET:
                         r = log_set_target_from_string(optarg);
                         if (r < 0)
                                 log_error_errno(r, "Failed to parse log target %s, ignoring: %m", optarg);

                         break;

                 case ARG_LOG_COLOR:

                         if (optarg) {
                                 r = log_show_color_from_string(optarg);
                                 if (r < 0)
                                         log_error_errno(r, "Failed to parse log color setting %s, ignoring: %m", optarg);
                         } else
                                 log_show_color(true);

                         break;

                 case ARG_LOG_LOCATION:
                         if (optarg) {
                                 r = log_show_location_from_string(optarg);
                                 if (r < 0)
                                         log_error_errno(r, "Failed to parse log location setting %s, ignoring: %m", optarg);
                         } else
                                 log_show_location(true);

                         break;

                 case ARG_EXIT_CODE:
                         r = safe_atou8(optarg, &arg_exit_code);
                         if (r < 0)
                                 log_error_errno(r, "Failed to parse exit code %s, ignoring: %m", optarg);

                         break;

                 case ARG_TIMEOUT:
                         r = parse_sec(optarg, &arg_timeout);
                         if (r < 0)
                                 log_error_errno(r, "Failed to parse shutdown timeout %s, ignoring: %m", optarg);

                         break;

                 case '\001':
                         if (!arg_verb)
                                 arg_verb = optarg;
                         else
                                 log_error("Excess arguments, ignoring");
                         break;

                 case '?':
                         return -EINVAL;

                 default:
                         assert_not_reached("Unhandled option code.");
                 }

         if (!arg_verb)
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
                                        "Verb argument missing.");

         return 0;
 }

 static int switch_root_initramfs(void) {
         if (mount("/run/initramfs", "/run/initramfs", NULL, MS_BIND, NULL) < 0)
                 return log_error_errno(errno, "Failed to mount bind /run/initramfs on /run/initramfs: %m");

         if (mount(NULL, "/run/initramfs", NULL, MS_PRIVATE, NULL) < 0)
                 return log_error_errno(errno, "Failed to make /run/initramfs private mount: %m");

         /* switch_root with MS_BIND, because there might still be processes lurking around, which have open file descriptors.
          * /run/initramfs/shutdown will take care of these.
          * Also do not detach the old root, because /run/initramfs/shutdown needs to access it.
          */
         return switch_root("/run/initramfs", "/oldroot", false, MS_BIND);
 }

 /* Read the following fields from /proc/meminfo:
  *
  *  NFS_Unstable
  *  Writeback
  *  Dirty
  *
  * Return true if the sum of these fields is greater than the previous
  * value input. For all other issues, report the failure and indicate that
  * the sync is not making progress.
  */
 static int sync_making_progress(unsigned long long *prev_dirty) {
         _cleanup_fclose_ FILE *f = NULL;
         unsigned long long val = 0;
         int ret;

         f = fopen("/proc/meminfo", "re");
         if (!f)
                 return log_warning_errno(errno, "Failed to open /proc/meminfo: %m");

         for (;;) {
                 _cleanup_free_ char *line = NULL;
                 unsigned long long ull = 0;
                 int q;

                 q = read_line(f, LONG_LINE_MAX, &line);
                 if (q < 0)
                         return log_warning_errno(q, "Failed to parse /proc/meminfo: %m");
                 if (q == 0)
                         break;

                 if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:"))
                         continue;

                 errno = 0;
                 if (sscanf(line, "%*s %llu %*s", &ull) != 1) {
                         if (errno != 0)
                                 log_warning_errno(errno, "Failed to parse /proc/meminfo: %m");
                         else
                                 log_warning("Failed to parse /proc/meminfo");

                         return false;
                 }

                 val += ull;
         }

         ret = *prev_dirty > val;
         *prev_dirty = val;
         return ret;
 }

 static void sync_with_progress(void) {
         unsigned long long dirty = ULLONG_MAX;
         unsigned checks;
         pid_t pid;
         int r;

         BLOCK_SIGNALS(SIGCHLD);

         /* Due to the possibility of the sync operation hanging, we fork a child process and monitor the progress. If
          * the timeout lapses, the assumption is that that particular sync stalled. */

         r = asynchronous_sync(&pid);
         if (r < 0) {
                 log_error_errno(r, "Failed to fork sync(): %m");
                 return;
         }

         log_info("Syncing filesystems and block devices.");

         /* Start monitoring the sync operation. If more than
          * SYNC_PROGRESS_ATTEMPTS lapse without progress being made,
          * we assume that the sync is stalled */
         for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) {
                 r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC);
                 if (r == 0)
                         /* Sync finished without error.
                          * (The sync itself does not return an error code) */
                         return;
                 else if (r == -ETIMEDOUT) {
                         /* Reset the check counter if the "Dirty" value is
                          * decreasing */
                         if (sync_making_progress(&dirty) > 0)
                                 checks = 0;
                 } else {
                         log_error_errno(r, "Failed to sync filesystems and block devices: %m");
                         return;
                 }
         }

         /* Only reached in the event of a timeout. We should issue a kill
          * to the stray process. */
         log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid);
         (void) kill(pid, SIGKILL);
 }

 static int read_current_sysctl_printk_log_level(void) {
         _cleanup_free_ char *sysctl_printk_vals = NULL, *sysctl_printk_curr = NULL;
         int current_lvl;
         const char *p;
         int r;

         r = sysctl_read("kernel/printk", &sysctl_printk_vals);
         if (r < 0)
                 return log_debug_errno(r, "Cannot read sysctl kernel.printk: %m");

         p = sysctl_printk_vals;
         r = extract_first_word(&p, &sysctl_printk_curr, NULL, 0);
         if (r < 0)
                 return log_debug_errno(r, "Failed to split out kernel printk priority: %m");
         if (r == 0)
                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Short read while reading kernel.printk sysctl");

         r = safe_atoi(sysctl_printk_curr, &current_lvl);
         if (r < 0)
                 return log_debug_errno(r, "Failed to parse kernel.printk sysctl: %s", sysctl_printk_vals);

         return current_lvl;
 }

 static void bump_sysctl_printk_log_level(int min_level) {
         int current_lvl, r;

         /* Set the logging level to be able to see messages with log level smaller or equal to min_level */

         current_lvl = read_current_sysctl_printk_log_level();
         if (current_lvl < 0 || current_lvl >= min_level + 1)
                 return;

         r = sysctl_writef("kernel/printk", "%i", min_level + 1);
         if (r < 0)
                 log_debug_errno(r, "Failed to bump kernel.printk to %i: %m", min_level + 1);
 }

 int main(int argc, char *argv[]) {
         bool need_umount, need_swapoff, need_loop_detach, need_dm_detach, in_container, use_watchdog = false, can_initrd;
         _cleanup_free_ char *cgroup = NULL;
         char *arguments[3], *watchdog_device;
         int cmd, r, umount_log_level = LOG_INFO;
         static const char* const dirs[] = {SYSTEM_SHUTDOWN_PATH, NULL};

         /* The log target defaults to console, but the original systemd process will pass its log target in through a
          * command line argument, which will override this default. Also, ensure we'll never log to the journal or
          * syslog, as these logging daemons are either already dead or will die very soon. */

         log_set_target(LOG_TARGET_CONSOLE);
         log_set_prohibit_ipc(true);
         log_parse_environment();

         r = parse_argv(argc, argv);
         if (r < 0)
                 goto error;

         log_open();

         umask(0022);

         if (getpid_cached() != 1) {
                 log_error("Not executed by init (PID 1).");
                 r = -EPERM;
                 goto error;
         }

         if (streq(arg_verb, "reboot"))
                 cmd = RB_AUTOBOOT;
         else if (streq(arg_verb, "poweroff"))
                 cmd = RB_POWER_OFF;
         else if (streq(arg_verb, "halt"))
                 cmd = RB_HALT_SYSTEM;
         else if (streq(arg_verb, "kexec"))
                 cmd = LINUX_REBOOT_CMD_KEXEC;
         else if (streq(arg_verb, "exit"))
                 cmd = 0; /* ignored, just checking that arg_verb is valid */
         else {
                 log_error("Unknown action '%s'.", arg_verb);
                 r = -EINVAL;
                 goto error;
         }

         (void) cg_get_root_path(&cgroup);
         in_container = detect_container() > 0;

         /* If the logging messages are going to KMSG, and if we are not running from a container, then try to
          * update the sysctl kernel.printk current value in order to see "info" messages; This current log
          * level is not updated if already big enough.
          */
         if (!in_container &&
             IN_SET(log_get_target(),
                    LOG_TARGET_AUTO,
                    LOG_TARGET_JOURNAL_OR_KMSG,
                    LOG_TARGET_SYSLOG_OR_KMSG,
                    LOG_TARGET_KMSG))
                 bump_sysctl_printk_log_level(LOG_WARNING);

         use_watchdog = getenv("WATCHDOG_USEC");
         watchdog_device = getenv("WATCHDOG_DEVICE");
         if (watchdog_device) {
                 r = watchdog_set_device(watchdog_device);
                 if (r < 0)
                         log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m",
                                           watchdog_device);
         }

         /* Lock us into memory */
         (void) mlockall(MCL_CURRENT|MCL_FUTURE);

         /* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that
          * slow IO is processed here already and the final process killing spree is not impacted by processes
          * desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will
          * result. */
         if (!in_container)
                 sync_with_progress();

         disable_coredumps();
         disable_binfmt();

         log_info("Sending SIGTERM to remaining processes...");
         broadcast_signal(SIGTERM, true, true, arg_timeout);

         log_info("Sending SIGKILL to remaining processes...");
         broadcast_signal(SIGKILL, true, false, arg_timeout);

         need_umount = !in_container;
         need_swapoff = !in_container;
         need_loop_detach = !in_container;
         need_dm_detach = !in_container;
         can_initrd = !in_container && !in_initrd() && access("/run/initramfs/shutdown", X_OK) == 0;

         /* Unmount all mountpoints, swaps, and loopback devices */
         for (;;) {
                 bool changed = false;

                 if (use_watchdog)
                         (void) watchdog_ping();

                 /* Let's trim the cgroup tree on each iteration so
                    that we leave an empty cgroup tree around, so that
                    container managers get a nice notify event when we
                    are down */
                 if (cgroup)
                         (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, cgroup, false);

                 if (need_umount) {
                         log_info("Unmounting file systems.");
                         r = umount_all(&changed, umount_log_level);
                         if (r == 0) {
                                 need_umount = false;
                                 log_info("All filesystems unmounted.");
                         } else if (r > 0)
                                 log_info("Not all file systems unmounted, %d left.", r);
                         else
                                 log_error_errno(r, "Failed to unmount file systems: %m");
                 }

                 if (need_swapoff) {
                         log_info("Deactivating swaps.");
                         r = swapoff_all(&changed);
                         if (r == 0) {
                                 need_swapoff = false;
                                 log_info("All swaps deactivated.");
                         } else if (r > 0)
                                 log_info("Not all swaps deactivated, %d left.", r);
                         else
                                 log_error_errno(r, "Failed to deactivate swaps: %m");
                 }

                 if (need_loop_detach) {
                         log_info("Detaching loop devices.");
                         r = loopback_detach_all(&changed, umount_log_level);
                         if (r == 0) {
                                 need_loop_detach = false;
                                 log_info("All loop devices detached.");
                         } else if (r > 0)
                                 log_info("Not all loop devices detached, %d left.", r);
                         else
                                 log_error_errno(r, "Failed to detach loop devices: %m");
                 }

                 if (need_dm_detach) {
                         log_info("Detaching DM devices.");
                         r = dm_detach_all(&changed, umount_log_level);
                         if (r == 0) {
                                 need_dm_detach = false;
                                 log_info("All DM devices detached.");
                         } else if (r > 0)
                                 log_info("Not all DM devices detached, %d left.", r);
                         else
                                 log_error_errno(r, "Failed to detach DM devices: %m");
                 }

                 if (!need_umount && !need_swapoff && !need_loop_detach && !need_dm_detach) {
                         log_info("All filesystems, swaps, loop devices and DM devices detached.");
                         /* Yay, done */
                         break;
                 }

                 if (!changed && umount_log_level == LOG_INFO && !can_initrd) {
                         /* There are things we cannot get rid of. Loop one more time
                          * with LOG_ERR to inform the user. Note that we don't need
                          * to do this if there is a initrd to switch to, because that
                          * one is likely to get rid of the remounting mounts. If not,
                          * it will log about them. */
                         umount_log_level = LOG_ERR;
                         continue;
                 }

                 /* If in this iteration we didn't manage to
                  * unmount/deactivate anything, we simply give up */
                 if (!changed) {
                         log_info("Cannot finalize remaining%s%s%s%s continuing.",
                                  need_umount ? " file systems," : "",
                                  need_swapoff ? " swap devices," : "",
                                  need_loop_detach ? " loop devices," : "",
                                  need_dm_detach ? " DM devices," : "");
                         break;
                 }

                 log_debug("Couldn't finalize remaining %s%s%s%s trying again.",
                           need_umount ? " file systems," : "",
                           need_swapoff ? " swap devices," : "",
                           need_loop_detach ? " loop devices," : "",
                           need_dm_detach ? " DM devices," : "");
         }

         /* We're done with the watchdog. */
         watchdog_free_device();

         arguments[0] = NULL;
         arguments[1] = arg_verb;
         arguments[2] = NULL;
         (void) execute_directories(dirs, DEFAULT_TIMEOUT_USEC, NULL, NULL, arguments, NULL, EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS);

         (void) rlimit_nofile_safe();

         if (can_initrd) {
                 r = switch_root_initramfs();
                 if (r >= 0) {
                         argv[0] = (char*) "/shutdown";

                         (void) setsid();
                         (void) make_console_stdio();

                         log_info("Successfully changed into root pivot.\n"
                                  "Returning to initrd...");

                         execv("/shutdown", argv);
                         log_error_errno(errno, "Failed to execute shutdown binary: %m");
                 } else
                         log_error_errno(r, "Failed to switch root to \"/run/initramfs\": %m");
         }

         if (need_umount || need_swapoff || need_loop_detach || need_dm_detach)
                 log_error("Failed to finalize%s%s%s%s ignoring.",
                           need_umount ? " file systems," : "",
                           need_swapoff ? " swap devices," : "",
                           need_loop_detach ? " loop devices," : "",
                           need_dm_detach ? " DM devices," : "");

         /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need to be
          * sync'ed explicitly in advance. So let's do this here, but not needlessly slow down containers. Note that we
          * sync'ed things already once above, but we did some more work since then which might have caused IO, hence
          * let's do it once more. Do not remove this sync, data corruption will result. */
         if (!in_container)
                 sync_with_progress();

         if (streq(arg_verb, "exit")) {
                 if (in_container)
                         return arg_exit_code;

                 cmd = RB_POWER_OFF; /* We cannot exit() on the host, fallback on another method. */
         }

         switch (cmd) {

         case LINUX_REBOOT_CMD_KEXEC:

                 if (!in_container) {
                         /* We cheat and exec kexec to avoid doing all its work */
                         log_info("Rebooting with kexec.");

                         r = safe_fork("(sd-kexec)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_WAIT, NULL);
                         if (r == 0) {
                                 const char * const args[] = {
                                         KEXEC, "-e", NULL
                                 };

                                 /* Child */

                                 execv(args[0], (char * const *) args);
                                 _exit(EXIT_FAILURE);
                         }

                         /* If we are still running, then the kexec can't have worked, let's fall through */
                 }

                 cmd = RB_AUTOBOOT;
                 _fallthrough_;

         case RB_AUTOBOOT:
                 (void) reboot_with_parameter(REBOOT_LOG);
                 log_info("Rebooting.");
                 break;

         case RB_POWER_OFF:
                 log_info("Powering off.");
                 break;

         case RB_HALT_SYSTEM:
                 log_info("Halting system.");
                 break;

         default:
                 assert_not_reached("Unknown magic");
         }

         (void) reboot(cmd);
         if (errno == EPERM && in_container) {
                 /* If we are in a container, and we lacked
                  * CAP_SYS_BOOT just exit, this will kill our
                  * container for good. */
                 log_info("Exiting container.");
                 return EXIT_SUCCESS;
         }

         r = log_error_errno(errno, "Failed to invoke reboot(): %m");

   error:
         log_emergency_errno(r, "Critical error while doing system shutdown: %m");
         freeze();
 }
	/* SPDX-License-Identifier: LGPL-2.1+ */
	/***
	Copyright © 2010 ProFUSION embedded systems
	***/

	#include <errno.h>
	#include <getopt.h>
	#include <linux/reboot.h>
	#include <stdbool.h>
	#include <stdlib.h>
	#include <sys/mman.h>
	#include <sys/mount.h>
	#include <sys/reboot.h>
	#include <sys/stat.h>
	#include <unistd.h>

	#include "alloc-util.h"
	#include "async.h"
	#include "binfmt-util.h"
	#include "cgroup-setup.h"
	#include "cgroup-util.h"
	#include "def.h"
	#include "exec-util.h"
	#include "fd-util.h"
	#include "fileio.h"
	#include "killall.h"
	#include "log.h"
	#include "parse-util.h"
	#include "process-util.h"
	#include "reboot-util.h"
	#include "rlimit-util.h"
	#include "signal-util.h"
	#include "string-util.h"
	#include "switch-root.h"
	#include "sysctl-util.h"
	#include "terminal-util.h"
	#include "umount.h"
	#include "util.h"
	#include "virt.h"
	#include "watchdog.h"

	#define SYNC_PROGRESS_ATTEMPTS 3
	#define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC)

	static char* arg_verb;
	static uint8_t arg_exit_code;
	static usec_t arg_timeout = DEFAULT_TIMEOUT_USEC;

	static int parse_argv(int argc, char *argv[]) {
	enum {
	ARG_LOG_LEVEL = 0x100,
	ARG_LOG_TARGET,
	ARG_LOG_COLOR,
	ARG_LOG_LOCATION,
	ARG_EXIT_CODE,
	ARG_TIMEOUT,
	};

	static const struct option options[] = {
	{ "log-level", required_argument, NULL, ARG_LOG_LEVEL },
	{ "log-target", required_argument, NULL, ARG_LOG_TARGET },
	{ "log-color", optional_argument, NULL, ARG_LOG_COLOR },
	{ "log-location", optional_argument, NULL, ARG_LOG_LOCATION },
	{ "exit-code", required_argument, NULL, ARG_EXIT_CODE },
	{ "timeout", required_argument, NULL, ARG_TIMEOUT },
	{}
	};

	int c, r;

	assert(argc >= 1);
	assert(argv);

	/* "-" prevents getopt from permuting argv[] and moving the verb away
	* from argv[1]. Our interface to initrd promises it'll be there. */
	while ((c = getopt_long(argc, argv, "-", options, NULL)) >= 0)
	switch (c) {

	case ARG_LOG_LEVEL:
	r = log_set_max_level_from_string(optarg);
	if (r < 0)
	log_error_errno(r, "Failed to parse log level %s, ignoring: %m", optarg);

	break;

	case ARG_LOG_TARGET:
	r = log_set_target_from_string(optarg);
	if (r < 0)
	log_error_errno(r, "Failed to parse log target %s, ignoring: %m", optarg);

	break;

	case ARG_LOG_COLOR:

	if (optarg) {
	r = log_show_color_from_string(optarg);
	if (r < 0)
	log_error_errno(r, "Failed to parse log color setting %s, ignoring: %m", optarg);
	} else
	log_show_color(true);

	break;

	case ARG_LOG_LOCATION:
	if (optarg) {
	r = log_show_location_from_string(optarg);
	if (r < 0)
	log_error_errno(r, "Failed to parse log location setting %s, ignoring: %m", optarg);
	} else
	log_show_location(true);

	break;

	case ARG_EXIT_CODE:
	r = safe_atou8(optarg, &arg_exit_code);
	if (r < 0)
	log_error_errno(r, "Failed to parse exit code %s, ignoring: %m", optarg);

	break;

	case ARG_TIMEOUT:
	r = parse_sec(optarg, &arg_timeout);
	if (r < 0)
	log_error_errno(r, "Failed to parse shutdown timeout %s, ignoring: %m", optarg);

	break;

	case '\001':
	if (!arg_verb)
	arg_verb = optarg;
	else
	log_error("Excess arguments, ignoring");
	break;

	case '?':
	return -EINVAL;

	default:
	assert_not_reached("Unhandled option code.");
	}

	if (!arg_verb)
	return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
	"Verb argument missing.");

	return 0;
	}

	static int switch_root_initramfs(void) {
	if (mount("/run/initramfs", "/run/initramfs", NULL, MS_BIND, NULL) < 0)
	return log_error_errno(errno, "Failed to mount bind /run/initramfs on /run/initramfs: %m");

	if (mount(NULL, "/run/initramfs", NULL, MS_PRIVATE, NULL) < 0)
	return log_error_errno(errno, "Failed to make /run/initramfs private mount: %m");

	/* switch_root with MS_BIND, because there might still be processes lurking around, which have open file descriptors.
	* /run/initramfs/shutdown will take care of these.
	* Also do not detach the old root, because /run/initramfs/shutdown needs to access it.
	*/
	return switch_root("/run/initramfs", "/oldroot", false, MS_BIND);
	}

	/* Read the following fields from /proc/meminfo:
	*
	* NFS_Unstable
	* Writeback
	* Dirty
	*
	* Return true if the sum of these fields is greater than the previous
	* value input. For all other issues, report the failure and indicate that
	* the sync is not making progress.
	*/
	static int sync_making_progress(unsigned long long *prev_dirty) {
	_cleanup_fclose_ FILE *f = NULL;
	unsigned long long val = 0;
	int ret;

	f = fopen("/proc/meminfo", "re");
	if (!f)
	return log_warning_errno(errno, "Failed to open /proc/meminfo: %m");

	for (;;) {
	_cleanup_free_ char *line = NULL;
	unsigned long long ull = 0;
	int q;

	q = read_line(f, LONG_LINE_MAX, &line);
	if (q < 0)
	return log_warning_errno(q, "Failed to parse /proc/meminfo: %m");
	if (q == 0)
	break;

	if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:"))
	continue;

	errno = 0;
	if (sscanf(line, "%s %llu %s", &ull) != 1) {
	if (errno != 0)
	log_warning_errno(errno, "Failed to parse /proc/meminfo: %m");
	else
	log_warning("Failed to parse /proc/meminfo");

	return false;
	}

	val += ull;
	}

	ret = *prev_dirty > val;
	*prev_dirty = val;
	return ret;
	}

	static void sync_with_progress(void) {
	unsigned long long dirty = ULLONG_MAX;
	unsigned checks;
	pid_t pid;
	int r;

	BLOCK_SIGNALS(SIGCHLD);

	/* Due to the possibility of the sync operation hanging, we fork a child process and monitor the progress. If
	* the timeout lapses, the assumption is that that particular sync stalled. */

	r = asynchronous_sync(&pid);
	if (r < 0) {
	log_error_errno(r, "Failed to fork sync(): %m");
	return;
	}

	log_info("Syncing filesystems and block devices.");

	/* Start monitoring the sync operation. If more than
	* SYNC_PROGRESS_ATTEMPTS lapse without progress being made,
	* we assume that the sync is stalled */
	for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) {
	r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC);
	if (r == 0)
	/* Sync finished without error.
	* (The sync itself does not return an error code) */
	return;
	else if (r == -ETIMEDOUT) {
	/* Reset the check counter if the "Dirty" value is
	* decreasing */
	if (sync_making_progress(&dirty) > 0)
	checks = 0;
	} else {
	log_error_errno(r, "Failed to sync filesystems and block devices: %m");
	return;
	}
	}

	/* Only reached in the event of a timeout. We should issue a kill
	* to the stray process. */
	log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid);
	(void) kill(pid, SIGKILL);
	}

	static int read_current_sysctl_printk_log_level(void) {
	_cleanup_free_ char sysctl_printk_vals = NULL, sysctl_printk_curr = NULL;
	int current_lvl;
	const char *p;
	int r;

	r = sysctl_read("kernel/printk", &sysctl_printk_vals);
	if (r < 0)
	return log_debug_errno(r, "Cannot read sysctl kernel.printk: %m");

	p = sysctl_printk_vals;
	r = extract_first_word(&p, &sysctl_printk_curr, NULL, 0);
	if (r < 0)
	return log_debug_errno(r, "Failed to split out kernel printk priority: %m");
	if (r == 0)
	return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Short read while reading kernel.printk sysctl");

	r = safe_atoi(sysctl_printk_curr, &current_lvl);
	if (r < 0)
	return log_debug_errno(r, "Failed to parse kernel.printk sysctl: %s", sysctl_printk_vals);

	return current_lvl;
	}

	static void bump_sysctl_printk_log_level(int min_level) {
	int current_lvl, r;

	/* Set the logging level to be able to see messages with log level smaller or equal to min_level */

	current_lvl = read_current_sysctl_printk_log_level();
	if (current_lvl < 0 \|\| current_lvl >= min_level + 1)
	return;

	r = sysctl_writef("kernel/printk", "%i", min_level + 1);
	if (r < 0)
	log_debug_errno(r, "Failed to bump kernel.printk to %i: %m", min_level + 1);
	}

	int main(int argc, char *argv[]) {
	bool need_umount, need_swapoff, need_loop_detach, need_dm_detach, in_container, use_watchdog = false, can_initrd;
	_cleanup_free_ char *cgroup = NULL;
	char arguments[3], watchdog_device;
	int cmd, r, umount_log_level = LOG_INFO;
	static const char* const dirs[] = {SYSTEM_SHUTDOWN_PATH, NULL};

	/* The log target defaults to console, but the original systemd process will pass its log target in through a
	* command line argument, which will override this default. Also, ensure we'll never log to the journal or
	* syslog, as these logging daemons are either already dead or will die very soon. */

	log_set_target(LOG_TARGET_CONSOLE);
	log_set_prohibit_ipc(true);
	log_parse_environment();

	r = parse_argv(argc, argv);
	if (r < 0)
	goto error;

	log_open();

	umask(0022);

	if (getpid_cached() != 1) {
	log_error("Not executed by init (PID 1).");
	r = -EPERM;
	goto error;
	}

	if (streq(arg_verb, "reboot"))
	cmd = RB_AUTOBOOT;
	else if (streq(arg_verb, "poweroff"))
	cmd = RB_POWER_OFF;
	else if (streq(arg_verb, "halt"))
	cmd = RB_HALT_SYSTEM;
	else if (streq(arg_verb, "kexec"))
	cmd = LINUX_REBOOT_CMD_KEXEC;
	else if (streq(arg_verb, "exit"))
	cmd = 0; /* ignored, just checking that arg_verb is valid */
	else {
	log_error("Unknown action '%s'.", arg_verb);
	r = -EINVAL;
	goto error;
	}

	(void) cg_get_root_path(&cgroup);
	in_container = detect_container() > 0;

	/* If the logging messages are going to KMSG, and if we are not running from a container, then try to
	* update the sysctl kernel.printk current value in order to see "info" messages; This current log
	* level is not updated if already big enough.
	*/
	if (!in_container &&
	IN_SET(log_get_target(),
	LOG_TARGET_AUTO,
	LOG_TARGET_JOURNAL_OR_KMSG,
	LOG_TARGET_SYSLOG_OR_KMSG,
	LOG_TARGET_KMSG))
	bump_sysctl_printk_log_level(LOG_WARNING);

	use_watchdog = getenv("WATCHDOG_USEC");
	watchdog_device = getenv("WATCHDOG_DEVICE");
	if (watchdog_device) {
	r = watchdog_set_device(watchdog_device);
	if (r < 0)
	log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m",
	watchdog_device);
	}

	/* Lock us into memory */
	(void) mlockall(MCL_CURRENT\|MCL_FUTURE);

	/* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that
	* slow IO is processed here already and the final process killing spree is not impacted by processes
	* desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will
	* result. */
	if (!in_container)
	sync_with_progress();

	disable_coredumps();
	disable_binfmt();

	log_info("Sending SIGTERM to remaining processes...");
	broadcast_signal(SIGTERM, true, true, arg_timeout);

	log_info("Sending SIGKILL to remaining processes...");
	broadcast_signal(SIGKILL, true, false, arg_timeout);

	need_umount = !in_container;
	need_swapoff = !in_container;
	need_loop_detach = !in_container;
	need_dm_detach = !in_container;
	can_initrd = !in_container && !in_initrd() && access("/run/initramfs/shutdown", X_OK) == 0;

	/* Unmount all mountpoints, swaps, and loopback devices */
	for (;;) {
	bool changed = false;

	if (use_watchdog)
	(void) watchdog_ping();

	/* Let's trim the cgroup tree on each iteration so
	that we leave an empty cgroup tree around, so that
	container managers get a nice notify event when we
	are down */
	if (cgroup)
	(void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, cgroup, false);

	if (need_umount) {
	log_info("Unmounting file systems.");
	r = umount_all(&changed, umount_log_level);
	if (r == 0) {
	need_umount = false;
	log_info("All filesystems unmounted.");
	} else if (r > 0)
	log_info("Not all file systems unmounted, %d left.", r);
	else
	log_error_errno(r, "Failed to unmount file systems: %m");
	}

	if (need_swapoff) {
	log_info("Deactivating swaps.");
	r = swapoff_all(&changed);
	if (r == 0) {
	need_swapoff = false;
	log_info("All swaps deactivated.");
	} else if (r > 0)
	log_info("Not all swaps deactivated, %d left.", r);
	else
	log_error_errno(r, "Failed to deactivate swaps: %m");
	}

	if (need_loop_detach) {
	log_info("Detaching loop devices.");
	r = loopback_detach_all(&changed, umount_log_level);
	if (r == 0) {
	need_loop_detach = false;
	log_info("All loop devices detached.");
	} else if (r > 0)
	log_info("Not all loop devices detached, %d left.", r);
	else
	log_error_errno(r, "Failed to detach loop devices: %m");
	}

	if (need_dm_detach) {
	log_info("Detaching DM devices.");
	r = dm_detach_all(&changed, umount_log_level);
	if (r == 0) {
	need_dm_detach = false;
	log_info("All DM devices detached.");
	} else if (r > 0)
	log_info("Not all DM devices detached, %d left.", r);
	else
	log_error_errno(r, "Failed to detach DM devices: %m");
	}

	if (!need_umount && !need_swapoff && !need_loop_detach && !need_dm_detach) {
	log_info("All filesystems, swaps, loop devices and DM devices detached.");
	/* Yay, done */
	break;
	}

	if (!changed && umount_log_level == LOG_INFO && !can_initrd) {
	/* There are things we cannot get rid of. Loop one more time
	* with LOG_ERR to inform the user. Note that we don't need
	* to do this if there is a initrd to switch to, because that
	* one is likely to get rid of the remounting mounts. If not,
	* it will log about them. */
	umount_log_level = LOG_ERR;
	continue;
	}

	/* If in this iteration we didn't manage to
	* unmount/deactivate anything, we simply give up */
	if (!changed) {
	log_info("Cannot finalize remaining%s%s%s%s continuing.",
	need_umount ? " file systems," : "",
	need_swapoff ? " swap devices," : "",
	need_loop_detach ? " loop devices," : "",
	need_dm_detach ? " DM devices," : "");
	break;
	}

	log_debug("Couldn't finalize remaining %s%s%s%s trying again.",
	need_umount ? " file systems," : "",
	need_swapoff ? " swap devices," : "",
	need_loop_detach ? " loop devices," : "",
	need_dm_detach ? " DM devices," : "");
	}

	/* We're done with the watchdog. */
	watchdog_free_device();

	arguments[0] = NULL;
	arguments[1] = arg_verb;
	arguments[2] = NULL;
	(void) execute_directories(dirs, DEFAULT_TIMEOUT_USEC, NULL, NULL, arguments, NULL, EXEC_DIR_PARALLEL \| EXEC_DIR_IGNORE_ERRORS);

	(void) rlimit_nofile_safe();

	if (can_initrd) {
	r = switch_root_initramfs();
	if (r >= 0) {
	argv[0] = (char*) "/shutdown";

	(void) setsid();
	(void) make_console_stdio();

	log_info("Successfully changed into root pivot.\n"
	"Returning to initrd...");

	execv("/shutdown", argv);
	log_error_errno(errno, "Failed to execute shutdown binary: %m");
	} else
	log_error_errno(r, "Failed to switch root to \"/run/initramfs\": %m");
	}

	if (need_umount \|\| need_swapoff \|\| need_loop_detach \|\| need_dm_detach)
	log_error("Failed to finalize%s%s%s%s ignoring.",
	need_umount ? " file systems," : "",
	need_swapoff ? " swap devices," : "",
	need_loop_detach ? " loop devices," : "",
	need_dm_detach ? " DM devices," : "");

	/* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need to be
	* sync'ed explicitly in advance. So let's do this here, but not needlessly slow down containers. Note that we
	* sync'ed things already once above, but we did some more work since then which might have caused IO, hence
	* let's do it once more. Do not remove this sync, data corruption will result. */
	if (!in_container)
	sync_with_progress();

	if (streq(arg_verb, "exit")) {
	if (in_container)
	return arg_exit_code;

	cmd = RB_POWER_OFF; /* We cannot exit() on the host, fallback on another method. */
	}

	switch (cmd) {

	case LINUX_REBOOT_CMD_KEXEC:

	if (!in_container) {
	/* We cheat and exec kexec to avoid doing all its work */
	log_info("Rebooting with kexec.");

	r = safe_fork("(sd-kexec)", FORK_RESET_SIGNALS\|FORK_CLOSE_ALL_FDS\|FORK_LOG\|FORK_WAIT, NULL);
	if (r == 0) {
	const char * const args[] = {
	KEXEC, "-e", NULL
	};

	/* Child */

	execv(args[0], (char * const *) args);
	_exit(EXIT_FAILURE);
	}

	/* If we are still running, then the kexec can't have worked, let's fall through */
	}

	cmd = RB_AUTOBOOT;
	_fallthrough_;

	case RB_AUTOBOOT:
	(void) reboot_with_parameter(REBOOT_LOG);
	log_info("Rebooting.");
	break;

	case RB_POWER_OFF:
	log_info("Powering off.");
	break;

	case RB_HALT_SYSTEM:
	log_info("Halting system.");
	break;

	default:
	assert_not_reached("Unknown magic");
	}

	(void) reboot(cmd);
	if (errno == EPERM && in_container) {
	/* If we are in a container, and we lacked
	* CAP_SYS_BOOT just exit, this will kill our
	* container for good. */
	log_info("Exiting container.");
	return EXIT_SUCCESS;
	}

	r = log_error_errno(errno, "Failed to invoke reboot(): %m");

	error:
	log_emergency_errno(r, "Critical error while doing system shutdown: %m");
	freeze();
	}