|  | #include <sys/types.h> | 
|  | #include <stdio.h> | 
|  | #include <stdlib.h> | 
|  | #include <errno.h> | 
|  | #include <string.h> | 
|  | #include <strings.h> | 
|  | #include <signal.h> | 
|  | #include <unistd.h> | 
|  | #include <sys/stat.h> | 
|  | #include <fcntl.h> | 
|  |  | 
|  | #include "types.h" | 
|  | #include "vzerror.h" | 
|  | #include "cgroup.h" | 
|  | #include "cpu.h" | 
|  | #include "bitmap.h" | 
|  | #include "logger.h" | 
|  | #include "util.h" | 
|  | #include "env.h" | 
|  |  | 
|  | #define MEMLIMIT	"memory.limit_in_bytes" | 
|  | #define SWAPLIMIT	"memory.memsw.limit_in_bytes" | 
|  | #define KMEMLIMIT	"memory.kmem.limit_in_bytes" | 
|  | #define TCPLIMIT	"memory.kmem.tcp.limit_in_bytes" | 
|  |  | 
|  | static int copy_string_from_parent(struct cgroup_controller *controller, | 
|  | struct cgroup_controller *pcont, const char *file) | 
|  | { | 
|  | char *ptr = NULL; | 
|  | int ret; | 
|  |  | 
|  | ret = cgroup_get_value_string(pcont, file, &ptr); | 
|  | if (ret) | 
|  | goto out; | 
|  | ret = cgroup_set_value_string(controller, file, ptr); | 
|  | out: | 
|  | free(ptr); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static int controller_apply_config(struct cgroup *ct, struct cgroup *parent, | 
|  | struct cgroup_controller *controller, | 
|  | const char *name) | 
|  | { | 
|  | int ret; | 
|  | if (!strcmp("cpuset", name)) { | 
|  | struct cgroup_controller *pcont = cgroup_get_controller(parent, name); | 
|  | if (!pcont) | 
|  | return 0; | 
|  |  | 
|  | if ((ret = copy_string_from_parent(controller, pcont, "cpuset.cpus"))) | 
|  | return ret; | 
|  |  | 
|  | if ((ret = copy_string_from_parent(controller, pcont, "cpuset.mems"))) | 
|  | return ret; | 
|  | } else if (!strcmp("memory", name)) { | 
|  | if ((ret = cgroup_set_value_string(controller, "memory.use_hierarchy", "1"))) | 
|  | return ret; | 
|  | } else if (!strcmp("devices", name)) { | 
|  | if ((ret = cgroup_set_value_string(controller, "devices.deny", "a"))) | 
|  | return ret; | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static char *conf_names[] = { | 
|  | "Memory", | 
|  | "Kernel Memory", | 
|  | "Swap", | 
|  | "TCPbuffer", | 
|  | "CPU limits", | 
|  | "CPU mask", | 
|  | "CPU shares", | 
|  | "Allowed Devices", | 
|  | "Denied Devices", | 
|  | }; | 
|  |  | 
|  | int container_apply_config(envid_t veid, enum conf_files c, void *_val) | 
|  | { | 
|  | struct cgroup *ct; | 
|  | char cgrp[CT_MAX_STR_SIZE]; | 
|  | struct cgroup_controller *mem, *cpu, *cpuset; | 
|  | int ret = -EINVAL; | 
|  | unsigned long *val = _val; | 
|  |  | 
|  | veid_to_name(cgrp, veid); | 
|  |  | 
|  | ct = cgroup_new_cgroup(cgrp); | 
|  | /* | 
|  | * We should really be doing some thing like: | 
|  | * | 
|  | *	ret = cgroup_get_cgroup(ct); | 
|  | * | 
|  | * and then doing cgroup_get_controller. However, libcgroup has | 
|  | * a very nasty bug that make it sometimes fail. adding a controller | 
|  | * to a newly "created" cgroup structure and then setting the value | 
|  | * is a workaround that seems to work on various versions of the | 
|  | * library | 
|  | */ | 
|  | switch (c) { | 
|  | case MEMORY: | 
|  | if ((mem = cgroup_add_controller(ct, "memory"))) | 
|  | ret = cgroup_set_value_uint64(mem, MEMLIMIT, *val); | 
|  | break; | 
|  | case SWAP: | 
|  | /* Unlike kmem, this must always be greater than mem */ | 
|  | if ((mem = cgroup_add_controller(ct, "memory"))) { | 
|  | unsigned long mval; | 
|  | if (!cgroup_get_value_uint64(mem, MEMLIMIT, &mval)) | 
|  | ret = cgroup_set_value_uint64(mem, SWAPLIMIT, | 
|  | mval + *val); | 
|  | } | 
|  | break; | 
|  | case KMEMORY: | 
|  | if ((mem = cgroup_add_controller(ct, "memory"))) | 
|  | ret = cgroup_set_value_uint64(mem, KMEMLIMIT, *val); | 
|  | break; | 
|  | case TCP: | 
|  | if ((mem = cgroup_add_controller(ct, "memory"))) | 
|  | ret = cgroup_set_value_uint64(mem, TCPLIMIT, *val); | 
|  | break; | 
|  | case CPULIMIT: { | 
|  | u_int64_t period; | 
|  | u_int64_t quota; | 
|  | if ((cpu = cgroup_add_controller(ct, "cpu")) == NULL) | 
|  | break; | 
|  |  | 
|  | /* Should be 100000, but be safe. It may fail on some versions | 
|  | * of libcgroup, so if it fails, just assume the default */ | 
|  | ret = cgroup_get_value_uint64(cpu, "cpu.cfs_period_us", &period); | 
|  | if (ret) | 
|  | period = 100000; | 
|  | /* val will contain an integer percentage, like 223% */ | 
|  | quota = (period * (*val)) / 100; | 
|  | ret = cgroup_set_value_uint64(cpu, "cpu.cfs_quota_us", quota); | 
|  | break; | 
|  | } | 
|  | case CPUSHARES: | 
|  | if ((cpu = cgroup_add_controller(ct, "cpu")) == NULL) | 
|  | break; | 
|  | ret = cgroup_set_value_uint64(cpu, "cpu.shares", *val); | 
|  | break; | 
|  | case CPUMASK: { | 
|  | struct cgroup_controller *pcont; | 
|  | struct cgroup *parent; | 
|  | char *ptr = NULL; | 
|  | char cpusetstr[2 * CPUMASK_NBITS]; | 
|  | unsigned int i; | 
|  |  | 
|  | if ((cpuset = cgroup_add_controller(ct, "cpuset")) == NULL) | 
|  | break; | 
|  | /* | 
|  | * Having all bits set is a bit different, bitmap_snprintf will | 
|  | * return a bad string. (From the PoV of the cpuset cgroup). We | 
|  | * actually need to copy the parent's mask in that case. | 
|  | */ | 
|  | for (i = 0; i < CPUMASK_NBYTES; i++) { | 
|  | if (val[i] != (~0UL)) { | 
|  | bitmap_snprintf(cpusetstr, CPUMASK_NBITS * 2, | 
|  | val, CPUMASK_NBITS); | 
|  | goto string_ok; | 
|  | } | 
|  | } | 
|  |  | 
|  | parent = cgroup_new_cgroup(CT_BASE_STRING); | 
|  | cgroup_get_cgroup(parent); | 
|  | pcont = cgroup_get_controller(parent, "cpuset"); | 
|  | ret = cgroup_get_value_string(pcont, "cpuset.cpus", &ptr); | 
|  | if (ptr) { | 
|  | strncpy(cpusetstr, ptr, CPUMASK_NBITS *2); | 
|  | free(ptr); | 
|  | } | 
|  | cgroup_free(&parent); | 
|  | string_ok: | 
|  | ret = cgroup_set_value_string(cpuset, "cpuset.cpus", cpusetstr); | 
|  | break; | 
|  | } | 
|  | case DEVICES_DENY: { | 
|  | struct cgroup_controller *dev; | 
|  |  | 
|  | if ((dev = cgroup_add_controller(ct, "devices")) == NULL) | 
|  | break; | 
|  |  | 
|  | ret = cgroup_set_value_string(dev, "devices.deny", (char *)_val); | 
|  | break; | 
|  | } | 
|  | case DEVICES_ALLOW: { | 
|  | struct cgroup_controller *dev; | 
|  |  | 
|  | if ((dev = cgroup_add_controller(ct, "devices")) == NULL) | 
|  | break; | 
|  |  | 
|  | ret = cgroup_set_value_string(dev, "devices.allow", (char *)_val); | 
|  | break; | 
|  | } | 
|  | default: | 
|  | ret = -EINVAL; | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (ret) | 
|  | goto out; | 
|  |  | 
|  | if ((ret = cgroup_modify_cgroup(ct))) | 
|  | logger(-1, 0, "Failed to set limits for %s (%s)", conf_names[c], | 
|  | cgroup_strerror(ret)); | 
|  | out: | 
|  | cgroup_free(&ct); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static int do_create_container(struct cgroup *ct, struct cgroup *parent) | 
|  | { | 
|  | struct cgroup_mount_point mnt; | 
|  | struct cgroup_controller *controller; | 
|  | void *handle; | 
|  | int ret; | 
|  |  | 
|  | ret = cgroup_get_controller_begin(&handle, &mnt); | 
|  |  | 
|  | cgroup_get_cgroup(parent); | 
|  |  | 
|  | do { | 
|  | controller = cgroup_add_controller(ct, mnt.name); | 
|  | ret = controller_apply_config(ct, parent, controller, mnt.name); | 
|  | if (!ret) | 
|  | ret = cgroup_get_controller_next(&handle, &mnt); | 
|  | } while (!ret); | 
|  |  | 
|  | cgroup_get_controller_end(&handle); | 
|  |  | 
|  | if (ret == ECGEOF) | 
|  | ret = cgroup_create_cgroup(ct, 0); | 
|  |  | 
|  | return ret; | 
|  |  | 
|  | } | 
|  |  | 
|  | int create_container(envid_t veid) | 
|  | { | 
|  | char cgrp[CT_MAX_STR_SIZE]; | 
|  | struct cgroup *ct, *parent; | 
|  | int ret; | 
|  | unsigned int i; | 
|  | const char *devices[] = { "c *:* m", /* everyone can mknod */ | 
|  | "b *:* m", /* block devices too */ | 
|  | "c 1:3 rmw", /* null */ | 
|  | "c 1:5 rmw", /* zero */ | 
|  | "c 1:7 rmw", /* full */ | 
|  | "c 1:8 rmw", /* random */ | 
|  | "c 1:9 rmw", /* urandom */ | 
|  | "c 5:2 rmw", /* ptmx */ | 
|  | "c 136:* rmw", /* various pts */ | 
|  | }; | 
|  |  | 
|  | veid_to_name(cgrp, veid); | 
|  | ct = cgroup_new_cgroup(cgrp); | 
|  | parent = cgroup_new_cgroup("/"); | 
|  |  | 
|  | ret = do_create_container(ct, parent); | 
|  | cgroup_free(&ct); | 
|  | cgroup_free(&parent); | 
|  |  | 
|  |  | 
|  | /* | 
|  | * FIXME: This is yet another hack required by libcgroup. At some point | 
|  | * in time, this MUST go away. | 
|  | * | 
|  | * Problem is that libcgroup works with buffered writes. If we write to | 
|  | * a cgroup file and want it to be seen in the filesystem, we need to | 
|  | * call cgroup_modify_cgroup(). | 
|  | * | 
|  | * However, all versions up to 0.38 will fail that operation for already | 
|  | * existent cgroups, due to a bug in the way they handle modifications | 
|  | * in the presence of read-only files (whether or not that specific file | 
|  | * was being modified). Because of that, we need to come up with a new | 
|  | * cgroup all the time, and free it afterwards. | 
|  | */ | 
|  | for (i = 0; i < ARRAY_SIZE(devices); i++) { | 
|  | struct cgroup_controller *dev; | 
|  |  | 
|  | veid_to_name(cgrp, veid); | 
|  | ct = cgroup_new_cgroup(cgrp); | 
|  |  | 
|  | if ((dev = cgroup_add_controller(ct, "devices"))) { | 
|  | cgroup_set_value_string(dev, "devices.allow", devices[i]); | 
|  | if ((ret = cgroup_modify_cgroup(ct))) { | 
|  | logger(-1, 0, "Failed to set device permissions for %s (%s)", | 
|  | devices[i], cgroup_strerror(ret)); | 
|  | } | 
|  | } else { | 
|  | logger(-1, 0, "Failed to attach device controller (%s)", | 
|  | cgroup_strerror(ret)); | 
|  | } | 
|  | cgroup_free(&ct); | 
|  | } | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | /* libcgroup is lame. This should be done with the cgroup structure, not the | 
|  | * cgroup name | 
|  | */ | 
|  | static int controller_has_tasks(const char *cgrp, const char *name) | 
|  | { | 
|  | int ret; | 
|  | pid_t pid; | 
|  | void *handle; | 
|  |  | 
|  | ret = cgroup_get_task_begin(cgrp, name, &handle, &pid); | 
|  | ret = (ret != ECGEOF); | 
|  | cgroup_get_task_end(&handle); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | int container_add_task(envid_t veid) | 
|  | { | 
|  | char cgrp[CT_MAX_STR_SIZE]; | 
|  | struct cgroup *ct; | 
|  | int ret = -1; | 
|  |  | 
|  | veid_to_name(cgrp, veid); | 
|  | ct = cgroup_new_cgroup(cgrp); | 
|  | if (cgroup_get_cgroup(ct)) | 
|  | goto out; | 
|  |  | 
|  | cgroup_attach_task_pid(ct, getpid()); | 
|  | ret = 0; | 
|  | out: | 
|  | cgroup_free(&ct); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | int destroy_container(envid_t veid) | 
|  | { | 
|  | struct cgroup *ct; | 
|  | char cgrp[CT_MAX_STR_SIZE]; | 
|  | int ret; | 
|  |  | 
|  | veid_to_name(cgrp, veid); | 
|  | ct = cgroup_new_cgroup(cgrp); | 
|  | ret = cgroup_get_cgroup(ct); | 
|  |  | 
|  | /* Since this can also be called from initialization, this is valid */ | 
|  | if (ret == ECGROUPNOTEXIST) { | 
|  | ret = 0; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | ret = cgroup_delete_cgroup_ext(ct, 0); | 
|  | out: | 
|  | cgroup_free(&ct); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | int container_is_running(envid_t veid) | 
|  | { | 
|  | int ret = 0; | 
|  | void *handle; | 
|  | struct cgroup_mount_point mnt; | 
|  | struct cgroup *ct; | 
|  | char cgrp[CT_MAX_STR_SIZE]; | 
|  |  | 
|  | veid_to_name(cgrp, veid); | 
|  |  | 
|  | ct = cgroup_new_cgroup(cgrp); | 
|  | ret = cgroup_get_cgroup(ct); | 
|  | if (ret == ECGROUPNOTEXIST) { | 
|  | ret = 0; | 
|  | goto out_free; | 
|  | } | 
|  |  | 
|  | ret = cgroup_get_controller_begin(&handle, &mnt); | 
|  | do { | 
|  | struct cgroup_controller *controller; | 
|  | controller = cgroup_get_controller(ct, mnt.name); | 
|  | if (!controller) { | 
|  | logger(0, 0, "Controller %s seems to be missing!", mnt.name); | 
|  | continue; | 
|  | } | 
|  | if ((ret = controller_has_tasks(cgrp, mnt.name)) != 0) | 
|  | goto out; | 
|  | } while ((ret = cgroup_get_controller_next(&handle, &mnt)) == 0); | 
|  |  | 
|  | if (ret != ECGEOF) | 
|  | ret = -ret; | 
|  | else | 
|  | ret = 0; | 
|  | out: | 
|  | cgroup_get_controller_end(&handle); | 
|  | out_free: | 
|  | cgroup_free(&ct); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * We send a kill signal to all processes. This is racy in theory, since they | 
|  | * could spawn new processes faster than we kill. But since one of them is the | 
|  | * init process, (we don't really know which), then eventually the init process | 
|  | * will die taking away all the others, so this is fine. | 
|  | * | 
|  | * This is a big hack, and only exists because we have no way to enter a PID | 
|  | * namespace from the outside (yet). From there, we could just issue a normal | 
|  | * reboot. | 
|  | */ | 
|  | int hackish_empty_container(envid_t veid) | 
|  | { | 
|  | char cgrp[CT_MAX_STR_SIZE]; | 
|  | struct cgroup *ct; | 
|  | int ret = 0; | 
|  | void *task_handle; | 
|  | pid_t pid; | 
|  | int i; | 
|  |  | 
|  | veid_to_name(cgrp, veid); | 
|  | ct = cgroup_new_cgroup(cgrp); | 
|  |  | 
|  | ret = cgroup_get_cgroup(ct); | 
|  | if (ret == ECGROUPNOTEXIST) { | 
|  | ret = 0; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | /* Any controller will do */ | 
|  | ret = cgroup_get_task_begin(cgrp, "cpu", &task_handle, &pid); | 
|  | while (!ret) { | 
|  | kill(pid, SIGKILL); | 
|  | ret = cgroup_get_task_next(&task_handle, &pid); | 
|  | } | 
|  | cgroup_get_task_end(&task_handle); | 
|  |  | 
|  | if (ret != ECGEOF) { | 
|  | logger(-1, 0, "Could not finish all tasks: %s", | 
|  | cgroup_strerror(ret)); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | ret = 0; | 
|  | for (i = 0; i < MAX_SHTD_TM; i++) { | 
|  | if (!container_is_running(veid)) | 
|  | goto out; | 
|  | usleep(500000); | 
|  | } | 
|  | logger(-1, 0, "Failed to wait for CT tasks to die"); | 
|  | ret = VZ_STOP_ERROR; | 
|  | out: | 
|  | cgroup_free(&ct); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * This function assumes that all pids inside a cgroup | 
|  | * belong to the same namespace, that is the container namespace. | 
|  | * Therefore, from the host box, any of them will do. | 
|  | */ | 
|  | pid_t get_pid_from_container(envid_t veid) | 
|  | { | 
|  | char cgrp[CT_MAX_STR_SIZE]; | 
|  | struct cgroup *ct; | 
|  | void *task_handle; | 
|  | void *cont_handle; | 
|  | struct cgroup_mount_point mnt; | 
|  | pid_t pid = -1; | 
|  | int ret; | 
|  |  | 
|  | veid_to_name(cgrp, veid); | 
|  | ct = cgroup_new_cgroup(cgrp); | 
|  | ret = cgroup_get_cgroup(ct); | 
|  | if (ret == ECGROUPNOTEXIST) | 
|  | goto out_free; | 
|  |  | 
|  | ret = cgroup_get_controller_begin(&cont_handle, &mnt); | 
|  | if (ret != 0) /* no controllers, something is wrong */ | 
|  | goto out_free; | 
|  |  | 
|  | ret = cgroup_get_task_begin(cgrp, mnt.name, &task_handle, &pid); | 
|  | if (ret != 0) /* no tasks, something is also wrong */ | 
|  | goto out_end_cont; | 
|  | cgroup_get_task_end(&task_handle); | 
|  |  | 
|  | out_end_cont: | 
|  | cgroup_get_controller_end(&cont_handle); | 
|  | out_free: | 
|  | cgroup_free(&ct); | 
|  | return pid; | 
|  | } | 
|  | int container_init(void) | 
|  | { | 
|  | return cgroup_init(); | 
|  | } |