blob: 5d7df931e4dc173b22dbbeb6f4a519a5389ca70f [file] [log] [blame] [raw]
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <strings.h>
#include <signal.h>
#include <unistd.h>
#include <sys/stat.h>
#include <fcntl.h>
#include "types.h"
#include "vzerror.h"
#include "cgroup.h"
#include "cpu.h"
#include "bitmap.h"
#include "logger.h"
#include "util.h"
#include "env.h"
#define MEMLIMIT "memory.limit_in_bytes"
#define SWAPLIMIT "memory.memsw.limit_in_bytes"
#define KMEMLIMIT "memory.kmem.limit_in_bytes"
#define TCPLIMIT "memory.kmem.tcp.limit_in_bytes"
static int copy_string_from_parent(struct cgroup_controller *controller,
struct cgroup_controller *pcont, const char *file)
{
char *ptr = NULL;
int ret;
ret = cgroup_get_value_string(pcont, file, &ptr);
if (ret)
goto out;
ret = cgroup_set_value_string(controller, file, ptr);
out:
free(ptr);
return ret;
}
static int controller_apply_config(struct cgroup *ct, struct cgroup *parent,
struct cgroup_controller *controller,
const char *name)
{
int ret;
if (!strcmp("cpuset", name)) {
struct cgroup_controller *pcont = cgroup_get_controller(parent, name);
if (!pcont)
return 0;
if ((ret = copy_string_from_parent(controller, pcont, "cpuset.cpus")))
return ret;
if ((ret = copy_string_from_parent(controller, pcont, "cpuset.mems")))
return ret;
} else if (!strcmp("memory", name)) {
if ((ret = cgroup_set_value_string(controller, "memory.use_hierarchy", "1")))
return ret;
/*
* The kernel memory controller cannot flip states from
* unlimited to limited if there are already tasks in it.
* Therefore, we always have to run with *some* value of kmem
* enabled. If we don't do it, we can't start unlimited and
* then use the set command to set any beancounters. We write
* The maximum amount minus two pages, which should effectively
* mean "accounting turned on, but unlimited". This will fail
* if the kmem controller is not present, but that is okay.
*/
cgroup_set_value_string(controller,
"memory.kmem.limit_in_bytes",
"9223372036854767712");
} else if (!strcmp("devices", name)) {
if ((ret = cgroup_set_value_string(controller, "devices.deny", "a")))
return ret;
}
return 0;
}
static char *conf_names[] = {
"Memory",
"Kernel Memory",
"Swap",
"TCPbuffer",
"CPU limits",
"CPU mask",
"CPU shares",
"Allowed Devices",
"Denied Devices",
};
int container_apply_config(envid_t veid, enum conf_files c, void *_val)
{
struct cgroup *ct;
char cgrp[CT_MAX_STR_SIZE];
struct cgroup_controller *mem, *cpu, *cpuset;
int ret = -EINVAL;
unsigned long *val = _val;
veid_to_name(cgrp, veid);
ct = cgroup_new_cgroup(cgrp);
/*
* We should really be doing some thing like:
*
* ret = cgroup_get_cgroup(ct);
*
* and then doing cgroup_get_controller. However, libcgroup has
* a very nasty bug that make it sometimes fail. adding a controller
* to a newly "created" cgroup structure and then setting the value
* is a workaround that seems to work on various versions of the
* library
*/
switch (c) {
case MEMORY:
if ((mem = cgroup_add_controller(ct, "memory")))
ret = cgroup_set_value_uint64(mem, MEMLIMIT, *val);
break;
case SWAP:
/* Unlike kmem, this must always be greater than mem */
if ((mem = cgroup_add_controller(ct, "memory"))) {
u_int64_t mval;
if (!cgroup_get_value_uint64(mem, MEMLIMIT, &mval))
ret = cgroup_set_value_uint64(mem, SWAPLIMIT,
mval + *val);
}
break;
case KMEMORY:
if ((mem = cgroup_add_controller(ct, "memory")))
ret = cgroup_set_value_uint64(mem, KMEMLIMIT, *val);
break;
case TCP:
if ((mem = cgroup_add_controller(ct, "memory")))
ret = cgroup_set_value_uint64(mem, TCPLIMIT, *val);
break;
case CPULIMIT: {
u_int64_t period;
u_int64_t quota;
if ((cpu = cgroup_add_controller(ct, "cpu")) == NULL)
break;
/* Should be 100000, but be safe. It may fail on some versions
* of libcgroup, so if it fails, just assume the default */
ret = cgroup_get_value_uint64(cpu, "cpu.cfs_period_us", &period);
if (ret)
period = 100000;
/* val will contain an integer percentage, like 223% */
quota = (period * (*val)) / 100;
ret = cgroup_set_value_uint64(cpu, "cpu.cfs_quota_us", quota);
break;
}
case CPUSHARES:
if ((cpu = cgroup_add_controller(ct, "cpu")) == NULL)
break;
ret = cgroup_set_value_uint64(cpu, "cpu.shares", *val);
break;
case CPUMASK: {
struct cgroup_controller *pcont;
struct cgroup *parent;
char *ptr = NULL;
char cpusetstr[2 * CPUMASK_NBITS];
unsigned int i;
if ((cpuset = cgroup_add_controller(ct, "cpuset")) == NULL)
break;
/*
* Having all bits set is a bit different, bitmap_snprintf will
* return a bad string. (From the PoV of the cpuset cgroup). We
* actually need to copy the parent's mask in that case.
*/
for (i = 0; i < CPUMASK_NBYTES; i++) {
if (val[i] != (~0UL)) {
bitmap_snprintf(cpusetstr, CPUMASK_NBITS * 2,
val, CPUMASK_NBITS);
goto string_ok;
}
}
parent = cgroup_new_cgroup(CT_BASE_STRING);
cgroup_get_cgroup(parent);
pcont = cgroup_get_controller(parent, "cpuset");
ret = cgroup_get_value_string(pcont, "cpuset.cpus", &ptr);
if (ptr) {
strncpy(cpusetstr, ptr, CPUMASK_NBITS *2);
free(ptr);
}
cgroup_free(&parent);
string_ok:
ret = cgroup_set_value_string(cpuset, "cpuset.cpus", cpusetstr);
break;
}
case DEVICES_DENY: {
struct cgroup_controller *dev;
if ((dev = cgroup_add_controller(ct, "devices")) == NULL)
break;
ret = cgroup_set_value_string(dev, "devices.deny", (char *)_val);
break;
}
case DEVICES_ALLOW: {
struct cgroup_controller *dev;
if ((dev = cgroup_add_controller(ct, "devices")) == NULL)
break;
ret = cgroup_set_value_string(dev, "devices.allow", (char *)_val);
break;
}
default:
ret = -EINVAL;
break;
}
if (ret)
goto out;
if ((ret = cgroup_modify_cgroup(ct)))
logger(-1, 0, "Failed to set limits for %s (%s)", conf_names[c],
cgroup_strerror(ret));
out:
cgroup_free(&ct);
return ret;
}
static int do_create_container(struct cgroup *ct, struct cgroup *parent)
{
struct cgroup_mount_point mnt;
struct cgroup_controller *controller;
void *handle;
int ret;
ret = cgroup_get_controller_begin(&handle, &mnt);
cgroup_get_cgroup(parent);
do {
controller = cgroup_add_controller(ct, mnt.name);
ret = controller_apply_config(ct, parent, controller, mnt.name);
if (!ret)
ret = cgroup_get_controller_next(&handle, &mnt);
} while (!ret);
cgroup_get_controller_end(&handle);
if (ret == ECGEOF)
ret = cgroup_create_cgroup(ct, 0);
return ret;
}
int create_container(envid_t veid)
{
char cgrp[CT_MAX_STR_SIZE];
struct cgroup *ct, *parent;
int ret;
unsigned int i;
const char *devices[] = { "c *:* m", /* everyone can mknod */
"b *:* m", /* block devices too */
"c 1:3 rmw", /* null */
"c 1:5 rmw", /* zero */
"c 1:7 rmw", /* full */
"c 1:8 rmw", /* random */
"c 1:9 rmw", /* urandom */
"c 5:2 rmw", /* ptmx */
"c 136:* rmw", /* various pts */
};
veid_to_name(cgrp, veid);
ct = cgroup_new_cgroup(cgrp);
parent = cgroup_new_cgroup("/");
ret = do_create_container(ct, parent);
cgroup_free(&ct);
cgroup_free(&parent);
/*
* FIXME: This is yet another hack required by libcgroup. At some point
* in time, this MUST go away.
*
* Problem is that libcgroup works with buffered writes. If we write to
* a cgroup file and want it to be seen in the filesystem, we need to
* call cgroup_modify_cgroup().
*
* However, all versions up to 0.38 will fail that operation for already
* existent cgroups, due to a bug in the way they handle modifications
* in the presence of read-only files (whether or not that specific file
* was being modified). Because of that, we need to come up with a new
* cgroup all the time, and free it afterwards.
*/
for (i = 0; i < ARRAY_SIZE(devices); i++) {
struct cgroup_controller *dev;
veid_to_name(cgrp, veid);
ct = cgroup_new_cgroup(cgrp);
if ((dev = cgroup_add_controller(ct, "devices"))) {
cgroup_set_value_string(dev, "devices.allow", devices[i]);
if ((ret = cgroup_modify_cgroup(ct))) {
logger(-1, 0, "Failed to set device permissions for %s (%s)",
devices[i], cgroup_strerror(ret));
}
} else {
logger(-1, 0, "Failed to attach device controller (%s)",
cgroup_strerror(ret));
}
cgroup_free(&ct);
}
return ret;
}
/* libcgroup is lame. This should be done with the cgroup structure, not the
* cgroup name
*/
static int controller_has_tasks(const char *cgrp, const char *name)
{
int ret;
pid_t pid;
void *handle;
ret = cgroup_get_task_begin(cgrp, name, &handle, &pid);
ret = (ret != ECGEOF);
cgroup_get_task_end(&handle);
return ret;
}
int container_add_task(envid_t veid)
{
char cgrp[CT_MAX_STR_SIZE];
struct cgroup *ct;
int ret;
veid_to_name(cgrp, veid);
ct = cgroup_new_cgroup(cgrp);
ret = cgroup_get_cgroup(ct);
if (ret)
goto out;
ret = cgroup_attach_task_pid(ct, getpid());
out:
cgroup_free(&ct);
return ret;
}
int destroy_container(envid_t veid)
{
struct cgroup *ct;
char cgrp[CT_MAX_STR_SIZE];
int ret;
veid_to_name(cgrp, veid);
ct = cgroup_new_cgroup(cgrp);
ret = cgroup_get_cgroup(ct);
/* Since this can also be called from initialization, this is valid */
if (ret == ECGROUPNOTEXIST) {
ret = 0;
goto out;
}
ret = cgroup_delete_cgroup_ext(ct, 0);
out:
cgroup_free(&ct);
return ret;
}
int container_is_running(envid_t veid)
{
int ret = 0;
void *handle;
struct cgroup_mount_point mnt;
struct cgroup *ct;
char cgrp[CT_MAX_STR_SIZE];
veid_to_name(cgrp, veid);
ct = cgroup_new_cgroup(cgrp);
ret = cgroup_get_cgroup(ct);
if (ret == ECGROUPNOTEXIST) {
ret = 0;
goto out_free;
}
ret = cgroup_get_controller_begin(&handle, &mnt);
do {
struct cgroup_controller *controller;
controller = cgroup_get_controller(ct, mnt.name);
if (!controller) {
logger(0, 0, "Controller %s seems to be missing!", mnt.name);
continue;
}
if ((ret = controller_has_tasks(cgrp, mnt.name)) != 0)
goto out;
} while ((ret = cgroup_get_controller_next(&handle, &mnt)) == 0);
if (ret != ECGEOF)
ret = -ret;
else
ret = 0;
out:
cgroup_get_controller_end(&handle);
out_free:
cgroup_free(&ct);
return ret;
}
/*
* We send a kill signal to all processes. This is racy in theory, since they
* could spawn new processes faster than we kill. But since one of them is the
* init process, (we don't really know which), then eventually the init process
* will die taking away all the others, so this is fine.
*
* This is a big hack, and only exists because we have no way to enter a PID
* namespace from the outside (yet). From there, we could just issue a normal
* reboot.
*/
int hackish_empty_container(envid_t veid)
{
char cgrp[CT_MAX_STR_SIZE];
struct cgroup *ct;
int ret = 0;
void *task_handle;
pid_t pid;
int i;
veid_to_name(cgrp, veid);
ct = cgroup_new_cgroup(cgrp);
ret = cgroup_get_cgroup(ct);
if (ret == ECGROUPNOTEXIST) {
ret = 0;
goto out;
}
/* Any controller will do */
ret = cgroup_get_task_begin(cgrp, "cpu", &task_handle, &pid);
while (!ret) {
kill(pid, SIGKILL);
ret = cgroup_get_task_next(&task_handle, &pid);
}
cgroup_get_task_end(&task_handle);
if (ret != ECGEOF) {
logger(-1, 0, "Could not finish all tasks: %s",
cgroup_strerror(ret));
goto out;
}
ret = 0;
for (i = 0; i < DEF_STOP_TIMEOUT; i++) {
if (!container_is_running(veid))
goto out;
usleep(500000);
}
logger(-1, 0, "Failed to wait for CT tasks to die");
ret = VZ_STOP_ERROR;
out:
cgroup_free(&ct);
return ret;
}
/*
* This function assumes that all pids inside a cgroup
* belong to the same namespace, that is the container namespace.
* Therefore, from the host box, any of them will do.
*/
pid_t get_pid_from_container(envid_t veid)
{
char cgrp[CT_MAX_STR_SIZE];
struct cgroup *ct;
void *task_handle;
void *cont_handle;
struct cgroup_mount_point mnt;
pid_t pid = -1;
int ret;
veid_to_name(cgrp, veid);
ct = cgroup_new_cgroup(cgrp);
ret = cgroup_get_cgroup(ct);
if (ret == ECGROUPNOTEXIST)
goto out_free;
ret = cgroup_get_controller_begin(&cont_handle, &mnt);
if (ret != 0) /* no controllers, something is wrong */
goto out_free;
ret = cgroup_get_task_begin(cgrp, mnt.name, &task_handle, &pid);
if (ret != 0) /* no tasks, something is also wrong */
goto out_end_cont;
cgroup_get_task_end(&task_handle);
out_end_cont:
cgroup_get_controller_end(&cont_handle);
out_free:
cgroup_free(&ct);
return pid;
}
int container_init(void)
{
return cgroup_init();
}