src/lib/cgroup.c - vzctl - Rivoreo Source Code Repositories

 #include <sys/types.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <errno.h>
 #include <string.h>
 #include <strings.h>
 #include <signal.h>
 #include <unistd.h>
 #include <sys/stat.h>
 #include <fcntl.h>

 #include "types.h"
 #include "vzerror.h"
 #include "cgroup.h"
 #include "cpu.h"
 #include "bitmap.h"
 #include "logger.h"
 #include "util.h"
 #include "env.h"

 #define MEMLIMIT	"memory.limit_in_bytes"
 #define SWAPLIMIT	"memory.memsw.limit_in_bytes"
 #define KMEMLIMIT	"memory.kmem.limit_in_bytes"
 #define TCPLIMIT	"memory.kmem.tcp.limit_in_bytes"

 static int copy_string_from_parent(struct cgroup_controller *controller,
 				   struct cgroup_controller *pcont, const char *file)
 {
 	char *ptr = NULL;
 	int ret;

 	ret = cgroup_get_value_string(pcont, file, &ptr);
 	if (ret)
 		goto out;
 	ret = cgroup_set_value_string(controller, file, ptr);
 out:
 	free(ptr);
 	return ret;
 }

 static int controller_apply_config(struct cgroup *ct, struct cgroup *parent,
 				   struct cgroup_controller *controller,
 				   const char *name)
 {
 	int ret;
 	if (!strcmp("cpuset", name)) {
 		struct cgroup_controller *pcont = cgroup_get_controller(parent, name);
 		if (!pcont)
 			return 0;

 		if ((ret = copy_string_from_parent(controller, pcont, "cpuset.cpus")))
 			return ret;

 		if ((ret = copy_string_from_parent(controller, pcont, "cpuset.mems")))
 			return ret;
 	} else if (!strcmp("memory", name)) {
 		if ((ret = cgroup_set_value_string(controller, "memory.use_hierarchy", "1")))
 			return ret;
 #if 0		/* BUG: this code assumes page size to be 4096 byte. */
 		/*
 		 * The kernel memory controller cannot flip states from
 		 * unlimited to limited if there are already tasks in it.
 		 * Therefore, we always have to run with *some* value of kmem
 		 * enabled. If we don't do it, we can't start unlimited and
 		 * then use the set command to set any beancounters. We write
 		 * The maximum amount minus two pages, which should effectively
 		 * mean "accounting turned on, but unlimited". This will fail
 		 * if the kmem controller is not present, but that is okay.
 		 */
 		cgroup_set_value_string(controller,
 				"memory.kmem.limit_in_bytes",
 				"9223372036854767712");
 #endif
 	} else if (!strcmp("devices", name)) {
 		if ((ret = cgroup_set_value_string(controller, "devices.deny", "a")))
 			return ret;
 	}
 	return 0;
 }

 static char *conf_names[] = {
 	"Memory",
 	"Kernel Memory",
 	"Swap",
 	"TCPbuffer",
 	"CPU limits",
 	"CPU mask",
 	"CPU shares",
 	"Allowed Devices",
 	"Denied Devices",
 };

 int container_apply_config(envid_t veid, enum conf_files c, void *_val)
 {
 	struct cgroup *ct;
 	char cgrp[CT_MAX_STR_SIZE];
 	struct cgroup_controller *mem, *cpu, *cpuset;
 	int ret = -EINVAL;
 	unsigned long *val = _val;

 	logger(2, 0, "function: container_apply_config(%u, %u, %p)", veid, c, _val);

 	veid_to_name(cgrp, veid);

 	ct = cgroup_new_cgroup(cgrp);
 	/*
 	 * We should really be doing some thing like:
 	 *
 	 *	ret = cgroup_get_cgroup(ct);
 	 *
 	 * and then doing cgroup_get_controller. However, libcgroup has
 	 * a very nasty bug that make it sometimes fail. adding a controller
 	 * to a newly "created" cgroup structure and then setting the value
 	 * is a workaround that seems to work on various versions of the
 	 * library
 	 */
 	switch (c) {
 	case MEMORY:
 		if ((mem = cgroup_add_controller(ct, "memory")))
 			ret = cgroup_set_value_uint64(mem, MEMLIMIT, *val);
 		break;
 	case SWAP:
 		/* Unlike kmem, this must always be greater than mem */
 		if ((mem = cgroup_add_controller(ct, "memory"))) {
 			uint64_t mval;
 			if (!cgroup_get_value_uint64(mem, MEMLIMIT, &mval))
 				ret = cgroup_set_value_uint64(mem, SWAPLIMIT,
 							      mval + *val);
 		}
 		break;
 	case KMEMORY:
 		if ((mem = cgroup_add_controller(ct, "memory")))
 			ret = cgroup_set_value_uint64(mem, KMEMLIMIT, *val);
 		break;
 	case TCP:
 		if ((mem = cgroup_add_controller(ct, "memory")))
 			ret = cgroup_set_value_uint64(mem, TCPLIMIT, *val);
 		break;
 	case CPULIMIT: {
 		uint64_t period;
 		uint64_t quota;
 		if ((cpu = cgroup_add_controller(ct, "cpu")) == NULL)
 			break;

 		/* Should be 100000, but be safe. It may fail on some versions
 		 * of libcgroup, so if it fails, just assume the default */
 		ret = cgroup_get_value_uint64(cpu, "cpu.cfs_period_us", &period);
 		if (ret)
 			period = 100000;
 		/* val will contain an integer percentage, like 223% */
 		quota = (period * (*val)) / 100;
 		ret = cgroup_set_value_uint64(cpu, "cpu.cfs_quota_us", quota);
 		break;
 	}
 	case CPUSHARES:
 		if ((cpu = cgroup_add_controller(ct, "cpu")) == NULL)
 			break;
 		ret = cgroup_set_value_uint64(cpu, "cpu.shares", *val);
 		break;
 	case CPUMASK: {
 		struct cgroup_controller *pcont;
 		struct cgroup *parent;
 		char *ptr = NULL;
 		char cpusetstr[2 * CPUMASK_NBITS];
 		unsigned int i;

 		if ((cpuset = cgroup_add_controller(ct, "cpuset")) == NULL)
 			break;
 		/*
 		 * Having all bits set is a bit different, bitmap_snprintf will
 		 * return a bad string. (From the PoV of the cpuset cgroup). We
 		 * actually need to copy the parent's mask in that case.
 		 */
 		for (i = 0; i < CPUMASK_NBYTES; i++) {
 			if (val[i] != (~0UL)) {
 				bitmap_snprintf(cpusetstr, CPUMASK_NBITS * 2,
 						val, CPUMASK_NBITS);
 				goto string_ok;
 			}
 		}

 		parent = cgroup_new_cgroup(CT_BASE_STRING);
 		cgroup_get_cgroup(parent);
 		pcont = cgroup_get_controller(parent, "cpuset");
 		ret = cgroup_get_value_string(pcont, "cpuset.cpus", &ptr);
 		if (ptr) {
 			strncpy(cpusetstr, ptr, CPUMASK_NBITS *2);
 			free(ptr);
 		}
 		cgroup_free(&parent);
 string_ok:
 		ret = cgroup_set_value_string(cpuset, "cpuset.cpus", cpusetstr);
 		break;
 	}
 	case DEVICES_DENY: {
 		struct cgroup_controller *dev;

 		if ((dev = cgroup_add_controller(ct, "devices")) == NULL)
 			break;

 		ret = cgroup_set_value_string(dev, "devices.deny", (char *)_val);
 		break;
 	}
 	case DEVICES_ALLOW: {
 		struct cgroup_controller *dev;

 		if ((dev = cgroup_add_controller(ct, "devices")) == NULL)
 			break;

 		ret = cgroup_set_value_string(dev, "devices.allow", (char *)_val);
 		break;
 	}
 	default:
 		ret = -EINVAL;
 		break;
 	}

 	if (ret)
 		goto out;

 	if ((ret = cgroup_modify_cgroup(ct)))
 		logger(-1, 0, "Failed to set limits for %s (%s)", conf_names[c],
 		       cgroup_strerror(ret));
 out:
 	cgroup_free(&ct);
 	return ret;
 }

 static int do_create_container(struct cgroup *ct, struct cgroup *parent)
 {
 	struct cgroup_mount_point mnt;
 	struct cgroup_controller *controller;
 	void *handle;
 	int ret;

 	ret = cgroup_get_controller_begin(&handle, &mnt);

 	cgroup_get_cgroup(parent);

 	do {
 		logger(2, 0, "Adding group to controller %s", mnt.name);
 		controller = cgroup_add_controller(ct, mnt.name);
 		ret = controller_apply_config(ct, parent, controller, mnt.name);
 		if (!ret)
 			ret = cgroup_get_controller_next(&handle, &mnt);
 	} while (!ret);

 	cgroup_get_controller_end(&handle);

 	if (ret == ECGEOF)
 		ret = cgroup_create_cgroup(ct, 0);

 	return ret;

 }

 int create_container(envid_t veid)
 {
 	char cgrp[CT_MAX_STR_SIZE];
 	struct cgroup *ct, *parent;
 	int ret;
 	unsigned int i;
 	const char *devices[] = { "c *:* m", /* everyone can mknod */
 				  "b *:* m", /* block devices too */
 				  "c 1:3 rmw", /* null */
 				  "c 1:5 rmw", /* zero */
 				  "c 1:7 rmw", /* full */
 				  "c 1:8 rmw", /* random */
 				  "c 1:9 rmw", /* urandom */
 				  "c 5:2 rmw", /* ptmx */
 				  "c 136:* rmw", /* various pts */
 				};

 	veid_to_name(cgrp, veid);
 	ct = cgroup_new_cgroup(cgrp);
 	parent = cgroup_new_cgroup("/");

 	ret = do_create_container(ct, parent);
 	cgroup_free(&ct);
 	cgroup_free(&parent);


 	/*
 	 * FIXME: This is yet another hack required by libcgroup. At some point
 	 * in time, this MUST go away.
 	 *
 	 * Problem is that libcgroup works with buffered writes. If we write to
 	 * a cgroup file and want it to be seen in the filesystem, we need to
 	 * call cgroup_modify_cgroup().
 	 *
 	 * However, all versions up to 0.38 will fail that operation for already
 	 * existent cgroups, due to a bug in the way they handle modifications
 	 * in the presence of read-only files (whether or not that specific file
 	 * was being modified). Because of that, we need to come up with a new
 	 * cgroup all the time, and free it afterwards.
 	 */
 	for (i = 0; i < ARRAY_SIZE(devices); i++) {
 		struct cgroup_controller *dev;

 		veid_to_name(cgrp, veid);
 		ct = cgroup_new_cgroup(cgrp);

 		if ((dev = cgroup_add_controller(ct, "devices"))) {
 			cgroup_set_value_string(dev, "devices.allow", devices[i]);
 			if ((ret = cgroup_modify_cgroup(ct))) {
 				logger(-1, 0, "Failed to set device permissions for %s (%s)",
 					devices[i], cgroup_strerror(ret));
 			}
 		} else {
 			logger(-1, 0, "Failed to attach device controller (%s)",
 			       cgroup_strerror(ret));
 		}
 		cgroup_free(&ct);
 	}

 	return ret;
 }

 /* libcgroup is lame. This should be done with the cgroup structure, not the
  * cgroup name
  */
 static int controller_has_tasks(const char *cgrp, const char *name)
 {
 	int ret;
 	pid_t pid;
 	void *handle;

 	ret = cgroup_get_task_begin(cgrp, name, &handle, &pid);
 	ret = (ret != ECGEOF);
 	cgroup_get_task_end(&handle);
 	return ret;
 }

 int container_add_task(envid_t veid)
 {
 	char cgrp[CT_MAX_STR_SIZE];
 	struct cgroup *ct;
 	int ret;

 	veid_to_name(cgrp, veid);
 	ct = cgroup_new_cgroup(cgrp);
 	ret = cgroup_get_cgroup(ct);
 	if (ret)
 		goto out;

 	ret = cgroup_attach_task_pid(ct, getpid());
 out:
 	cgroup_free(&ct);
 	return ret;
 }

 int destroy_container(envid_t veid)
 {
 	struct cgroup *ct;
 	char cgrp[CT_MAX_STR_SIZE];
 	int ret;

 	veid_to_name(cgrp, veid);
 	ct = cgroup_new_cgroup(cgrp);
 	ret = cgroup_get_cgroup(ct);

 	/* Since this can also be called from initialization, this is valid */
 	if (ret == ECGROUPNOTEXIST) {
 		ret = 0;
 		goto out;
 	}

 	ret = cgroup_delete_cgroup_ext(ct, 0);
 out:
 	cgroup_free(&ct);
 	return ret;
 }

 int container_is_running(envid_t veid)
 {
 	int ret = 0;
 	void *handle;
 	struct cgroup_mount_point mnt;
 	struct cgroup *ct;
 	char cgrp[CT_MAX_STR_SIZE];

 	veid_to_name(cgrp, veid);

 	ct = cgroup_new_cgroup(cgrp);
 	ret = cgroup_get_cgroup(ct);
 	if (ret == ECGROUPNOTEXIST) {
 		ret = 0;
 		goto out_free;
 	}

 	ret = cgroup_get_controller_begin(&handle, &mnt);
 	do {
 		struct cgroup_controller *controller;
 		controller = cgroup_get_controller(ct, mnt.name);
 		if (!controller) {
 			logger(0, 0, "Controller %s seems to be missing!", mnt.name);
 			continue;
 		}
 		if ((ret = controller_has_tasks(cgrp, mnt.name)) != 0)
 			goto out;
 	} while ((ret = cgroup_get_controller_next(&handle, &mnt)) == 0);

 	if (ret != ECGEOF)
 		ret = -ret;
 	else
 		ret = 0;
 out:
 	cgroup_get_controller_end(&handle);
 out_free:
 	cgroup_free(&ct);
 	return ret;
 }

 /*
  * We send a kill signal to all processes. This is racy in theory, since they
  * could spawn new processes faster than we kill. But since one of them is the
  * init process, (we don't really know which), then eventually the init process
  * will die taking away all the others, so this is fine.
  *
  * This is a big hack, and only exists because we have no way to enter a PID
  * namespace from the outside (yet). From there, we could just issue a normal
  * reboot.
  */
 int hackish_empty_container(envid_t veid)
 {
 	char cgrp[CT_MAX_STR_SIZE];
 	struct cgroup *ct;
 	int ret = 0;
 	void *task_handle;
 	pid_t pid;
 	int i;

 	veid_to_name(cgrp, veid);
 	ct = cgroup_new_cgroup(cgrp);

 	ret = cgroup_get_cgroup(ct);
 	if (ret == ECGROUPNOTEXIST) {
 		ret = 0;
 		goto out;
 	}

 	/* Any controller will do */
 	ret = cgroup_get_task_begin(cgrp, "cpu", &task_handle, &pid);
 	while (!ret) {
 		kill(pid, SIGKILL);
 		ret = cgroup_get_task_next(&task_handle, &pid);
 	}
 	cgroup_get_task_end(&task_handle);

 	if (ret != ECGEOF) {
 		logger(-1, 0, "Could not finish all tasks: %s",
 				cgroup_strerror(ret));
 		goto out;
 	}

 	ret = 0;
 	for (i = 0; i < DEF_STOP_TIMEOUT; i++) {
 		if (!container_is_running(veid))
 			goto out;
 		usleep(500000);
 	}
 	logger(-1, 0, "Failed to wait for CT tasks to die");
 	ret = VZ_STOP_ERROR;
 out:
 	cgroup_free(&ct);
 	return ret;
 }

 /*
  * This function assumes that all pids inside a cgroup
  * belong to the same namespace, that is the container namespace.
  * Therefore, from the host box, any of them will do.
  */
 pid_t get_pid_from_container(envid_t veid)
 {
 	char cgrp[CT_MAX_STR_SIZE];
 	struct cgroup *ct;
 	void *task_handle;
 	void *cont_handle;
 	struct cgroup_mount_point mnt;
 	pid_t pid = -1;
 	int ret;

 	veid_to_name(cgrp, veid);
 	ct = cgroup_new_cgroup(cgrp);
 	ret = cgroup_get_cgroup(ct);
 	if (ret == ECGROUPNOTEXIST)
 		goto out_free;

 	ret = cgroup_get_controller_begin(&cont_handle, &mnt);
 	if (ret != 0) /* no controllers, something is wrong */
 		goto out_free;

 	ret = cgroup_get_task_begin(cgrp, mnt.name, &task_handle, &pid);
 	if (ret != 0) /* no tasks, something is also wrong */
 		goto out_end_cont;
 	cgroup_get_task_end(&task_handle);

 out_end_cont:
 	cgroup_get_controller_end(&cont_handle);
 out_free:
 	cgroup_free(&ct);
 	return pid;
 }
 int container_init(void)
 {
 	return cgroup_init();
 }
	#include <sys/types.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <errno.h>
	#include <string.h>
	#include <strings.h>
	#include <signal.h>
	#include <unistd.h>
	#include <sys/stat.h>
	#include <fcntl.h>

	#include "types.h"
	#include "vzerror.h"
	#include "cgroup.h"
	#include "cpu.h"
	#include "bitmap.h"
	#include "logger.h"
	#include "util.h"
	#include "env.h"

	#define MEMLIMIT "memory.limit_in_bytes"
	#define SWAPLIMIT "memory.memsw.limit_in_bytes"
	#define KMEMLIMIT "memory.kmem.limit_in_bytes"
	#define TCPLIMIT "memory.kmem.tcp.limit_in_bytes"

	static int copy_string_from_parent(struct cgroup_controller *controller,
	struct cgroup_controller pcont, const char file)
	{
	char *ptr = NULL;
	int ret;

	ret = cgroup_get_value_string(pcont, file, &ptr);
	if (ret)
	goto out;
	ret = cgroup_set_value_string(controller, file, ptr);
	out:
	free(ptr);
	return ret;
	}

	static int controller_apply_config(struct cgroup ct, struct cgroup parent,
	struct cgroup_controller *controller,
	const char *name)
	{
	int ret;
	if (!strcmp("cpuset", name)) {
	struct cgroup_controller *pcont = cgroup_get_controller(parent, name);
	if (!pcont)
	return 0;

	if ((ret = copy_string_from_parent(controller, pcont, "cpuset.cpus")))
	return ret;

	if ((ret = copy_string_from_parent(controller, pcont, "cpuset.mems")))
	return ret;
	} else if (!strcmp("memory", name)) {
	if ((ret = cgroup_set_value_string(controller, "memory.use_hierarchy", "1")))
	return ret;
	#if 0 /* BUG: this code assumes page size to be 4096 byte. */
	/*
	* The kernel memory controller cannot flip states from
	* unlimited to limited if there are already tasks in it.
	* Therefore, we always have to run with some value of kmem
	* enabled. If we don't do it, we can't start unlimited and
	* then use the set command to set any beancounters. We write
	* The maximum amount minus two pages, which should effectively
	* mean "accounting turned on, but unlimited". This will fail
	* if the kmem controller is not present, but that is okay.
	*/
	cgroup_set_value_string(controller,
	"memory.kmem.limit_in_bytes",
	"9223372036854767712");
	#endif
	} else if (!strcmp("devices", name)) {
	if ((ret = cgroup_set_value_string(controller, "devices.deny", "a")))
	return ret;
	}
	return 0;
	}

	static char *conf_names[] = {
	"Memory",
	"Kernel Memory",
	"Swap",
	"TCPbuffer",
	"CPU limits",
	"CPU mask",
	"CPU shares",
	"Allowed Devices",
	"Denied Devices",
	};

	int container_apply_config(envid_t veid, enum conf_files c, void *_val)
	{
	struct cgroup *ct;
	char cgrp[CT_MAX_STR_SIZE];
	struct cgroup_controller mem, cpu, *cpuset;
	int ret = -EINVAL;
	unsigned long *val = _val;

	logger(2, 0, "function: container_apply_config(%u, %u, %p)", veid, c, _val);

	veid_to_name(cgrp, veid);

	ct = cgroup_new_cgroup(cgrp);
	/*
	* We should really be doing some thing like:
	*
	* ret = cgroup_get_cgroup(ct);
	*
	* and then doing cgroup_get_controller. However, libcgroup has
	* a very nasty bug that make it sometimes fail. adding a controller
	* to a newly "created" cgroup structure and then setting the value
	* is a workaround that seems to work on various versions of the
	* library
	*/
	switch (c) {
	case MEMORY:
	if ((mem = cgroup_add_controller(ct, "memory")))
	ret = cgroup_set_value_uint64(mem, MEMLIMIT, *val);
	break;
	case SWAP:
	/* Unlike kmem, this must always be greater than mem */
	if ((mem = cgroup_add_controller(ct, "memory"))) {
	uint64_t mval;
	if (!cgroup_get_value_uint64(mem, MEMLIMIT, &mval))
	ret = cgroup_set_value_uint64(mem, SWAPLIMIT,
	mval + *val);
	}
	break;
	case KMEMORY:
	if ((mem = cgroup_add_controller(ct, "memory")))
	ret = cgroup_set_value_uint64(mem, KMEMLIMIT, *val);
	break;
	case TCP:
	if ((mem = cgroup_add_controller(ct, "memory")))
	ret = cgroup_set_value_uint64(mem, TCPLIMIT, *val);
	break;
	case CPULIMIT: {
	uint64_t period;
	uint64_t quota;
	if ((cpu = cgroup_add_controller(ct, "cpu")) == NULL)
	break;

	/* Should be 100000, but be safe. It may fail on some versions
	* of libcgroup, so if it fails, just assume the default */
	ret = cgroup_get_value_uint64(cpu, "cpu.cfs_period_us", &period);
	if (ret)
	period = 100000;
	/* val will contain an integer percentage, like 223% */
	quota = (period * (*val)) / 100;
	ret = cgroup_set_value_uint64(cpu, "cpu.cfs_quota_us", quota);
	break;
	}
	case CPUSHARES:
	if ((cpu = cgroup_add_controller(ct, "cpu")) == NULL)
	break;
	ret = cgroup_set_value_uint64(cpu, "cpu.shares", *val);
	break;
	case CPUMASK: {
	struct cgroup_controller *pcont;
	struct cgroup *parent;
	char *ptr = NULL;
	char cpusetstr[2 * CPUMASK_NBITS];
	unsigned int i;

	if ((cpuset = cgroup_add_controller(ct, "cpuset")) == NULL)
	break;
	/*
	* Having all bits set is a bit different, bitmap_snprintf will
	* return a bad string. (From the PoV of the cpuset cgroup). We
	* actually need to copy the parent's mask in that case.
	*/
	for (i = 0; i < CPUMASK_NBYTES; i++) {
	if (val[i] != (~0UL)) {
	bitmap_snprintf(cpusetstr, CPUMASK_NBITS * 2,
	val, CPUMASK_NBITS);
	goto string_ok;
	}
	}

	parent = cgroup_new_cgroup(CT_BASE_STRING);
	cgroup_get_cgroup(parent);
	pcont = cgroup_get_controller(parent, "cpuset");
	ret = cgroup_get_value_string(pcont, "cpuset.cpus", &ptr);
	if (ptr) {
	strncpy(cpusetstr, ptr, CPUMASK_NBITS *2);
	free(ptr);
	}
	cgroup_free(&parent);
	string_ok:
	ret = cgroup_set_value_string(cpuset, "cpuset.cpus", cpusetstr);
	break;
	}
	case DEVICES_DENY: {
	struct cgroup_controller *dev;

	if ((dev = cgroup_add_controller(ct, "devices")) == NULL)
	break;

	ret = cgroup_set_value_string(dev, "devices.deny", (char *)_val);
	break;
	}
	case DEVICES_ALLOW: {
	struct cgroup_controller *dev;

	if ((dev = cgroup_add_controller(ct, "devices")) == NULL)
	break;

	ret = cgroup_set_value_string(dev, "devices.allow", (char *)_val);
	break;
	}
	default:
	ret = -EINVAL;
	break;
	}

	if (ret)
	goto out;

	if ((ret = cgroup_modify_cgroup(ct)))
	logger(-1, 0, "Failed to set limits for %s (%s)", conf_names[c],
	cgroup_strerror(ret));
	out:
	cgroup_free(&ct);
	return ret;
	}

	static int do_create_container(struct cgroup ct, struct cgroup parent)
	{
	struct cgroup_mount_point mnt;
	struct cgroup_controller *controller;
	void *handle;
	int ret;

	ret = cgroup_get_controller_begin(&handle, &mnt);

	cgroup_get_cgroup(parent);

	do {
	logger(2, 0, "Adding group to controller %s", mnt.name);
	controller = cgroup_add_controller(ct, mnt.name);
	ret = controller_apply_config(ct, parent, controller, mnt.name);
	if (!ret)
	ret = cgroup_get_controller_next(&handle, &mnt);
	} while (!ret);

	cgroup_get_controller_end(&handle);

	if (ret == ECGEOF)
	ret = cgroup_create_cgroup(ct, 0);

	return ret;

	}

	int create_container(envid_t veid)
	{
	char cgrp[CT_MAX_STR_SIZE];
	struct cgroup ct, parent;
	int ret;
	unsigned int i;
	const char devices[] = { "c :* m", /* everyone can mknod */
	"b : m", /* block devices too */
	"c 1:3 rmw", /* null */
	"c 1:5 rmw", /* zero */
	"c 1:7 rmw", /* full */
	"c 1:8 rmw", /* random */
	"c 1:9 rmw", /* urandom */
	"c 5:2 rmw", /* ptmx */
	"c 136:* rmw", /* various pts */
	};

	veid_to_name(cgrp, veid);
	ct = cgroup_new_cgroup(cgrp);
	parent = cgroup_new_cgroup("/");

	ret = do_create_container(ct, parent);
	cgroup_free(&ct);
	cgroup_free(&parent);


	/*
	* FIXME: This is yet another hack required by libcgroup. At some point
	* in time, this MUST go away.
	*
	* Problem is that libcgroup works with buffered writes. If we write to
	* a cgroup file and want it to be seen in the filesystem, we need to
	* call cgroup_modify_cgroup().
	*
	* However, all versions up to 0.38 will fail that operation for already
	* existent cgroups, due to a bug in the way they handle modifications
	* in the presence of read-only files (whether or not that specific file
	* was being modified). Because of that, we need to come up with a new
	* cgroup all the time, and free it afterwards.
	*/
	for (i = 0; i < ARRAY_SIZE(devices); i++) {
	struct cgroup_controller *dev;

	veid_to_name(cgrp, veid);
	ct = cgroup_new_cgroup(cgrp);

	if ((dev = cgroup_add_controller(ct, "devices"))) {
	cgroup_set_value_string(dev, "devices.allow", devices[i]);
	if ((ret = cgroup_modify_cgroup(ct))) {
	logger(-1, 0, "Failed to set device permissions for %s (%s)",
	devices[i], cgroup_strerror(ret));
	}
	} else {
	logger(-1, 0, "Failed to attach device controller (%s)",
	cgroup_strerror(ret));
	}
	cgroup_free(&ct);
	}

	return ret;
	}

	/* libcgroup is lame. This should be done with the cgroup structure, not the
	* cgroup name
	*/
	static int controller_has_tasks(const char cgrp, const char name)
	{
	int ret;
	pid_t pid;
	void *handle;

	ret = cgroup_get_task_begin(cgrp, name, &handle, &pid);
	ret = (ret != ECGEOF);
	cgroup_get_task_end(&handle);
	return ret;
	}

	int container_add_task(envid_t veid)
	{
	char cgrp[CT_MAX_STR_SIZE];
	struct cgroup *ct;
	int ret;

	veid_to_name(cgrp, veid);
	ct = cgroup_new_cgroup(cgrp);
	ret = cgroup_get_cgroup(ct);
	if (ret)
	goto out;

	ret = cgroup_attach_task_pid(ct, getpid());
	out:
	cgroup_free(&ct);
	return ret;
	}

	int destroy_container(envid_t veid)
	{
	struct cgroup *ct;
	char cgrp[CT_MAX_STR_SIZE];
	int ret;

	veid_to_name(cgrp, veid);
	ct = cgroup_new_cgroup(cgrp);
	ret = cgroup_get_cgroup(ct);

	/* Since this can also be called from initialization, this is valid */
	if (ret == ECGROUPNOTEXIST) {
	ret = 0;
	goto out;
	}

	ret = cgroup_delete_cgroup_ext(ct, 0);
	out:
	cgroup_free(&ct);
	return ret;
	}

	int container_is_running(envid_t veid)
	{
	int ret = 0;
	void *handle;
	struct cgroup_mount_point mnt;
	struct cgroup *ct;
	char cgrp[CT_MAX_STR_SIZE];

	veid_to_name(cgrp, veid);

	ct = cgroup_new_cgroup(cgrp);
	ret = cgroup_get_cgroup(ct);
	if (ret == ECGROUPNOTEXIST) {
	ret = 0;
	goto out_free;
	}

	ret = cgroup_get_controller_begin(&handle, &mnt);
	do {
	struct cgroup_controller *controller;
	controller = cgroup_get_controller(ct, mnt.name);
	if (!controller) {
	logger(0, 0, "Controller %s seems to be missing!", mnt.name);
	continue;
	}
	if ((ret = controller_has_tasks(cgrp, mnt.name)) != 0)
	goto out;
	} while ((ret = cgroup_get_controller_next(&handle, &mnt)) == 0);

	if (ret != ECGEOF)
	ret = -ret;
	else
	ret = 0;
	out:
	cgroup_get_controller_end(&handle);
	out_free:
	cgroup_free(&ct);
	return ret;
	}

	/*
	* We send a kill signal to all processes. This is racy in theory, since they
	* could spawn new processes faster than we kill. But since one of them is the
	* init process, (we don't really know which), then eventually the init process
	* will die taking away all the others, so this is fine.
	*
	* This is a big hack, and only exists because we have no way to enter a PID
	* namespace from the outside (yet). From there, we could just issue a normal
	* reboot.
	*/
	int hackish_empty_container(envid_t veid)
	{
	char cgrp[CT_MAX_STR_SIZE];
	struct cgroup *ct;
	int ret = 0;
	void *task_handle;
	pid_t pid;
	int i;

	veid_to_name(cgrp, veid);
	ct = cgroup_new_cgroup(cgrp);

	ret = cgroup_get_cgroup(ct);
	if (ret == ECGROUPNOTEXIST) {
	ret = 0;
	goto out;
	}

	/* Any controller will do */
	ret = cgroup_get_task_begin(cgrp, "cpu", &task_handle, &pid);
	while (!ret) {
	kill(pid, SIGKILL);
	ret = cgroup_get_task_next(&task_handle, &pid);
	}
	cgroup_get_task_end(&task_handle);

	if (ret != ECGEOF) {
	logger(-1, 0, "Could not finish all tasks: %s",
	cgroup_strerror(ret));
	goto out;
	}

	ret = 0;
	for (i = 0; i < DEF_STOP_TIMEOUT; i++) {
	if (!container_is_running(veid))
	goto out;
	usleep(500000);
	}
	logger(-1, 0, "Failed to wait for CT tasks to die");
	ret = VZ_STOP_ERROR;
	out:
	cgroup_free(&ct);
	return ret;
	}

	/*
	* This function assumes that all pids inside a cgroup
	* belong to the same namespace, that is the container namespace.
	* Therefore, from the host box, any of them will do.
	*/
	pid_t get_pid_from_container(envid_t veid)
	{
	char cgrp[CT_MAX_STR_SIZE];
	struct cgroup *ct;
	void *task_handle;
	void *cont_handle;
	struct cgroup_mount_point mnt;
	pid_t pid = -1;
	int ret;

	veid_to_name(cgrp, veid);
	ct = cgroup_new_cgroup(cgrp);
	ret = cgroup_get_cgroup(ct);
	if (ret == ECGROUPNOTEXIST)
	goto out_free;

	ret = cgroup_get_controller_begin(&cont_handle, &mnt);
	if (ret != 0) /* no controllers, something is wrong */
	goto out_free;

	ret = cgroup_get_task_begin(cgrp, mnt.name, &task_handle, &pid);
	if (ret != 0) /* no tasks, something is also wrong */
	goto out_end_cont;
	cgroup_get_task_end(&task_handle);

	out_end_cont:
	cgroup_get_controller_end(&cont_handle);
	out_free:
	cgroup_free(&ct);
	return pid;
	}
	int container_init(void)
	{
	return cgroup_init();
	}