Discussion:
[PATCH 2/5] sched: new clone flag CLONE_NEWCGROUP for cgroup namespace
Aditya Kali
2014-07-17 19:52:08 UTC
Permalink
CLONE_NEWCGROUP will be used to create new cgroup namespace.

Signed-off-by: Aditya Kali <adityakali-hpIqsD4AKlfQT0dZR+***@public.gmane.org>
---
include/uapi/linux/sched.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 34f9d73..2f90d00 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -21,8 +21,7 @@
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
-/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
- and is now available for re-use. */
+#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
--
2.0.0.526.g5318336
Aditya Kali
2014-07-17 19:52:10 UTC
Permalink
move cgroup_get() and cgroup_put() into cgroup.h so that
they can be called from other places.

Signed-off-by: Aditya Kali <adityakali-hpIqsD4AKlfQT0dZR+***@public.gmane.org>
---
include/linux/cgroup.h | 17 +++++++++++++++++
kernel/cgroup.c | 18 ------------------
2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 707c302..4ea477f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -530,6 +530,23 @@ static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
return cgrp->root == &cgrp_dfl_root;
}

+/* convenient tests for these bits */
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+ return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static inline void cgroup_get(struct cgroup *cgrp)
+{
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
+}
+
+static inline void cgroup_put(struct cgroup *cgrp)
+{
+ css_put(&cgrp->self);
+}
+
/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_has_tasks(struct cgroup *cgrp)
{
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1671345..8552513 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];

-static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -286,12 +285,6 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
return cgroup_css(cgrp, ss);
}

-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
- return !(cgrp->self.flags & CSS_ONLINE);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -1029,17 +1022,6 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
return mode;
}

-static void cgroup_get(struct cgroup *cgrp)
-{
- WARN_ON_ONCE(cgroup_is_dead(cgrp));
- css_get(&cgrp->self);
-}
-
-static void cgroup_put(struct cgroup *cgrp)
-{
- css_put(&cgrp->self);
-}
-
/**
* cgroup_refresh_child_subsys_mask - update child_subsys_mask
* @cgrp: the target cgroup
--
2.0.0.526.g5318336
Serge Hallyn
2014-07-24 17:03:16 UTC
Permalink
Post by Aditya Kali
move cgroup_get() and cgroup_put() into cgroup.h so that
they can be called from other places.
---
include/linux/cgroup.h | 17 +++++++++++++++++
kernel/cgroup.c | 18 ------------------
2 files changed, 17 insertions(+), 18 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 707c302..4ea477f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -530,6 +530,23 @@ static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
return cgrp->root == &cgrp_dfl_root;
}
+/* convenient tests for these bits */
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+ return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static inline void cgroup_get(struct cgroup *cgrp)
+{
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
+}
+
+static inline void cgroup_put(struct cgroup *cgrp)
+{
+ css_put(&cgrp->self);
+}
+
/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_has_tasks(struct cgroup *cgrp)
{
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1671345..8552513 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
-static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -286,12 +285,6 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
return cgroup_css(cgrp, ss);
}
-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
- return !(cgrp->self.flags & CSS_ONLINE);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -1029,17 +1022,6 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
return mode;
}
-static void cgroup_get(struct cgroup *cgrp)
-{
- WARN_ON_ONCE(cgroup_is_dead(cgrp));
- css_get(&cgrp->self);
-}
-
-static void cgroup_put(struct cgroup *cgrp)
-{
- css_put(&cgrp->self);
-}
-
/**
* cgroup_refresh_child_subsys_mask - update child_subsys_mask
--
2.0.0.526.g5318336
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
Aditya Kali
2014-07-17 19:52:06 UTC
Permalink
Background
Cgroups and Namespaces are used together to create =E2=80=9Cvirtual=E2=
=80=9D
containers that isolates the host environment from the processes
running in container. But since cgroups themselves are not
=E2=80=9Cvirtualized=E2=80=9D, the task is always able to see global =
cgroups view
through cgroupfs mount and via /proc/self/cgroup file.

$ cat /proc/self/cgroup=20
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_=
id1

This exposure of cgroup names to the processes running inside a
container results in some problems:
(1) The container names are typically host-container-management-agent
(systemd, docker/libcontainer, etc.) data and leaking its name (o=
r
leaking the hierarchy) reveals too much information about the hos=
t
system.
(2) It makes the container migration across machines (CRIU) more
difficult as the container names need to be unique across the
machines in the migration domain.
(3) It makes it difficult to run container management tools (like
docker/libcontainer, lmctfy, etc.) within virtual containers
without adding dependency on some state/agent present outside the
container.

Note that the feature proposed here is completely different than the
=E2=80=9Cns cgroup=E2=80=9D feature which existed in the linux kernel=
until recently.
The ns cgroup also attempted to connect cgroups and namespaces by
creating a new cgroup every time a new namespace was created. It did
not solve any of the above mentioned problems and was later dropped
from the kernel.

Introducing CGroup Namespaces
With unified cgroup hierarchy
(Documentation/cgroups/unified-hierarchy.txt), the containers can now
have a much more coherent cgroup view and its easy to associate a
container with a single cgroup. This also allows us to virtualize the
cgroup view for tasks inside the container.

The new CGroup Namespace allows a process to =E2=80=9Cunshare=E2=80=9D=
its cgroup
hierarchy starting from the cgroup its currently in.
For Ex:
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_=
id1
$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> cgr=
oup:[4026531835]
$ ~/unshare -c # calls unshare(CLONE_NEWCGROUP) and exec=E2=80=99s /=
bin/bash
[ns]$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup -> cgr=
oup:[4026532183]
# From within new cgroupns, process sees that its in the root cgroup
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/

# From global cgroupns:
$ cat /proc/<pid>/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_=
id1

The virtualization of /proc/self/cgroup file combined with restrictin=
g
the view of cgroup hierarchy by bind-mounting for the
$CGROUP_MOUNT/batchjobs/c_job_id1/ directory to
$CONTAINER_CHROOT/sys/fs/cgroup/) should provide a completely isolate=
d
cgroup view inside the container.

In its current simplistic form, the cgroup namespaces provide
following behavior:

(1) The =E2=80=9Croot=E2=80=9D cgroup for a cgroup namespace is the c=
group in which
the process calling unshare is running.
For ex. if a process in /batchjobs/c_job_id1 cgroup calls unshare=
,
cgroup /batchjobs/c_job_id1 becomes the cgroupns-root.
For the init_cgroup_ns, this is the real root (=E2=80=9C/=E2=80=9D=
) cgroup
(identified in code as cgrp_dfl_root.cgrp).

(2) The cgroupns-root cgroup does not change even if the namespace
creator process later moves to a different cgroup.
$ ~/unshare -c # unshare cgroupns in some cgroup
[ns]$ cat /proc/self/cgroup=20
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/=20
[ns]$ mkdir sub_cgrp_1
[ns]$ echo 0 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/self/cgroup=20
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1

(3) Each process gets its CGROUPNS specific view of
/proc/<pid>/cgroup.
(a) Processes running inside the cgroup namespace will be able to see
cgroup paths (in /proc/self/cgroup) only inside their root cgroup
[ns]$ sleep 100000 & # From within unshared cgroupns
[1] 7353
[ns]$ echo 7353 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1

(b) From global cgroupns, the real cgroup path will be visible:
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_=
job_id1/sub_cgrp_1

(c) From a sibling cgroupns, the real path will be visible:
[ns2]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_=
job_id1/sub_cgrp_1
(In correct container setup though, it should not be possible to
access PIDs in another container in the first place. This can be
detected changed if desired.)

(4) Processes inside a cgroupns are not allowed to move out of the
cgroupns-root. This is true even if a privileged process in globa=
l
cgroupns tries to move the process out of its cgroupns-root.

# From global cgroupns
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_=
job_id1/sub_cgrp_1
# cgroupns-root for 7353 is /batchjobs/c_job_id1
$ echo 7353 > batchjobs/c_job_id2/cgroup.procs
-bash: echo: write error: Operation not permitted

(5) setns() is not supported for cgroup namespace in the initial
version.

(6) When some thread from a multi-threaded process unshares its
cgroup-namespace, the new cgroupns gets applied to the entire
process (all the threads). This should be OK since
unified-hierarchy only allows process-level containerization. So
all the threads in the process will have the same cgroup. And bot=
h
- changing cgroups and unsharing namespaces - are protected under
threadgroup_lock(task).

(7) The cgroup namespace is alive as long as there is atleast 1
process inside it. When the last process exits, the cgroup
namespace is destroyed. The cgroupns-root and the actual cgroups
remain though.

Implementation
The current patch-set is based on top of Tejun's cgroup tree (for-nex=
t
branch). Its fairly non-intrusive and provides above mentioned
features.

Possible extensions of CGROUPNS:
(1) The Documentation/cgroups/unified-hierarchy.txt mentions use of
capabilities to restrict cgroups to administrative users. CGroup
namespaces could be of help here. With cgroup namespaces, it migh=
t
be possible to delegate administration of sub-cgroups under a
cgroupns-root to the cgroupns owner.

(2) Provide a cgroupns specific cgroupfs mount. i.e., the following
command when ran from inside a cgroupns should only mount the
hierarchy from cgroupns-root cgroup:
$ mount -t cgroup cgroup <cgroup-mountpoint>
# -o __DEVEL__sane_behavior should be implicit

This is similar to how procfs can be mounted for every PIDNS. Thi=
s
may have some usecases.

---
fs/kernfs/dir.c | 51 +++++++++++++---
fs/proc/namespaces.c | 3 +
include/linux/cgroup.h | 36 ++++++++++-
include/linux/cgroup_namespace.h | 62 +++++++++++++++++++
include/linux/kernfs.h | 3 +
include/linux/nsproxy.h | 2 +
include/linux/proc_ns.h | 4 ++
include/uapi/linux/sched.h | 3 +-
init/Kconfig | 9 +++
kernel/Makefile | 1 +
kernel/cgroup.c | 75 +++++++++++++++++------
kernel/cgroup_namespace.c | 128 +++++++++++++++++++++++++++++++=
++++++++
kernel/fork.c | 2 +-
kernel/nsproxy.c | 19 +++++-
14 files changed, 364 insertions(+), 34 deletions(-)
create mode 100644 include/linux/cgroup_namespace.h
create mode 100644 kernel/cgroup_namespace.c

[PATCH 1/5] kernfs: Add API to get generate relative kernfs path
[PATCH 2/5] sched: new clone flag CLONE_NEWCGROUP for cgroup
[PATCH 3/5] cgroup: add function to get task's cgroup on default
[PATCH 4/5] cgroup: export cgroup_get() and cgroup_put()
[PATCH 5/5] cgroup: introduce cgroup namespaces
Aditya Kali
2014-07-17 19:52:11 UTC
Permalink
Introduce the ability to create new cgroup namespace. The newly created
cgroup namespace remembers the 'struct cgroup *root_cgrp' at the point
of creation of the cgroup namespace. The task that creates the new
cgroup namespace and all its future children will now be restricted only
to the cgroup hierarchy under this root_cgrp. In the first version,
setns() is not supported for cgroup namespaces.
The main purpose of cgroup namespace is to virtualize the contents
of /proc/self/cgroup file. Processes inside a cgroup namespace
are only able to see paths relative to their namespace root.
This allows container-tools (like libcontainer, lxc, lmctfy, etc.)
to create completely virtualized containers without leaking system
level cgroup hierarchy to the task.

Signed-off-by: Aditya Kali <adityakali-hpIqsD4AKlfQT0dZR+***@public.gmane.org>
---
fs/proc/namespaces.c | 3 +
include/linux/cgroup.h | 18 +++++-
include/linux/cgroup_namespace.h | 62 +++++++++++++++++++
include/linux/nsproxy.h | 2 +
include/linux/proc_ns.h | 4 ++
init/Kconfig | 9 +++
kernel/Makefile | 1 +
kernel/cgroup.c | 32 ++++++++++
kernel/cgroup_namespace.c | 128 +++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 2 +-
kernel/nsproxy.c | 19 +++++-
11 files changed, 276 insertions(+), 4 deletions(-)
create mode 100644 include/linux/cgroup_namespace.h
create mode 100644 kernel/cgroup_namespace.c

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 8902609..e04ed4b 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -32,6 +32,9 @@ static const struct proc_ns_operations *ns_entries[] = {
&userns_operations,
#endif
&mntns_operations,
+#ifdef CONFIG_CGROUP_NS
+ &cgroupns_operations,
+#endif
};

static const struct file_operations ns_file_operations = {
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4ea477f..d3c6070 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,8 @@
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/wait.h>
+#include <linux/nsproxy.h>
+#include <linux/types.h>

#ifdef CONFIG_CGROUPS

@@ -469,6 +471,13 @@ struct cftype {
#endif
};

+struct cgroup_namespace {
+ atomic_t count;
+ unsigned int proc_inum;
+ struct user_namespace *user_ns;
+ struct cgroup *root_cgrp;
+};
+
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;

@@ -591,10 +600,17 @@ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
return kernfs_name(cgrp->kn, buf, buflen);
}

+static inline char * __must_check cgroup_path_ns(struct cgroup_namespace *ns,
+ struct cgroup *cgrp, char *buf,
+ size_t buflen)
+{
+ return kernfs_path_from_node(ns->root_cgrp->kn, cgrp->kn, buf, buflen);
+}
+
static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
size_t buflen)
{
- return kernfs_path(cgrp->kn, buf, buflen);
+ return cgroup_path_ns(current->nsproxy->cgroup_ns, cgrp, buf, buflen);
}

static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h
new file mode 100644
index 0000000..9f637fe
--- /dev/null
+++ b/include/linux/cgroup_namespace.h
@@ -0,0 +1,62 @@
+#ifndef _LINUX_CGROUP_NAMESPACE_H
+#define _LINUX_CGROUP_NAMESPACE_H
+
+#include <linux/nsproxy.h>
+#include <linux/cgroup.h>
+#include <linux/types.h>
+#include <linux/user_namespace.h>
+
+extern struct cgroup_namespace init_cgroup_ns;
+
+static inline struct cgroup *task_cgroupns_root(struct task_struct *tsk)
+{
+ return tsk->nsproxy->cgroup_ns->root_cgrp;
+}
+
+#ifdef CONFIG_CGROUP_NS
+
+extern void free_cgroup_ns(struct cgroup_namespace *ns);
+
+static inline struct cgroup_namespace *get_cgroup_ns(
+ struct cgroup_namespace *ns)
+{
+ if (ns)
+ atomic_inc(&ns->count);
+ return ns;
+}
+
+static inline void put_cgroup_ns(struct cgroup_namespace *ns)
+{
+ if (ns && atomic_dec_and_test(&ns->count))
+ free_cgroup_ns(ns);
+}
+
+extern struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns);
+
+#else /* CONFIG_CGROUP_NS */
+
+static inline struct cgroup_namespace *get_cgroup_ns(
+ struct cgroup_namespace *ns)
+{
+ return &init_cgroup_ns;
+}
+
+static inline void put_cgroup_ns(struct cgroup_namespace *ns)
+{
+}
+
+static inline struct cgroup_namespace *copy_cgroup_ns(
+ unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns) {
+ if (flags & CLONE_NEWCGROUP)
+ return ERR_PTR(-EINVAL);
+
+ return old_ns;
+}
+
+#endif /* CONFIG_CGROUP_NS */
+
+#endif /* _LINUX_CGROUP_NAMESPACE_H */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index b4ec59d..44f388c 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -8,6 +8,7 @@ struct mnt_namespace;
struct uts_namespace;
struct ipc_namespace;
struct pid_namespace;
+struct cgroup_namespace;
struct fs_struct;

/*
@@ -33,6 +34,7 @@ struct nsproxy {
struct mnt_namespace *mnt_ns;
struct pid_namespace *pid_ns_for_children;
struct net *net_ns;
+ struct cgroup_namespace *cgroup_ns;
};
extern struct nsproxy init_nsproxy;

diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 34a1e10..e56dd73 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -6,6 +6,8 @@

struct pid_namespace;
struct nsproxy;
+struct task_struct;
+struct inode;

struct proc_ns_operations {
const char *name;
@@ -27,6 +29,7 @@ extern const struct proc_ns_operations ipcns_operations;
extern const struct proc_ns_operations pidns_operations;
extern const struct proc_ns_operations userns_operations;
extern const struct proc_ns_operations mntns_operations;
+extern const struct proc_ns_operations cgroupns_operations;

/*
* We always define these enumerators
@@ -37,6 +40,7 @@ enum {
PROC_UTS_INIT_INO = 0xEFFFFFFEU,
PROC_USER_INIT_INO = 0xEFFFFFFDU,
PROC_PID_INIT_INO = 0xEFFFFFFCU,
+ PROC_CGROUP_INIT_INO = 0xEFFFFFFBU,
};

#ifdef CONFIG_PROC_FS
diff --git a/init/Kconfig b/init/Kconfig
index 9d76b99..2f43ec9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1101,6 +1101,15 @@ config DEBUG_BLK_CGROUP
Enable some debugging help. Currently it exports additional stat
files in a cgroup which can be useful for debugging.

+config CGROUP_NS
+ bool "CGroup Namespaces"
+ default n
+ help
+ This options enables CGroup Namespaces which can be used to isolate
+ cgroup paths. This feature is only useful when unified cgroup
+ hierarchy is in use (i.e. cgroups are mounted with sane_behavior
+ option).
+
endif # CGROUPS

config CHECKPOINT_RESTORE
diff --git a/kernel/Makefile b/kernel/Makefile
index f2a8b62..61c5791 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
+obj-$(CONFIG_CGROUP_NS) += cgroup_namespace.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8552513..c04e971 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,8 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
#include <linux/delay.h>
+#include <linux/proc_ns.h>
+#include <linux/cgroup_namespace.h>

#include <linux/atomic.h>

@@ -196,6 +198,15 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);

+struct cgroup_namespace init_cgroup_ns = {
+ .count = {
+ .counter = 1,
+ },
+ .proc_inum = PROC_CGROUP_INIT_INO,
+ .user_ns = &init_user_ns,
+ .root_cgrp = &cgrp_dfl_root.cgrp,
+};
+
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
gfp_t gfp_mask)
@@ -2333,6 +2344,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
struct task_struct *task;
int ret;

+ /* Only allow changing cgroups accessible within task's cgroup
+ * namespace. i.e. 'dst_cgrp' should be a descendant of task's
+ * cgroupns->root_cgrp. */
+ if (!cgroup_is_descendant(dst_cgrp, task_cgroupns_root(leader)))
+ return -EPERM;
+
/* look up all src csets */
down_read(&css_set_rwsem);
rcu_read_lock();
@@ -4551,6 +4568,13 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
parent = cgroup_kn_lock_live(parent_kn);
if (!parent)
return -ENODEV;
+
+ /* Allow mkdir only within process's cgroup namespace root. */
+ if (!cgroup_is_descendant(parent, task_cgroupns_root(current))) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
root = parent->root;

/* allocate the cgroup and its ID, 0 is reserved for the root */
@@ -4819,6 +4843,14 @@ static int cgroup_rmdir(struct kernfs_node *kn)
cgrp = cgroup_kn_lock_live(kn);
if (!cgrp)
return 0;
+
+ /* Allow rmdir only within process's cgroup namespace root.
+ * The process can't delete its own root anyways. */
+ if (!cgroup_is_descendant(cgrp, task_cgroupns_root(current))) {
+ cgroup_kn_unlock(kn);
+ return -EPERM;
+ }
+
cgroup_get(cgrp); /* for @kn->priv clearing */

ret = cgroup_destroy_locked(cgrp);
diff --git a/kernel/cgroup_namespace.c b/kernel/cgroup_namespace.c
new file mode 100644
index 0000000..a2e6804
--- /dev/null
+++ b/kernel/cgroup_namespace.c
@@ -0,0 +1,128 @@
+
+#include <linux/cgroup.h>
+#include <linux/cgroup_namespace.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+ struct cgroup_namespace *new_ns;
+
+ new_ns = kmalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+ if (new_ns)
+ atomic_set(&new_ns->count, 1);
+ return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+ cgroup_put(ns->root_cgrp);
+ put_user_ns(ns->user_ns);
+ proc_free_inum(ns->proc_inum);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns)
+{
+ struct cgroup_namespace *new_ns = NULL;
+ struct cgroup *cgrp = NULL;
+ int err;
+
+ BUG_ON(!old_ns);
+
+ if (!(flags & CLONE_NEWCGROUP))
+ return get_cgroup_ns(old_ns);
+
+ /* Allow only sysadmin to create cgroup namespace. */
+ err = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto err_out;
+
+ /* Prevent cgroup changes for this task. */
+ threadgroup_lock(current);
+
+ cgrp = get_task_cgroup(current);
+
+ /* Creating new CGROUPNS is supported only when unified hierarchy is in
+ * use. */
+ err = -EINVAL;
+ if (!cgroup_on_dfl(cgrp))
+ goto err_out_unlock;
+
+ err = -ENOMEM;
+ new_ns = alloc_cgroup_ns();
+ if (!new_ns)
+ goto err_out_unlock;
+
+ err = proc_alloc_inum(&new_ns->proc_inum);
+ if (err)
+ goto err_out_unlock;
+
+ new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->root_cgrp = cgrp;
+
+ threadgroup_unlock(current);
+
+ return new_ns;
+
+err_out_unlock:
+ threadgroup_unlock(current);
+err_out:
+ if (cgrp)
+ cgroup_put(cgrp);
+ kfree(new_ns);
+ return ERR_PTR(err);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, void *ns)
+{
+ pr_info("setns not supported for cgroup namespace");
+ return -EINVAL;
+}
+
+static void *cgroupns_get(struct task_struct *task)
+{
+ struct cgroup_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ rcu_read_lock();
+ nsproxy = task_nsproxy(task);
+ if (nsproxy) {
+ ns = nsproxy->cgroup_ns;
+ get_cgroup_ns(ns);
+ }
+ rcu_read_unlock();
+
+ return ns;
+}
+
+static void cgroupns_put(void *ns)
+{
+ put_cgroup_ns(ns);
+}
+
+static unsigned int cgroupns_inum(void *ns)
+{
+ struct cgroup_namespace *cgroup_ns = ns;
+
+ return cgroup_ns->proc_inum;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+ .name = "cgroup",
+ .type = CLONE_NEWCGROUP,
+ .get = cgroupns_get,
+ .put = cgroupns_put,
+ .install = cgroupns_install,
+ .inum = cgroupns_inum,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+ return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index d2799d1..95981a1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1747,7 +1747,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID))
+ CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing to
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8e78110..e20298c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -25,6 +25,7 @@
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
+#include <linux/cgroup_namespace.h>

static struct kmem_cache *nsproxy_cachep;

@@ -39,6 +40,7 @@ struct nsproxy init_nsproxy = {
#ifdef CONFIG_NET
.net_ns = &init_net,
#endif
+ .cgroup_ns = &init_cgroup_ns,
};

static inline struct nsproxy *create_nsproxy(void)
@@ -92,6 +94,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_pid;
}

+ new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
+ tsk->nsproxy->cgroup_ns);
+ if (IS_ERR(new_nsp->cgroup_ns)) {
+ err = PTR_ERR(new_nsp->cgroup_ns);
+ goto out_cgroup;
+ }
+
new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
@@ -101,6 +110,9 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
return new_nsp;

out_net:
+ if (new_nsp->cgroup_ns)
+ put_cgroup_ns(new_nsp->cgroup_ns);
+out_cgroup:
if (new_nsp->pid_ns_for_children)
put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
@@ -128,7 +140,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *new_ns;

if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWPID | CLONE_NEWNET)))) {
+ CLONE_NEWPID | CLONE_NEWNET |
+ CLONE_NEWCGROUP)))) {
get_nsproxy(old_ns);
return 0;
}
@@ -165,6 +178,8 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns_for_children)
put_pid_ns(ns->pid_ns_for_children);
+ if (ns->cgroup_ns)
+ put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
}
@@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;

if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID)))
+ CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
return 0;

user_ns = new_cred ? new_cred->user_ns : current_user_ns();
--
2.0.0.526.g5318336
Andy Lutomirski
2014-07-17 19:57:03 UTC
Permalink
Post by Aditya Kali
Introduce the ability to create new cgroup namespace. The newly created
cgroup namespace remembers the 'struct cgroup *root_cgrp' at the point
of creation of the cgroup namespace. The task that creates the new
cgroup namespace and all its future children will now be restricted only
to the cgroup hierarchy under this root_cgrp. In the first version,
setns() is not supported for cgroup namespaces.
The main purpose of cgroup namespace is to virtualize the contents
of /proc/self/cgroup file. Processes inside a cgroup namespace
are only able to see paths relative to their namespace root.
This allows container-tools (like libcontainer, lxc, lmctfy, etc.)
to create completely virtualized containers without leaking system
level cgroup hierarchy to the task.
What happens if someone moves a task in a cgroup namespace outside of
the namespace root cgroup?

--Andy
Aditya Kali
2014-07-17 20:55:43 UTC
Permalink
Post by Andy Lutomirski
Post by Aditya Kali
Introduce the ability to create new cgroup namespace. The newly created
cgroup namespace remembers the 'struct cgroup *root_cgrp' at the point
of creation of the cgroup namespace. The task that creates the new
cgroup namespace and all its future children will now be restricted only
to the cgroup hierarchy under this root_cgrp. In the first version,
setns() is not supported for cgroup namespaces.
The main purpose of cgroup namespace is to virtualize the contents
of /proc/self/cgroup file. Processes inside a cgroup namespace
are only able to see paths relative to their namespace root.
This allows container-tools (like libcontainer, lxc, lmctfy, etc.)
to create completely virtualized containers without leaking system
level cgroup hierarchy to the task.
What happens if someone moves a task in a cgroup namespace outside of
the namespace root cgroup?
Attempt to move a task outside of cgroupns root will fail with EPERM.
This is true irrespective of the privileges of the process attempting
this. Once cgroupns is created, the task will be confined to the
cgroup hierarchy under its cgroupns root until it dies.
Post by Andy Lutomirski
--Andy
--
Aditya
Andy Lutomirski
2014-07-18 16:51:33 UTC
Permalink
Post by Aditya Kali
Post by Andy Lutomirski
Post by Aditya Kali
Introduce the ability to create new cgroup namespace. The newly created
cgroup namespace remembers the 'struct cgroup *root_cgrp' at the point
of creation of the cgroup namespace. The task that creates the new
cgroup namespace and all its future children will now be restricted only
to the cgroup hierarchy under this root_cgrp. In the first version,
setns() is not supported for cgroup namespaces.
The main purpose of cgroup namespace is to virtualize the contents
of /proc/self/cgroup file. Processes inside a cgroup namespace
are only able to see paths relative to their namespace root.
This allows container-tools (like libcontainer, lxc, lmctfy, etc.)
to create completely virtualized containers without leaking system
level cgroup hierarchy to the task.
What happens if someone moves a task in a cgroup namespace outside of
the namespace root cgroup?
Attempt to move a task outside of cgroupns root will fail with EPERM.
This is true irrespective of the privileges of the process attempting
this. Once cgroupns is created, the task will be confined to the
cgroup hierarchy under its cgroupns root until it dies.
Can a task in a non-init userns create a cgroupns? If not, that's
unusual. If so, is it problematic if they can prevent themselves from
being moved?

I hate to say it, but it might be worth requiring explicit permission
from the cgroup manager for this. For example, there could be a new
cgroup attribute may_unshare, and any attempt to unshare the cgroup ns
will fail with -EPERM unless the caller is in a may_share=1 cgroup.
may_unshare in a parent cgroup would not give child cgroups the
ability to unshare.

--Andy
Aditya Kali
2014-07-18 18:51:17 UTC
Permalink
Post by Andy Lutomirski
Post by Aditya Kali
Post by Andy Lutomirski
What happens if someone moves a task in a cgroup namespace outside of
the namespace root cgroup?
Attempt to move a task outside of cgroupns root will fail with EPERM.
This is true irrespective of the privileges of the process attempting
this. Once cgroupns is created, the task will be confined to the
cgroup hierarchy under its cgroupns root until it dies.
Can a task in a non-init userns create a cgroupns? If not, that's
unusual. If so, is it problematic if they can prevent themselves from
being moved?
Currently, only a task with CAP_SYS_ADMIN in the init-userns can
create cgroupns. It is stricter than for other namespaces, yes.
Post by Andy Lutomirski
I hate to say it, but it might be worth requiring explicit permission
from the cgroup manager for this. For example, there could be a new
cgroup attribute may_unshare, and any attempt to unshare the cgroup ns
will fail with -EPERM unless the caller is in a may_share=1 cgroup.
may_unshare in a parent cgroup would not give child cgroups the
ability to unshare.
What you suggest can be done. The current patch-set punts the problem
of permission checking by only allowing unshare from a
capable(CAP_SYS_ADMIN) process. This can be implemented as a follow-up
improvement to cgroupns feature if we want to open it to non-init
userns.

Being said that, I would argue that even if we don't have this
explicit permission and relax the check to non-init userns, it should
be 'OK' to let ns_capable(current_user_ns(), CAP_SYS_ADMIN) tasks to
unshare cgroupns (basically, if you can "create" a cgroup hierarchy,
you should probably be allowed to unshare() it). By unsharing
cgroupns, the tasks can only confine themselves further under its
cgroupns-root. As long as it cannot escape that hierarchy, it should
be fine.
In my experience, there is seldom a need to move tasks out of their
cgroup. At most, we create a sub-cgroup and move the task there (which
is allowed in their cgroupns). Even for a cgroup manager, I can't
think of a case where it will be useful to move a task from one cgroup
hierarchy to another. Such move seems overly complicated (even without
cgroup namespaces). The cgroup manager can just modify the settings of
the task's cgroup as needed or simply kill & restart the task in a new
container.
Post by Andy Lutomirski
--Andy
Thanks,
--
Aditya
Andy Lutomirski
2014-07-18 18:57:22 UTC
Permalink
Post by Aditya Kali
Post by Andy Lutomirski
Post by Aditya Kali
Post by Andy Lutomirski
What happens if someone moves a task in a cgroup namespace outside of
the namespace root cgroup?
Attempt to move a task outside of cgroupns root will fail with EPERM.
This is true irrespective of the privileges of the process attempting
this. Once cgroupns is created, the task will be confined to the
cgroup hierarchy under its cgroupns root until it dies.
Can a task in a non-init userns create a cgroupns? If not, that's
unusual. If so, is it problematic if they can prevent themselves from
being moved?
Currently, only a task with CAP_SYS_ADMIN in the init-userns can
create cgroupns. It is stricter than for other namespaces, yes.
I'm slightly hesitant to have unshare(CLONE_NEWUSER |
CLONE_NEWCGROUPNS | ...) start having weird side effects that are
visible outside the namespace, especially when those side effects
don't happen (because the call fails entirely) if
unshare(CLONE_NEWUSER) happens first. I don't see a real problem with
it, but it's weird.
Post by Aditya Kali
Post by Andy Lutomirski
I hate to say it, but it might be worth requiring explicit permission
from the cgroup manager for this. For example, there could be a new
cgroup attribute may_unshare, and any attempt to unshare the cgroup ns
will fail with -EPERM unless the caller is in a may_share=1 cgroup.
may_unshare in a parent cgroup would not give child cgroups the
ability to unshare.
What you suggest can be done. The current patch-set punts the problem
of permission checking by only allowing unshare from a
capable(CAP_SYS_ADMIN) process. This can be implemented as a follow-up
improvement to cgroupns feature if we want to open it to non-init
userns.
Being said that, I would argue that even if we don't have this
explicit permission and relax the check to non-init userns, it should
be 'OK' to let ns_capable(current_user_ns(), CAP_SYS_ADMIN) tasks to
unshare cgroupns (basically, if you can "create" a cgroup hierarchy,
you should probably be allowed to unshare() it).
But non-init-userns tasks can't create cgroup hierarchies, unless I
misunderstand the current code. And, if they can, I bet I can find
three or four serious security issues in an hour or two. :)
Post by Aditya Kali
By unsharing
cgroupns, the tasks can only confine themselves further under its
cgroupns-root. As long as it cannot escape that hierarchy, it should
be fine.
But they can also *lock* their hierarchy.
Post by Aditya Kali
In my experience, there is seldom a need to move tasks out of their
cgroup. At most, we create a sub-cgroup and move the task there (which
is allowed in their cgroupns). Even for a cgroup manager, I can't
think of a case where it will be useful to move a task from one cgroup
hierarchy to another. Such move seems overly complicated (even without
cgroup namespaces). The cgroup manager can just modify the settings of
the task's cgroup as needed or simply kill & restart the task in a new
container.
I do this all the time. Maybe my new systemd overlords will make me
stop doing it, at which point my current production setup will blow
up.

--Andy
Aditya Kali
2014-07-21 22:11:25 UTC
Permalink
Post by Andy Lutomirski
Post by Aditya Kali
Post by Andy Lutomirski
Post by Aditya Kali
Post by Andy Lutomirski
What happens if someone moves a task in a cgroup namespace outside of
the namespace root cgroup?
Attempt to move a task outside of cgroupns root will fail with EPERM.
This is true irrespective of the privileges of the process attempting
this. Once cgroupns is created, the task will be confined to the
cgroup hierarchy under its cgroupns root until it dies.
Can a task in a non-init userns create a cgroupns? If not, that's
unusual. If so, is it problematic if they can prevent themselves from
being moved?
Currently, only a task with CAP_SYS_ADMIN in the init-userns can
create cgroupns. It is stricter than for other namespaces, yes.
I'm slightly hesitant to have unshare(CLONE_NEWUSER |
CLONE_NEWCGROUPNS | ...) start having weird side effects that are
visible outside the namespace, especially when those side effects
don't happen (because the call fails entirely) if
unshare(CLONE_NEWUSER) happens first. I don't see a real problem with
it, but it's weird.
I expect this to be only in the initial version of the patch. We can
make this consistent with other namespaces once we figure out how
cgroupns can be safely enabled for non-init-userns.
Post by Andy Lutomirski
Post by Aditya Kali
Post by Andy Lutomirski
I hate to say it, but it might be worth requiring explicit permission
from the cgroup manager for this. For example, there could be a new
cgroup attribute may_unshare, and any attempt to unshare the cgroup ns
will fail with -EPERM unless the caller is in a may_share=1 cgroup.
may_unshare in a parent cgroup would not give child cgroups the
ability to unshare.
What you suggest can be done. The current patch-set punts the problem
of permission checking by only allowing unshare from a
capable(CAP_SYS_ADMIN) process. This can be implemented as a follow-up
improvement to cgroupns feature if we want to open it to non-init
userns.
Being said that, I would argue that even if we don't have this
explicit permission and relax the check to non-init userns, it should
be 'OK' to let ns_capable(current_user_ns(), CAP_SYS_ADMIN) tasks to
unshare cgroupns (basically, if you can "create" a cgroup hierarchy,
you should probably be allowed to unshare() it).
But non-init-userns tasks can't create cgroup hierarchies, unless I
misunderstand the current code. And, if they can, I bet I can find
three or four serious security issues in an hour or two. :)
Task running in non-init userns can create cgroup hierarchies if you
chown/chgrp their cgroup root to the task user:

# while running as 'root' (uid=0)
$ cd $CGROUP_MOUNT
$ mkdir -p batchjobs/c_job_id1/

# transfer ownership to the user (in this case 'nobody' (uid=99)).
$ chown nobody batchjobs/c_job_id1/
$ chgrp nobody batchjobs/c_job_id1/
$ ls -ld batchjobs/c_job_id1/
drwxr-xr-x 2 nobody nobody 0 2014-07-21 12:47 batchjobs/c_job_id1/

# enter container cgroup
$ echo 0 > batchjobs/c_job_id1/cgroup.procs

# unshare both userns and cgroupns
$ unshare -u -c
# setup uid_map and gid_map and export user '99' in the userns
# $ cat /proc/<pid>/uid_map
# 0 0 1
# 99 99 1
# $ cat /proc/<pid>/gid_map
# 0 0 1
# 99 99 1
# switch to user 'nobody'
$ su nobody
$ id
uid=99(nobody) gid=99(nobody) groups=99(nobody)

# Now user nobody running under non-init userns can create sub-cgroups
# under "batchjobs/c_job_id1/".
# PWD=$CGROUP_MOUNT/batchjobs/c_job_id1
$ mkdir sub_cgroup1
$ ls -ld sub_cgroup1/
drwxr-xr-x 2 nobody nobody 0 2014-07-21 13:11 sub_cgroup1/
$ echo 0 > sub_cgroup1/cgroup.procs
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgroup1
$ ls -l sub_cgroup1/
total 0
-r--r--r-- 1 nobody nobody 0 2014-07-21 13:11 cgroup.controllers
-r--r--r-- 1 nobody nobody 0 2014-07-21 13:11 cgroup.populated
-rw-r--r-- 1 nobody nobody 0 2014-07-21 13:12 cgroup.procs
-rw-r--r-- 1 nobody nobody 0 2014-07-21 13:11 cgroup.subtree_control


This is a powerful feature as it allows non-root tasks to run
container-management tools and provision their resources properly. But
this makes implementing your suggestion of having 'cgroup.may_unshare'
file tricky as the cgroup owner (task) will be able to set it and
still unshare cgroupns. Instead, may be we could just check if the
task has appropriate (write?) permissions on the cgroup directory
before allowing nested cgroupns creation.
Post by Andy Lutomirski
Post by Aditya Kali
By unsharing
cgroupns, the tasks can only confine themselves further under its
cgroupns-root. As long as it cannot escape that hierarchy, it should
be fine.
But they can also *lock* their hierarchy.
But locking the tasks inside the hierarchy is really what cgroupns
feature is trying to provide. I understand that this is a change in
expectation, but with unified hierarchy, there are already
restrictions on where tasks can be moved (only to leaf cgroups). With
cgroup namespaces, this becomes: "only to leaf cgroups within task's
cgroupns".
Post by Andy Lutomirski
Post by Aditya Kali
In my experience, there is seldom a need to move tasks out of their
cgroup. At most, we create a sub-cgroup and move the task there (which
is allowed in their cgroupns). Even for a cgroup manager, I can't
think of a case where it will be useful to move a task from one cgroup
hierarchy to another. Such move seems overly complicated (even without
cgroup namespaces). The cgroup manager can just modify the settings of
the task's cgroup as needed or simply kill & restart the task in a new
container.
I do this all the time. Maybe my new systemd overlords will make me
stop doing it, at which point my current production setup will blow
up.
[shudder]
I am surprised that this even works correctly.

Either way, may be checking cgroup directory permissions will work for
you? i.e., if you "chown" a cgroup directory to the user, it should be
OK if the user's task unshares cgroupns under that cgroup and you
don't care about moving tasks from under that cgroup. Without
ownership of the cgroup directory, creation of cgroupns will be
disallowed. What do you think?
Post by Andy Lutomirski
--Andy
--
Aditya
Andy Lutomirski
2014-07-21 22:16:20 UTC
Permalink
Post by Aditya Kali
Post by Andy Lutomirski
Post by Aditya Kali
Post by Andy Lutomirski
Post by Aditya Kali
Post by Andy Lutomirski
What happens if someone moves a task in a cgroup namespace outside of
the namespace root cgroup?
Attempt to move a task outside of cgroupns root will fail with EPERM.
This is true irrespective of the privileges of the process attempting
this. Once cgroupns is created, the task will be confined to the
cgroup hierarchy under its cgroupns root until it dies.
Can a task in a non-init userns create a cgroupns? If not, that's
unusual. If so, is it problematic if they can prevent themselves from
being moved?
Currently, only a task with CAP_SYS_ADMIN in the init-userns can
create cgroupns. It is stricter than for other namespaces, yes.
I'm slightly hesitant to have unshare(CLONE_NEWUSER |
CLONE_NEWCGROUPNS | ...) start having weird side effects that are
visible outside the namespace, especially when those side effects
don't happen (because the call fails entirely) if
unshare(CLONE_NEWUSER) happens first. I don't see a real problem with
it, but it's weird.
I expect this to be only in the initial version of the patch. We can
make this consistent with other namespaces once we figure out how
cgroupns can be safely enabled for non-init-userns.
Post by Andy Lutomirski
Post by Aditya Kali
Post by Andy Lutomirski
I hate to say it, but it might be worth requiring explicit permission
from the cgroup manager for this. For example, there could be a new
cgroup attribute may_unshare, and any attempt to unshare the cgroup ns
will fail with -EPERM unless the caller is in a may_share=1 cgroup.
may_unshare in a parent cgroup would not give child cgroups the
ability to unshare.
What you suggest can be done. The current patch-set punts the problem
of permission checking by only allowing unshare from a
capable(CAP_SYS_ADMIN) process. This can be implemented as a follow-up
improvement to cgroupns feature if we want to open it to non-init
userns.
Being said that, I would argue that even if we don't have this
explicit permission and relax the check to non-init userns, it should
be 'OK' to let ns_capable(current_user_ns(), CAP_SYS_ADMIN) tasks to
unshare cgroupns (basically, if you can "create" a cgroup hierarchy,
you should probably be allowed to unshare() it).
But non-init-userns tasks can't create cgroup hierarchies, unless I
misunderstand the current code. And, if they can, I bet I can find
three or four serious security issues in an hour or two. :)
Task running in non-init userns can create cgroup hierarchies if you
Won't the systemd people hate you forever for this suggestion? (I do
exactly this myself...)
Post by Aditya Kali
This is a powerful feature as it allows non-root tasks to run
container-management tools and provision their resources properly. But
this makes implementing your suggestion of having 'cgroup.may_unshare'
file tricky as the cgroup owner (task) will be able to set it and
still unshare cgroupns. Instead, may be we could just check if the
task has appropriate (write?) permissions on the cgroup directory
before allowing nested cgroupns creation.
I bet that systemd will want to set may_unshare but not give write
access. Who knows?
Post by Aditya Kali
[shudder]
I am surprised that this even works correctly.
Either way, may be checking cgroup directory permissions will work for
you? i.e., if you "chown" a cgroup directory to the user, it should be
OK if the user's task unshares cgroupns under that cgroup and you
don't care about moving tasks from under that cgroup. Without
ownership of the cgroup directory, creation of cgroupns will be
disallowed. What do you think?
I think this is *safe* but may not useful for eventual systemd stuff.
Not really sure.

--Andy
Aditya Kali
2014-07-17 19:52:07 UTC
Permalink
The new function kernfs_path_from_node() generates and returns
kernfs path of a given kernfs_node relative to a given parent
kernfs_node.

Signed-off-by: Aditya Kali <adityakali-hpIqsD4AKlfQT0dZR+***@public.gmane.org>
---
fs/kernfs/dir.c | 51 ++++++++++++++++++++++++++++++++++++++++----------
include/linux/kernfs.h | 3 +++
2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a693f5b..2224f08 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -44,14 +44,22 @@ static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
}

-static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
- size_t buflen)
+static char * __must_check kernfs_path_from_node_locked(
+ struct kernfs_node *kn_root,
+ struct kernfs_node *kn,
+ char *buf,
+ size_t buflen)
{
char *p = buf + buflen;
int len;

*--p = '\0';

+ if (kn == kn_root) {
+ *--p = '/';
+ return p;
+ }
+
do {
len = strlen(kn->name);
if (p - buf < len + 1) {
@@ -63,6 +71,8 @@ static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
memcpy(p, kn->name, len);
*--p = '/';
kn = kn->parent;
+ if (kn == kn_root)
+ break;
} while (kn && kn->parent);

return p;
@@ -92,26 +102,47 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
}

/**
- * kernfs_path - build full path of a given node
+ * kernfs_path_from_node - build path of node @kn relative to @kn_root.
+ * @kn_root: parent kernfs_node relative to which we need to build the path
* @kn: kernfs_node of interest
- * @buf: buffer to copy @kn's name into
+ * @buf: buffer to copy @kn's path into
* @buflen: size of @buf
*
- * Builds and returns the full path of @kn in @buf of @buflen bytes. The
- * path is built from the end of @buf so the returned pointer usually
+ * Builds and returns @kn's path relative to @kn_root. @kn_root is expected to
+ * be parent of @kn at some level. If this is not true or if @kn_root is NULL,
+ * then full path of @kn is returned.
+ * The path is built from the end of @buf so the returned pointer usually
* doesn't match @buf. If @buf isn't long enough, @buf is nul terminated
* and %NULL is returned.
*/
-char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
+char *kernfs_path_from_node(struct kernfs_node *kn_root, struct kernfs_node *kn,
+ char *buf, size_t buflen)
{
unsigned long flags;
char *p;

spin_lock_irqsave(&kernfs_rename_lock, flags);
- p = kernfs_path_locked(kn, buf, buflen);
+ p = kernfs_path_from_node_locked(kn_root, kn, buf, buflen);
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
return p;
}
+EXPORT_SYMBOL_GPL(kernfs_path_from_node);
+
+/**
+ * kernfs_path - build full path of a given node
+ * @kn: kernfs_node of interest
+ * @buf: buffer to copy @kn's name into
+ * @buflen: size of @buf
+ *
+ * Builds and returns the full path of @kn in @buf of @buflen bytes. The
+ * path is built from the end of @buf so the returned pointer usually
+ * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated
+ * and %NULL is returned.
+ */
+char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+ return kernfs_path_from_node(NULL, kn, buf, buflen);
+}
EXPORT_SYMBOL_GPL(kernfs_path);

/**
@@ -145,8 +176,8 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)

spin_lock_irqsave(&kernfs_rename_lock, flags);

- p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
- sizeof(kernfs_pr_cont_buf));
+ p = kernfs_path_from_node_locked(NULL, kn, kernfs_pr_cont_buf,
+ sizeof(kernfs_pr_cont_buf));
if (p)
pr_cont("%s", p);
else
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 20f4935..1627341 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -257,6 +257,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
}

int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
+char * __must_check kernfs_path_from_node(struct kernfs_node *root_kn,
+ struct kernfs_node *kn, char *buf,
+ size_t buflen);
char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
size_t buflen);
void pr_cont_kernfs_name(struct kernfs_node *kn);
--
2.0.0.526.g5318336
Serge Hallyn
2014-07-24 15:10:50 UTC
Permalink
Post by Aditya Kali
The new function kernfs_path_from_node() generates and returns
kernfs path of a given kernfs_node relative to a given parent
kernfs_node.
---
fs/kernfs/dir.c | 51 ++++++++++++++++++++++++++++++++++++++++----------
include/linux/kernfs.h | 3 +++
2 files changed, 44 insertions(+), 10 deletions(-)
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a693f5b..2224f08 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -44,14 +44,22 @@ static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
}
-static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
- size_t buflen)
+static char * __must_check kernfs_path_from_node_locked(
+ struct kernfs_node *kn_root,
+ struct kernfs_node *kn,
+ char *buf,
+ size_t buflen)
{
char *p = buf + buflen;
int len;
*--p = '\0';
I realize this is currently couldn't happen (hm, well through the
EXPORT_SYMBOL_GPL(kernfs_path) it actually could), and it's the same in the
current code, but could you add a BUG_ON(!buflen) here?

Otherwise looks good to me.
Post by Aditya Kali
+ if (kn == kn_root) {
+ *--p = '/';
+ return p;
+ }
+
do {
len = strlen(kn->name);
if (p - buf < len + 1) {
@@ -63,6 +71,8 @@ static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
memcpy(p, kn->name, len);
*--p = '/';
kn = kn->parent;
+ if (kn == kn_root)
+ break;
} while (kn && kn->parent);
return p;
@@ -92,26 +102,47 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
}
/**
- * kernfs_path - build full path of a given node
*
* and %NULL is returned.
*/
-char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
+char *kernfs_path_from_node(struct kernfs_node *kn_root, struct kernfs_node *kn,
+ char *buf, size_t buflen)
{
unsigned long flags;
char *p;
spin_lock_irqsave(&kernfs_rename_lock, flags);
- p = kernfs_path_locked(kn, buf, buflen);
+ p = kernfs_path_from_node_locked(kn_root, kn, buf, buflen);
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
return p;
}
+EXPORT_SYMBOL_GPL(kernfs_path_from_node);
+
+/**
+ * kernfs_path - build full path of a given node
+ *
+ * and %NULL is returned.
+ */
+char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+ return kernfs_path_from_node(NULL, kn, buf, buflen);
+}
EXPORT_SYMBOL_GPL(kernfs_path);
/**
@@ -145,8 +176,8 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
spin_lock_irqsave(&kernfs_rename_lock, flags);
- p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
- sizeof(kernfs_pr_cont_buf));
+ p = kernfs_path_from_node_locked(NULL, kn, kernfs_pr_cont_buf,
+ sizeof(kernfs_pr_cont_buf));
if (p)
pr_cont("%s", p);
else
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 20f4935..1627341 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -257,6 +257,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
}
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
+char * __must_check kernfs_path_from_node(struct kernfs_node *root_kn,
+ struct kernfs_node *kn, char *buf,
+ size_t buflen);
char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
size_t buflen);
void pr_cont_kernfs_name(struct kernfs_node *kn);
--
2.0.0.526.g5318336
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
Aditya Kali
2014-07-17 19:52:09 UTC
Permalink
get_task_cgroup() returns the (reference counted) cgroup of the
given task on the default hierarchy.

Signed-off-by: Aditya Kali <adityakali-hpIqsD4AKlfQT0dZR+***@public.gmane.org>
---
include/linux/cgroup.h | 1 +
kernel/cgroup.c | 25 +++++++++++++++++++++++++
2 files changed, 26 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b5223c5..707c302 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -591,6 +591,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
}

char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
+struct cgroup *get_task_cgroup(struct task_struct *task);

int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1e94b71..1671345 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1937,6 +1937,31 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
}
EXPORT_SYMBOL_GPL(task_cgroup_path);

+/*
+ * get_task_cgroup - returns the cgroup of the task in the default cgroup
+ * hierarchy.
+ *
+ * @task: target task
+ * This function returns the @task's cgroup on the default cgroup hierarchy. The
+ * returned cgroup has its reference incremented (by calling cgroup_get()). So
+ * the caller must cgroup_put() the obtained reference once it is done with it.
+ */
+struct cgroup *get_task_cgroup(struct task_struct *task)
+{
+ struct cgroup *cgrp;
+
+ mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
+
+ cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ cgroup_get(cgrp);
+
+ up_read(&css_set_rwsem);
+ mutex_unlock(&cgroup_mutex);
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(get_task_cgroup);
+
/* used to track tasks and other necessary states during migration */
struct cgroup_taskset {
/* the src and dst cset list running through cset->mg_node */
--
2.0.0.526.g5318336
Serge Hallyn
2014-07-24 16:59:02 UTC
Permalink
Post by Aditya Kali
get_task_cgroup() returns the (reference counted) cgroup of the
given task on the default hierarchy.
---
include/linux/cgroup.h | 1 +
kernel/cgroup.c | 25 +++++++++++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b5223c5..707c302 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -591,6 +591,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
}
char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
+struct cgroup *get_task_cgroup(struct task_struct *task);
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1e94b71..1671345 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1937,6 +1937,31 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
+/*
+ * get_task_cgroup - returns the cgroup of the task in the default cgroup
+ * hierarchy.
+ *
+ * returned cgroup has its reference incremented (by calling cgroup_get()). So
+ * the caller must cgroup_put() the obtained reference once it is done with it.
+ */
+struct cgroup *get_task_cgroup(struct task_struct *task)
+{
+ struct cgroup *cgrp;
+
+ mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
+
+ cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ cgroup_get(cgrp);
+
+ up_read(&css_set_rwsem);
+ mutex_unlock(&cgroup_mutex);
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(get_task_cgroup);
+
/* used to track tasks and other necessary states during migration */
struct cgroup_taskset {
/* the src and dst cset list running through cset->mg_node */
--
2.0.0.526.g5318336
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
Serge Hallyn
2014-07-18 16:00:04 UTC
Permalink
Post by Aditya Kali
Background
Cgroups and Namespaces are used together to create =E2=80=9Cvirtual=
=E2=80=9D
Post by Aditya Kali
containers that isolates the host environment from the processes
running in container. But since cgroups themselves are not
=E2=80=9Cvirtualized=E2=80=9D, the task is always able to see globa=
l cgroups view
Post by Aditya Kali
through cgroupfs mount and via /proc/self/cgroup file.
=20
$ cat /proc/self/cgroup=20
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_jo=
b_id1
Post by Aditya Kali
=20
This exposure of cgroup names to the processes running inside a
(1) The container names are typically host-container-management-age=
nt
Post by Aditya Kali
(systemd, docker/libcontainer, etc.) data and leaking its name =
(or
Post by Aditya Kali
leaking the hierarchy) reveals too much information about the h=
ost
Post by Aditya Kali
system.
(2) It makes the container migration across machines (CRIU) more
difficult as the container names need to be unique across the
machines in the migration domain.
(3) It makes it difficult to run container management tools (like
docker/libcontainer, lmctfy, etc.) within virtual containers
without adding dependency on some state/agent present outside t=
he
Post by Aditya Kali
container.
=20
Note that the feature proposed here is completely different than th=
e
Post by Aditya Kali
=E2=80=9Cns cgroup=E2=80=9D feature which existed in the linux kern=
el until recently.
Post by Aditya Kali
The ns cgroup also attempted to connect cgroups and namespaces by
creating a new cgroup every time a new namespace was created. It di=
d
Post by Aditya Kali
not solve any of the above mentioned problems and was later dropped
from the kernel.
=20
Introducing CGroup Namespaces
With unified cgroup hierarchy
(Documentation/cgroups/unified-hierarchy.txt), the containers can n=
ow
Post by Aditya Kali
have a much more coherent cgroup view and its easy to associate a
container with a single cgroup. This also allows us to virtualize t=
he
Post by Aditya Kali
cgroup view for tasks inside the container.
Hi,

So right now we basically do this in userspace using cgmanager. Each
container/chroot/whatever that has a cgproxy is 'locked' under that
proxy's cgroup. So if root in a container asks the cgproxy for the
cgroup of pid 2000, and cgproxy is in /lxc/u1 while pid 2000 in the
container is in /lxc/u1/service1, then the response will be '/service1'=
=2E
Same happens with creating cgroups, moving pids into cgroups, etc.

(Hoping to take a close look at this set early next week)

-serge
Serge Hallyn
2014-07-24 16:10:18 UTC
Permalink
Post by Aditya Kali
Background
Cgroups and Namespaces are used together to create =E2=80=9Cvirtual=
=E2=80=9D
Post by Aditya Kali
containers that isolates the host environment from the processes
running in container. But since cgroups themselves are not
=E2=80=9Cvirtualized=E2=80=9D, the task is always able to see globa=
l cgroups view
Post by Aditya Kali
through cgroupfs mount and via /proc/self/cgroup file.
Hi,

A few questions/comments:

1. Based on this description, am I to understand that after doing a
cgroupns unshare, 'mount -t cgroup cgroup /mnt' by default will
still mount the global root cgroup? Any plans on "changing" that?
Will attempts to change settings of a cgroup which is not under
our current ns be rejected? (That should be easy to do given your
patch 1/5). Sorry if it's done in the set, I'm jumping around...

2. What would be the reprecussions of allowing cgroupns unshare so
long as you have ns_capable(CAP_SYS_ADMIN) to the user_ns which
created your current ns cgroup? It'd be a shame if that wasn't
on the roadmap.

3. The un-namespaced view of /proc/self/cgroup from a sibling cgroupns
makes me wonder whether it wouldn't be more appropriate to leave
/proc/self/cgroup always un-filtered, and use /proc/self/nscgroup
(or somesuch) to provide the namespaced view. /proc/self/nscgroup
would simply be empty (or say (invalid) or (unreachable)) from a
sibling ns. That will give criu and admin tools like lxc/docker all
they need to do simple cgroup setup.
Post by Aditya Kali
=20
$ cat /proc/self/cgroup=20
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_jo=
b_id1
Post by Aditya Kali
=20
This exposure of cgroup names to the processes running inside a
(1) The container names are typically host-container-management-age=
nt
Post by Aditya Kali
(systemd, docker/libcontainer, etc.) data and leaking its name =
(or
Post by Aditya Kali
leaking the hierarchy) reveals too much information about the h=
ost
Post by Aditya Kali
system.
(2) It makes the container migration across machines (CRIU) more
difficult as the container names need to be unique across the
machines in the migration domain.
(3) It makes it difficult to run container management tools (like
docker/libcontainer, lmctfy, etc.) within virtual containers
without adding dependency on some state/agent present outside t=
he
Post by Aditya Kali
container.
=20
Note that the feature proposed here is completely different than th=
e
Post by Aditya Kali
=E2=80=9Cns cgroup=E2=80=9D feature which existed in the linux kern=
el until recently.
Post by Aditya Kali
The ns cgroup also attempted to connect cgroups and namespaces by
creating a new cgroup every time a new namespace was created. It di=
d
Post by Aditya Kali
not solve any of the above mentioned problems and was later dropped
from the kernel.
=20
Introducing CGroup Namespaces
With unified cgroup hierarchy
(Documentation/cgroups/unified-hierarchy.txt), the containers can n=
ow
Post by Aditya Kali
have a much more coherent cgroup view and its easy to associate a
container with a single cgroup. This also allows us to virtualize t=
he
Post by Aditya Kali
cgroup view for tasks inside the container.
=20
The new CGroup Namespace allows a process to =E2=80=9Cunshare=E2=80=
=9D its cgroup
Post by Aditya Kali
hierarchy starting from the cgroup its currently in.
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_jo=
b_id1
Post by Aditya Kali
$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> c=
group:[4026531835]
Post by Aditya Kali
$ ~/unshare -c # calls unshare(CLONE_NEWCGROUP) and exec=E2=80=99s=
/bin/bash
Post by Aditya Kali
[ns]$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup -> c=
group:[4026532183]
Post by Aditya Kali
# From within new cgroupns, process sees that its in the root cgrou=
p
Post by Aditya Kali
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
=20
$ cat /proc/<pid>/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_jo=
b_id1
Post by Aditya Kali
=20
The virtualization of /proc/self/cgroup file combined with restrict=
ing
Post by Aditya Kali
the view of cgroup hierarchy by bind-mounting for the
$CGROUP_MOUNT/batchjobs/c_job_id1/ directory to
$CONTAINER_CHROOT/sys/fs/cgroup/) should provide a completely isola=
ted
Post by Aditya Kali
cgroup view inside the container.
=20
In its current simplistic form, the cgroup namespaces provide
=20
(1) The =E2=80=9Croot=E2=80=9D cgroup for a cgroup namespace is the=
cgroup in which
Post by Aditya Kali
the process calling unshare is running.
For ex. if a process in /batchjobs/c_job_id1 cgroup calls unsha=
re,
Post by Aditya Kali
cgroup /batchjobs/c_job_id1 becomes the cgroupns-root.
For the init_cgroup_ns, this is the real root (=E2=80=9C/=E2=80=
=9D) cgroup
Post by Aditya Kali
(identified in code as cgrp_dfl_root.cgrp).
=20
(2) The cgroupns-root cgroup does not change even if the namespace
creator process later moves to a different cgroup.
$ ~/unshare -c # unshare cgroupns in some cgroup
[ns]$ cat /proc/self/cgroup=20
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/=20
[ns]$ mkdir sub_cgrp_1
[ns]$ echo 0 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/self/cgroup=20
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1
=20
(3) Each process gets its CGROUPNS specific view of
/proc/<pid>/cgroup.
(a) Processes running inside the cgroup namespace will be able to s=
ee
Post by Aditya Kali
cgroup paths (in /proc/self/cgroup) only inside their root cgro=
up
Post by Aditya Kali
[ns]$ sleep 100000 & # From within unshared cgroupns
[1] 7353
[ns]$ echo 7353 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1
=20
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/=
c_job_id1/sub_cgrp_1
Post by Aditya Kali
=20
[ns2]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/=
c_job_id1/sub_cgrp_1
Post by Aditya Kali
(In correct container setup though, it should not be possible t=
o
Post by Aditya Kali
access PIDs in another container in the first place. This can =
be
Post by Aditya Kali
detected changed if desired.)
=20
(4) Processes inside a cgroupns are not allowed to move out of the
cgroupns-root. This is true even if a privileged process in glo=
bal
Post by Aditya Kali
cgroupns tries to move the process out of its cgroupns-root.
=20
# From global cgroupns
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/=
c_job_id1/sub_cgrp_1
Post by Aditya Kali
# cgroupns-root for 7353 is /batchjobs/c_job_id1
$ echo 7353 > batchjobs/c_job_id2/cgroup.procs
-bash: echo: write error: Operation not permitted
=20
(5) setns() is not supported for cgroup namespace in the initial
version.
=20
(6) When some thread from a multi-threaded process unshares its
cgroup-namespace, the new cgroupns gets applied to the entire
process (all the threads). This should be OK since
unified-hierarchy only allows process-level containerization. S=
o
Post by Aditya Kali
all the threads in the process will have the same cgroup. And b=
oth
Post by Aditya Kali
- changing cgroups and unsharing namespaces - are protected und=
er
Post by Aditya Kali
threadgroup_lock(task).
=20
(7) The cgroup namespace is alive as long as there is atleast 1
process inside it. When the last process exits, the cgroup
namespace is destroyed. The cgroupns-root and the actual cgroup=
s
Post by Aditya Kali
remain though.
=20
Implementation
The current patch-set is based on top of Tejun's cgroup tree (for-n=
ext
Post by Aditya Kali
branch). Its fairly non-intrusive and provides above mentioned
features.
=20
(1) The Documentation/cgroups/unified-hierarchy.txt mentions use of
capabilities to restrict cgroups to administrative users. CGrou=
p
Post by Aditya Kali
namespaces could be of help here. With cgroup namespaces, it mi=
ght
Post by Aditya Kali
be possible to delegate administration of sub-cgroups under a
cgroupns-root to the cgroupns owner.
=20
(2) Provide a cgroupns specific cgroupfs mount. i.e., the following
command when ran from inside a cgroupns should only mount the
$ mount -t cgroup cgroup <cgroup-mountpoint>
# -o __DEVEL__sane_behavior should be implicit
=20
This is similar to how procfs can be mounted for every PIDNS. T=
his
Post by Aditya Kali
may have some usecases.
=20
---
fs/kernfs/dir.c | 51 +++++++++++++---
fs/proc/namespaces.c | 3 +
include/linux/cgroup.h | 36 ++++++++++-
include/linux/cgroup_namespace.h | 62 +++++++++++++++++++
include/linux/kernfs.h | 3 +
include/linux/nsproxy.h | 2 +
include/linux/proc_ns.h | 4 ++
include/uapi/linux/sched.h | 3 +-
init/Kconfig | 9 +++
kernel/Makefile | 1 +
kernel/cgroup.c | 75 +++++++++++++++++------
kernel/cgroup_namespace.c | 128 +++++++++++++++++++++++++++++=
++++++++++
Post by Aditya Kali
kernel/fork.c | 2 +-
kernel/nsproxy.c | 19 +++++-
14 files changed, 364 insertions(+), 34 deletions(-)
create mode 100644 include/linux/cgroup_namespace.h
create mode 100644 kernel/cgroup_namespace.c
=20
[PATCH 1/5] kernfs: Add API to get generate relative kernfs path
[PATCH 2/5] sched: new clone flag CLONE_NEWCGROUP for cgroup
[PATCH 3/5] cgroup: add function to get task's cgroup on default
[PATCH 4/5] cgroup: export cgroup_get() and cgroup_put()
[PATCH 5/5] cgroup: introduce cgroup namespaces
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
Serge Hallyn
2014-07-24 16:36:28 UTC
Permalink
Post by Aditya Kali
Background
Cgroups and Namespaces are used together to create =E2=80=9Cvirtual=
=E2=80=9D
Post by Aditya Kali
containers that isolates the host environment from the processes
running in container. But since cgroups themselves are not
=E2=80=9Cvirtualized=E2=80=9D, the task is always able to see globa=
l cgroups view
Post by Aditya Kali
through cgroupfs mount and via /proc/self/cgroup file.
=20
$ cat /proc/self/cgroup=20
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_jo=
b_id1
Post by Aditya Kali
=20
This exposure of cgroup names to the processes running inside a
(1) The container names are typically host-container-management-age=
nt
Post by Aditya Kali
(systemd, docker/libcontainer, etc.) data and leaking its name =
(or
Post by Aditya Kali
leaking the hierarchy) reveals too much information about the h=
ost
Post by Aditya Kali
system.
(2) It makes the container migration across machines (CRIU) more
difficult as the container names need to be unique across the
machines in the migration domain.
(3) It makes it difficult to run container management tools (like
docker/libcontainer, lmctfy, etc.) within virtual containers
without adding dependency on some state/agent present outside t=
he
Post by Aditya Kali
container.
=20
Note that the feature proposed here is completely different than th=
e
Post by Aditya Kali
=E2=80=9Cns cgroup=E2=80=9D feature which existed in the linux kern=
el until recently.
Post by Aditya Kali
The ns cgroup also attempted to connect cgroups and namespaces by
creating a new cgroup every time a new namespace was created. It di=
d
Post by Aditya Kali
not solve any of the above mentioned problems and was later dropped
from the kernel.
=20
Introducing CGroup Namespaces
With unified cgroup hierarchy
(Documentation/cgroups/unified-hierarchy.txt), the containers can n=
ow
Post by Aditya Kali
have a much more coherent cgroup view and its easy to associate a
container with a single cgroup. This also allows us to virtualize t=
he
Post by Aditya Kali
cgroup view for tasks inside the container.
=20
The new CGroup Namespace allows a process to =E2=80=9Cunshare=E2=80=
=9D its cgroup
Post by Aditya Kali
hierarchy starting from the cgroup its currently in.
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_jo=
b_id1
Post by Aditya Kali
$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> c=
group:[4026531835]
Post by Aditya Kali
$ ~/unshare -c # calls unshare(CLONE_NEWCGROUP) and exec=E2=80=99s=
/bin/bash
Post by Aditya Kali
[ns]$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup -> c=
group:[4026532183]
Post by Aditya Kali
# From within new cgroupns, process sees that its in the root cgrou=
p
Post by Aditya Kali
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
=20
$ cat /proc/<pid>/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_jo=
b_id1
Post by Aditya Kali
=20
The virtualization of /proc/self/cgroup file combined with restrict=
ing
Post by Aditya Kali
the view of cgroup hierarchy by bind-mounting for the
$CGROUP_MOUNT/batchjobs/c_job_id1/ directory to
$CONTAINER_CHROOT/sys/fs/cgroup/) should provide a completely isola=
ted
Post by Aditya Kali
cgroup view inside the container.
=20
In its current simplistic form, the cgroup namespaces provide
=20
(1) The =E2=80=9Croot=E2=80=9D cgroup for a cgroup namespace is the=
cgroup in which
Post by Aditya Kali
the process calling unshare is running.
For ex. if a process in /batchjobs/c_job_id1 cgroup calls unsha=
re,
Post by Aditya Kali
cgroup /batchjobs/c_job_id1 becomes the cgroupns-root.
For the init_cgroup_ns, this is the real root (=E2=80=9C/=E2=80=
=9D) cgroup
Post by Aditya Kali
(identified in code as cgrp_dfl_root.cgrp).
=20
(2) The cgroupns-root cgroup does not change even if the namespace
creator process later moves to a different cgroup.
$ ~/unshare -c # unshare cgroupns in some cgroup
[ns]$ cat /proc/self/cgroup=20
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/=20
[ns]$ mkdir sub_cgrp_1
[ns]$ echo 0 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/self/cgroup=20
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1
=20
(3) Each process gets its CGROUPNS specific view of
/proc/<pid>/cgroup.
(a) Processes running inside the cgroup namespace will be able to s=
ee
Post by Aditya Kali
cgroup paths (in /proc/self/cgroup) only inside their root cgro=
up
Post by Aditya Kali
[ns]$ sleep 100000 & # From within unshared cgroupns
[1] 7353
[ns]$ echo 7353 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1
=20
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/=
c_job_id1/sub_cgrp_1
Post by Aditya Kali
=20
[ns2]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/=
c_job_id1/sub_cgrp_1
Post by Aditya Kali
(In correct container setup though, it should not be possible t=
o
Post by Aditya Kali
access PIDs in another container in the first place. This can =
be
Post by Aditya Kali
detected changed if desired.)
=20
(4) Processes inside a cgroupns are not allowed to move out of the
cgroupns-root. This is true even if a privileged process in glo=
bal
Post by Aditya Kali
cgroupns tries to move the process out of its cgroupns-root.
=20
# From global cgroupns
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/=
c_job_id1/sub_cgrp_1
Post by Aditya Kali
# cgroupns-root for 7353 is /batchjobs/c_job_id1
$ echo 7353 > batchjobs/c_job_id2/cgroup.procs
-bash: echo: write error: Operation not permitted
=20
(5) setns() is not supported for cgroup namespace in the initial
version.
This combined with the full-path reporting for peer ns cgroups could ma=
ke
for fun antics when attaching to an existing container (since we'd have
to unshare into a new ns cgroup with the same roto as the container).
I understand you are implying this will be fixed soon though.
Post by Aditya Kali
(6) When some thread from a multi-threaded process unshares its
cgroup-namespace, the new cgroupns gets applied to the entire
process (all the threads). This should be OK since
unified-hierarchy only allows process-level containerization. S=
o
Post by Aditya Kali
all the threads in the process will have the same cgroup. And b=
oth
Post by Aditya Kali
- changing cgroups and unsharing namespaces - are protected und=
er
Post by Aditya Kali
threadgroup_lock(task).
=20
(7) The cgroup namespace is alive as long as there is atleast 1
process inside it. When the last process exits, the cgroup
namespace is destroyed. The cgroupns-root and the actual cgroup=
s
Post by Aditya Kali
remain though.
=20
Implementation
The current patch-set is based on top of Tejun's cgroup tree (for-n=
ext
Post by Aditya Kali
branch). Its fairly non-intrusive and provides above mentioned
features.
=20
(1) The Documentation/cgroups/unified-hierarchy.txt mentions use of
capabilities to restrict cgroups to administrative users. CGrou=
p
Post by Aditya Kali
namespaces could be of help here. With cgroup namespaces, it mi=
ght
Post by Aditya Kali
be possible to delegate administration of sub-cgroups under a
cgroupns-root to the cgroupns owner.
That would be nice.
Post by Aditya Kali
(2) Provide a cgroupns specific cgroupfs mount. i.e., the following
command when ran from inside a cgroupns should only mount the
$ mount -t cgroup cgroup <cgroup-mountpoint>
# -o __DEVEL__sane_behavior should be implicit
=20
This is similar to how procfs can be mounted for every PIDNS. T=
his
Post by Aditya Kali
may have some usecases.
Sorry - I see this answers the first part of a question in my previous =
email.
However, the question of whether changes to limits in cgroups which are=
not
under our cgroup-ns-root are allowed.

Admittedly the current case with cgmanager is the same - in that it dep=
ends
on proper setup of the container - but cgmanager is geared to recommend
not mounting the cgroups in the container at all (and we can reject suc=
h
mounts in the contaienr altogether with no loss in functionality) where=
as
you are here encouraging such mounts. Which is fine - so long as you t=
hen
fully address the potential issues.
Aditya Kali
2014-07-25 19:29:52 UTC
Permalink
Thank you for your review. I have tried to respond to both your emails =
here.
Post by Serge Hallyn
Post by Aditya Kali
Background
Cgroups and Namespaces are used together to create =E2=80=9Cvirtua=
l=E2=80=9D
Post by Serge Hallyn
Post by Aditya Kali
containers that isolates the host environment from the processes
running in container. But since cgroups themselves are not
=E2=80=9Cvirtualized=E2=80=9D, the task is always able to see glob=
al cgroups view
Post by Serge Hallyn
Post by Aditya Kali
through cgroupfs mount and via /proc/self/cgroup file.
Hi,
1. Based on this description, am I to understand that after doing a
cgroupns unshare, 'mount -t cgroup cgroup /mnt' by default will
still mount the global root cgroup? Any plans on "changing" that?
This is suggested in the "Possible Extensions of CGROUPNS" section.
More details below.
Post by Serge Hallyn
Will attempts to change settings of a cgroup which is not under
our current ns be rejected? (That should be easy to do given your
patch 1/5). Sorry if it's done in the set, I'm jumping around...
Currently, only 'cgroup_attach_task', 'cgroup_mkdir' and
'cgroup_rmdir' of cgroups outside of cgroupns-root are prevented. The
read/write of actual cgroup properties are not prevented. Usual
permission checks continue to apply for those. I was hoping that
should be enough, but see more comments towards the end.
Post by Serge Hallyn
2. What would be the reprecussions of allowing cgroupns unshare so
long as you have ns_capable(CAP_SYS_ADMIN) to the user_ns which
created your current ns cgroup? It'd be a shame if that wasn't
on the roadmap.
Its certainly on the roadmap, just that some logistics were not clear
at this time. As pointed out by Andy Lutomirski on [PATCH 5/5] of this
series, if we allow cgroupns creation to ns_capable(CAP_SYS_ADMIN)
processes, we may need some kind of explicit permission from the
cgroup subsystem to allow this. One approach could be an explicit
cgroup.may_unshare setting. Alternatively, the cgroup directory (which
is going to become the cgroupns-root) ownership could also be used
here. i.e., the process is ns_capable(CAP_SYS_ADMIN) && it owns the
cgroup directory. There seems to be already a function that allows
similar thing and might be sufficient:

/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mappe=
d
* @inode: The inode in question
* @cap: The capability in question
*
* Return true if the current task has the given capability targeted at
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)

What do you think? We can enable this for non-init userns once this is
decided on.
Post by Serge Hallyn
3. The un-namespaced view of /proc/self/cgroup from a sibling cgroupn=
s
Post by Serge Hallyn
makes me wonder whether it wouldn't be more appropriate to leave
/proc/self/cgroup always un-filtered, and use /proc/self/nscgroup
(or somesuch) to provide the namespaced view. /proc/self/nscgroup
would simply be empty (or say (invalid) or (unreachable)) from a
sibling ns. That will give criu and admin tools like lxc/docker a=
ll
Post by Serge Hallyn
they need to do simple cgroup setup.
It may work for lxc/docker and new applications that use the new
interface. But its difficult to change numerous existing user
applications and libraries that depend on /proc/self/cgroup. Moreover,
even with the new interface, /proc/self/cgroup will continue to leak
system level cgroup information. And fixing this leak is critical to
make the container migratable.

Its easy to correctly handle the read of /proc/<pid>/cgroup from a
sibling cgroupns. Instead of showing unfiltered view, we could just
not show anything (same behavior when the cgroup hierarchy is not
mounted). Will that be more acceptable? I can make that change in the
next version of this series.
Post by Serge Hallyn
Post by Aditya Kali
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_j=
ob_id1
Post by Serge Hallyn
Post by Aditya Kali
This exposure of cgroup names to the processes running inside a
(1) The container names are typically host-container-management-ag=
ent
Post by Serge Hallyn
Post by Aditya Kali
(systemd, docker/libcontainer, etc.) data and leaking its name=
(or
Post by Serge Hallyn
Post by Aditya Kali
leaking the hierarchy) reveals too much information about the =
host
Post by Serge Hallyn
Post by Aditya Kali
system.
(2) It makes the container migration across machines (CRIU) more
difficult as the container names need to be unique across the
machines in the migration domain.
(3) It makes it difficult to run container management tools (like
docker/libcontainer, lmctfy, etc.) within virtual containers
without adding dependency on some state/agent present outside =
the
Post by Serge Hallyn
Post by Aditya Kali
container.
Note that the feature proposed here is completely different than t=
he
Post by Serge Hallyn
Post by Aditya Kali
=E2=80=9Cns cgroup=E2=80=9D feature which existed in the linux ker=
nel until recently.
Post by Serge Hallyn
Post by Aditya Kali
The ns cgroup also attempted to connect cgroups and namespaces by
creating a new cgroup every time a new namespace was created. It d=
id
Post by Serge Hallyn
Post by Aditya Kali
not solve any of the above mentioned problems and was later droppe=
d
Post by Serge Hallyn
Post by Aditya Kali
from the kernel.
Introducing CGroup Namespaces
With unified cgroup hierarchy
(Documentation/cgroups/unified-hierarchy.txt), the containers can =
now
Post by Serge Hallyn
Post by Aditya Kali
have a much more coherent cgroup view and its easy to associate a
container with a single cgroup. This also allows us to virtualize =
the
Post by Serge Hallyn
Post by Aditya Kali
cgroup view for tasks inside the container.
The new CGroup Namespace allows a process to =E2=80=9Cunshare=E2=80=
=9D its cgroup
Post by Serge Hallyn
Post by Aditya Kali
hierarchy starting from the cgroup its currently in.
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_j=
ob_id1
Post by Serge Hallyn
Post by Aditya Kali
$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> =
cgroup:[4026531835]
Post by Serge Hallyn
Post by Aditya Kali
$ ~/unshare -c # calls unshare(CLONE_NEWCGROUP) and exec=E2=80=99=
s /bin/bash
Post by Serge Hallyn
Post by Aditya Kali
[ns]$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup -> =
cgroup:[4026532183]
Post by Serge Hallyn
Post by Aditya Kali
# From within new cgroupns, process sees that its in the root cgro=
up
Post by Serge Hallyn
Post by Aditya Kali
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
$ cat /proc/<pid>/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_j=
ob_id1
Post by Serge Hallyn
Post by Aditya Kali
The virtualization of /proc/self/cgroup file combined with restric=
ting
Post by Serge Hallyn
Post by Aditya Kali
the view of cgroup hierarchy by bind-mounting for the
$CGROUP_MOUNT/batchjobs/c_job_id1/ directory to
$CONTAINER_CHROOT/sys/fs/cgroup/) should provide a completely isol=
ated
Post by Serge Hallyn
Post by Aditya Kali
cgroup view inside the container.
In its current simplistic form, the cgroup namespaces provide
(1) The =E2=80=9Croot=E2=80=9D cgroup for a cgroup namespace is th=
e cgroup in which
Post by Serge Hallyn
Post by Aditya Kali
the process calling unshare is running.
For ex. if a process in /batchjobs/c_job_id1 cgroup calls unsh=
are,
Post by Serge Hallyn
Post by Aditya Kali
cgroup /batchjobs/c_job_id1 becomes the cgroupns-root.
For the init_cgroup_ns, this is the real root (=E2=80=9C/=E2=80=
=9D) cgroup
Post by Serge Hallyn
Post by Aditya Kali
(identified in code as cgrp_dfl_root.cgrp).
(2) The cgroupns-root cgroup does not change even if the namespace
creator process later moves to a different cgroup.
$ ~/unshare -c # unshare cgroupns in some cgroup
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
[ns]$ mkdir sub_cgrp_1
[ns]$ echo 0 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_=
1
Post by Serge Hallyn
Post by Aditya Kali
(3) Each process gets its CGROUPNS specific view of
/proc/<pid>/cgroup.
(a) Processes running inside the cgroup namespace will be able to =
see
Post by Serge Hallyn
Post by Aditya Kali
cgroup paths (in /proc/self/cgroup) only inside their root cgr=
oup
Post by Serge Hallyn
Post by Aditya Kali
[ns]$ sleep 100000 & # From within unshared cgroupns
[1] 7353
[ns]$ echo 7353 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_=
1
Post by Serge Hallyn
Post by Aditya Kali
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs=
/c_job_id1/sub_cgrp_1
Post by Serge Hallyn
Post by Aditya Kali
[ns2]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs=
/c_job_id1/sub_cgrp_1
Post by Serge Hallyn
Post by Aditya Kali
(In correct container setup though, it should not be possible =
to
Post by Serge Hallyn
Post by Aditya Kali
access PIDs in another container in the first place. This can=
be
Post by Serge Hallyn
Post by Aditya Kali
detected changed if desired.)
(4) Processes inside a cgroupns are not allowed to move out of the
cgroupns-root. This is true even if a privileged process in gl=
obal
Post by Serge Hallyn
Post by Aditya Kali
cgroupns tries to move the process out of its cgroupns-root.
# From global cgroupns
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs=
/c_job_id1/sub_cgrp_1
Post by Serge Hallyn
Post by Aditya Kali
# cgroupns-root for 7353 is /batchjobs/c_job_id1
$ echo 7353 > batchjobs/c_job_id2/cgroup.procs
-bash: echo: write error: Operation not permitted
(5) setns() is not supported for cgroup namespace in the initial
version.
This combined with the full-path reporting for peer ns cgroups could =
make
Post by Serge Hallyn
for fun antics when attaching to an existing container (since we'd ha=
ve
Post by Serge Hallyn
to unshare into a new ns cgroup with the same roto as the container).
I understand you are implying this will be fixed soon though.
I am thinking the setns() will be only allowed if
target_cgrpns->cgroupns_root is_descendant_of
current_cgrpns->cgroupns_root. i.e., you will only be setns to a
cgroup namespace which is rooted deeper in hierarchy than your own (in
addition to checking capable_wrt_inode_uidgid(target_cgrpns_inode)).

In addition to this, we need to decide whether its OK for setns() to
also change the cgroup of the task. Consider following example:

[A] ----> [B] ----> C
----> D

[A] and [B] are cgroupns-roots. Now, if a task in Cgroup D (which is
under cgroupns [A]) attempts to setns() to cgroupns [B], then its
cgroup should change from /A/D to /A/B. I am concerned about the
side-effects this might cause. Though otherwise, this is a very useful
feature for containers. One could argue that this is similar to
setns() to a mount-namespace which is pivot_root'd somewhere else (in
which case, the attaching task's root "/" moves implicitly with
setns).

Alternatively, we could only allow setns() if
target_cgrpns->cgroupns_root =3D=3D current->cgroup . I.e., taking abov=
e
example again, if process in Cgroup D wants to setns() to cgroupns
[B], then it will first need to move to Cgroup B, and only then the
setns() will succeed. This makes sure that there is no implicit cgroup
move.

WDYT? I haven't prototyped this yet, but will send out a patch after
this series is accepted.
Post by Serge Hallyn
Post by Aditya Kali
(6) When some thread from a multi-threaded process unshares its
cgroup-namespace, the new cgroupns gets applied to the entire
process (all the threads). This should be OK since
unified-hierarchy only allows process-level containerization. =
So
Post by Serge Hallyn
Post by Aditya Kali
all the threads in the process will have the same cgroup. And =
both
Post by Serge Hallyn
Post by Aditya Kali
- changing cgroups and unsharing namespaces - are protected un=
der
Post by Serge Hallyn
Post by Aditya Kali
threadgroup_lock(task).
(7) The cgroup namespace is alive as long as there is atleast 1
process inside it. When the last process exits, the cgroup
namespace is destroyed. The cgroupns-root and the actual cgrou=
ps
Post by Serge Hallyn
Post by Aditya Kali
remain though.
Implementation
The current patch-set is based on top of Tejun's cgroup tree (for-=
next
Post by Serge Hallyn
Post by Aditya Kali
branch). Its fairly non-intrusive and provides above mentioned
features.
(1) The Documentation/cgroups/unified-hierarchy.txt mentions use o=
f
Post by Serge Hallyn
Post by Aditya Kali
capabilities to restrict cgroups to administrative users. CGro=
up
Post by Serge Hallyn
Post by Aditya Kali
namespaces could be of help here. With cgroup namespaces, it m=
ight
Post by Serge Hallyn
Post by Aditya Kali
be possible to delegate administration of sub-cgroups under a
cgroupns-root to the cgroupns owner.
That would be nice.
Post by Aditya Kali
(2) Provide a cgroupns specific cgroupfs mount. i.e., the followin=
g
Post by Serge Hallyn
Post by Aditya Kali
command when ran from inside a cgroupns should only mount the
$ mount -t cgroup cgroup <cgroup-mountpoint>
# -o __DEVEL__sane_behavior should be implicit
This is similar to how procfs can be mounted for every PIDNS. =
This
Post by Serge Hallyn
Post by Aditya Kali
may have some usecases.
Sorry - I see this answers the first part of a question in my previou=
s email.
Post by Serge Hallyn
However, the question of whether changes to limits in cgroups which a=
re not
Post by Serge Hallyn
under our cgroup-ns-root are allowed.
Admittedly the current case with cgmanager is the same - in that it d=
epends
Post by Serge Hallyn
on proper setup of the container - but cgmanager is geared to recomme=
nd
Post by Serge Hallyn
not mounting the cgroups in the container at all (and we can reject s=
uch
Post by Serge Hallyn
mounts in the contaienr altogether with no loss in functionality) whe=
reas
Post by Serge Hallyn
you are here encouraging such mounts. Which is fine - so long as you=
then
Post by Serge Hallyn
fully address the potential issues.
It will be nice to have this, but frankly, it may add a bit of
complexity in the cgroup/kernfs code (I will have to prototype and
see). Also same behavior can be obtained simply by bind-mounting
cgroupns-root inside the container. So I am currently inclining
towards rejecting such mounts in favor of simplicity.

Regarding disallowing writes to cgroup files outside of your
cgroupns-root, I think it should possible implement it easily. I will
include it in the next revision of this series.

Thanks,
--=20
Aditya
Andy Lutomirski
2014-07-25 20:27:50 UTC
Permalink
Thank you for your review. I have tried to respond to both your emails here.
Post by Serge Hallyn
2. What would be the reprecussions of allowing cgroupns unshare so
long as you have ns_capable(CAP_SYS_ADMIN) to the user_ns which
created your current ns cgroup? It'd be a shame if that wasn't
on the roadmap.
Its certainly on the roadmap, just that some logistics were not clear
at this time. As pointed out by Andy Lutomirski on [PATCH 5/5] of this
series, if we allow cgroupns creation to ns_capable(CAP_SYS_ADMIN)
processes, we may need some kind of explicit permission from the
cgroup subsystem to allow this. One approach could be an explicit
cgroup.may_unshare setting. Alternatively, the cgroup directory (which
is going to become the cgroupns-root) ownership could also be used
here. i.e., the process is ns_capable(CAP_SYS_ADMIN) && it owns the
cgroup directory. There seems to be already a function that allows
/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
*
* Return true if the current task has the given capability targeted at
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
What do you think? We can enable this for non-init userns once this is
decided on.
I think I'd rather it just check that it's owned by the userns owner
if we were going down that route. But maybe there's a good reason to
do it this way.
Post by Serge Hallyn
3. The un-namespaced view of /proc/self/cgroup from a sibling cgroupns
makes me wonder whether it wouldn't be more appropriate to leave
/proc/self/cgroup always un-filtered, and use /proc/self/nscgroup
(or somesuch) to provide the namespaced view. /proc/self/nscgroup
would simply be empty (or say (invalid) or (unreachable)) from a
sibling ns. That will give criu and admin tools like lxc/docker all
they need to do simple cgroup setup.
It may work for lxc/docker and new applications that use the new
interface. But its difficult to change numerous existing user
applications and libraries that depend on /proc/self/cgroup. Moreover,
even with the new interface, /proc/self/cgroup will continue to leak
system level cgroup information. And fixing this leak is critical to
make the container migratable.
Its easy to correctly handle the read of /proc/<pid>/cgroup from a
sibling cgroupns. Instead of showing unfiltered view, we could just
not show anything (same behavior when the cgroup hierarchy is not
mounted). Will that be more acceptable? I can make that change in the
next version of this series.
Post by Serge Hallyn
Post by Aditya Kali
(5) setns() is not supported for cgroup namespace in the initial
version.
This combined with the full-path reporting for peer ns cgroups could make
for fun antics when attaching to an existing container (since we'd have
to unshare into a new ns cgroup with the same roto as the container).
I understand you are implying this will be fixed soon though.
I am thinking the setns() will be only allowed if
target_cgrpns->cgroupns_root is_descendant_of
current_cgrpns->cgroupns_root. i.e., you will only be setns to a
cgroup namespace which is rooted deeper in hierarchy than your own (in
addition to checking capable_wrt_inode_uidgid(target_cgrpns_inode)).
I'm not sure why the capable_wrt_inode_uidgid is needed here -- I
imagine that the hierarchy check and the usual CAP_SYS_ADMIN check on
the cgroupns's userns would be sufficient.
In addition to this, we need to decide whether its OK for setns() to
[A] ----> [B] ----> C
----> D
[A] and [B] are cgroupns-roots. Now, if a task in Cgroup D (which is
under cgroupns [A]) attempts to setns() to cgroupns [B], then its
cgroup should change from /A/D to /A/B. I am concerned about the
side-effects this might cause. Though otherwise, this is a very useful
feature for containers. One could argue that this is similar to
setns() to a mount-namespace which is pivot_root'd somewhere else (in
which case, the attaching task's root "/" moves implicitly with
setns).
Off the top of my head, I think that making setns do this would be too
magical. How about just requiring that you already be in (a
descendent of) the requested cgroupns's root cgroup if you try to
setns?
Alternatively, we could only allow setns() if
target_cgrpns->cgroupns_root == current->cgroup . I.e., taking above
example again, if process in Cgroup D wants to setns() to cgroupns
[B], then it will first need to move to Cgroup B, and only then the
setns() will succeed. This makes sure that there is no implicit cgroup
move.
I like this one, but I think that descendant cgroups should probably
be allowed, too.

--Andy
Serge E. Hallyn
2014-07-29 04:51:59 UTC
Permalink
Thank you for your review. I have tried to respond to both your email=
s here.
=20
Post by Serge Hallyn
Post by Aditya Kali
Background
Cgroups and Namespaces are used together to create =E2=80=9Cvirt=
ual=E2=80=9D
Post by Serge Hallyn
Post by Aditya Kali
containers that isolates the host environment from the processes
running in container. But since cgroups themselves are not
=E2=80=9Cvirtualized=E2=80=9D, the task is always able to see gl=
obal cgroups view
Post by Serge Hallyn
Post by Aditya Kali
through cgroupfs mount and via /proc/self/cgroup file.
Hi,
1. Based on this description, am I to understand that after doing a
cgroupns unshare, 'mount -t cgroup cgroup /mnt' by default will
still mount the global root cgroup? Any plans on "changing" tha=
t?
=20
This is suggested in the "Possible Extensions of CGROUPNS" section.
More details below.
=20
Post by Serge Hallyn
Will attempts to change settings of a cgroup which is not under
our current ns be rejected? (That should be easy to do given yo=
ur
Post by Serge Hallyn
patch 1/5). Sorry if it's done in the set, I'm jumping around..=
=2E
=20
Currently, only 'cgroup_attach_task', 'cgroup_mkdir' and
'cgroup_rmdir' of cgroups outside of cgroupns-root are prevented. The
read/write of actual cgroup properties are not prevented. Usual
permission checks continue to apply for those. I was hoping that
should be enough, but see more comments towards the end.
=20
Post by Serge Hallyn
2. What would be the reprecussions of allowing cgroupns unshare so
long as you have ns_capable(CAP_SYS_ADMIN) to the user_ns which
created your current ns cgroup? It'd be a shame if that wasn't
on the roadmap.
=20
Its certainly on the roadmap, just that some logistics were not clear
at this time. As pointed out by Andy Lutomirski on [PATCH 5/5] of thi=
s
series, if we allow cgroupns creation to ns_capable(CAP_SYS_ADMIN)
processes, we may need some kind of explicit permission from the
cgroup subsystem to allow this. One approach could be an explicit
So long as you do ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN) I think
you're fine.

The only real problem I can think of with unsharing a cgroup_ns is that
you could lock a setuid-root application someplace it wasn't expecting.
The above check guarantees that you were privileged enough that you'd
better be trusted in this user namespace.

(Unless there is some possible interaction I'm overlooking)
cgroup.may_unshare setting. Alternatively, the cgroup directory (whic=
h
is going to become the cgroupns-root) ownership could also be used
here. i.e., the process is ns_capable(CAP_SYS_ADMIN) && it owns the
cgroup directory. There seems to be already a function that allows
=20
/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid map=
ped
*
* Return true if the current task has the given capability targeted =
at
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
=20
What do you think? We can enable this for non-init userns once this i=
s
decided on.
I don't think it's needed... (until you show how wrong I am above :)
Post by Serge Hallyn
3. The un-namespaced view of /proc/self/cgroup from a sibling cgrou=
pns
Post by Serge Hallyn
makes me wonder whether it wouldn't be more appropriate to leave
/proc/self/cgroup always un-filtered, and use /proc/self/nscgrou=
p
Post by Serge Hallyn
(or somesuch) to provide the namespaced view. /proc/self/nscgro=
up
Post by Serge Hallyn
would simply be empty (or say (invalid) or (unreachable)) from a
sibling ns. That will give criu and admin tools like lxc/docker=
all
Post by Serge Hallyn
they need to do simple cgroup setup.
=20
It may work for lxc/docker and new applications that use the new
interface. But its difficult to change numerous existing user
applications and libraries that depend on /proc/self/cgroup. Moreover=
,
even with the new interface, /proc/self/cgroup will continue to leak
system level cgroup information. And fixing this leak is critical to
make the container migratable.
=20
Its easy to correctly handle the read of /proc/<pid>/cgroup from a
sibling cgroupns. Instead of showing unfiltered view, we could just
not show anything (same behavior when the cgroup hierarchy is not
mounted). Will that be more acceptable? I can make that change in the
next version of this series.
It'll be acceptable so long as setns(CLONE_NEWCGROUP) is supported.
Post by Serge Hallyn
Post by Aditya Kali
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c=
_job_id1
Post by Serge Hallyn
Post by Aditya Kali
This exposure of cgroup names to the processes running inside a
(1) The container names are typically host-container-management-=
agent
Post by Serge Hallyn
Post by Aditya Kali
(systemd, docker/libcontainer, etc.) data and leaking its na=
me (or
Post by Serge Hallyn
Post by Aditya Kali
leaking the hierarchy) reveals too much information about th=
e host
Post by Serge Hallyn
Post by Aditya Kali
system.
(2) It makes the container migration across machines (CRIU) more
difficult as the container names need to be unique across th=
e
Post by Serge Hallyn
Post by Aditya Kali
machines in the migration domain.
(3) It makes it difficult to run container management tools (lik=
e
Post by Serge Hallyn
Post by Aditya Kali
docker/libcontainer, lmctfy, etc.) within virtual containers
without adding dependency on some state/agent present outsid=
e the
Post by Serge Hallyn
Post by Aditya Kali
container.
Note that the feature proposed here is completely different than=
the
Post by Serge Hallyn
Post by Aditya Kali
=E2=80=9Cns cgroup=E2=80=9D feature which existed in the linux k=
ernel until recently.
Post by Serge Hallyn
Post by Aditya Kali
The ns cgroup also attempted to connect cgroups and namespaces b=
y
Post by Serge Hallyn
Post by Aditya Kali
creating a new cgroup every time a new namespace was created. It=
did
Post by Serge Hallyn
Post by Aditya Kali
not solve any of the above mentioned problems and was later drop=
ped
Post by Serge Hallyn
Post by Aditya Kali
from the kernel.
Introducing CGroup Namespaces
With unified cgroup hierarchy
(Documentation/cgroups/unified-hierarchy.txt), the containers ca=
n now
Post by Serge Hallyn
Post by Aditya Kali
have a much more coherent cgroup view and its easy to associate =
a
Post by Serge Hallyn
Post by Aditya Kali
container with a single cgroup. This also allows us to virtualiz=
e the
Post by Serge Hallyn
Post by Aditya Kali
cgroup view for tasks inside the container.
The new CGroup Namespace allows a process to =E2=80=9Cunshare=E2=
=80=9D its cgroup
Post by Serge Hallyn
Post by Aditya Kali
hierarchy starting from the cgroup its currently in.
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c=
_job_id1
Post by Serge Hallyn
Post by Aditya Kali
$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -=
cgroup:[4026531835]
Post by Serge Hallyn
Post by Aditya Kali
$ ~/unshare -c # calls unshare(CLONE_NEWCGROUP) and exec=E2=80=99=
s /bin/bash
Post by Serge Hallyn
Post by Aditya Kali
[ns]$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup -=
cgroup:[4026532183]
Post by Serge Hallyn
Post by Aditya Kali
# From within new cgroupns, process sees that its in the root cg=
roup
Post by Serge Hallyn
Post by Aditya Kali
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
$ cat /proc/<pid>/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c=
_job_id1
Post by Serge Hallyn
Post by Aditya Kali
The virtualization of /proc/self/cgroup file combined with restr=
icting
Post by Serge Hallyn
Post by Aditya Kali
the view of cgroup hierarchy by bind-mounting for the
$CGROUP_MOUNT/batchjobs/c_job_id1/ directory to
$CONTAINER_CHROOT/sys/fs/cgroup/) should provide a completely is=
olated
Post by Serge Hallyn
Post by Aditya Kali
cgroup view inside the container.
In its current simplistic form, the cgroup namespaces provide
(1) The =E2=80=9Croot=E2=80=9D cgroup for a cgroup namespace is =
the cgroup in which
Post by Serge Hallyn
Post by Aditya Kali
the process calling unshare is running.
For ex. if a process in /batchjobs/c_job_id1 cgroup calls un=
share,
Post by Serge Hallyn
Post by Aditya Kali
cgroup /batchjobs/c_job_id1 becomes the cgroupns-root.
For the init_cgroup_ns, this is the real root (=E2=80=9C/=E2=
=80=9D) cgroup
Post by Serge Hallyn
Post by Aditya Kali
(identified in code as cgrp_dfl_root.cgrp).
(2) The cgroupns-root cgroup does not change even if the namespa=
ce
Post by Serge Hallyn
Post by Aditya Kali
creator process later moves to a different cgroup.
$ ~/unshare -c # unshare cgroupns in some cgroup
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
[ns]$ mkdir sub_cgrp_1
[ns]$ echo 0 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgr=
p_1
Post by Serge Hallyn
Post by Aditya Kali
(3) Each process gets its CGROUPNS specific view of
/proc/<pid>/cgroup.
(a) Processes running inside the cgroup namespace will be able t=
o see
Post by Serge Hallyn
Post by Aditya Kali
cgroup paths (in /proc/self/cgroup) only inside their root c=
group
Post by Serge Hallyn
Post by Aditya Kali
[ns]$ sleep 100000 & # From within unshared cgroupns
[1] 7353
[ns]$ echo 7353 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgr=
p_1
Post by Serge Hallyn
Post by Aditya Kali
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjo=
bs/c_job_id1/sub_cgrp_1
Post by Serge Hallyn
Post by Aditya Kali
[ns2]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjo=
bs/c_job_id1/sub_cgrp_1
Post by Serge Hallyn
Post by Aditya Kali
(In correct container setup though, it should not be possibl=
e to
Post by Serge Hallyn
Post by Aditya Kali
access PIDs in another container in the first place. This c=
an be
Post by Serge Hallyn
Post by Aditya Kali
detected changed if desired.)
(4) Processes inside a cgroupns are not allowed to move out of t=
he
Post by Serge Hallyn
Post by Aditya Kali
cgroupns-root. This is true even if a privileged process in =
global
Post by Serge Hallyn
Post by Aditya Kali
cgroupns tries to move the process out of its cgroupns-root.
# From global cgroupns
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjo=
bs/c_job_id1/sub_cgrp_1
Post by Serge Hallyn
Post by Aditya Kali
# cgroupns-root for 7353 is /batchjobs/c_job_id1
$ echo 7353 > batchjobs/c_job_id2/cgroup.procs
-bash: echo: write error: Operation not permitted
(5) setns() is not supported for cgroup namespace in the initial
version.
This combined with the full-path reporting for peer ns cgroups coul=
d make
Post by Serge Hallyn
for fun antics when attaching to an existing container (since we'd =
have
Post by Serge Hallyn
to unshare into a new ns cgroup with the same roto as the container=
).
Post by Serge Hallyn
I understand you are implying this will be fixed soon though.
=20
I am thinking the setns() will be only allowed if
target_cgrpns->cgroupns_root is_descendant_of
current_cgrpns->cgroupns_root. i.e., you will only be setns to a
cgroup namespace which is rooted deeper in hierarchy than your own (i=
n
addition to checking capable_wrt_inode_uidgid(target_cgrpns_inode)).
Certainly.
In addition to this, we need to decide whether its OK for setns() to
=20
[A] ----> [B] ----> C
----> D
=20
[A] and [B] are cgroupns-roots. Now, if a task in Cgroup D (which is
under cgroupns [A]) attempts to setns() to cgroupns [B], then its
cgroup should change from /A/D to /A/B. I am concerned about the
side-effects this might cause. Though otherwise, this is a very usefu=
l
feature for containers. One could argue that this is similar to
setns() to a mount-namespace which is pivot_root'd somewhere else (in
which case, the attaching task's root "/" moves implicitly with
setns).
This is what I'd expect.
Alternatively, we could only allow setns() if
target_cgrpns->cgroupns_root =3D=3D current->cgroup . I.e., taking ab=
ove
example again, if process in Cgroup D wants to setns() to cgroupns
[B], then it will first need to move to Cgroup B, and only then the
setns() will succeed. This makes sure that there is no implicit cgrou=
p
move.
I'm ok with the restriction if it makes the patchset easier for you -
i.e. you not having to man-handle me into another cgroup. Though I
wouldn't expect the locking for that to be an obstacle...
WDYT? I haven't prototyped this yet, but will send out a patch after
this series is accepted.
Either one is fine with me.
Post by Serge Hallyn
Post by Aditya Kali
(6) When some thread from a multi-threaded process unshares its
cgroup-namespace, the new cgroupns gets applied to the entir=
e
Post by Serge Hallyn
Post by Aditya Kali
process (all the threads). This should be OK since
unified-hierarchy only allows process-level containerization=
=2E So
Post by Serge Hallyn
Post by Aditya Kali
all the threads in the process will have the same cgroup. An=
d both
Post by Serge Hallyn
Post by Aditya Kali
- changing cgroups and unsharing namespaces - are protected =
under
Post by Serge Hallyn
Post by Aditya Kali
threadgroup_lock(task).
(7) The cgroup namespace is alive as long as there is atleast 1
process inside it. When the last process exits, the cgroup
namespace is destroyed. The cgroupns-root and the actual cgr=
oups
Post by Serge Hallyn
Post by Aditya Kali
remain though.
Implementation
The current patch-set is based on top of Tejun's cgroup tree (fo=
r-next
Post by Serge Hallyn
Post by Aditya Kali
branch). Its fairly non-intrusive and provides above mentioned
features.
(1) The Documentation/cgroups/unified-hierarchy.txt mentions use=
of
Post by Serge Hallyn
Post by Aditya Kali
capabilities to restrict cgroups to administrative users. CG=
roup
Post by Serge Hallyn
Post by Aditya Kali
namespaces could be of help here. With cgroup namespaces, it=
might
Post by Serge Hallyn
Post by Aditya Kali
be possible to delegate administration of sub-cgroups under =
a
Post by Serge Hallyn
Post by Aditya Kali
cgroupns-root to the cgroupns owner.
That would be nice.
Post by Aditya Kali
(2) Provide a cgroupns specific cgroupfs mount. i.e., the follow=
ing
Post by Serge Hallyn
Post by Aditya Kali
command when ran from inside a cgroupns should only mount th=
e
Post by Serge Hallyn
Post by Aditya Kali
$ mount -t cgroup cgroup <cgroup-mountpoint>
# -o __DEVEL__sane_behavior should be implicit
This is similar to how procfs can be mounted for every PIDNS=
=2E This
Post by Serge Hallyn
Post by Aditya Kali
may have some usecases.
Sorry - I see this answers the first part of a question in my previ=
ous email.
Post by Serge Hallyn
However, the question of whether changes to limits in cgroups which=
are not
Post by Serge Hallyn
under our cgroup-ns-root are allowed.
Admittedly the current case with cgmanager is the same - in that it=
depends
Post by Serge Hallyn
on proper setup of the container - but cgmanager is geared to recom=
mend
Post by Serge Hallyn
not mounting the cgroups in the container at all (and we can reject=
such
Post by Serge Hallyn
mounts in the contaienr altogether with no loss in functionality) w=
hereas
Post by Serge Hallyn
you are here encouraging such mounts. Which is fine - so long as y=
ou then
Post by Serge Hallyn
fully address the potential issues.
=20
It will be nice to have this, but frankly, it may add a bit of
complexity in the cgroup/kernfs code (I will have to prototype and
see). Also same behavior can be obtained simply by bind-mounting
cgroupns-root inside the container. So I am currently inclining
towards rejecting such mounts in favor of simplicity.
Not having to track what to bind-mount where is a very nice
simplification though. In lxc with cgmanager, we are now able to alway=
s
simply bind-mount /sys/fs/cgroup/cgmanager from the host into the
container. Nothing more needed for the container to be able to manage
its own cgroup and start its own containers. Likewise, if mount -t
cgroup were filtered to cgroupns, then lxc could simply not mount
anything into the container at all. If it mount -t cgroup is not
filtered wrt cgroupns, then we'd have to go back to, at container start=
,
finding the mountpoint for every subsystem, calculating the container's
cgroup there, and bind-mounting them into the container.
Regarding disallowing writes to cgroup files outside of your
cgroupns-root, I think it should possible implement it easily. I will
include it in the next revision of this series.
Great - thanks.

-serge
Andy Lutomirski
2014-07-29 15:08:40 UTC
Permalink
Post by Serge E. Hallyn
Thank you for your review. I have tried to respond to both your emails here.
Post by Serge Hallyn
Post by Aditya Kali
Background
Cgroups and Namespaces are used together to create “virtual”
containers that isolates the host environment from the processes
running in container. But since cgroups themselves are not
“virtualized”, the task is always able to see global cgroups view
through cgroupfs mount and via /proc/self/cgroup file.
Hi,
1. Based on this description, am I to understand that after doing a
cgroupns unshare, 'mount -t cgroup cgroup /mnt' by default will
still mount the global root cgroup? Any plans on "changing" that?
This is suggested in the "Possible Extensions of CGROUPNS" section.
More details below.
Post by Serge Hallyn
Will attempts to change settings of a cgroup which is not under
our current ns be rejected? (That should be easy to do given your
patch 1/5). Sorry if it's done in the set, I'm jumping around...
Currently, only 'cgroup_attach_task', 'cgroup_mkdir' and
'cgroup_rmdir' of cgroups outside of cgroupns-root are prevented. The
read/write of actual cgroup properties are not prevented. Usual
permission checks continue to apply for those. I was hoping that
should be enough, but see more comments towards the end.
Post by Serge Hallyn
2. What would be the reprecussions of allowing cgroupns unshare so
long as you have ns_capable(CAP_SYS_ADMIN) to the user_ns which
created your current ns cgroup? It'd be a shame if that wasn't
on the roadmap.
Its certainly on the roadmap, just that some logistics were not clear
at this time. As pointed out by Andy Lutomirski on [PATCH 5/5] of this
series, if we allow cgroupns creation to ns_capable(CAP_SYS_ADMIN)
processes, we may need some kind of explicit permission from the
cgroup subsystem to allow this. One approach could be an explicit
So long as you do ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN) I think
you're fine.
The only real problem I can think of with unsharing a cgroup_ns is that
you could lock a setuid-root application someplace it wasn't expecting.
The above check guarantees that you were privileged enough that you'd
better be trusted in this user namespace.
(Unless there is some possible interaction I'm overlooking)
I think that, if it's done this way, you'd have to unshare cgroupns
before unsharing userns, since you forfeit that capability when you
unshare your userns. That means that the new cgroupns ends up being
associated w/ the root userns, which may not be what you want.

You could unshare both namespaces in one syscall and give that some
magic semantics, but that's kind of weird. It would be nice if you
could unshare your userns and temporarily retains caps in the parent,
but there is no such mechanism right now.

--Andy
Serge E. Hallyn
2014-07-29 16:06:56 UTC
Permalink
Post by Andy Lutomirski
Post by Serge E. Hallyn
Thank you for your review. I have tried to respond to both your emails here.
Post by Serge Hallyn
Post by Aditya Kali
Background
Cgroups and Namespaces are used together to create “virtual”
containers that isolates the host environment from the processes
running in container. But since cgroups themselves are not
“virtualized”, the task is always able to see global cgroups view
through cgroupfs mount and via /proc/self/cgroup file.
Hi,
1. Based on this description, am I to understand that after doing a
cgroupns unshare, 'mount -t cgroup cgroup /mnt' by default will
still mount the global root cgroup? Any plans on "changing" that?
This is suggested in the "Possible Extensions of CGROUPNS" section.
More details below.
Post by Serge Hallyn
Will attempts to change settings of a cgroup which is not under
our current ns be rejected? (That should be easy to do given your
patch 1/5). Sorry if it's done in the set, I'm jumping around...
Currently, only 'cgroup_attach_task', 'cgroup_mkdir' and
'cgroup_rmdir' of cgroups outside of cgroupns-root are prevented. The
read/write of actual cgroup properties are not prevented. Usual
permission checks continue to apply for those. I was hoping that
should be enough, but see more comments towards the end.
Post by Serge Hallyn
2. What would be the reprecussions of allowing cgroupns unshare so
long as you have ns_capable(CAP_SYS_ADMIN) to the user_ns which
created your current ns cgroup? It'd be a shame if that wasn't
on the roadmap.
Its certainly on the roadmap, just that some logistics were not clear
at this time. As pointed out by Andy Lutomirski on [PATCH 5/5] of this
series, if we allow cgroupns creation to ns_capable(CAP_SYS_ADMIN)
processes, we may need some kind of explicit permission from the
cgroup subsystem to allow this. One approach could be an explicit
So long as you do ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN) I think
you're fine.
The only real problem I can think of with unsharing a cgroup_ns is that
you could lock a setuid-root application someplace it wasn't expecting.
The above check guarantees that you were privileged enough that you'd
better be trusted in this user namespace.
(Unless there is some possible interaction I'm overlooking)
I think that, if it's done this way, you'd have to unshare cgroupns
before unsharing userns, since you forfeit that capability when you
unshare your userns. That means that the new cgroupns ends up being
associated w/ the root userns, which may not be what you want.
You could unshare both namespaces in one syscall and give that some
magic semantics, but that's kind of weird. It would be nice if you
could unshare your userns and temporarily retains caps in the parent,
but there is no such mechanism right now.
Hm, good point.
Serge Hallyn
2014-07-24 17:01:19 UTC
Permalink
Post by Aditya Kali
CLONE_NEWCGROUP will be used to create new cgroup namespace.
This is fine and I'm not looking to bikeshed, but am wondering - did
you consider any other ways beside unshare (i.e. a new mount option
to cgroupfs)? If so, do you have a list of the downsides of those?
(I mainly ask bc clone flags are still a scarce commodity)
Post by Aditya Kali
---
include/uapi/linux/sched.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 34f9d73..2f90d00 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -21,8 +21,7 @@
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
-/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
- and is now available for re-use. */
+#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
--
2.0.0.526.g5318336
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
Aditya Kali
2014-07-31 19:48:43 UTC
Permalink
Post by Serge Hallyn
Post by Aditya Kali
CLONE_NEWCGROUP will be used to create new cgroup namespace.
This is fine and I'm not looking to bikeshed, but am wondering - did
you consider any other ways beside unshare (i.e. a new mount option
to cgroupfs)? If so, do you have a list of the downsides of those?
(I mainly ask bc clone flags are still a scarce commodity)
I did consider couple of other ways:

(1) having a cgroup.ns_root (or something) cgroup file. If this value
is '1', it would mean that all processes it and its descendant cgroups
will have their cgroup paths in /proc/self/cgroup terminated at this
cgroup.
For ex:
[A] --> [B] --> C
| --> [D] --> E

[A], [B] and [D] has cgroup.ns_root = 1.
* all processes in cgroup C & E will see their cgroup path as /C and
/E respectively
* all processes in cgroup B & D will see their own cgroup path as /

In this model, its easy to know what to show if process is looking at
its own cgroup paths (/proc/self/cgroup). It gets tricky when you are
looking at other process's /proc/<pid>/cgroup. We may be able to come
up with some hacky way read correct value, but depending on the
cgroupfs mount, it may not make sense.
One other major drawback of this approach is that "every" process in
the cgroup will now get a restricted view. i.e., you cannot change
cgroups without affecting your view. And this is undesirable for
administrative processes.

(2) Another idea that I didn't pursue further (and is a bit hacky as
above) was having cgroup.ns_procs (like cgroup.procs, but all the pids
in cgroup.ns_procs will have their /proc/self/cgroup restricted).
Writing a pid to cgroup.ns_procs implies that you are writing it to
cgroup.procs too. But, not vise-versa. So, you could move yourself in
another cgroup by writing your pid in cgroup.procs, but not in
cgroup.ns_procs, thus preventing from getting "rooted". I This was to
solve administrative process issue in the above appraoch. But I think
this is very clunky too and I find semantics for this approach to be
non-intuitive. It almost looks like moving towards a separate "ns"
subsystem. But as we already know, its a path to failure.

I didn't think of using a mount option. I imagine the mount option
(something like -o root=/bathjobs/container_1) could be used to
restrict the visibility of cgroupfs inside the container's mount
namespace. i.e., the value you read from /proc/<pid>/cgroup now
depends on what mount namespace you are in. Its similar to cgroup
namespace, but just that the cgroupns_root is now stored in the
'struct mnt_namespace' instead of a separate 'struct
cgroup_namespace'. But, since mount namespace on creation inherits
mounts from its parent, the first cgroupfs mount in a mount namespace
is now treated specially. Also, its not possible to restrict cgroups
without mount namespace now. This is interesting and may not be too
bad. I am willing to give this a try. But I feel the cgroup namespace
approach fits well in-line with other namespaces where it does one
thing - virtualize the view of /proc/<pid>/cgroup file for processes
inside the namespace. The semantics are more intuitive as they are
similar to other namespaces.

Thanks,
Post by Serge Hallyn
Post by Aditya Kali
---
include/uapi/linux/sched.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 34f9d73..2f90d00 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -21,8 +21,7 @@
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
-/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
- and is now available for re-use. */
+#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
--
2.0.0.526.g5318336
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
--
Aditya
Serge Hallyn
2014-08-04 23:12:55 UTC
Permalink
Post by Aditya Kali
Post by Serge Hallyn
Post by Aditya Kali
CLONE_NEWCGROUP will be used to create new cgroup namespace.
This is fine and I'm not looking to bikeshed, but am wondering - did
you consider any other ways beside unshare (i.e. a new mount option
to cgroupfs)? If so, do you have a list of the downsides of those?
(I mainly ask bc clone flags are still a scarce commodity)
(1) having a cgroup.ns_root (or something) cgroup file. If this value
is '1', it would mean that all processes it and its descendant cgroups
will have their cgroup paths in /proc/self/cgroup terminated at this
cgroup.
[A] --> [B] --> C
| --> [D] --> E
[A], [B] and [D] has cgroup.ns_root = 1.
* all processes in cgroup C & E will see their cgroup path as /C and
/E respectively
* all processes in cgroup B & D will see their own cgroup path as /
In this model, its easy to know what to show if process is looking at
its own cgroup paths (/proc/self/cgroup). It gets tricky when you are
looking at other process's /proc/<pid>/cgroup. We may be able to come
up with some hacky way read correct value, but depending on the
cgroupfs mount, it may not make sense.
One other major drawback of this approach is that "every" process in
the cgroup will now get a restricted view. i.e., you cannot change
cgroups without affecting your view. And this is undesirable for
administrative processes.
(2) Another idea that I didn't pursue further (and is a bit hacky as
above) was having cgroup.ns_procs (like cgroup.procs, but all the pids
in cgroup.ns_procs will have their /proc/self/cgroup restricted).
Writing a pid to cgroup.ns_procs implies that you are writing it to
cgroup.procs too. But, not vise-versa. So, you could move yourself in
another cgroup by writing your pid in cgroup.procs, but not in
cgroup.ns_procs, thus preventing from getting "rooted". I This was to
solve administrative process issue in the above appraoch. But I think
this is very clunky too and I find semantics for this approach to be
non-intuitive. It almost looks like moving towards a separate "ns"
subsystem. But as we already know, its a path to failure.
I didn't think of using a mount option. I imagine the mount option
(something like -o root=/bathjobs/container_1) could be used to
restrict the visibility of cgroupfs inside the container's mount
namespace. i.e., the value you read from /proc/<pid>/cgroup now
depends on what mount namespace you are in. Its similar to cgroup
namespace, but just that the cgroupns_root is now stored in the
'struct mnt_namespace' instead of a separate 'struct
cgroup_namespace'. But, since mount namespace on creation inherits
mounts from its parent, the first cgroupfs mount in a mount namespace
is now treated specially. Also, its not possible to restrict cgroups
without mount namespace now. This is interesting and may not be too
bad. I am willing to give this a try. But I feel the cgroup namespace
approach fits well in-line with other namespaces where it does one
thing - virtualize the view of /proc/<pid>/cgroup file for processes
inside the namespace. The semantics are more intuitive as they are
similar to other namespaces.
Yeah, let's stick with what you have :)

thanks,
-serge
Aditya Kali
2014-10-13 21:23:42 UTC
Permalink
Second take at the Cgroup Namespace patch-set.

Major changes form RFC (V0):
1. setns support for cgroupns
2. 'mount -t cgroup cgroup <mntpt>' from inside a cgroupns now
mounts the cgroup hierarcy with cgroupns-root as the filesystem root.
3. writes to cgroup files outside of cgroupns-root are not allowed
4. visibility of /proc/<pid>/cgroup is further restricted by not showing
anything if the <pid> is in a sibling cgroupns and its cgroup falls outside
your cgroupns-root.

More details in the writeup below.

Background
Cgroups and Namespaces are used together to create “virtual”
containers that isolates the host environment from the processes
running in container. But since cgroups themselves are not
“virtualized”, the task is always able to see global cgroups view
through cgroupfs mount and via /proc/self/cgroup file.

$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1

This exposure of cgroup names to the processes running inside a
container results in some problems:
(1) The container names are typically host-container-management-agent
(systemd, docker/libcontainer, etc.) data and leaking its name (or
leaking the hierarchy) reveals too much information about the host
system.
(2) It makes the container migration across machines (CRIU) more
difficult as the container names need to be unique across the
machines in the migration domain.
(3) It makes it difficult to run container management tools (like
docker/libcontainer, lmctfy, etc.) within virtual containers
without adding dependency on some state/agent present outside the
container.

Note that the feature proposed here is completely different than the
“ns cgroup” feature which existed in the linux kernel until recently.
The ns cgroup also attempted to connect cgroups and namespaces by
creating a new cgroup every time a new namespace was created. It did
not solve any of the above mentioned problems and was later dropped
from the kernel. Incidentally though, it used the same config option
name CONFIG_CGROUP_NS as used in my prototype!

Introducing CGroup Namespaces
With unified cgroup hierarchy
(Documentation/cgroups/unified-hierarchy.txt), the containers can now
have a much more coherent cgroup view and its easy to associate a
container with a single cgroup. This also allows us to virtualize the
cgroup view for tasks inside the container.

The new CGroup Namespace allows a process to “unshare” its cgroup
hierarchy starting from the cgroup its currently in.
For Ex:
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1
$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> cgroup:[4026531835]
$ ~/unshare -c # calls unshare(CLONE_NEWCGROUP) and exec’s /bin/bash
[ns]$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup ->
cgroup:[4026532183]
# From within new cgroupns, process sees that its in the root cgroup
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/

# From global cgroupns:
$ cat /proc/<pid>/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1

# Unshare cgroupns along with userns and mountns
# Following calls unshare(CLONE_NEWCGROUP|CLONE_NEWUSER|CLONE_NEWNS), then
# sets up uid/gid map and exec’s /bin/bash
$ ~/unshare -c -u -m

# Originally, we were in /batchjobs/c_job_id1 cgroup. Mount our own cgroup
# hierarchy.
[ns]$ mount -t cgroup cgroup /tmp/cgroup
[ns]$ ls -l /tmp/cgroup
total 0
-r--r--r-- 1 root root 0 2014-10-13 09:32 cgroup.controllers
-r--r--r-- 1 root root 0 2014-10-13 09:32 cgroup.populated
-rw-r--r-- 1 root root 0 2014-10-13 09:25 cgroup.procs
-rw-r--r-- 1 root root 0 2014-10-13 09:32 cgroup.subtree_control

The cgroupns-root (/batchjobs/c_job_id1 in above example) becomes the
filesystem root for the namespace specific cgroupfs mount.

The virtualization of /proc/self/cgroup file combined with restricting
the view of cgroup hierarchy by namespace-private cgroupfs mount
should provide a completely isolated cgroup view inside the container.

In its current form, the cgroup namespaces patcheset provides following
behavior:

(1) The “root” cgroup for a cgroup namespace is the cgroup in which
the process calling unshare is running.
For ex. if a process in /batchjobs/c_job_id1 cgroup calls unshare,
cgroup /batchjobs/c_job_id1 becomes the cgroupns-root.
For the init_cgroup_ns, this is the real root (“/”) cgroup
(identified in code as cgrp_dfl_root.cgrp).

(2) The cgroupns-root cgroup does not change even if the namespace
creator process later moves to a different cgroup.
$ ~/unshare -c # unshare cgroupns in some cgroup
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
[ns]$ mkdir sub_cgrp_1
[ns]$ echo 0 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1

(3) Each process gets its CGROUPNS specific view of
/proc/<pid>/cgroup.
(a) Processes running inside the cgroup namespace will be able to see
cgroup paths (in /proc/self/cgroup) only inside their root cgroup
[ns]$ sleep 100000 & # From within unshared cgroupns
[1] 7353
[ns]$ echo 7353 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1

(b) From global cgroupns, the real cgroup path will be visible:
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1/sub_cgrp_1

(c) From a sibling cgroupns (cgroupns root-ed at a sibling cgroup), no cgroup
path will be visible:
# ns2's cgroupns-root is at '/batchjobs/c_job_id2'
[ns2]$ cat /proc/7353/cgroup
[ns2]$
This is same as when cgroup hierarchy is not mounted at all.
(In correct container setup though, it should not be possible to
access PIDs in another container in the first place.)

(4) Processes inside a cgroupns are not allowed to move out of the
cgroupns-root. This is true even if a privileged process in global
cgroupns tries to move the process out of its cgroupns-root.

# From global cgroupns
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1/sub_cgrp_1
# cgroupns-root for 7353 is /batchjobs/c_job_id1
$ echo 7353 > batchjobs/c_job_id2/cgroup.procs
-bash: echo: write error: Operation not permitted

(5) Setns to another cgroup namespace is allowed only when:
(a) process has CAP_SYS_ADMIN in its current userns
(b) process has CAP_SYS_ADMIN in the target cgroupns' userns
(c) the process's current cgroup is a descendant cgroupns-root of the
target namespace.
(d) the target cgroupns-root is descendant of current cgroupns-root..
The last check (d) prevents processes from escaping their cgroupns-root by
attaching to parent cgroupns. Thus, setns is allowed only when the process
is trying to restrict itself to a deeper cgroup hierarchy.

(6) When some thread from a multi-threaded process unshares its
cgroup-namespace, the new cgroupns gets applied to the entire
process (all the threads). This should be OK since
unified-hierarchy only allows process-level containerization. So
all the threads in the process will have the same cgroup. And both
- changing cgroups and unsharing namespaces - are protected under
threadgroup_lock(task).

(7) The cgroup namespace is alive as long as there is atleast 1
process inside it. When the last process exits, the cgroup
namespace is destroyed. The cgroupns-root and the actual cgroups
remain though.

(8) 'mount -t cgroup cgroup <mntpt>' when called from within cgroupns mounts
the unified cgroup hierarchy with cgroupns-root as the filesystem root.
The process needs CAP_SYS_ADMIN in its userns and mntns. This allows the
container management tools to be run inside the containers transparently.

Implementation
The current patch-set is based on top of Tejun Heo's cgroup tree (for-next
branch). Its fairly non-intrusive and provides above mentioned
features.

Possible extensions of CGROUPNS:
(1) The Documentation/cgroups/unified-hierarchy.txt mentions use of
capabilities to restrict cgroups to administrative users. CGroup
namespaces could be of help here. With cgroup namespaces, it might
be possible to delegate administration of sub-cgroups under a
cgroupns-root to the cgroupns owner.


---
fs/kernfs/dir.c | 53 +++++++++---
fs/kernfs/mount.c | 48 +++++++++++
fs/proc/namespaces.c | 3 +
include/linux/cgroup.h | 41 +++++++++-
include/linux/cgroup_namespace.h | 62 +++++++++++++++
include/linux/kernfs.h | 5 ++
include/linux/nsproxy.h | 2 +
include/linux/proc_ns.h | 4 +
include/uapi/linux/sched.h | 3 +-
init/Kconfig | 9 +++
kernel/Makefile | 1 +
kernel/cgroup.c | 139 ++++++++++++++++++++++++++------
kernel/cgroup_namespace.c | 168 +++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 2 +-
kernel/nsproxy.c | 19 ++++-
15 files changed, 518 insertions(+), 41 deletions(-)
create mode 100644 include/linux/cgroup_namespace.h
create mode 100644 kernel/cgroup_namespace.c

[PATCHv1 1/8] kernfs: Add API to generate relative kernfs path
[PATCHv1 2/8] sched: new clone flag CLONE_NEWCGROUP for cgroup
[PATCHv1 3/8] cgroup: add function to get task's cgroup on default
[PATCHv1 4/8] cgroup: export cgroup_get() and cgroup_put()
[PATCHv1 5/8] cgroup: introduce cgroup namespaces
[PATCHv1 6/8] cgroup: restrict cgroup operations within task's cgroupns
[PATCHv1 7/8] cgroup: cgroup namespace setns support
[PATCHv1 8/8] cgroup: mount cgroupns-root when inside non-init cgroupns
Aditya Kali
2014-10-13 21:23:43 UTC
Permalink
The new function kernfs_path_from_node() generates and returns
kernfs path of a given kernfs_node relative to a given parent
kernfs_node.

Signed-off-by: Aditya Kali <adityakali-hpIqsD4AKlfQT0dZR+***@public.gmane.org>
---
fs/kernfs/dir.c | 53 ++++++++++++++++++++++++++++++++++++++++----------
include/linux/kernfs.h | 3 +++
2 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a693f5b..8655485 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -44,14 +44,24 @@ static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
}

-static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
- size_t buflen)
+static char * __must_check kernfs_path_from_node_locked(
+ struct kernfs_node *kn_root,
+ struct kernfs_node *kn,
+ char *buf,
+ size_t buflen)
{
char *p = buf + buflen;
int len;

+ BUG_ON(!buflen);
+
*--p = '\0';

+ if (kn == kn_root) {
+ *--p = '/';
+ return p;
+ }
+
do {
len = strlen(kn->name);
if (p - buf < len + 1) {
@@ -63,6 +73,8 @@ static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
memcpy(p, kn->name, len);
*--p = '/';
kn = kn->parent;
+ if (kn == kn_root)
+ break;
} while (kn && kn->parent);

return p;
@@ -92,26 +104,47 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
}

/**
- * kernfs_path - build full path of a given node
+ * kernfs_path_from_node - build path of node @kn relative to @kn_root.
+ * @kn_root: parent kernfs_node relative to which we need to build the path
* @kn: kernfs_node of interest
- * @buf: buffer to copy @kn's name into
+ * @buf: buffer to copy @kn's path into
* @buflen: size of @buf
*
- * Builds and returns the full path of @kn in @buf of @buflen bytes. The
- * path is built from the end of @buf so the returned pointer usually
+ * Builds and returns @kn's path relative to @kn_root. @kn_root is expected to
+ * be parent of @kn at some level. If this is not true or if @kn_root is NULL,
+ * then full path of @kn is returned.
+ * The path is built from the end of @buf so the returned pointer usually
* doesn't match @buf. If @buf isn't long enough, @buf is nul terminated
* and %NULL is returned.
*/
-char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
+char *kernfs_path_from_node(struct kernfs_node *kn_root, struct kernfs_node *kn,
+ char *buf, size_t buflen)
{
unsigned long flags;
char *p;

spin_lock_irqsave(&kernfs_rename_lock, flags);
- p = kernfs_path_locked(kn, buf, buflen);
+ p = kernfs_path_from_node_locked(kn_root, kn, buf, buflen);
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
return p;
}
+EXPORT_SYMBOL_GPL(kernfs_path_from_node);
+
+/**
+ * kernfs_path - build full path of a given node
+ * @kn: kernfs_node of interest
+ * @buf: buffer to copy @kn's name into
+ * @buflen: size of @buf
+ *
+ * Builds and returns the full path of @kn in @buf of @buflen bytes. The
+ * path is built from the end of @buf so the returned pointer usually
+ * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated
+ * and %NULL is returned.
+ */
+char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+ return kernfs_path_from_node(NULL, kn, buf, buflen);
+}
EXPORT_SYMBOL_GPL(kernfs_path);

/**
@@ -145,8 +178,8 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)

spin_lock_irqsave(&kernfs_rename_lock, flags);

- p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
- sizeof(kernfs_pr_cont_buf));
+ p = kernfs_path_from_node_locked(NULL, kn, kernfs_pr_cont_buf,
+ sizeof(kernfs_pr_cont_buf));
if (p)
pr_cont("%s", p);
else
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 30faf79..3c2be75 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -258,6 +258,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
}

int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
+char * __must_check kernfs_path_from_node(struct kernfs_node *root_kn,
+ struct kernfs_node *kn, char *buf,
+ size_t buflen);
char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
size_t buflen);
void pr_cont_kernfs_name(struct kernfs_node *kn);
--
2.1.0.rc2.206.gedb03e5
Serge E. Hallyn
2014-10-16 16:07:36 UTC
Permalink
Post by Aditya Kali
The new function kernfs_path_from_node() generates and returns
kernfs path of a given kernfs_node relative to a given parent
kernfs_node.
Acked-by: Serge Hallyn <serge.hallyn-Z7WLFzj8eWMS+***@public.gmane.org>

(with or without my comment below taken)
Post by Aditya Kali
---
fs/kernfs/dir.c | 53 ++++++++++++++++++++++++++++++++++++++++----------
include/linux/kernfs.h | 3 +++
2 files changed, 46 insertions(+), 10 deletions(-)
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a693f5b..8655485 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -44,14 +44,24 @@ static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
}
-static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
- size_t buflen)
+static char * __must_check kernfs_path_from_node_locked(
+ struct kernfs_node *kn_root,
+ struct kernfs_node *kn,
+ char *buf,
+ size_t buflen)
{
char *p = buf + buflen;
int len;
+ BUG_ON(!buflen);
+
*--p = '\0';
+ if (kn == kn_root) {
+ *--p = '/';
+ return p;
+ }
+
do {
len = strlen(kn->name);
if (p - buf < len + 1) {
@@ -63,6 +73,8 @@ static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
memcpy(p, kn->name, len);
*--p = '/';
kn = kn->parent;
+ if (kn == kn_root)
+ break;
I wonder if it would be clearer if you instead changed the while condition, i.e.

} while (kn && kn != kn_root && kn_parent);

i.e .it's not a special condition, just a part of the expected flow.
Post by Aditya Kali
} while (kn && kn->parent);
return p;
@@ -92,26 +104,47 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
}
/**
- * kernfs_path - build full path of a given node
*
* and %NULL is returned.
*/
-char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
+char *kernfs_path_from_node(struct kernfs_node *kn_root, struct kernfs_node *kn,
+ char *buf, size_t buflen)
{
unsigned long flags;
char *p;
spin_lock_irqsave(&kernfs_rename_lock, flags);
- p = kernfs_path_locked(kn, buf, buflen);
+ p = kernfs_path_from_node_locked(kn_root, kn, buf, buflen);
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
return p;
}
+EXPORT_SYMBOL_GPL(kernfs_path_from_node);
+
+/**
+ * kernfs_path - build full path of a given node
+ *
+ * and %NULL is returned.
+ */
+char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+ return kernfs_path_from_node(NULL, kn, buf, buflen);
+}
EXPORT_SYMBOL_GPL(kernfs_path);
/**
@@ -145,8 +178,8 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
spin_lock_irqsave(&kernfs_rename_lock, flags);
- p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
- sizeof(kernfs_pr_cont_buf));
+ p = kernfs_path_from_node_locked(NULL, kn, kernfs_pr_cont_buf,
+ sizeof(kernfs_pr_cont_buf));
if (p)
pr_cont("%s", p);
else
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 30faf79..3c2be75 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -258,6 +258,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
}
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
+char * __must_check kernfs_path_from_node(struct kernfs_node *root_kn,
+ struct kernfs_node *kn, char *buf,
+ size_t buflen);
char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
size_t buflen);
void pr_cont_kernfs_name(struct kernfs_node *kn);
--
2.1.0.rc2.206.gedb03e5
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
Aditya Kali
2014-10-13 21:23:45 UTC
Permalink
get_task_cgroup() returns the (reference counted) cgroup of the
given task on the default hierarchy.

Signed-off-by: Aditya Kali <***@google.com>
---
include/linux/cgroup.h | 1 +
kernel/cgroup.c | 25 +++++++++++++++++++++++++
2 files changed, 26 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 1d51968..80ed6e0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -579,6 +579,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
}

char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
+struct cgroup *get_task_cgroup(struct task_struct *task);

int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cab7dc4..56d507b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1916,6 +1916,31 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
}
EXPORT_SYMBOL_GPL(task_cgroup_path);

+/*
+ * get_task_cgroup - returns the cgroup of the task in the default cgroup
+ * hierarchy.
+ *
+ * @task: target task
+ * This function returns the @task's cgroup on the default cgroup hierarchy. The
+ * returned cgroup has its reference incremented (by calling cgroup_get()). So
+ * the caller must cgroup_put() the obtained reference once it is done with it.
+ */
+struct cgroup *get_task_cgroup(struct task_struct *task)
+{
+ struct cgroup *cgrp;
+
+ mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
+
+ cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ cgroup_get(cgrp);
+
+ up_read(&css_set_rwsem);
+ mutex_unlock(&cgroup_mutex);
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(get_task_cgroup);
+
/* used to track tasks and other necessary states during migration */
struct cgroup_taskset {
/* the src and dst cset list running through cset->mg_node */
--
2.1.0.rc2.206.gedb03e5
Serge E. Hallyn
2014-10-16 16:13:24 UTC
Permalink
Post by Aditya Kali
get_task_cgroup() returns the (reference counted) cgroup of the
given task on the default hierarchy.
---
include/linux/cgroup.h | 1 +
kernel/cgroup.c | 25 +++++++++++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 1d51968..80ed6e0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -579,6 +579,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
}
char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
+struct cgroup *get_task_cgroup(struct task_struct *task);
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cab7dc4..56d507b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1916,6 +1916,31 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
+/*
+ * get_task_cgroup - returns the cgroup of the task in the default cgroup
+ * hierarchy.
+ *
+ * returned cgroup has its reference incremented (by calling cgroup_get()). So
+ * the caller must cgroup_put() the obtained reference once it is done with it.
+ */
+struct cgroup *get_task_cgroup(struct task_struct *task)
+{
+ struct cgroup *cgrp;
+
+ mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
+
+ cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ cgroup_get(cgrp);
+
+ up_read(&css_set_rwsem);
+ mutex_unlock(&cgroup_mutex);
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(get_task_cgroup);
+
/* used to track tasks and other necessary states during migration */
struct cgroup_taskset {
/* the src and dst cset list running through cset->mg_node */
--
2.1.0.rc2.206.gedb03e5
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
Aditya Kali
2014-10-13 21:23:44 UTC
Permalink
CLONE_NEWCGROUP will be used to create new cgroup namespace.

Signed-off-by: Aditya Kali <adityakali-hpIqsD4AKlfQT0dZR+***@public.gmane.org>
---
include/uapi/linux/sched.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 34f9d73..2f90d00 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -21,8 +21,7 @@
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
-/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
- and is now available for re-use. */
+#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
--
2.1.0.rc2.206.gedb03e5
Serge E. Hallyn
2014-10-16 16:08:15 UTC
Permalink
Post by Aditya Kali
CLONE_NEWCGROUP will be used to create new cgroup namespace.
---
include/uapi/linux/sched.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 34f9d73..2f90d00 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -21,8 +21,7 @@
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
-/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
- and is now available for re-use. */
+#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
--
2.1.0.rc2.206.gedb03e5
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
Aditya Kali
2014-10-13 21:23:49 UTC
Permalink
setns on a cgroup namespace is allowed only if
* task has CAP_SYS_ADMIN in its current user-namespace and
over the user-namespace associated with target cgroupns.
* task's current cgroup is descendent of the target cgroupns-root
cgroup.
* target cgroupns-root is same as or deeper than task's current
cgroupns-root. This is so that the task cannot escape out of its
cgroupns-root. This also ensures that setns() only makes the task
get restricted to a deeper cgroup hierarchy.

Signed-off-by: Aditya Kali <adityakali-hpIqsD4AKlfQT0dZR+***@public.gmane.org>
---
kernel/cgroup_namespace.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/kernel/cgroup_namespace.c b/kernel/cgroup_namespace.c
index c16604f..c612946 100644
--- a/kernel/cgroup_namespace.c
+++ b/kernel/cgroup_namespace.c
@@ -80,8 +80,48 @@ err_out:

static int cgroupns_install(struct nsproxy *nsproxy, void *ns)
{
- pr_info("setns not supported for cgroup namespace");
- return -EINVAL;
+ struct cgroup_namespace *cgroup_ns = ns;
+ struct task_struct *task = current;
+ struct cgroup *cgrp = NULL;
+ int err = 0;
+
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* Prevent cgroup changes for this task. */
+ threadgroup_lock(task);
+
+ cgrp = get_task_cgroup(task);
+
+ err = -EINVAL;
+ if (!cgroup_on_dfl(cgrp))
+ goto out_unlock;
+
+ /* Allow switch only if the task's current cgroup is descendant of the
+ * target cgroup_ns->root_cgrp.
+ */
+ if (!cgroup_is_descendant(cgrp, cgroup_ns->root_cgrp))
+ goto out_unlock;
+
+ /* Only allow setns to a cgroupns root-ed deeper than task's current
+ * cgroupns-root. This will make sure that tasks cannot escape their
+ * cgroupns by attaching to parent cgroupns.
+ */
+ if (!cgroup_is_descendant(cgroup_ns->root_cgrp,
+ task_cgroupns_root(task)))
+ goto out_unlock;
+
+ err = 0;
+ get_cgroup_ns(cgroup_ns);
+ put_cgroup_ns(nsproxy->cgroup_ns);
+ nsproxy->cgroup_ns = cgroup_ns;
+
+out_unlock:
+ threadgroup_unlock(current);
+ if (cgrp)
+ cgroup_put(cgrp);
+ return err;
}

static void *cgroupns_get(struct task_struct *task)
--
2.1.0.rc2.206.gedb03e5
Serge E. Hallyn
2014-10-16 21:12:36 UTC
Permalink
Post by Aditya Kali
setns on a cgroup namespace is allowed only if
* task has CAP_SYS_ADMIN in its current user-namespace and
over the user-namespace associated with target cgroupns.
* task's current cgroup is descendent of the target cgroupns-root
cgroup.
What is the point of this?

If I'm a user logged into
/lxc/c1/user.slice/user-1000.slice/session-c12.scope and I start
a container which is in
/lxc/c1/user.slice/user-1000.slice/session-c12.scope/x1
then I will want to be able to enter the container's cgroup.
The container's cgroup root is under my own (satisfying the
below condition0 but my cgroup is not a descendent of the
container's cgroup.
Post by Aditya Kali
* target cgroupns-root is same as or deeper than task's current
cgroupns-root. This is so that the task cannot escape out of its
cgroupns-root. This also ensures that setns() only makes the task
get restricted to a deeper cgroup hierarchy.
---
kernel/cgroup_namespace.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 42 insertions(+), 2 deletions(-)
diff --git a/kernel/cgroup_namespace.c b/kernel/cgroup_namespace.c
index c16604f..c612946 100644
--- a/kernel/cgroup_namespace.c
+++ b/kernel/cgroup_namespace.c
static int cgroupns_install(struct nsproxy *nsproxy, void *ns)
{
- pr_info("setns not supported for cgroup namespace");
- return -EINVAL;
+ struct cgroup_namespace *cgroup_ns = ns;
+ struct task_struct *task = current;
+ struct cgroup *cgrp = NULL;
+ int err = 0;
+
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* Prevent cgroup changes for this task. */
+ threadgroup_lock(task);
+
+ cgrp = get_task_cgroup(task);
+
+ err = -EINVAL;
+ if (!cgroup_on_dfl(cgrp))
+ goto out_unlock;
+
+ /* Allow switch only if the task's current cgroup is descendant of the
+ * target cgroup_ns->root_cgrp.
+ */
+ if (!cgroup_is_descendant(cgrp, cgroup_ns->root_cgrp))
+ goto out_unlock;
+
+ /* Only allow setns to a cgroupns root-ed deeper than task's current
+ * cgroupns-root. This will make sure that tasks cannot escape their
+ * cgroupns by attaching to parent cgroupns.
+ */
+ if (!cgroup_is_descendant(cgroup_ns->root_cgrp,
+ task_cgroupns_root(task)))
+ goto out_unlock;
+
+ err = 0;
+ get_cgroup_ns(cgroup_ns);
+ put_cgroup_ns(nsproxy->cgroup_ns);
+ nsproxy->cgroup_ns = cgroup_ns;
+
+ threadgroup_unlock(current);
+ if (cgrp)
+ cgroup_put(cgrp);
+ return err;
}
static void *cgroupns_get(struct task_struct *task)
--
2.1.0.rc2.206.gedb03e5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Andy Lutomirski
2014-10-16 21:17:18 UTC
Permalink
Post by Serge E. Hallyn
Post by Aditya Kali
setns on a cgroup namespace is allowed only if
* task has CAP_SYS_ADMIN in its current user-namespace and
over the user-namespace associated with target cgroupns.
* task's current cgroup is descendent of the target cgroupns-root
cgroup.
What is the point of this?
If I'm a user logged into
/lxc/c1/user.slice/user-1000.slice/session-c12.scope and I start
a container which is in
/lxc/c1/user.slice/user-1000.slice/session-c12.scope/x1
then I will want to be able to enter the container's cgroup.
The container's cgroup root is under my own (satisfying the
below condition0 but my cgroup is not a descendent of the
container's cgroup.
Presumably you need to ask your friendly cgroup manager to stick you
in that cgroup first. Or we need to generally allow tasks to move
themselves deeper in the hierarchy, but that seems like a big change.

--Andy
Post by Serge E. Hallyn
Post by Aditya Kali
* target cgroupns-root is same as or deeper than task's current
cgroupns-root. This is so that the task cannot escape out of its
cgroupns-root. This also ensures that setns() only makes the task
get restricted to a deeper cgroup hierarchy.
---
kernel/cgroup_namespace.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 42 insertions(+), 2 deletions(-)
diff --git a/kernel/cgroup_namespace.c b/kernel/cgroup_namespace.c
index c16604f..c612946 100644
--- a/kernel/cgroup_namespace.c
+++ b/kernel/cgroup_namespace.c
static int cgroupns_install(struct nsproxy *nsproxy, void *ns)
{
- pr_info("setns not supported for cgroup namespace");
- return -EINVAL;
+ struct cgroup_namespace *cgroup_ns = ns;
+ struct task_struct *task = current;
+ struct cgroup *cgrp = NULL;
+ int err = 0;
+
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* Prevent cgroup changes for this task. */
+ threadgroup_lock(task);
+
+ cgrp = get_task_cgroup(task);
+
+ err = -EINVAL;
+ if (!cgroup_on_dfl(cgrp))
+ goto out_unlock;
+
+ /* Allow switch only if the task's current cgroup is descendant of the
+ * target cgroup_ns->root_cgrp.
+ */
+ if (!cgroup_is_descendant(cgrp, cgroup_ns->root_cgrp))
+ goto out_unlock;
+
+ /* Only allow setns to a cgroupns root-ed deeper than task's current
+ * cgroupns-root. This will make sure that tasks cannot escape their
+ * cgroupns by attaching to parent cgroupns.
+ */
+ if (!cgroup_is_descendant(cgroup_ns->root_cgrp,
+ task_cgroupns_root(task)))
+ goto out_unlock;
+
+ err = 0;
+ get_cgroup_ns(cgroup_ns);
+ put_cgroup_ns(nsproxy->cgroup_ns);
+ nsproxy->cgroup_ns = cgroup_ns;
+
+ threadgroup_unlock(current);
+ if (cgrp)
+ cgroup_put(cgrp);
+ return err;
}
static void *cgroupns_get(struct task_struct *task)
--
2.1.0.rc2.206.gedb03e5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
--
Andy Lutomirski
AMA Capital Management, LLC
Aditya Kali
2014-10-16 21:22:18 UTC
Permalink
Post by Serge E. Hallyn
Post by Aditya Kali
setns on a cgroup namespace is allowed only if
* task has CAP_SYS_ADMIN in its current user-namespace and
over the user-namespace associated with target cgroupns.
* task's current cgroup is descendent of the target cgroupns-root
cgroup.
What is the point of this?
If I'm a user logged into
/lxc/c1/user.slice/user-1000.slice/session-c12.scope and I start
a container which is in
/lxc/c1/user.slice/user-1000.slice/session-c12.scope/x1
then I will want to be able to enter the container's cgroup.
The container's cgroup root is under my own (satisfying the
below condition0 but my cgroup is not a descendent of the
container's cgroup.
This condition is there because we don't want to do implicit cgroup
changes when a process attaches to another cgroupns. cgroupns tries to
preserve the invariant that at any point, your current cgroup is
always under the cgroupns-root of your cgroup namespace. But in your
example, if we allow a process in "session-c12.scope" container to
attach to cgroupns root'ed at "session-c12.scope/x1" container
(without implicitly moving its cgroup), then this invariant won't
hold.
Post by Serge E. Hallyn
Post by Aditya Kali
* target cgroupns-root is same as or deeper than task's current
cgroupns-root. This is so that the task cannot escape out of its
cgroupns-root. This also ensures that setns() only makes the task
get restricted to a deeper cgroup hierarchy.
---
kernel/cgroup_namespace.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 42 insertions(+), 2 deletions(-)
diff --git a/kernel/cgroup_namespace.c b/kernel/cgroup_namespace.c
index c16604f..c612946 100644
--- a/kernel/cgroup_namespace.c
+++ b/kernel/cgroup_namespace.c
static int cgroupns_install(struct nsproxy *nsproxy, void *ns)
{
- pr_info("setns not supported for cgroup namespace");
- return -EINVAL;
+ struct cgroup_namespace *cgroup_ns = ns;
+ struct task_struct *task = current;
+ struct cgroup *cgrp = NULL;
+ int err = 0;
+
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* Prevent cgroup changes for this task. */
+ threadgroup_lock(task);
+
+ cgrp = get_task_cgroup(task);
+
+ err = -EINVAL;
+ if (!cgroup_on_dfl(cgrp))
+ goto out_unlock;
+
+ /* Allow switch only if the task's current cgroup is descendant of the
+ * target cgroup_ns->root_cgrp.
+ */
+ if (!cgroup_is_descendant(cgrp, cgroup_ns->root_cgrp))
+ goto out_unlock;
+
+ /* Only allow setns to a cgroupns root-ed deeper than task's current
+ * cgroupns-root. This will make sure that tasks cannot escape their
+ * cgroupns by attaching to parent cgroupns.
+ */
+ if (!cgroup_is_descendant(cgroup_ns->root_cgrp,
+ task_cgroupns_root(task)))
+ goto out_unlock;
+
+ err = 0;
+ get_cgroup_ns(cgroup_ns);
+ put_cgroup_ns(nsproxy->cgroup_ns);
+ nsproxy->cgroup_ns = cgroup_ns;
+
+ threadgroup_unlock(current);
+ if (cgrp)
+ cgroup_put(cgrp);
+ return err;
}
static void *cgroupns_get(struct task_struct *task)
--
2.1.0.rc2.206.gedb03e5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
--
Aditya
Serge E. Hallyn
2014-10-16 21:47:10 UTC
Permalink
Post by Aditya Kali
Post by Serge E. Hallyn
Post by Aditya Kali
setns on a cgroup namespace is allowed only if
* task has CAP_SYS_ADMIN in its current user-namespace and
over the user-namespace associated with target cgroupns.
* task's current cgroup is descendent of the target cgroupns-root
cgroup.
What is the point of this?
If I'm a user logged into
/lxc/c1/user.slice/user-1000.slice/session-c12.scope and I start
a container which is in
/lxc/c1/user.slice/user-1000.slice/session-c12.scope/x1
then I will want to be able to enter the container's cgroup.
The container's cgroup root is under my own (satisfying the
below condition0 but my cgroup is not a descendent of the
container's cgroup.
This condition is there because we don't want to do implicit cgroup
changes when a process attaches to another cgroupns. cgroupns tries to
preserve the invariant that at any point, your current cgroup is
always under the cgroupns-root of your cgroup namespace. But in your
example, if we allow a process in "session-c12.scope" container to
attach to cgroupns root'ed at "session-c12.scope/x1" container
(without implicitly moving its cgroup), then this invariant won't
hold.
Oh, I see. Guess that should be workable. Thanks.

-serge
ebiederm-aS9lmoZGLiVWk0Htik3J/ (Eric W. Biederman)
2014-10-19 05:23:39 UTC
Permalink
Post by Serge E. Hallyn
Post by Aditya Kali
Post by Serge E. Hallyn
Post by Aditya Kali
setns on a cgroup namespace is allowed only if
* task has CAP_SYS_ADMIN in its current user-namespace and
over the user-namespace associated with target cgroupns.
* task's current cgroup is descendent of the target cgroupns-root
cgroup.
What is the point of this?
If I'm a user logged into
/lxc/c1/user.slice/user-1000.slice/session-c12.scope and I start
a container which is in
/lxc/c1/user.slice/user-1000.slice/session-c12.scope/x1
then I will want to be able to enter the container's cgroup.
The container's cgroup root is under my own (satisfying the
below condition0 but my cgroup is not a descendent of the
container's cgroup.
This condition is there because we don't want to do implicit cgroup
changes when a process attaches to another cgroupns. cgroupns tries to
preserve the invariant that at any point, your current cgroup is
always under the cgroupns-root of your cgroup namespace. But in your
example, if we allow a process in "session-c12.scope" container to
attach to cgroupns root'ed at "session-c12.scope/x1" container
(without implicitly moving its cgroup), then this invariant won't
hold.
Oh, I see. Guess that should be workable. Thanks.
Which has me looking at what the rules are for moving through
the cgroup hierarchy.

As long as we have write access to cgroup.procs and are allowed
to open the file for write, we can move any of our own tasks
into the cgroup. So the cgroup namespace rules don't seem
to be a problem.

Andy can you please take a look at the permission checks in
__cgroup_procs_write.

As I read the code I see 3 security gaffaws in the permssion check.
- Using current->cred instead of file->f_cred.
- Not checking tcred->euid.
- Checking GLOBAL_ROOT_UID instead of having a capable call.

The file permission on cgroup.procs seem just sufficient to keep
to keep those bugs from being easily exploitable.

Eric
Andy Lutomirski
2014-10-19 18:26:29 UTC
Permalink
On Sat, Oct 18, 2014 at 10:23 PM, Eric W. Biederman
Post by ebiederm-aS9lmoZGLiVWk0Htik3J/ (Eric W. Biederman)
Post by Serge E. Hallyn
Post by Aditya Kali
Post by Serge E. Hallyn
Post by Aditya Kali
setns on a cgroup namespace is allowed only if
* task has CAP_SYS_ADMIN in its current user-namespace and
over the user-namespace associated with target cgroupns.
* task's current cgroup is descendent of the target cgroupns-root
cgroup.
What is the point of this?
If I'm a user logged into
/lxc/c1/user.slice/user-1000.slice/session-c12.scope and I start
a container which is in
/lxc/c1/user.slice/user-1000.slice/session-c12.scope/x1
then I will want to be able to enter the container's cgroup.
The container's cgroup root is under my own (satisfying the
below condition0 but my cgroup is not a descendent of the
container's cgroup.
This condition is there because we don't want to do implicit cgroup
changes when a process attaches to another cgroupns. cgroupns tries to
preserve the invariant that at any point, your current cgroup is
always under the cgroupns-root of your cgroup namespace. But in your
example, if we allow a process in "session-c12.scope" container to
attach to cgroupns root'ed at "session-c12.scope/x1" container
(without implicitly moving its cgroup), then this invariant won't
hold.
Oh, I see. Guess that should be workable. Thanks.
Which has me looking at what the rules are for moving through
the cgroup hierarchy.
As long as we have write access to cgroup.procs and are allowed
to open the file for write, we can move any of our own tasks
into the cgroup. So the cgroup namespace rules don't seem
to be a problem.
Andy can you please take a look at the permission checks in
__cgroup_procs_write.
The actual requirements for calling that function haven't changed,
right? IOW, what does this have to do with cgroupns? Is the idea
that you want a privileged user wrt a cgroupns's userns to be able to
use this? If so:

Yes, that current_cred() thing is bogus. (Actually, this is probably
exploitable right now if any cgroup.procs inode anywhere on the system
lets non-root write.) (Can we have some kernel debugging option that
makes any use of current_cred() in write(2) warn?)

We really need a weaker version of may_ptrace for this kind of stuff.
Maybe the existing may_ptrace stuff is okay, actually. But this is
completely missing group checks, cap checks, capabilities wrt the
userns, etc.

Also, I think that, if this version of the patchset allows non-init
userns to unshare cgroupns, then the issue of what permission is
needed to lock the cgroup hierarchy like that needs to be addressed,
because unshare(CLONE_NEWUSER|CLONE_NEWCGROUP) will effectively pin
the calling task with no permission required. Bolting on a fix later
will be a mess.

--Andy
Post by ebiederm-aS9lmoZGLiVWk0Htik3J/ (Eric W. Biederman)
As I read the code I see 3 security gaffaws in the permssion check.
- Using current->cred instead of file->f_cred.
- Not checking tcred->euid.
- Checking GLOBAL_ROOT_UID instead of having a capable call.
The file permission on cgroup.procs seem just sufficient to keep
to keep those bugs from being easily exploitable.
Eric
--
Andy Lutomirski
AMA Capital Management, LLC
Eric W.Biederman
2014-10-20 04:55:50 UTC
Permalink
Post by Andy Lutomirski
On Sat, Oct 18, 2014 at 10:23 PM, Eric W. Biederman
Post by ebiederm-aS9lmoZGLiVWk0Htik3J/ (Eric W. Biederman)
Post by Serge E. Hallyn
Post by Aditya Kali
Post by Serge E. Hallyn
Post by Aditya Kali
setns on a cgroup namespace is allowed only if
* task has CAP_SYS_ADMIN in its current user-namespace and
over the user-namespace associated with target cgroupns.
* task's current cgroup is descendent of the target
cgroupns-root
Post by ebiederm-aS9lmoZGLiVWk0Htik3J/ (Eric W. Biederman)
Post by Serge E. Hallyn
Post by Aditya Kali
Post by Serge E. Hallyn
Post by Aditya Kali
cgroup.
What is the point of this?
If I'm a user logged into
/lxc/c1/user.slice/user-1000.slice/session-c12.scope and I start
a container which is in
/lxc/c1/user.slice/user-1000.slice/session-c12.scope/x1
then I will want to be able to enter the container's cgroup.
The container's cgroup root is under my own (satisfying the
below condition0 but my cgroup is not a descendent of the
container's cgroup.
This condition is there because we don't want to do implicit cgroup
changes when a process attaches to another cgroupns. cgroupns tries
to
Post by ebiederm-aS9lmoZGLiVWk0Htik3J/ (Eric W. Biederman)
Post by Serge E. Hallyn
Post by Aditya Kali
preserve the invariant that at any point, your current cgroup is
always under the cgroupns-root of your cgroup namespace. But in
your
Post by ebiederm-aS9lmoZGLiVWk0Htik3J/ (Eric W. Biederman)
Post by Serge E. Hallyn
Post by Aditya Kali
example, if we allow a process in "session-c12.scope" container to
attach to cgroupns root'ed at "session-c12.scope/x1" container
(without implicitly moving its cgroup), then this invariant won't
hold.
Oh, I see. Guess that should be workable. Thanks.
Which has me looking at what the rules are for moving through
the cgroup hierarchy.
As long as we have write access to cgroup.procs and are allowed
to open the file for write, we can move any of our own tasks
into the cgroup. So the cgroup namespace rules don't seem
to be a problem.
Andy can you please take a look at the permission checks in
__cgroup_procs_write.
The actual requirements for calling that function haven't changed,
right? IOW, what does this have to do with cgroupns?
Excluding user namespaces the requirements have not changed.

The immediate correlation is that to enter a cgroupns you must first put your process in one of it's cgroups.

So I was examining what it would take to enter the cgroup of cgroupns.
Post by Andy Lutomirski
Is the idea
that you want a privileged user wrt a cgroupns's userns to be able to
Yes, that current_cred() thing is bogus. (Actually, this is probably
exploitable right now if any cgroup.procs inode anywhere on the system
lets non-root write.) (Can we have some kernel debugging option that
makes any use of current_cred() in write(2) warn?)
We really need a weaker version of may_ptrace for this kind of stuff.
Maybe the existing may_ptrace stuff is okay, actually. But this is
completely missing group checks, cap checks, capabilities wrt the
userns, etc.
Also, I think that, if this version of the patchset allows non-init
userns to unshare cgroupns, then the issue of what permission is
needed to lock the cgroup hierarchy like that needs to be addressed,
because unshare(CLONE_NEWUSER|CLONE_NEWCGROUP) will effectively pin
the calling task with no permission required. Bolting on a fix later
will be a mess.
I imagine the pinning would be like the userns.

Ah but there is a potentially serious issue with the pinning.
With pinning we can make it impossible for root to move us to a different cgroup.

I am not certain how serious that is but it bears thinking about.
If we don't implement pinning we should be able to implent everything with just filesystem mount options, and no new namespace required.

Sigh.

I am too tired tonight to see the end game in this.

Eric
Post by Andy Lutomirski
Post by ebiederm-aS9lmoZGLiVWk0Htik3J/ (Eric W. Biederman)
As I read the code I see 3 security gaffaws in the permssion check.
- Using current->cred instead of file->f_cred.
- Not checking tcred->euid.
- Checking GLOBAL_ROOT_UID instead of having a capable call.
The file permission on cgroup.procs seem just sufficient to keep
to keep those bugs from being easily exploitable.
Eric
Serge E. Hallyn
2014-10-17 09:52:20 UTC
Permalink
Post by Aditya Kali
setns on a cgroup namespace is allowed only if
* task has CAP_SYS_ADMIN in its current user-namespace and
over the user-namespace associated with target cgroupns.
* task's current cgroup is descendent of the target cgroupns-root
cgroup.
* target cgroupns-root is same as or deeper than task's current
cgroupns-root. This is so that the task cannot escape out of its
cgroupns-root. This also ensures that setns() only makes the task
get restricted to a deeper cgroup hierarchy.
Acked-by: Serge Hallyn <serge.hallyn-Z7WLFzj8eWMS+***@public.gmane.org>

Below you allow setns to your own cgroupns. I think that's fine,
but since you're not doing an explicit cgroup change anyway should
you just return 0 at top in that case to save some cpu time?
Post by Aditya Kali
---
kernel/cgroup_namespace.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 42 insertions(+), 2 deletions(-)
diff --git a/kernel/cgroup_namespace.c b/kernel/cgroup_namespace.c
index c16604f..c612946 100644
--- a/kernel/cgroup_namespace.c
+++ b/kernel/cgroup_namespace.c
static int cgroupns_install(struct nsproxy *nsproxy, void *ns)
{
- pr_info("setns not supported for cgroup namespace");
- return -EINVAL;
+ struct cgroup_namespace *cgroup_ns = ns;
+ struct task_struct *task = current;
+ struct cgroup *cgrp = NULL;
+ int err = 0;
+
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* Prevent cgroup changes for this task. */
+ threadgroup_lock(task);
+
+ cgrp = get_task_cgroup(task);
+
+ err = -EINVAL;
+ if (!cgroup_on_dfl(cgrp))
+ goto out_unlock;
+
+ /* Allow switch only if the task's current cgroup is descendant of the
+ * target cgroup_ns->root_cgrp.
+ */
+ if (!cgroup_is_descendant(cgrp, cgroup_ns->root_cgrp))
+ goto out_unlock;
+
+ /* Only allow setns to a cgroupns root-ed deeper than task's current
+ * cgroupns-root. This will make sure that tasks cannot escape their
+ * cgroupns by attaching to parent cgroupns.
+ */
+ if (!cgroup_is_descendant(cgroup_ns->root_cgrp,
+ task_cgroupns_root(task)))
+ goto out_unlock;
+
+ err = 0;
+ get_cgroup_ns(cgroup_ns);
+ put_cgroup_ns(nsproxy->cgroup_ns);
+ nsproxy->cgroup_ns = cgroup_ns;
+
+ threadgroup_unlock(current);
+ if (cgrp)
+ cgroup_put(cgrp);
+ return err;
}
static void *cgroupns_get(struct task_struct *task)
--
2.1.0.rc2.206.gedb03e5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Aditya Kali
2014-10-13 21:23:50 UTC
Permalink
This patch enables cgroup mounting inside userns when a process
as appropriate privileges. The cgroup filesystem mounted is
rooted at the cgroupns-root. Thus, in a container-setup, only
the hierarchy under the cgroupns-root is exposed inside the container.
This allows container management tools to run inside the containers
without depending on any global state.
In order to support this, a new kernfs api is added to lookup the
dentry for the cgroupns-root.

Signed-off-by: Aditya Kali <***@google.com>
---
fs/kernfs/mount.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/kernfs.h | 2 ++
kernel/cgroup.c | 47 +++++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f973ae9..e334f45 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -62,6 +62,54 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
return NULL;
}

+/**
+ * kernfs_make_root - create new root dentry for the given kernfs_node.
+ * @sb: the kernfs super_block
+ * @kn: kernfs_node for which a dentry is needed
+ *
+ * This can used used by callers which want to mount only a part of the kernfs
+ * as root of the filesystem.
+ */
+struct dentry *kernfs_obtain_root(struct super_block *sb,
+ struct kernfs_node *kn)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ BUG_ON(sb->s_op != &kernfs_sops);
+
+ /* inode for the given kernfs_node should already exist. */
+ inode = ilookup(sb, kn->ino);
+ if (!inode) {
+ pr_debug("kernfs: could not get inode for '");
+ pr_cont_kernfs_path(kn);
+ pr_cont("'.\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* instantiate and link root dentry */
+ dentry = d_obtain_root(inode);
+ if (!dentry) {
+ pr_debug("kernfs: could not get dentry for '");
+ pr_cont_kernfs_path(kn);
+ pr_cont("'.\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* If this is a new dentry, set it up. We need kernfs_mutex because this
+ * may be called by callers other than kernfs_fill_super. */
+ mutex_lock(&kernfs_mutex);
+ if (!dentry->d_fsdata) {
+ kernfs_get(kn);
+ dentry->d_fsdata = kn;
+ } else {
+ WARN_ON(dentry->d_fsdata != kn);
+ }
+ mutex_unlock(&kernfs_mutex);
+
+ return dentry;
+}
+
static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
{
struct kernfs_super_info *info = kernfs_info(sb);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 3c2be75..b9538e0 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -274,6 +274,8 @@ void kernfs_put(struct kernfs_node *kn);
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);

+struct dentry *kernfs_obtain_root(struct super_block *sb,
+ struct kernfs_node *kn);
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
unsigned int flags, void *priv);
void kernfs_destroy_root(struct kernfs_root *root);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2fc0dfa..ef27dc4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1302,6 +1302,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)

memset(opts, 0, sizeof(*opts));

+ /* Implicitly add CGRP_ROOT_SANE_BEHAVIOR if inside a non-init cgroup
+ * namespace.
+ */
+ if (current->nsproxy->cgroup_ns != &init_cgroup_ns) {
+ opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
+ }
+
while ((token = strsep(&o, ",")) != NULL) {
nr_opts++;

@@ -1391,7 +1398,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)

if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- if (nr_opts != 1) {
+ if (nr_opts > 1) {
pr_err("sane_behavior: no other mount options allowed\n");
return -EINVAL;
}
@@ -1581,6 +1588,15 @@ static void init_cgroup_root(struct cgroup_root *root,
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}

+struct dentry *cgroupns_get_root(struct super_block *sb,
+ struct cgroup_namespace *ns)
+{
+ struct dentry *nsdentry;
+
+ nsdentry = kernfs_obtain_root(sb, ns->root_cgrp->kn);
+ return nsdentry;
+}
+
static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
LIST_HEAD(tmp_links);
@@ -1684,6 +1700,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int ret;
int i;
bool new_sb;
+ struct cgroup_namespace *ns =
+ get_cgroup_ns(current->nsproxy->cgroup_ns);
+
+ /* Check if the caller has permission to mount. */
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+ put_cgroup_ns(ns);
+ return ERR_PTR(-EPERM);
+ }

/*
* The first time anyone tries to mount a cgroup, enable the list
@@ -1816,11 +1840,28 @@ out_free:
kfree(opts.release_agent);
kfree(opts.name);

- if (ret)
+ if (ret) {
+ put_cgroup_ns(ns);
return ERR_PTR(ret);
+ }

dentry = kernfs_mount(fs_type, flags, root->kf_root,
CGROUP_SUPER_MAGIC, &new_sb);
+
+ if (!IS_ERR(dentry)) {
+ /* If this mount is for a non-init cgroup namespace, then
+ * Instead of root's dentry, we return the dentry specific to
+ * the cgroupns->root_cgrp.
+ */
+ if (ns != &init_cgroup_ns) {
+ struct dentry *nsdentry;
+
+ nsdentry = cgroupns_get_root(dentry->d_sb, ns);
+ dput(dentry);
+ dentry = nsdentry;
+ }
+ }
+
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);

@@ -1833,6 +1874,7 @@ out_free:
deactivate_super(pinned_sb);
}

+ put_cgroup_ns(ns);
return dentry;
}

@@ -1861,6 +1903,7 @@ static struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};

static struct kobject *cgroup_kobj;
--
2.1.0.rc2.206.gedb03e5
Serge E. Hallyn
2014-10-17 12:19:57 UTC
Permalink
Post by Aditya Kali
This patch enables cgroup mounting inside userns when a process
as appropriate privileges. The cgroup filesystem mounted is
rooted at the cgroupns-root. Thus, in a container-setup, only
the hierarchy under the cgroupns-root is exposed inside the container.
This allows container management tools to run inside the containers
without depending on any global state.
In order to support this, a new kernfs api is added to lookup the
dentry for the cgroupns-root.
---
fs/kernfs/mount.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/kernfs.h | 2 ++
kernel/cgroup.c | 47 +++++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 95 insertions(+), 2 deletions(-)
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f973ae9..e334f45 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -62,6 +62,54 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
return NULL;
}
+/**
+ * kernfs_make_root - create new root dentry for the given kernfs_node.
+ *
+ * This can used used by callers which want to mount only a part of the kernfs
+ * as root of the filesystem.
+ */
+struct dentry *kernfs_obtain_root(struct super_block *sb,
+ struct kernfs_node *kn)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ BUG_ON(sb->s_op != &kernfs_sops);
+
+ /* inode for the given kernfs_node should already exist. */
+ inode = ilookup(sb, kn->ino);
+ if (!inode) {
+ pr_debug("kernfs: could not get inode for '");
+ pr_cont_kernfs_path(kn);
+ pr_cont("'.\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* instantiate and link root dentry */
+ dentry = d_obtain_root(inode);
+ if (!dentry) {
+ pr_debug("kernfs: could not get dentry for '");
+ pr_cont_kernfs_path(kn);
+ pr_cont("'.\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* If this is a new dentry, set it up. We need kernfs_mutex because this
+ * may be called by callers other than kernfs_fill_super. */
+ mutex_lock(&kernfs_mutex);
+ if (!dentry->d_fsdata) {
+ kernfs_get(kn);
+ dentry->d_fsdata = kn;
+ } else {
+ WARN_ON(dentry->d_fsdata != kn);
+ }
+ mutex_unlock(&kernfs_mutex);
+
+ return dentry;
+}
+
static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
{
struct kernfs_super_info *info = kernfs_info(sb);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 3c2be75..b9538e0 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -274,6 +274,8 @@ void kernfs_put(struct kernfs_node *kn);
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
+struct dentry *kernfs_obtain_root(struct super_block *sb,
+ struct kernfs_node *kn);
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
unsigned int flags, void *priv);
void kernfs_destroy_root(struct kernfs_root *root);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2fc0dfa..ef27dc4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1302,6 +1302,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
memset(opts, 0, sizeof(*opts));
+ /* Implicitly add CGRP_ROOT_SANE_BEHAVIOR if inside a non-init cgroup
+ * namespace.
+ */
+ if (current->nsproxy->cgroup_ns != &init_cgroup_ns) {
+ opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
+ }
+
while ((token = strsep(&o, ",")) != NULL) {
nr_opts++;
@@ -1391,7 +1398,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- if (nr_opts != 1) {
+ if (nr_opts > 1) {
pr_err("sane_behavior: no other mount options allowed\n");
return -EINVAL;
}
@@ -1581,6 +1588,15 @@ static void init_cgroup_root(struct cgroup_root *root,
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
+struct dentry *cgroupns_get_root(struct super_block *sb,
+ struct cgroup_namespace *ns)
+{
+ struct dentry *nsdentry;
+
+ nsdentry = kernfs_obtain_root(sb, ns->root_cgrp->kn);
+ return nsdentry;
+}
+
static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
LIST_HEAD(tmp_links);
@@ -1684,6 +1700,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int ret;
int i;
bool new_sb;
+ struct cgroup_namespace *ns =
+ get_cgroup_ns(current->nsproxy->cgroup_ns);
+
+ /* Check if the caller has permission to mount. */
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+ put_cgroup_ns(ns);
+ return ERR_PTR(-EPERM);
+ }
/*
* The first time anyone tries to mount a cgroup, enable the list
kfree(opts.release_agent);
kfree(opts.name);
- if (ret)
+ if (ret) {
+ put_cgroup_ns(ns);
return ERR_PTR(ret);
+ }
dentry = kernfs_mount(fs_type, flags, root->kf_root,
CGROUP_SUPER_MAGIC, &new_sb);
+
+ if (!IS_ERR(dentry)) {
+ /* If this mount is for a non-init cgroup namespace, then
+ * Instead of root's dentry, we return the dentry specific to
+ * the cgroupns->root_cgrp.
+ */
+ if (ns != &init_cgroup_ns) {
+ struct dentry *nsdentry;
+
+ nsdentry = cgroupns_get_root(dentry->d_sb, ns);
+ dput(dentry);
+ dentry = nsdentry;
+ }
+ }
+
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
deactivate_super(pinned_sb);
}
+ put_cgroup_ns(ns);
return dentry;
}
@@ -1861,6 +1903,7 @@ static struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
static struct kobject *cgroup_kobj;
--
2.1.0.rc2.206.gedb03e5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Aditya Kali
2014-10-13 21:23:46 UTC
Permalink
move cgroup_get() and cgroup_put() into cgroup.h so that
they can be called from other places.

Signed-off-by: Aditya Kali <adityakali-hpIqsD4AKlfQT0dZR+***@public.gmane.org>
---
include/linux/cgroup.h | 22 ++++++++++++++++++++++
kernel/cgroup.c | 22 ----------------------
2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 80ed6e0..4a0eb2d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -521,6 +521,28 @@ static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
return cgrp->root == &cgrp_dfl_root;
}

+/* convenient tests for these bits */
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+ return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static inline void cgroup_get(struct cgroup *cgrp)
+{
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
+}
+
+static inline bool cgroup_tryget(struct cgroup *cgrp)
+{
+ return css_tryget(&cgrp->self);
+}
+
+static inline void cgroup_put(struct cgroup *cgrp)
+{
+ css_put(&cgrp->self);
+}
+
/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_has_tasks(struct cgroup *cgrp)
{
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 56d507b..2b3e9f9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -284,12 +284,6 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
return cgroup_css(cgrp, ss);
}

-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
- return !(cgrp->self.flags & CSS_ONLINE);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -1002,22 +996,6 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
return mode;
}

-static void cgroup_get(struct cgroup *cgrp)
-{
- WARN_ON_ONCE(cgroup_is_dead(cgrp));
- css_get(&cgrp->self);
-}
-
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
- return css_tryget(&cgrp->self);
-}
-
-static void cgroup_put(struct cgroup *cgrp)
-{
- css_put(&cgrp->self);
-}
-
/**
* cgroup_refresh_child_subsys_mask - update child_subsys_mask
* @cgrp: the target cgroup
--
2.1.0.rc2.206.gedb03e5
Serge E. Hallyn
2014-10-16 16:14:21 UTC
Permalink
Post by Aditya Kali
move cgroup_get() and cgroup_put() into cgroup.h so that
they can be called from other places.
---
include/linux/cgroup.h | 22 ++++++++++++++++++++++
kernel/cgroup.c | 22 ----------------------
2 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 80ed6e0..4a0eb2d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -521,6 +521,28 @@ static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
return cgrp->root == &cgrp_dfl_root;
}
+/* convenient tests for these bits */
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+ return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static inline void cgroup_get(struct cgroup *cgrp)
+{
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
+}
+
+static inline bool cgroup_tryget(struct cgroup *cgrp)
+{
+ return css_tryget(&cgrp->self);
+}
+
+static inline void cgroup_put(struct cgroup *cgrp)
+{
+ css_put(&cgrp->self);
+}
+
/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_has_tasks(struct cgroup *cgrp)
{
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 56d507b..2b3e9f9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -284,12 +284,6 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
return cgroup_css(cgrp, ss);
}
-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
- return !(cgrp->self.flags & CSS_ONLINE);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -1002,22 +996,6 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
return mode;
}
-static void cgroup_get(struct cgroup *cgrp)
-{
- WARN_ON_ONCE(cgroup_is_dead(cgrp));
- css_get(&cgrp->self);
-}
-
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
- return css_tryget(&cgrp->self);
-}
-
-static void cgroup_put(struct cgroup *cgrp)
-{
- css_put(&cgrp->self);
-}
-
/**
* cgroup_refresh_child_subsys_mask - update child_subsys_mask
--
2.1.0.rc2.206.gedb03e5
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
Aditya Kali
2014-10-13 21:23:47 UTC
Permalink
Introduce the ability to create new cgroup namespace. The newly created
cgroup namespace remembers the 'struct cgroup *root_cgrp' at the point
of creation of the cgroup namespace. The task that creates the new
cgroup namespace and all its future children will now be restricted only
to the cgroup hierarchy under this root_cgrp.
The main purpose of cgroup namespace is to virtualize the contents
of /proc/self/cgroup file. Processes inside a cgroup namespace
are only able to see paths relative to their namespace root.
This allows container-tools (like libcontainer, lxc, lmctfy, etc.)
to create completely virtualized containers without leaking system
level cgroup hierarchy to the task.
This patch only implements the 'unshare' part of the cgroupns.

Signed-off-by: Aditya Kali <adityakali-hpIqsD4AKlfQT0dZR+***@public.gmane.org>
---
fs/proc/namespaces.c | 3 +
include/linux/cgroup.h | 18 +++++-
include/linux/cgroup_namespace.h | 62 +++++++++++++++++++
include/linux/nsproxy.h | 2 +
include/linux/proc_ns.h | 4 ++
init/Kconfig | 9 +++
kernel/Makefile | 1 +
kernel/cgroup.c | 11 ++++
kernel/cgroup_namespace.c | 128 +++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 2 +-
kernel/nsproxy.c | 19 +++++-
11 files changed, 255 insertions(+), 4 deletions(-)

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 8902609..e04ed4b 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -32,6 +32,9 @@ static const struct proc_ns_operations *ns_entries[] = {
&userns_operations,
#endif
&mntns_operations,
+#ifdef CONFIG_CGROUP_NS
+ &cgroupns_operations,
+#endif
};

static const struct file_operations ns_file_operations = {
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4a0eb2d..aa86495 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,8 @@
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/wait.h>
+#include <linux/nsproxy.h>
+#include <linux/types.h>

#ifdef CONFIG_CGROUPS

@@ -460,6 +462,13 @@ struct cftype {
#endif
};

+struct cgroup_namespace {
+ atomic_t count;
+ unsigned int proc_inum;
+ struct user_namespace *user_ns;
+ struct cgroup *root_cgrp;
+};
+
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;

@@ -584,10 +593,17 @@ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
return kernfs_name(cgrp->kn, buf, buflen);
}

+static inline char * __must_check cgroup_path_ns(struct cgroup_namespace *ns,
+ struct cgroup *cgrp, char *buf,
+ size_t buflen)
+{
+ return kernfs_path_from_node(ns->root_cgrp->kn, cgrp->kn, buf, buflen);
+}
+
static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
size_t buflen)
{
- return kernfs_path(cgrp->kn, buf, buflen);
+ return cgroup_path_ns(current->nsproxy->cgroup_ns, cgrp, buf, buflen);
}

static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h
new file mode 100644
index 0000000..9f637fe
--- /dev/null
+++ b/include/linux/cgroup_namespace.h
@@ -0,0 +1,62 @@
+#ifndef _LINUX_CGROUP_NAMESPACE_H
+#define _LINUX_CGROUP_NAMESPACE_H
+
+#include <linux/nsproxy.h>
+#include <linux/cgroup.h>
+#include <linux/types.h>
+#include <linux/user_namespace.h>
+
+extern struct cgroup_namespace init_cgroup_ns;
+
+static inline struct cgroup *task_cgroupns_root(struct task_struct *tsk)
+{
+ return tsk->nsproxy->cgroup_ns->root_cgrp;
+}
+
+#ifdef CONFIG_CGROUP_NS
+
+extern void free_cgroup_ns(struct cgroup_namespace *ns);
+
+static inline struct cgroup_namespace *get_cgroup_ns(
+ struct cgroup_namespace *ns)
+{
+ if (ns)
+ atomic_inc(&ns->count);
+ return ns;
+}
+
+static inline void put_cgroup_ns(struct cgroup_namespace *ns)
+{
+ if (ns && atomic_dec_and_test(&ns->count))
+ free_cgroup_ns(ns);
+}
+
+extern struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns);
+
+#else /* CONFIG_CGROUP_NS */
+
+static inline struct cgroup_namespace *get_cgroup_ns(
+ struct cgroup_namespace *ns)
+{
+ return &init_cgroup_ns;
+}
+
+static inline void put_cgroup_ns(struct cgroup_namespace *ns)
+{
+}
+
+static inline struct cgroup_namespace *copy_cgroup_ns(
+ unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns) {
+ if (flags & CLONE_NEWCGROUP)
+ return ERR_PTR(-EINVAL);
+
+ return old_ns;
+}
+
+#endif /* CONFIG_CGROUP_NS */
+
+#endif /* _LINUX_CGROUP_NAMESPACE_H */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 35fa08f..ac0d65b 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -8,6 +8,7 @@ struct mnt_namespace;
struct uts_namespace;
struct ipc_namespace;
struct pid_namespace;
+struct cgroup_namespace;
struct fs_struct;

/*
@@ -33,6 +34,7 @@ struct nsproxy {
struct mnt_namespace *mnt_ns;
struct pid_namespace *pid_ns_for_children;
struct net *net_ns;
+ struct cgroup_namespace *cgroup_ns;
};
extern struct nsproxy init_nsproxy;

diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 34a1e10..e56dd73 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -6,6 +6,8 @@

struct pid_namespace;
struct nsproxy;
+struct task_struct;
+struct inode;

struct proc_ns_operations {
const char *name;
@@ -27,6 +29,7 @@ extern const struct proc_ns_operations ipcns_operations;
extern const struct proc_ns_operations pidns_operations;
extern const struct proc_ns_operations userns_operations;
extern const struct proc_ns_operations mntns_operations;
+extern const struct proc_ns_operations cgroupns_operations;

/*
* We always define these enumerators
@@ -37,6 +40,7 @@ enum {
PROC_UTS_INIT_INO = 0xEFFFFFFEU,
PROC_USER_INIT_INO = 0xEFFFFFFDU,
PROC_PID_INIT_INO = 0xEFFFFFFCU,
+ PROC_CGROUP_INIT_INO = 0xEFFFFFFBU,
};

#ifdef CONFIG_PROC_FS
diff --git a/init/Kconfig b/init/Kconfig
index e84c642..c3be001 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1144,6 +1144,15 @@ config DEBUG_BLK_CGROUP
Enable some debugging help. Currently it exports additional stat
files in a cgroup which can be useful for debugging.

+config CGROUP_NS
+ bool "CGroup Namespaces"
+ default n
+ help
+ This options enables CGroup Namespaces which can be used to isolate
+ cgroup paths. This feature is only useful when unified cgroup
+ hierarchy is in use (i.e. cgroups are mounted with sane_behavior
+ option).
+
endif # CGROUPS

config CHECKPOINT_RESTORE
diff --git a/kernel/Makefile b/kernel/Makefile
index dc5c775..75334f8 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
+obj-$(CONFIG_CGROUP_NS) += cgroup_namespace.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2b3e9f9..f8099b4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,8 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
#include <linux/delay.h>
+#include <linux/proc_ns.h>
+#include <linux/cgroup_namespace.h>

#include <linux/atomic.h>

@@ -195,6 +197,15 @@ static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);

+struct cgroup_namespace init_cgroup_ns = {
+ .count = {
+ .counter = 1,
+ },
+ .proc_inum = PROC_CGROUP_INIT_INO,
+ .user_ns = &init_user_ns,
+ .root_cgrp = &cgrp_dfl_root.cgrp,
+};
+
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
gfp_t gfp_mask)
diff --git a/kernel/cgroup_namespace.c b/kernel/cgroup_namespace.c
new file mode 100644
index 0000000..c16604f
--- /dev/null
+++ b/kernel/cgroup_namespace.c
@@ -0,0 +1,128 @@
+
+#include <linux/cgroup.h>
+#include <linux/cgroup_namespace.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+ struct cgroup_namespace *new_ns;
+
+ new_ns = kmalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+ if (new_ns)
+ atomic_set(&new_ns->count, 1);
+ return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+ cgroup_put(ns->root_cgrp);
+ put_user_ns(ns->user_ns);
+ proc_free_inum(ns->proc_inum);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns)
+{
+ struct cgroup_namespace *new_ns = NULL;
+ struct cgroup *cgrp = NULL;
+ int err;
+
+ BUG_ON(!old_ns);
+
+ if (!(flags & CLONE_NEWCGROUP))
+ return get_cgroup_ns(old_ns);
+
+ /* Allow only sysadmin to create cgroup namespace. */
+ err = -EPERM;
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ goto err_out;
+
+ /* Prevent cgroup changes for this task. */
+ threadgroup_lock(current);
+
+ cgrp = get_task_cgroup(current);
+
+ /* Creating new CGROUPNS is supported only when unified hierarchy is in
+ * use. */
+ err = -EINVAL;
+ if (!cgroup_on_dfl(cgrp))
+ goto err_out_unlock;
+
+ err = -ENOMEM;
+ new_ns = alloc_cgroup_ns();
+ if (!new_ns)
+ goto err_out_unlock;
+
+ err = proc_alloc_inum(&new_ns->proc_inum);
+ if (err)
+ goto err_out_unlock;
+
+ new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->root_cgrp = cgrp;
+
+ threadgroup_unlock(current);
+
+ return new_ns;
+
+err_out_unlock:
+ threadgroup_unlock(current);
+err_out:
+ if (cgrp)
+ cgroup_put(cgrp);
+ kfree(new_ns);
+ return ERR_PTR(err);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, void *ns)
+{
+ pr_info("setns not supported for cgroup namespace");
+ return -EINVAL;
+}
+
+static void *cgroupns_get(struct task_struct *task)
+{
+ struct cgroup_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ rcu_read_lock();
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->cgroup_ns;
+ get_cgroup_ns(ns);
+ }
+ rcu_read_unlock();
+
+ return ns;
+}
+
+static void cgroupns_put(void *ns)
+{
+ put_cgroup_ns(ns);
+}
+
+static unsigned int cgroupns_inum(void *ns)
+{
+ struct cgroup_namespace *cgroup_ns = ns;
+
+ return cgroup_ns->proc_inum;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+ .name = "cgroup",
+ .type = CLONE_NEWCGROUP,
+ .get = cgroupns_get,
+ .put = cgroupns_put,
+ .install = cgroupns_install,
+ .inum = cgroupns_inum,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+ return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0cf9cdb..cc06851 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1790,7 +1790,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID))
+ CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing to
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ef42d0a..a8b1970 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -25,6 +25,7 @@
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
+#include <linux/cgroup_namespace.h>

static struct kmem_cache *nsproxy_cachep;

@@ -39,6 +40,7 @@ struct nsproxy init_nsproxy = {
#ifdef CONFIG_NET
.net_ns = &init_net,
#endif
+ .cgroup_ns = &init_cgroup_ns,
};

static inline struct nsproxy *create_nsproxy(void)
@@ -92,6 +94,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_pid;
}

+ new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
+ tsk->nsproxy->cgroup_ns);
+ if (IS_ERR(new_nsp->cgroup_ns)) {
+ err = PTR_ERR(new_nsp->cgroup_ns);
+ goto out_cgroup;
+ }
+
new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
@@ -101,6 +110,9 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
return new_nsp;

out_net:
+ if (new_nsp->cgroup_ns)
+ put_cgroup_ns(new_nsp->cgroup_ns);
+out_cgroup:
if (new_nsp->pid_ns_for_children)
put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
@@ -128,7 +140,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *new_ns;

if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWPID | CLONE_NEWNET)))) {
+ CLONE_NEWPID | CLONE_NEWNET |
+ CLONE_NEWCGROUP)))) {
get_nsproxy(old_ns);
return 0;
}
@@ -165,6 +178,8 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns_for_children)
put_pid_ns(ns->pid_ns_for_children);
+ if (ns->cgroup_ns)
+ put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
}
@@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;

if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID)))
+ CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
return 0;

user_ns = new_cred ? new_cred->user_ns : current_user_ns();
--
2.1.0.rc2.206.gedb03e5
Serge E. Hallyn
2014-10-16 16:37:03 UTC
Permalink
Post by Aditya Kali
Introduce the ability to create new cgroup namespace. The newly created
cgroup namespace remembers the 'struct cgroup *root_cgrp' at the point
of creation of the cgroup namespace. The task that creates the new
cgroup namespace and all its future children will now be restricted only
to the cgroup hierarchy under this root_cgrp.
The main purpose of cgroup namespace is to virtualize the contents
of /proc/self/cgroup file. Processes inside a cgroup namespace
are only able to see paths relative to their namespace root.
This allows container-tools (like libcontainer, lxc, lmctfy, etc.)
to create completely virtualized containers without leaking system
level cgroup hierarchy to the task.
This patch only implements the 'unshare' part of the cgroupns.
I'm not sure that the CONFIG_CGROUP_NS is worthwhile. If you already
have cgroups in the kernel this won't add much in the way of memory
usage, right? And I think the 'experimental' argument has long since
been squashed. So I'd argue for simplifying this patch by removing
CONFIG_CGROUP_NS.

(more below)
Post by Aditya Kali
---
fs/proc/namespaces.c | 3 +
include/linux/cgroup.h | 18 +++++-
include/linux/cgroup_namespace.h | 62 +++++++++++++++++++
include/linux/nsproxy.h | 2 +
include/linux/proc_ns.h | 4 ++
init/Kconfig | 9 +++
kernel/Makefile | 1 +
kernel/cgroup.c | 11 ++++
kernel/cgroup_namespace.c | 128 +++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 2 +-
kernel/nsproxy.c | 19 +++++-
11 files changed, 255 insertions(+), 4 deletions(-)
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 8902609..e04ed4b 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -32,6 +32,9 @@ static const struct proc_ns_operations *ns_entries[] = {
&userns_operations,
#endif
&mntns_operations,
+#ifdef CONFIG_CGROUP_NS
+ &cgroupns_operations,
+#endif
};
static const struct file_operations ns_file_operations = {
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4a0eb2d..aa86495 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,8 @@
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/wait.h>
+#include <linux/nsproxy.h>
+#include <linux/types.h>
#ifdef CONFIG_CGROUPS
@@ -460,6 +462,13 @@ struct cftype {
#endif
};
+struct cgroup_namespace {
+ atomic_t count;
+ unsigned int proc_inum;
+ struct user_namespace *user_ns;
+ struct cgroup *root_cgrp;
+};
+
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;
@@ -584,10 +593,17 @@ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
return kernfs_name(cgrp->kn, buf, buflen);
}
+static inline char * __must_check cgroup_path_ns(struct cgroup_namespace *ns,
+ struct cgroup *cgrp, char *buf,
+ size_t buflen)
+{
+ return kernfs_path_from_node(ns->root_cgrp->kn, cgrp->kn, buf, buflen);
+}
+
static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
size_t buflen)
{
- return kernfs_path(cgrp->kn, buf, buflen);
+ return cgroup_path_ns(current->nsproxy->cgroup_ns, cgrp, buf, buflen);
}
static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h
new file mode 100644
index 0000000..9f637fe
--- /dev/null
+++ b/include/linux/cgroup_namespace.h
@@ -0,0 +1,62 @@
+#ifndef _LINUX_CGROUP_NAMESPACE_H
+#define _LINUX_CGROUP_NAMESPACE_H
+
+#include <linux/nsproxy.h>
+#include <linux/cgroup.h>
+#include <linux/types.h>
+#include <linux/user_namespace.h>
+
+extern struct cgroup_namespace init_cgroup_ns;
+
+static inline struct cgroup *task_cgroupns_root(struct task_struct *tsk)
+{
+ return tsk->nsproxy->cgroup_ns->root_cgrp;
Per the rules in nsproxy.h, you should be taking the task_lock here.

(If you are making assumptions about tsk then you need to state them
here - I only looked quickly enough that you pass in 'leader')
Post by Aditya Kali
+}
+
+#ifdef CONFIG_CGROUP_NS
+
+extern void free_cgroup_ns(struct cgroup_namespace *ns);
+
+static inline struct cgroup_namespace *get_cgroup_ns(
+ struct cgroup_namespace *ns)
+{
+ if (ns)
+ atomic_inc(&ns->count);
+ return ns;
+}
+
+static inline void put_cgroup_ns(struct cgroup_namespace *ns)
+{
+ if (ns && atomic_dec_and_test(&ns->count))
+ free_cgroup_ns(ns);
+}
+
+extern struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns);
+
+#else /* CONFIG_CGROUP_NS */
+
+static inline struct cgroup_namespace *get_cgroup_ns(
+ struct cgroup_namespace *ns)
+{
+ return &init_cgroup_ns;
+}
+
+static inline void put_cgroup_ns(struct cgroup_namespace *ns)
+{
+}
+
+static inline struct cgroup_namespace *copy_cgroup_ns(
+ unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns) {
+ if (flags & CLONE_NEWCGROUP)
+ return ERR_PTR(-EINVAL);
+
+ return old_ns;
+}
+
+#endif /* CONFIG_CGROUP_NS */
+
+#endif /* _LINUX_CGROUP_NAMESPACE_H */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 35fa08f..ac0d65b 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -8,6 +8,7 @@ struct mnt_namespace;
struct uts_namespace;
struct ipc_namespace;
struct pid_namespace;
+struct cgroup_namespace;
struct fs_struct;
/*
@@ -33,6 +34,7 @@ struct nsproxy {
struct mnt_namespace *mnt_ns;
struct pid_namespace *pid_ns_for_children;
struct net *net_ns;
+ struct cgroup_namespace *cgroup_ns;
};
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 34a1e10..e56dd73 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -6,6 +6,8 @@
struct pid_namespace;
struct nsproxy;
+struct task_struct;
+struct inode;
struct proc_ns_operations {
const char *name;
@@ -27,6 +29,7 @@ extern const struct proc_ns_operations ipcns_operations;
extern const struct proc_ns_operations pidns_operations;
extern const struct proc_ns_operations userns_operations;
extern const struct proc_ns_operations mntns_operations;
+extern const struct proc_ns_operations cgroupns_operations;
/*
* We always define these enumerators
@@ -37,6 +40,7 @@ enum {
PROC_UTS_INIT_INO = 0xEFFFFFFEU,
PROC_USER_INIT_INO = 0xEFFFFFFDU,
PROC_PID_INIT_INO = 0xEFFFFFFCU,
+ PROC_CGROUP_INIT_INO = 0xEFFFFFFBU,
};
#ifdef CONFIG_PROC_FS
diff --git a/init/Kconfig b/init/Kconfig
index e84c642..c3be001 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1144,6 +1144,15 @@ config DEBUG_BLK_CGROUP
Enable some debugging help. Currently it exports additional stat
files in a cgroup which can be useful for debugging.
+config CGROUP_NS
+ bool "CGroup Namespaces"
+ default n
+ help
+ This options enables CGroup Namespaces which can be used to isolate
+ cgroup paths. This feature is only useful when unified cgroup
+ hierarchy is in use (i.e. cgroups are mounted with sane_behavior
+ option).
+
endif # CGROUPS
config CHECKPOINT_RESTORE
diff --git a/kernel/Makefile b/kernel/Makefile
index dc5c775..75334f8 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
+obj-$(CONFIG_CGROUP_NS) += cgroup_namespace.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2b3e9f9..f8099b4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,8 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
#include <linux/delay.h>
+#include <linux/proc_ns.h>
+#include <linux/cgroup_namespace.h>
#include <linux/atomic.h>
@@ -195,6 +197,15 @@ static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+struct cgroup_namespace init_cgroup_ns = {
+ .count = {
+ .counter = 1,
+ },
+ .proc_inum = PROC_CGROUP_INIT_INO,
+ .user_ns = &init_user_ns,
This might mean that you should bump the init_user_ns refcount.
Post by Aditya Kali
+ .root_cgrp = &cgrp_dfl_root.cgrp,
+};
+
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
gfp_t gfp_mask)
diff --git a/kernel/cgroup_namespace.c b/kernel/cgroup_namespace.c
new file mode 100644
index 0000000..c16604f
--- /dev/null
+++ b/kernel/cgroup_namespace.c
@@ -0,0 +1,128 @@
+
+#include <linux/cgroup.h>
+#include <linux/cgroup_namespace.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+ struct cgroup_namespace *new_ns;
+
+ new_ns = kmalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+ if (new_ns)
+ atomic_set(&new_ns->count, 1);
+ return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+ cgroup_put(ns->root_cgrp);
+ put_user_ns(ns->user_ns);
This is a problem on error patch in copy_cgroup_ns. The
alloc_cgroup_ns() doesn't initialize these values, so if
you should fail in proc_alloc_inum() you'll show up here
with fandom values in ns->*.
Post by Aditya Kali
+ proc_free_inum(ns->proc_inum);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns)
+{
+ struct cgroup_namespace *new_ns = NULL;
+ struct cgroup *cgrp = NULL;
+ int err;
+
+ BUG_ON(!old_ns);
+
+ if (!(flags & CLONE_NEWCGROUP))
+ return get_cgroup_ns(old_ns);
+
+ /* Allow only sysadmin to create cgroup namespace. */
+ err = -EPERM;
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ goto err_out;
+
+ /* Prevent cgroup changes for this task. */
+ threadgroup_lock(current);
+
+ cgrp = get_task_cgroup(current);
+
+ /* Creating new CGROUPNS is supported only when unified hierarchy is in
+ * use. */
Oh, drat. Well, I'll take, it, but under protest :)
Post by Aditya Kali
+ err = -EINVAL;
+ if (!cgroup_on_dfl(cgrp))
+ goto err_out_unlock;
+
+ err = -ENOMEM;
+ new_ns = alloc_cgroup_ns();
+ if (!new_ns)
+ goto err_out_unlock;
+
+ err = proc_alloc_inum(&new_ns->proc_inum);
+ if (err)
+ goto err_out_unlock;
+
+ new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->root_cgrp = cgrp;
+
+ threadgroup_unlock(current);
+
+ return new_ns;
+
+ threadgroup_unlock(current);
+ if (cgrp)
+ cgroup_put(cgrp);
+ kfree(new_ns);
+ return ERR_PTR(err);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, void *ns)
+{
+ pr_info("setns not supported for cgroup namespace");
+ return -EINVAL;
+}
+
+static void *cgroupns_get(struct task_struct *task)
+{
+ struct cgroup_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ rcu_read_lock();
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->cgroup_ns;
+ get_cgroup_ns(ns);
+ }
+ rcu_read_unlock();
+
+ return ns;
+}
+
+static void cgroupns_put(void *ns)
+{
+ put_cgroup_ns(ns);
+}
+
+static unsigned int cgroupns_inum(void *ns)
+{
+ struct cgroup_namespace *cgroup_ns = ns;
+
+ return cgroup_ns->proc_inum;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+ .name = "cgroup",
+ .type = CLONE_NEWCGROUP,
+ .get = cgroupns_get,
+ .put = cgroupns_put,
+ .install = cgroupns_install,
+ .inum = cgroupns_inum,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+ return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0cf9cdb..cc06851 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1790,7 +1790,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID))
+ CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing to
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ef42d0a..a8b1970 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -25,6 +25,7 @@
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
+#include <linux/cgroup_namespace.h>
static struct kmem_cache *nsproxy_cachep;
@@ -39,6 +40,7 @@ struct nsproxy init_nsproxy = {
#ifdef CONFIG_NET
.net_ns = &init_net,
#endif
+ .cgroup_ns = &init_cgroup_ns,
};
static inline struct nsproxy *create_nsproxy(void)
@@ -92,6 +94,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_pid;
}
+ new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
+ tsk->nsproxy->cgroup_ns);
+ if (IS_ERR(new_nsp->cgroup_ns)) {
+ err = PTR_ERR(new_nsp->cgroup_ns);
+ goto out_cgroup;
+ }
+
new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
@@ -101,6 +110,9 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
return new_nsp;
+ if (new_nsp->cgroup_ns)
+ put_cgroup_ns(new_nsp->cgroup_ns);
if (new_nsp->pid_ns_for_children)
put_pid_ns(new_nsp->pid_ns_for_children);
@@ -128,7 +140,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *new_ns;
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWPID | CLONE_NEWNET)))) {
+ CLONE_NEWPID | CLONE_NEWNET |
+ CLONE_NEWCGROUP)))) {
get_nsproxy(old_ns);
return 0;
}
@@ -165,6 +178,8 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns_for_children)
put_pid_ns(ns->pid_ns_for_children);
+ if (ns->cgroup_ns)
+ put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
}
@@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID)))
+ CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
--
2.1.0.rc2.206.gedb03e5
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
Aditya Kali
2014-10-13 21:23:48 UTC
Permalink
Restrict following operations within the calling tasks:
* cgroup_mkdir & cgroup_rmdir
* cgroup_attach_task
* writes to cgroup files outside of task's cgroupns-root

Also, read of /proc/<pid>/cgroup file is now restricted only
to tasks under same cgroupns-root. If a task tries to look
at cgroup of another task outside of its cgroupns-root, then
it won't be able to see anything for the default hierarchy.
This is same as if the cgroups are not mounted.

Signed-off-by: Aditya Kali <adityakali-hpIqsD4AKlfQT0dZR+***@public.gmane.org>
---
kernel/cgroup.c | 34 +++++++++++++++++++++++++++++++++-
1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f8099b4..2fc0dfa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2318,6 +2318,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
struct task_struct *task;
int ret;

+ /* Only allow changing cgroups accessible within task's cgroup
+ * namespace. i.e. 'dst_cgrp' should be a descendant of task's
+ * cgroupns->root_cgrp. */
+ if (!cgroup_is_descendant(dst_cgrp, task_cgroupns_root(leader)))
+ return -EPERM;
+
/* look up all src csets */
down_read(&css_set_rwsem);
rcu_read_lock();
@@ -2882,6 +2888,10 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
struct cgroup_subsys_state *css;
int ret;

+ /* Reject writes to cgroup files outside of task's cgroupns-root. */
+ if (!cgroup_is_descendant(cgrp, task_cgroupns_root(current)))
+ return -EINVAL;
+
if (cft->write)
return cft->write(of, buf, nbytes, off);

@@ -4560,6 +4570,13 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
parent = cgroup_kn_lock_live(parent_kn);
if (!parent)
return -ENODEV;
+
+ /* Allow mkdir only within process's cgroup namespace root. */
+ if (!cgroup_is_descendant(parent, task_cgroupns_root(current))) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
root = parent->root;

/* allocate the cgroup and its ID, 0 is reserved for the root */
@@ -4822,6 +4839,13 @@ static int cgroup_rmdir(struct kernfs_node *kn)
if (!cgrp)
return 0;

+ /* Allow rmdir only within process's cgroup namespace root.
+ * The process can't delete its own root anyways. */
+ if (!cgroup_is_descendant(cgrp, task_cgroupns_root(current))) {
+ cgroup_kn_unlock(kn);
+ return -EPERM;
+ }
+
ret = cgroup_destroy_locked(cgrp);

cgroup_kn_unlock(kn);
@@ -5051,6 +5075,15 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
continue;

+ cgrp = task_cgroup_from_root(tsk, root);
+
+ /* The cgroup path on default hierarchy is shown only if it
+ * falls under current task's cgroupns-root.
+ */
+ if (root == &cgrp_dfl_root &&
+ !cgroup_is_descendant(cgrp, task_cgroupns_root(current)))
+ continue;
+
seq_printf(m, "%d:", root->hierarchy_id);
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
@@ -5059,7 +5092,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
- cgrp = task_cgroup_from_root(tsk, root);
path = cgroup_path(cgrp, buf, PATH_MAX);
if (!path) {
retval = -ENAMETOOLONG;
--
2.1.0.rc2.206.gedb03e5
Serge E. Hallyn
2014-10-17 09:28:14 UTC
Permalink
Post by Aditya Kali
* cgroup_mkdir & cgroup_rmdir
* cgroup_attach_task
* writes to cgroup files outside of task's cgroupns-root
Also, read of /proc/<pid>/cgroup file is now restricted only
to tasks under same cgroupns-root. If a task tries to look
at cgroup of another task outside of its cgroupns-root, then
it won't be able to see anything for the default hierarchy.
This is same as if the cgroups are not mounted.
So this is a bit different from some other namespaces - if I
have an open fd to a file, then setns into a mntns where that
file is not addressable, I can still use the file.

I guess not allowing attach to a cgroup outside our ns is a
good failsafe as we'll otherwise risk falling off a cliff in
some code, but I'm not sure the cgroup_file_write/mkdir/rmdir
restrictions are needed. (And really I can fchdir to a
directory not in my ns, so the cgroup-attach restriction is
any more justified).

Still I'm not strictly opposed ot this, so

Acked-by: Serge Hallyn <serge.hallyn-Z7WLFzj8eWMS+***@public.gmane.org>

just wanted to point this out.
Post by Aditya Kali
---
kernel/cgroup.c | 34 +++++++++++++++++++++++++++++++++-
1 file changed, 33 insertions(+), 1 deletion(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f8099b4..2fc0dfa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2318,6 +2318,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
struct task_struct *task;
int ret;
+ /* Only allow changing cgroups accessible within task's cgroup
+ * namespace. i.e. 'dst_cgrp' should be a descendant of task's
+ * cgroupns->root_cgrp. */
+ if (!cgroup_is_descendant(dst_cgrp, task_cgroupns_root(leader)))
+ return -EPERM;
+
/* look up all src csets */
down_read(&css_set_rwsem);
rcu_read_lock();
@@ -2882,6 +2888,10 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
struct cgroup_subsys_state *css;
int ret;
+ /* Reject writes to cgroup files outside of task's cgroupns-root. */
+ if (!cgroup_is_descendant(cgrp, task_cgroupns_root(current)))
+ return -EINVAL;
+
if (cft->write)
return cft->write(of, buf, nbytes, off);
@@ -4560,6 +4570,13 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
parent = cgroup_kn_lock_live(parent_kn);
if (!parent)
return -ENODEV;
+
+ /* Allow mkdir only within process's cgroup namespace root. */
+ if (!cgroup_is_descendant(parent, task_cgroupns_root(current))) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
root = parent->root;
/* allocate the cgroup and its ID, 0 is reserved for the root */
@@ -4822,6 +4839,13 @@ static int cgroup_rmdir(struct kernfs_node *kn)
if (!cgrp)
return 0;
+ /* Allow rmdir only within process's cgroup namespace root.
+ * The process can't delete its own root anyways. */
+ if (!cgroup_is_descendant(cgrp, task_cgroupns_root(current))) {
+ cgroup_kn_unlock(kn);
+ return -EPERM;
+ }
+
ret = cgroup_destroy_locked(cgrp);
cgroup_kn_unlock(kn);
@@ -5051,6 +5075,15 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
continue;
+ cgrp = task_cgroup_from_root(tsk, root);
+
+ /* The cgroup path on default hierarchy is shown only if it
+ * falls under current task's cgroupns-root.
+ */
+ if (root == &cgrp_dfl_root &&
+ !cgroup_is_descendant(cgrp, task_cgroupns_root(current)))
+ continue;
+
seq_printf(m, "%d:", root->hierarchy_id);
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
@@ -5059,7 +5092,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
- cgrp = task_cgroup_from_root(tsk, root);
path = cgroup_path(cgrp, buf, PATH_MAX);
if (!path) {
retval = -ENAMETOOLONG;
--
2.1.0.rc2.206.gedb03e5
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
ebiederm-aS9lmoZGLiVWk0Htik3J/ (Eric W. Biederman)
2014-10-19 04:57:30 UTC
Permalink
Post by Aditya Kali
* cgroup_mkdir & cgroup_rmdir
* cgroup_attach_task
* writes to cgroup files outside of task's cgroupns-root
Also, read of /proc/<pid>/cgroup file is now restricted only
to tasks under same cgroupns-root. If a task tries to look
at cgroup of another task outside of its cgroupns-root, then
it won't be able to see anything for the default hierarchy.
This is same as if the cgroups are not mounted.
So I think this patch is out of order.

We should add the namespace infrastructre and the restrictions before
we allow creation of the namespace. Otherwise there is a bisection
point where cgroup namespaces are broken or at the very least have a
security hole. Since we can anticipate this let's see if we can figure
out how to avoid it.

Eric
Post by Aditya Kali
---
kernel/cgroup.c | 34 +++++++++++++++++++++++++++++++++-
1 file changed, 33 insertions(+), 1 deletion(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f8099b4..2fc0dfa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2318,6 +2318,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
struct task_struct *task;
int ret;
+ /* Only allow changing cgroups accessible within task's cgroup
+ * namespace. i.e. 'dst_cgrp' should be a descendant of task's
+ * cgroupns->root_cgrp. */
+ if (!cgroup_is_descendant(dst_cgrp, task_cgroupns_root(leader)))
+ return -EPERM;
+
/* look up all src csets */
down_read(&css_set_rwsem);
rcu_read_lock();
@@ -2882,6 +2888,10 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
struct cgroup_subsys_state *css;
int ret;
+ /* Reject writes to cgroup files outside of task's cgroupns-root. */
+ if (!cgroup_is_descendant(cgrp, task_cgroupns_root(current)))
+ return -EINVAL;
+
if (cft->write)
return cft->write(of, buf, nbytes, off);
@@ -4560,6 +4570,13 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
parent = cgroup_kn_lock_live(parent_kn);
if (!parent)
return -ENODEV;
+
+ /* Allow mkdir only within process's cgroup namespace root. */
+ if (!cgroup_is_descendant(parent, task_cgroupns_root(current))) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
root = parent->root;
/* allocate the cgroup and its ID, 0 is reserved for the root */
@@ -4822,6 +4839,13 @@ static int cgroup_rmdir(struct kernfs_node *kn)
if (!cgrp)
return 0;
+ /* Allow rmdir only within process's cgroup namespace root.
+ * The process can't delete its own root anyways. */
+ if (!cgroup_is_descendant(cgrp, task_cgroupns_root(current))) {
+ cgroup_kn_unlock(kn);
+ return -EPERM;
+ }
+
ret = cgroup_destroy_locked(cgrp);
cgroup_kn_unlock(kn);
@@ -5051,6 +5075,15 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
continue;
+ cgrp = task_cgroup_from_root(tsk, root);
+
+ /* The cgroup path on default hierarchy is shown only if it
+ * falls under current task's cgroupns-root.
+ */
+ if (root == &cgrp_dfl_root &&
+ !cgroup_is_descendant(cgrp, task_cgroupns_root(current)))
+ continue;
+
seq_printf(m, "%d:", root->hierarchy_id);
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
@@ -5059,7 +5092,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
- cgrp = task_cgroup_from_root(tsk, root);
path = cgroup_path(cgrp, buf, PATH_MAX);
if (!path) {
retval = -ENAMETOOLONG;
Andy Lutomirski
2014-10-14 22:42:55 UTC
Permalink
Post by Aditya Kali
Second take at the Cgroup Namespace patch-set.
1. setns support for cgroupns
2. 'mount -t cgroup cgroup <mntpt>' from inside a cgroupns now
mounts the cgroup hierarcy with cgroupns-root as the filesystem root.
3. writes to cgroup files outside of cgroupns-root are not allowed
4. visibility of /proc/<pid>/cgroup is further restricted by not showing
anything if the <pid> is in a sibling cgroupns and its cgroup falls outside
your cgroupns-root.
More details in the writeup below.
Background
Cgroups and Namespaces are used together to create “virtual”
containers that isolates the host environment from the processes
running in container. But since cgroups themselves are not
“virtualized”, the task is always able to see global cgroups view
through cgroupfs mount and via /proc/self/cgroup file.
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1
This exposure of cgroup names to the processes running inside a
(1) The container names are typically host-container-management-agent
(systemd, docker/libcontainer, etc.) data and leaking its name (or
leaking the hierarchy) reveals too much information about the host
system.
(2) It makes the container migration across machines (CRIU) more
difficult as the container names need to be unique across the
machines in the migration domain.
(3) It makes it difficult to run container management tools (like
docker/libcontainer, lmctfy, etc.) within virtual containers
without adding dependency on some state/agent present outside the
container.
Note that the feature proposed here is completely different than the
“ns cgroup” feature which existed in the linux kernel until recently.
The ns cgroup also attempted to connect cgroups and namespaces by
creating a new cgroup every time a new namespace was created. It did
not solve any of the above mentioned problems and was later dropped
from the kernel. Incidentally though, it used the same config option
name CONFIG_CGROUP_NS as used in my prototype!
Introducing CGroup Namespaces
With unified cgroup hierarchy
(Documentation/cgroups/unified-hierarchy.txt), the containers can now
have a much more coherent cgroup view and its easy to associate a
container with a single cgroup. This also allows us to virtualize the
cgroup view for tasks inside the container.
The new CGroup Namespace allows a process to “unshare” its cgroup
hierarchy starting from the cgroup its currently in.
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1
$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> cgroup:[4026531835]
$ ~/unshare -c # calls unshare(CLONE_NEWCGROUP) and exec’s /bin/bash
[ns]$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup ->
cgroup:[4026532183]
# From within new cgroupns, process sees that its in the root cgroup
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
$ cat /proc/<pid>/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1
# Unshare cgroupns along with userns and mountns
# Following calls unshare(CLONE_NEWCGROUP|CLONE_NEWUSER|CLONE_NEWNS), then
# sets up uid/gid map and exec’s /bin/bash
$ ~/unshare -c -u -m
# Originally, we were in /batchjobs/c_job_id1 cgroup. Mount our own cgroup
# hierarchy.
[ns]$ mount -t cgroup cgroup /tmp/cgroup
[ns]$ ls -l /tmp/cgroup
total 0
-r--r--r-- 1 root root 0 2014-10-13 09:32 cgroup.controllers
-r--r--r-- 1 root root 0 2014-10-13 09:32 cgroup.populated
-rw-r--r-- 1 root root 0 2014-10-13 09:25 cgroup.procs
-rw-r--r-- 1 root root 0 2014-10-13 09:32 cgroup.subtree_control
The cgroupns-root (/batchjobs/c_job_id1 in above example) becomes the
filesystem root for the namespace specific cgroupfs mount.
The virtualization of /proc/self/cgroup file combined with restricting
the view of cgroup hierarchy by namespace-private cgroupfs mount
should provide a completely isolated cgroup view inside the container.
In its current form, the cgroup namespaces patcheset provides following
(1) The “root” cgroup for a cgroup namespace is the cgroup in which
the process calling unshare is running.
For ex. if a process in /batchjobs/c_job_id1 cgroup calls unshare,
cgroup /batchjobs/c_job_id1 becomes the cgroupns-root.
For the init_cgroup_ns, this is the real root (“/”) cgroup
(identified in code as cgrp_dfl_root.cgrp).
(2) The cgroupns-root cgroup does not change even if the namespace
creator process later moves to a different cgroup.
$ ~/unshare -c # unshare cgroupns in some cgroup
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
[ns]$ mkdir sub_cgrp_1
[ns]$ echo 0 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1
(3) Each process gets its CGROUPNS specific view of
/proc/<pid>/cgroup.
(a) Processes running inside the cgroup namespace will be able to see
cgroup paths (in /proc/self/cgroup) only inside their root cgroup
[ns]$ sleep 100000 & # From within unshared cgroupns
[1] 7353
[ns]$ echo 7353 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1/sub_cgrp_1
This is a little weird. Not sure it's a problem.
Post by Aditya Kali
(c) From a sibling cgroupns (cgroupns root-ed at a sibling cgroup), no cgroup
# ns2's cgroupns-root is at '/batchjobs/c_job_id2'
[ns2]$ cat /proc/7353/cgroup
[ns2]$
This is same as when cgroup hierarchy is not mounted at all.
(In correct container setup though, it should not be possible to
access PIDs in another container in the first place.)
(4) Processes inside a cgroupns are not allowed to move out of the
cgroupns-root. This is true even if a privileged process in global
cgroupns tries to move the process out of its cgroupns-root.
# From global cgroupns
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1/sub_cgrp_1
# cgroupns-root for 7353 is /batchjobs/c_job_id1
$ echo 7353 > batchjobs/c_job_id2/cgroup.procs
-bash: echo: write error: Operation not permitted
(6) When some thread from a multi-threaded process unshares its
cgroup-namespace, the new cgroupns gets applied to the entire
process (all the threads). This should be OK since
unified-hierarchy only allows process-level containerization. So
all the threads in the process will have the same cgroup. And both
- changing cgroups and unsharing namespaces - are protected under
threadgroup_lock(task).
This seems odd to me. Does unsharing the cgroupns unshare for all
tasks in the process? If not, then I think that it shouldn't change
the cgroup either.

What did you end up doing to grant permission to unshare the cgroup ns?

--Andy
Aditya Kali
2014-10-14 23:33:11 UTC
Permalink
Post by Andy Lutomirski
Post by Aditya Kali
Second take at the Cgroup Namespace patch-set.
1. setns support for cgroupns
2. 'mount -t cgroup cgroup <mntpt>' from inside a cgroupns now
mounts the cgroup hierarcy with cgroupns-root as the filesystem root.
3. writes to cgroup files outside of cgroupns-root are not allowed
4. visibility of /proc/<pid>/cgroup is further restricted by not showing
anything if the <pid> is in a sibling cgroupns and its cgroup falls outside
your cgroupns-root.
More details in the writeup below.
Background
Cgroups and Namespaces are used together to create “virtual”
containers that isolates the host environment from the processes
running in container. But since cgroups themselves are not
“virtualized”, the task is always able to see global cgroups view
through cgroupfs mount and via /proc/self/cgroup file.
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1
This exposure of cgroup names to the processes running inside a
(1) The container names are typically host-container-management-agent
(systemd, docker/libcontainer, etc.) data and leaking its name (or
leaking the hierarchy) reveals too much information about the host
system.
(2) It makes the container migration across machines (CRIU) more
difficult as the container names need to be unique across the
machines in the migration domain.
(3) It makes it difficult to run container management tools (like
docker/libcontainer, lmctfy, etc.) within virtual containers
without adding dependency on some state/agent present outside the
container.
Note that the feature proposed here is completely different than the
“ns cgroup” feature which existed in the linux kernel until recently.
The ns cgroup also attempted to connect cgroups and namespaces by
creating a new cgroup every time a new namespace was created. It did
not solve any of the above mentioned problems and was later dropped
from the kernel. Incidentally though, it used the same config option
name CONFIG_CGROUP_NS as used in my prototype!
Introducing CGroup Namespaces
With unified cgroup hierarchy
(Documentation/cgroups/unified-hierarchy.txt), the containers can now
have a much more coherent cgroup view and its easy to associate a
container with a single cgroup. This also allows us to virtualize the
cgroup view for tasks inside the container.
The new CGroup Namespace allows a process to “unshare” its cgroup
hierarchy starting from the cgroup its currently in.
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1
$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> cgroup:[4026531835]
$ ~/unshare -c # calls unshare(CLONE_NEWCGROUP) and exec’s /bin/bash
[ns]$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup ->
cgroup:[4026532183]
# From within new cgroupns, process sees that its in the root cgroup
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
$ cat /proc/<pid>/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1
# Unshare cgroupns along with userns and mountns
# Following calls unshare(CLONE_NEWCGROUP|CLONE_NEWUSER|CLONE_NEWNS), then
# sets up uid/gid map and exec’s /bin/bash
$ ~/unshare -c -u -m
# Originally, we were in /batchjobs/c_job_id1 cgroup. Mount our own cgroup
# hierarchy.
[ns]$ mount -t cgroup cgroup /tmp/cgroup
[ns]$ ls -l /tmp/cgroup
total 0
-r--r--r-- 1 root root 0 2014-10-13 09:32 cgroup.controllers
-r--r--r-- 1 root root 0 2014-10-13 09:32 cgroup.populated
-rw-r--r-- 1 root root 0 2014-10-13 09:25 cgroup.procs
-rw-r--r-- 1 root root 0 2014-10-13 09:32 cgroup.subtree_control
The cgroupns-root (/batchjobs/c_job_id1 in above example) becomes the
filesystem root for the namespace specific cgroupfs mount.
The virtualization of /proc/self/cgroup file combined with restricting
the view of cgroup hierarchy by namespace-private cgroupfs mount
should provide a completely isolated cgroup view inside the container.
In its current form, the cgroup namespaces patcheset provides following
(1) The “root” cgroup for a cgroup namespace is the cgroup in which
the process calling unshare is running.
For ex. if a process in /batchjobs/c_job_id1 cgroup calls unshare,
cgroup /batchjobs/c_job_id1 becomes the cgroupns-root.
For the init_cgroup_ns, this is the real root (“/”) cgroup
(identified in code as cgrp_dfl_root.cgrp).
(2) The cgroupns-root cgroup does not change even if the namespace
creator process later moves to a different cgroup.
$ ~/unshare -c # unshare cgroupns in some cgroup
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
[ns]$ mkdir sub_cgrp_1
[ns]$ echo 0 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1
(3) Each process gets its CGROUPNS specific view of
/proc/<pid>/cgroup.
(a) Processes running inside the cgroup namespace will be able to see
cgroup paths (in /proc/self/cgroup) only inside their root cgroup
[ns]$ sleep 100000 & # From within unshared cgroupns
[1] 7353
[ns]$ echo 7353 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1/sub_cgrp_1
This is a little weird. Not sure it's a problem.
Post by Aditya Kali
(c) From a sibling cgroupns (cgroupns root-ed at a sibling cgroup), no cgroup
# ns2's cgroupns-root is at '/batchjobs/c_job_id2'
[ns2]$ cat /proc/7353/cgroup
[ns2]$
This is same as when cgroup hierarchy is not mounted at all.
(In correct container setup though, it should not be possible to
access PIDs in another container in the first place.)
(4) Processes inside a cgroupns are not allowed to move out of the
cgroupns-root. This is true even if a privileged process in global
cgroupns tries to move the process out of its cgroupns-root.
# From global cgroupns
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1/sub_cgrp_1
# cgroupns-root for 7353 is /batchjobs/c_job_id1
$ echo 7353 > batchjobs/c_job_id2/cgroup.procs
-bash: echo: write error: Operation not permitted
(6) When some thread from a multi-threaded process unshares its
cgroup-namespace, the new cgroupns gets applied to the entire
process (all the threads). This should be OK since
unified-hierarchy only allows process-level containerization. So
all the threads in the process will have the same cgroup. And both
- changing cgroups and unsharing namespaces - are protected under
threadgroup_lock(task).
This seems odd to me. Does unsharing the cgroupns unshare for all
tasks in the process? If not, then I think that it shouldn't change
the cgroup either.
Unsharing cgorupns unshares for all tasks in the process, yes.

The cgroup changes are protected by threadgroup_lock. So it made sense
to protect cgroupns changes (unshare or setns) by the same lock as we
don't want task's cgroup to change underneath while we are changing
its cgroup-namespace. No cgroup change happens during the
unshare/setns call.
Post by Andy Lutomirski
What did you end up doing to grant permission to unshare the cgroup ns?
Currently the only requirement is ns_capable(cgroupns->user_ns,
CAP_SYS_ADMIN). Its possible to refine this further, but for now I
just kept it simpler. I am looking into the explicit permission check
discussed previously (https://lkml.org/lkml/2014/7/29/402), but wanted
to get this out sooner.
Post by Andy Lutomirski
--Andy
Thanks,
--
Aditya
ebiederm-aS9lmoZGLiVWk0Htik3J/ (Eric W. Biederman)
2014-10-19 04:54:32 UTC
Permalink
Post by Aditya Kali
Second take at the Cgroup Namespace patch-set.
1. setns support for cgroupns
2. 'mount -t cgroup cgroup <mntpt>' from inside a cgroupns now
mounts the cgroup hierarcy with cgroupns-root as the filesystem root.
3. writes to cgroup files outside of cgroupns-root are not allowed
4. visibility of /proc/<pid>/cgroup is further restricted by not showing
anything if the <pid> is in a sibling cgroupns and its cgroup falls outside
your cgroupns-root.
More details in the writeup below.
This definitely looks like the right direction to go, and something that
in some form or another I had been asking for since cgroups were merged.
So I am very glad to see this work moving forward.

I had hoped that we might just be able to be clever with remounting
cgroupfs but 2 things stand in the way.
1) /proc/<pid>/cgroups (but proc could capture that).
2) providing a hard guarnatee that tasks stay within a subset of the
cgroup hierarchy.

So I think this clearly meets the requirements for a new namespace.

We need to have the discussion on chmod of files on cgroupfs. There is
a notion that has floated around that only systemd or only root (with
the appropriate capabilities) should be allowed to set resource limits
in cgroupfs. In a practical reality that is nonsense. If an atribute
is properly bound in it's hiearchy it should be safe to change.

Not all attributes are properly bound to hierarchy and some are or at
least were dangerous for anyone except root to set. So I suggest that a
CFTYPE flag perhaps CFTYPE_UNPRIV be added for attributes that are safe
to allow anyone to set, and require CFTYPE_UNPRIV be set before we chmod
a cgroup attribute from root.

That would be complimentary work, and not strictly tied the cgroup
namespaces but unprivileged cgroup namespaces don't make much sense
without that work.

Eric
Post by Aditya Kali
Background
Cgroups and Namespaces are used together to create “virtual”
containers that isolates the host environment from the processes
running in container. But since cgroups themselves are not
“virtualized”, the task is always able to see global cgroups view
through cgroupfs mount and via /proc/self/cgroup file.
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1
This exposure of cgroup names to the processes running inside a
(1) The container names are typically host-container-management-agent
(systemd, docker/libcontainer, etc.) data and leaking its name (or
leaking the hierarchy) reveals too much information about the host
system.
(2) It makes the container migration across machines (CRIU) more
difficult as the container names need to be unique across the
machines in the migration domain.
(3) It makes it difficult to run container management tools (like
docker/libcontainer, lmctfy, etc.) within virtual containers
without adding dependency on some state/agent present outside the
container.
Note that the feature proposed here is completely different than the
“ns cgroup” feature which existed in the linux kernel until recently.
The ns cgroup also attempted to connect cgroups and namespaces by
creating a new cgroup every time a new namespace was created. It did
not solve any of the above mentioned problems and was later dropped
from the kernel. Incidentally though, it used the same config option
name CONFIG_CGROUP_NS as used in my prototype!
Introducing CGroup Namespaces
With unified cgroup hierarchy
(Documentation/cgroups/unified-hierarchy.txt), the containers can now
have a much more coherent cgroup view and its easy to associate a
container with a single cgroup. This also allows us to virtualize the
cgroup view for tasks inside the container.
The new CGroup Namespace allows a process to “unshare” its cgroup
hierarchy starting from the cgroup its currently in.
$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1
$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> cgroup:[4026531835]
$ ~/unshare -c # calls unshare(CLONE_NEWCGROUP) and exec’s /bin/bash
[ns]$ ls -l /proc/self/ns/cgroup
lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup ->
cgroup:[4026532183]
# From within new cgroupns, process sees that its in the root cgroup
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
$ cat /proc/<pid>/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1
# Unshare cgroupns along with userns and mountns
# Following calls unshare(CLONE_NEWCGROUP|CLONE_NEWUSER|CLONE_NEWNS), then
# sets up uid/gid map and exec’s /bin/bash
$ ~/unshare -c -u -m
# Originally, we were in /batchjobs/c_job_id1 cgroup. Mount our own cgroup
# hierarchy.
[ns]$ mount -t cgroup cgroup /tmp/cgroup
[ns]$ ls -l /tmp/cgroup
total 0
-r--r--r-- 1 root root 0 2014-10-13 09:32 cgroup.controllers
-r--r--r-- 1 root root 0 2014-10-13 09:32 cgroup.populated
-rw-r--r-- 1 root root 0 2014-10-13 09:25 cgroup.procs
-rw-r--r-- 1 root root 0 2014-10-13 09:32 cgroup.subtree_control
The cgroupns-root (/batchjobs/c_job_id1 in above example) becomes the
filesystem root for the namespace specific cgroupfs mount.
The virtualization of /proc/self/cgroup file combined with restricting
the view of cgroup hierarchy by namespace-private cgroupfs mount
should provide a completely isolated cgroup view inside the container.
In its current form, the cgroup namespaces patcheset provides following
(1) The “root” cgroup for a cgroup namespace is the cgroup in which
the process calling unshare is running.
For ex. if a process in /batchjobs/c_job_id1 cgroup calls unshare,
cgroup /batchjobs/c_job_id1 becomes the cgroupns-root.
For the init_cgroup_ns, this is the real root (“/”) cgroup
(identified in code as cgrp_dfl_root.cgrp).
(2) The cgroupns-root cgroup does not change even if the namespace
creator process later moves to a different cgroup.
$ ~/unshare -c # unshare cgroupns in some cgroup
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/
[ns]$ mkdir sub_cgrp_1
[ns]$ echo 0 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/self/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1
(3) Each process gets its CGROUPNS specific view of
/proc/<pid>/cgroup.
(a) Processes running inside the cgroup namespace will be able to see
cgroup paths (in /proc/self/cgroup) only inside their root cgroup
[ns]$ sleep 100000 & # From within unshared cgroupns
[1] 7353
[ns]$ echo 7353 > sub_cgrp_1/cgroup.procs
[ns]$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1/sub_cgrp_1
(c) From a sibling cgroupns (cgroupns root-ed at a sibling cgroup), no cgroup
# ns2's cgroupns-root is at '/batchjobs/c_job_id2'
[ns2]$ cat /proc/7353/cgroup
[ns2]$
This is same as when cgroup hierarchy is not mounted at all.
(In correct container setup though, it should not be possible to
access PIDs in another container in the first place.)
(4) Processes inside a cgroupns are not allowed to move out of the
cgroupns-root. This is true even if a privileged process in global
cgroupns tries to move the process out of its cgroupns-root.
# From global cgroupns
$ cat /proc/7353/cgroup
0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/c_job_id1/sub_cgrp_1
# cgroupns-root for 7353 is /batchjobs/c_job_id1
$ echo 7353 > batchjobs/c_job_id2/cgroup.procs
-bash: echo: write error: Operation not permitted
(a) process has CAP_SYS_ADMIN in its current userns
(b) process has CAP_SYS_ADMIN in the target cgroupns' userns
(c) the process's current cgroup is a descendant cgroupns-root of the
target namespace.
(d) the target cgroupns-root is descendant of current cgroupns-root..
The last check (d) prevents processes from escaping their cgroupns-root by
attaching to parent cgroupns. Thus, setns is allowed only when the process
is trying to restrict itself to a deeper cgroup hierarchy.
(6) When some thread from a multi-threaded process unshares its
cgroup-namespace, the new cgroupns gets applied to the entire
process (all the threads). This should be OK since
unified-hierarchy only allows process-level containerization. So
all the threads in the process will have the same cgroup. And both
- changing cgroups and unsharing namespaces - are protected under
threadgroup_lock(task).
(7) The cgroup namespace is alive as long as there is atleast 1
process inside it. When the last process exits, the cgroup
namespace is destroyed. The cgroupns-root and the actual cgroups
remain though.
(8) 'mount -t cgroup cgroup <mntpt>' when called from within cgroupns mounts
the unified cgroup hierarchy with cgroupns-root as the filesystem root.
The process needs CAP_SYS_ADMIN in its userns and mntns. This allows the
container management tools to be run inside the containers transparently.
Implementation
The current patch-set is based on top of Tejun Heo's cgroup tree (for-next
branch). Its fairly non-intrusive and provides above mentioned
features.
(1) The Documentation/cgroups/unified-hierarchy.txt mentions use of
capabilities to restrict cgroups to administrative users. CGroup
namespaces could be of help here. With cgroup namespaces, it might
be possible to delegate administration of sub-cgroups under a
cgroupns-root to the cgroupns owner.
---
fs/kernfs/dir.c | 53 +++++++++---
fs/kernfs/mount.c | 48 +++++++++++
fs/proc/namespaces.c | 3 +
include/linux/cgroup.h | 41 +++++++++-
include/linux/cgroup_namespace.h | 62 +++++++++++++++
include/linux/kernfs.h | 5 ++
include/linux/nsproxy.h | 2 +
include/linux/proc_ns.h | 4 +
include/uapi/linux/sched.h | 3 +-
init/Kconfig | 9 +++
kernel/Makefile | 1 +
kernel/cgroup.c | 139 ++++++++++++++++++++++++++------
kernel/cgroup_namespace.c | 168 +++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 2 +-
kernel/nsproxy.c | 19 ++++-
15 files changed, 518 insertions(+), 41 deletions(-)
create mode 100644 include/linux/cgroup_namespace.h
create mode 100644 kernel/cgroup_namespace.c
[PATCHv1 1/8] kernfs: Add API to generate relative kernfs path
[PATCHv1 2/8] sched: new clone flag CLONE_NEWCGROUP for cgroup
[PATCHv1 3/8] cgroup: add function to get task's cgroup on default
[PATCHv1 4/8] cgroup: export cgroup_get() and cgroup_put()
[PATCHv1 5/8] cgroup: introduce cgroup namespaces
[PATCHv1 6/8] cgroup: restrict cgroup operations within task's cgroupns
[PATCHv1 7/8] cgroup: cgroup namespace setns support
[PATCHv1 8/8] cgroup: mount cgroupns-root when inside non-init cgroupns
_______________________________________________
Containers mailing list
https://lists.linuxfoundation.org/mailman/listinfo/containers
Loading...