[libvirt] [PATCH v2 0/8] Add user namespace support for libvirt lxc

This patchset try to add userns support for libvirt lxc. Since userns is nearly completed in linux-3.9, the old kernel doesn't support userns, I add some New XML elements to let people decide if enable userns.The userns is enabled only when user configure the XML. The format of user namespace related XML file like below: <idmap> <uid start='0' target='1000' count='10'> <gid start='0' target='1000' count='10'> </idmap> it means the user in container (which uid:gid is 0:0) will be mapped to the user in host (uid:gid is 1000:1000), count is used to form an u/gid range: The users in container which uid in [start, start + count -1] will be mapped. You can have multiple lines to map differnet id ranges, caution, you must make sure the root user of container has been mapped. This patchset also does the below jobs. 1, Because the uninit userns has no right to create devices, we should create devices for container on host. 2, Changes the owner of fuse and tty device. Gao feng (8): LXC: Introduce New XML element for user namespace LXC: enable user namespace when user set the uidmap LXC: sort the uidmap/gidmap of domain LXC: introduce virLXCControllerSetupUserns and lxcContainerSetUserns LXC: Creating devices for container on host side LXC: change the owner of devices created on host LXC: change the owner of tty devices to the root user of container LXC: fuse: Change files owner to the root user of container docs/formatdomain.html.in | 23 +++++ docs/schemas/domaincommon.rng | 28 ++++++ src/conf/domain_conf.c | 111 +++++++++++++++++++++ src/conf/domain_conf.h | 19 ++++ src/lxc/lxc_container.c | 120 +++++++++++------------ src/lxc/lxc_controller.c | 218 +++++++++++++++++++++++++++++++++++++++++- src/lxc/lxc_fuse.c | 6 ++ 7 files changed, 455 insertions(+), 70 deletions(-) -- 1.8.1.4

This patch introduces new element <idmap> for user namespace. for example <idmap> <uid start='0' target='1000' count='10'/> <gid start='0' target='1000' count='10'/> </idmap> this new element is used for setting proc files /proc/<pid>/{uid_map,gid_map}. This patch also supports multiple uid/gid elements setting in XML configuration. Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- docs/formatdomain.html.in | 23 +++++++++++ docs/schemas/domaincommon.rng | 28 ++++++++++++++ src/conf/domain_conf.c | 88 +++++++++++++++++++++++++++++++++++++++++++ src/conf/domain_conf.h | 19 ++++++++++ 4 files changed, 158 insertions(+) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 572d7ee..7a5c1ed 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -285,6 +285,29 @@ </pre> + <p> + If you want to enable user namespace,set the <code>idmap</code> element. + the <code>uid</code> and <code>gid</code> elements have three attributes: + </p> + + <dl> + <dt><code>start</code></dt> + <dd>First user id in container.</dd> + <dt><code>target</code></dt> + <dd>The first user id in container will be mapped to this target user + id in host.</dd> + <dt><code>count</code></dt> + <dd>How many users in container being allowed to map to host's user.</dd> + </dl> + + <pre> + <idmap> + <uid start='0' target='1000' count='10'/> + <gid start='0' target='1000' count='10'/> + </idmap> + </pre> + + <h3><a name="elementsSysinfo">SMBIOS System Information</a></h3> <p> diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng index 10596dc..dffb103 100644 --- a/docs/schemas/domaincommon.rng +++ b/docs/schemas/domaincommon.rng @@ -465,6 +465,34 @@ </optional> </interleave> </define> + <define name="map"> + <optional> + <element name="idmap"> + <element name="uid"> + <attribute name="start"> + <ref name="unsignedInt"/> + </attribute> + <attribute name="target"> + <ref name="unsignedInt"/> + </attribute> + <attribute name="count"> + <ref name="unsignedInt"/> + </attribute> + </element> + <element name="gid"> + <attribute name="start"> + <ref name="unsignedInt"/> + </attribute> + <attribute name="target"> + <ref name="unsignedInt"/> + </attribute> + <attribute name="count"> + <ref name="unsignedInt"/> + </attribute> + </element> + </element> + </optional> + </define> <!-- Resources usage defines the amount of memory (maximum and possibly current usage) and number of virtual CPUs used by that domain. diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index b7e253e..46be458 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -1944,6 +1944,9 @@ void virDomainDefFree(virDomainDefPtr def) virDomainTPMDefFree(def->tpm); + VIR_FREE(def->idmap.uidmap); + VIR_FREE(def->idmap.gidmap); + VIR_FREE(def->os.type); VIR_FREE(def->os.machine); VIR_FREE(def->os.init); @@ -9798,6 +9801,40 @@ cleanup: return ret; } + +/* Parse the XML definition for user namespace id map. + * + * idmap has the form of + * + * <uid start='0' target='1000' count='10'/> + * <gid start='0' target='1000' count='10'/> + */ +static struct idmap * +virDomainIdmapDefParseXML(const xmlNodePtr *node, + xmlXPathContextPtr ctxt, + ssize_t num) +{ + int i; + struct idmap *idmap = NULL; + xmlNodePtr save_ctxt = ctxt->node; + + if (VIR_ALLOC_N(idmap, num) < 0) { + virReportOOMError(); + goto error; + } + + for (i = 0; i < num ; i++) { + ctxt->node = node[i]; + virXPathUInt("string(./@start)", ctxt, &idmap[i].start); + virXPathUInt("string(./@target)", ctxt, &idmap[i].target); + virXPathUInt("string(./@count)", ctxt, &idmap[i].count); + } + error: + ctxt->node = save_ctxt; + return idmap; +} + + /* Parse the XML definition for a vcpupin or emulatorpin. * * vcpupin has the form of @@ -11544,6 +11581,36 @@ virDomainDefParseXML(xmlDocPtr xml, } VIR_FREE(nodes); + /* analysis of the user namespace mapping */ + def->idmap.nuidmap = 0; + def->idmap.uidmap = NULL; + if ((n = virXPathNodeSet("./idmap/uid", ctxt, &nodes)) < 0) + goto error; + + if (n) { + def->idmap.uidmap = virDomainIdmapDefParseXML(nodes, ctxt, n); + if (!def->idmap.uidmap) + goto error; + + def->idmap.nuidmap = n; + } + VIR_FREE(nodes); + + def->idmap.ngidmap = 0; + def->idmap.gidmap = NULL; + + if ((n = virXPathNodeSet("./idmap/gid", ctxt, &nodes)) < 0) + goto error; + + if (n) { + def->idmap.gidmap = virDomainIdmapDefParseXML(nodes, ctxt, n); + if (!def->idmap.gidmap) + goto error; + + def->idmap.ngidmap = n; + } + VIR_FREE(nodes); + /* analysis of cpu handling */ if ((node = virXPathNode("./cpu[1]", ctxt)) != NULL) { xmlNodePtr oldnode = ctxt->node; @@ -15696,6 +15763,27 @@ virDomainDefFormatInternal(virDomainDefPtr def, virBufferAddLit(buf, " </os>\n"); + + if (def->idmap.uidmap || def->idmap.gidmap) { + virBufferAddLit(buf, " <idmap>\n"); + for (i = 0 ; i < def->idmap.nuidmap; i++) { + virBufferAsprintf(buf, + " <uid start='%u' target='%u' count='%u'/>\n", + def->idmap.uidmap[i].start, + def->idmap.uidmap[i].target, + def->idmap.uidmap[i].count); + } + for (i = 0 ; i < def->idmap.ngidmap; i++) { + virBufferAsprintf(buf, + " <gid start='%u' target='%u' count='%u'/>\n", + def->idmap.gidmap[i].start, + def->idmap.gidmap[i].target, + def->idmap.gidmap[i].count); + } + virBufferAddLit(buf, " </idmap>\n"); + } + + if (def->features) { virBufferAddLit(buf, " <features>\n"); for (i = 0 ; i < VIR_DOMAIN_FEATURE_LAST ; i++) { diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 21f7ce2..b21d567 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -120,6 +120,9 @@ typedef virDomainSnapshotObjList *virDomainSnapshotObjListPtr; typedef struct _virDomainRNGDef virDomainRNGDef; typedef virDomainRNGDef *virDomainRNGDefPtr; +typedef struct _virDomainIdmapDef virDomainIdmapDef; +typedef virDomainIdmapDef *virDomainIdmapDefPtr; + /* Flags for the 'type' field in virDomainDeviceDef */ typedef enum { VIR_DOMAIN_DEVICE_NONE = 0, @@ -1803,6 +1806,21 @@ struct _virDomainRNGDef { virDomainDeviceInfo info; }; +struct idmap { + unsigned int start; + unsigned int target; + unsigned int count; +}; + +struct _virDomainIdmapDef { + size_t nuidmap; + struct idmap *uidmap; + + size_t ngidmap; + struct idmap *gidmap; +}; + + void virBlkioDeviceWeightArrayClear(virBlkioDeviceWeightPtr deviceWeights, int ndevices); @@ -1863,6 +1881,7 @@ struct _virDomainDef { virNumaTuneDef numatune; virDomainResourceDefPtr resource; + virDomainIdmapDef idmap; /* These 3 are based on virDomainLifeCycleAction enum flags */ int onReboot; -- 1.8.1.4

On Fri, May 10, 2013 at 05:58:10PM +0800, Gao feng wrote:
This patch introduces new element <idmap> for user namespace. for example <idmap> <uid start='0' target='1000' count='10'/> <gid start='0' target='1000' count='10'/> </idmap>
this new element is used for setting proc files /proc/<pid>/{uid_map,gid_map}.
This patch also supports multiple uid/gid elements setting in XML configuration.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- docs/formatdomain.html.in | 23 +++++++++++ docs/schemas/domaincommon.rng | 28 ++++++++++++++ src/conf/domain_conf.c | 88 +++++++++++++++++++++++++++++++++++++++++++ src/conf/domain_conf.h | 19 ++++++++++ 4 files changed, 158 insertions(+)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 572d7ee..7a5c1ed 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -285,6 +285,29 @@ </pre>
+ <p> + If you want to enable user namespace,set the <code>idmap</code> element. + the <code>uid</code> and <code>gid</code> elements have three attributes: + </p> + + <dl> + <dt><code>start</code></dt> + <dd>First user id in container.</dd> + <dt><code>target</code></dt> + <dd>The first user id in container will be mapped to this target user + id in host.</dd> + <dt><code>count</code></dt> + <dd>How many users in container being allowed to map to host's user.</dd> + </dl> + + <pre> + <idmap> + <uid start='0' target='1000' count='10'/> + <gid start='0' target='1000' count='10'/> + </idmap> + </pre> + + <h3><a name="elementsSysinfo">SMBIOS System Information</a></h3>
<p> diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng index 10596dc..dffb103 100644 --- a/docs/schemas/domaincommon.rng +++ b/docs/schemas/domaincommon.rng @@ -465,6 +465,34 @@ </optional> </interleave> </define> + <define name="map"> + <optional> + <element name="idmap">
<zeroOrMore>
+ <element name="uid"> + <attribute name="start"> + <ref name="unsignedInt"/> + </attribute> + <attribute name="target"> + <ref name="unsignedInt"/> + </attribute> + <attribute name="count"> + <ref name="unsignedInt"/> + </attribute> + </element>
</zeroOrMore> <zeroOrMore>
+ <element name="gid"> + <attribute name="start"> + <ref name="unsignedInt"/> + </attribute> + <attribute name="target"> + <ref name="unsignedInt"/> + </attribute> + <attribute name="count"> + <ref name="unsignedInt"/> + </attribute> + </element>
</zeroOrMore>
+ </element> + </optional> + </define>
You don't seem to have used this new 'map' define anywhere in the schema.
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index b7e253e..46be458 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -1944,6 +1944,9 @@ void virDomainDefFree(virDomainDefPtr def)
virDomainTPMDefFree(def->tpm);
+ VIR_FREE(def->idmap.uidmap); + VIR_FREE(def->idmap.gidmap); + VIR_FREE(def->os.type); VIR_FREE(def->os.machine); VIR_FREE(def->os.init); @@ -9798,6 +9801,40 @@ cleanup: return ret; }
+ +/* Parse the XML definition for user namespace id map. + * + * idmap has the form of + * + * <uid start='0' target='1000' count='10'/> + * <gid start='0' target='1000' count='10'/> + */ +static struct idmap * +virDomainIdmapDefParseXML(const xmlNodePtr *node, + xmlXPathContextPtr ctxt, + ssize_t num) +{ + int i; + struct idmap *idmap = NULL; + xmlNodePtr save_ctxt = ctxt->node; + + if (VIR_ALLOC_N(idmap, num) < 0) { + virReportOOMError(); + goto error; + } + + for (i = 0; i < num ; i++) { + ctxt->node = node[i]; + virXPathUInt("string(./@start)", ctxt, &idmap[i].start); + virXPathUInt("string(./@target)", ctxt, &idmap[i].target); + virXPathUInt("string(./@count)", ctxt, &idmap[i].count); + } + error: + ctxt->node = save_ctxt; + return idmap; +} + + /* Parse the XML definition for a vcpupin or emulatorpin. * * vcpupin has the form of @@ -11544,6 +11581,36 @@ virDomainDefParseXML(xmlDocPtr xml, } VIR_FREE(nodes);
+ /* analysis of the user namespace mapping */ + def->idmap.nuidmap = 0; + def->idmap.uidmap = NULL; + if ((n = virXPathNodeSet("./idmap/uid", ctxt, &nodes)) < 0) + goto error; + + if (n) { + def->idmap.uidmap = virDomainIdmapDefParseXML(nodes, ctxt, n); + if (!def->idmap.uidmap) + goto error; + + def->idmap.nuidmap = n; + } + VIR_FREE(nodes); + + def->idmap.ngidmap = 0; + def->idmap.gidmap = NULL; + + if ((n = virXPathNodeSet("./idmap/gid", ctxt, &nodes)) < 0) + goto error; + + if (n) { + def->idmap.gidmap = virDomainIdmapDefParseXML(nodes, ctxt, n); + if (!def->idmap.gidmap) + goto error; + + def->idmap.ngidmap = n; + } + VIR_FREE(nodes); + /* analysis of cpu handling */ if ((node = virXPathNode("./cpu[1]", ctxt)) != NULL) { xmlNodePtr oldnode = ctxt->node; @@ -15696,6 +15763,27 @@ virDomainDefFormatInternal(virDomainDefPtr def,
virBufferAddLit(buf, " </os>\n");
+ + if (def->idmap.uidmap || def->idmap.gidmap) { + virBufferAddLit(buf, " <idmap>\n"); + for (i = 0 ; i < def->idmap.nuidmap; i++) { + virBufferAsprintf(buf, + " <uid start='%u' target='%u' count='%u'/>\n", + def->idmap.uidmap[i].start, + def->idmap.uidmap[i].target, + def->idmap.uidmap[i].count); + } + for (i = 0 ; i < def->idmap.ngidmap; i++) { + virBufferAsprintf(buf, + " <gid start='%u' target='%u' count='%u'/>\n", + def->idmap.gidmap[i].start, + def->idmap.gidmap[i].target, + def->idmap.gidmap[i].count); + } + virBufferAddLit(buf, " </idmap>\n"); + } + + if (def->features) { virBufferAddLit(buf, " <features>\n"); for (i = 0 ; i < VIR_DOMAIN_FEATURE_LAST ; i++) { diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 21f7ce2..b21d567 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -120,6 +120,9 @@ typedef virDomainSnapshotObjList *virDomainSnapshotObjListPtr; typedef struct _virDomainRNGDef virDomainRNGDef; typedef virDomainRNGDef *virDomainRNGDefPtr;
+typedef struct _virDomainIdmapDef virDomainIdmapDef; +typedef virDomainIdmapDef *virDomainIdmapDefPtr;
Missing typedef struct _virDomainIdMapEntry virDomainIdMapEntry; typedef virDomainIdMapEntry *virDomainIdMapEntryPtr;
+ /* Flags for the 'type' field in virDomainDeviceDef */ typedef enum { VIR_DOMAIN_DEVICE_NONE = 0, @@ -1803,6 +1806,21 @@ struct _virDomainRNGDef { virDomainDeviceInfo info; };
+struct idmap {
s/idmap/_virDomainIdMapEntry/
+ unsigned int start; + unsigned int target; + unsigned int count; +}; + +struct _virDomainIdmapDef {
s/_virDomainIdmapDef/_virDomainIdMapDef/
+ size_t nuidmap; + struct idmap *uidmap; + + size_t ngidmap; + struct idmap *gidmap; +};
Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

If the idmap exist, the user namespace will be enabled automatically. If you want disable user namespace,just remove these elements from XML. Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_container.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 8e1d10a..094f205 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -2030,16 +2030,10 @@ cleanup: return ret; } -static int userns_supported(void) +static int userns_supported(virDomainDefPtr def) { -#if 1 - /* - * put off using userns until uid mapping is implemented - */ - return 0; -#else - return lxcContainerAvailable(LXC_CONTAINER_FEATURE_USER) == 0; -#endif + return (def->idmap.nuidmap && def->idmap.ngidmap && + lxcContainerAvailable(LXC_CONTAINER_FEATURE_USER) == 0); } virArch lxcContainerGetAlt32bitArch(virArch arch) @@ -2119,7 +2113,7 @@ int lxcContainerStart(virDomainDefPtr def, cflags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWIPC|SIGCHLD; - if (userns_supported()) { + if (userns_supported(def)) { VIR_DEBUG("Enable user namespaces"); cflags |= CLONE_NEWUSER; } -- 1.8.1.4

On Fri, May 10, 2013 at 05:58:11PM +0800, Gao feng wrote:
If the idmap exist, the user namespace will be enabled automatically. If you want disable user namespace,just remove these elements from XML.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_container.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 8e1d10a..094f205 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -2030,16 +2030,10 @@ cleanup: return ret; }
-static int userns_supported(void) +static int userns_supported(virDomainDefPtr def) { -#if 1 - /* - * put off using userns until uid mapping is implemented - */ - return 0; -#else - return lxcContainerAvailable(LXC_CONTAINER_FEATURE_USER) == 0; -#endif + return (def->idmap.nuidmap && def->idmap.ngidmap && + lxcContainerAvailable(LXC_CONTAINER_FEATURE_USER) == 0);
No you can't do this - it causes the code to silently ignore the reqested idmap if the kernel doesn't support it. If the kernel can't support it we must report a fatal error to the user not ignore it. You should separate these checks really - userns_supported() to check the kernel and 'userns_required(def)' to check fi the config requires it.
virArch lxcContainerGetAlt32bitArch(virArch arch) @@ -2119,7 +2113,7 @@ int lxcContainerStart(virDomainDefPtr def,
cflags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWIPC|SIGCHLD;
- if (userns_supported()) { + if (userns_supported(def)) { VIR_DEBUG("Enable user namespaces"); cflags |= CLONE_NEWUSER; }
Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 05/10/2013 06:26 PM, Daniel P. Berrange wrote:
On Fri, May 10, 2013 at 05:58:11PM +0800, Gao feng wrote:
If the idmap exist, the user namespace will be enabled automatically. If you want disable user namespace,just remove these elements from XML.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_container.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 8e1d10a..094f205 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -2030,16 +2030,10 @@ cleanup: return ret; }
-static int userns_supported(void) +static int userns_supported(virDomainDefPtr def) { -#if 1 - /* - * put off using userns until uid mapping is implemented - */ - return 0; -#else - return lxcContainerAvailable(LXC_CONTAINER_FEATURE_USER) == 0; -#endif + return (def->idmap.nuidmap && def->idmap.ngidmap && + lxcContainerAvailable(LXC_CONTAINER_FEATURE_USER) == 0);
No you can't do this - it causes the code to silently ignore the reqested idmap if the kernel doesn't support it. If the kernel can't support it we must report a fatal error to the user not ignore it.
You should separate these checks really - userns_supported() to check the kernel and 'userns_required(def)' to check fi the config requires it.
Get it, thanks for your comments :) Thanks, Gao

Make sure the mapping line contains the root user of container is the first element of idmap array. So we can get the real user id on host for the container easily. This patch also check the map information, User must map the root user of container to any user of host. Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/conf/domain_conf.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 46be458..5bc4b8c 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -9815,7 +9815,8 @@ virDomainIdmapDefParseXML(const xmlNodePtr *node, ssize_t num) { int i; - struct idmap *idmap = NULL; + struct idmap *idmap = NULL, map; + int index = -1; xmlNodePtr save_ctxt = ctxt->node; if (VIR_ALLOC_N(idmap, num) < 0) { @@ -9828,7 +9829,29 @@ virDomainIdmapDefParseXML(const xmlNodePtr *node, virXPathUInt("string(./@start)", ctxt, &idmap[i].start); virXPathUInt("string(./@target)", ctxt, &idmap[i].target); virXPathUInt("string(./@count)", ctxt, &idmap[i].count); + + if (idmap[i].start == 0) { + index = i; + map.start = idmap[i].start; + map.target = idmap[i].target; + map.count = idmap[i].count; + } + } + /* Make sure the mapping line contains the root user of container + * is the first element of idmap array. So we can get the real + * user id on host for the container easily. */ + if (index != -1) { + idmap[index] = idmap[0]; + idmap[0] = map; + } else { + /* Root user of container isn't mapped to any user of host, + * return error. */ + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("You must map the root user of container")); + VIR_FREE(idmap); + idmap = NULL; } + error: ctxt->node = save_ctxt; return idmap; -- 1.8.1.4

On Fri, May 10, 2013 at 05:58:12PM +0800, Gao feng wrote:
Make sure the mapping line contains the root user of container is the first element of idmap array. So we can get the real user id on host for the container easily.
This patch also check the map information, User must map the root user of container to any user of host.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/conf/domain_conf.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-)
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 46be458..5bc4b8c 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -9815,7 +9815,8 @@ virDomainIdmapDefParseXML(const xmlNodePtr *node, ssize_t num) { int i; - struct idmap *idmap = NULL; + struct idmap *idmap = NULL, map; + int index = -1; xmlNodePtr save_ctxt = ctxt->node;
if (VIR_ALLOC_N(idmap, num) < 0) { @@ -9828,7 +9829,29 @@ virDomainIdmapDefParseXML(const xmlNodePtr *node, virXPathUInt("string(./@start)", ctxt, &idmap[i].start); virXPathUInt("string(./@target)", ctxt, &idmap[i].target); virXPathUInt("string(./@count)", ctxt, &idmap[i].count); + + if (idmap[i].start == 0) { + index = i; + map.start = idmap[i].start; + map.target = idmap[i].target; + map.count = idmap[i].count; + } + } + /* Make sure the mapping line contains the root user of container + * is the first element of idmap array. So we can get the real + * user id on host for the container easily. */ + if (index != -1) { + idmap[index] = idmap[0]; + idmap[0] = map;
IMHO it would be better to just use qsort() with 'start' as the sort key to ensure the entire array is sorted, not merely the first element.
+ } else { + /* Root user of container isn't mapped to any user of host, + * return error. */ + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("You must map the root user of container")); + VIR_FREE(idmap); + idmap = NULL; } + error: ctxt->node = save_ctxt; return idmap;
Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

This patch introduces new helper function virLXCControllerSetupUserns, in this function, we set the files uid_map and gid_map of the init task of container. lxcContainerSetUserns is used for creating cred for tasks running in container. Since after setuid/setgid, we may be a new user. This patch calls lxcContainerSetUserns at first to make sure the new created files belong to right user. Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_container.c | 59 +++++++++++++++++++++++++++------------- src/lxc/lxc_controller.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 18 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 094f205..eabaa34 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -331,6 +331,26 @@ int lxcContainerWaitForContinue(int control) /** + * lxcContainerSetUserns: + * + * This function calls setuid and setgid to create proper + * cred for tasks running in container. + * + * Returns 0 on success or -1 in case of error + */ +static int lxcContainerSetUserns(virDomainDefPtr def) +{ + if (def->idmap.nuidmap && def->idmap.ngidmap && virSetUIDGID(0, 0) < 0) { + virReportSystemError(errno, "%s", + _("setuid or setgid failed")); + return -1; + } + + return 0; +} + + +/** * lxcContainerRenameAndEnableInterfaces: * @nveths: number of interfaces * @veths: interface names @@ -1936,6 +1956,27 @@ static int lxcContainerChild(void *data) cmd = lxcContainerBuildInitCmd(vmDef); virCommandWriteArgLog(cmd, 1); + if (lxcContainerResolveSymlinks(vmDef) < 0) + goto cleanup; + + if (!virFileExists(vmDef->os.init)) { + virReportSystemError(errno, + _("cannot find init path '%s' relative to container root"), + vmDef->os.init); + goto cleanup; + } + + /* Wait for interface devices to show up */ + if (lxcContainerWaitForContinue(argv->monitor) < 0) { + virReportSystemError(errno, "%s", + _("Failed to read the container continue message")); + goto cleanup; + } + VIR_DEBUG("Received container continue message"); + + if (lxcContainerSetUserns(vmDef) < 0) + goto cleanup; + root = virDomainGetRootFilesystem(vmDef); if (argv->nttyPaths) { @@ -1965,29 +2006,11 @@ static int lxcContainerChild(void *data) goto cleanup; } - if (lxcContainerResolveSymlinks(vmDef) < 0) - goto cleanup; - if (lxcContainerSetupPivotRoot(vmDef, root, argv->ttyPaths, argv->nttyPaths, argv->securityDriver) < 0) goto cleanup; - if (!virFileExists(vmDef->os.init)) { - virReportSystemError(errno, - _("cannot find init path '%s' relative to container root"), - vmDef->os.init); - goto cleanup; - } - - /* Wait for interface devices to show up */ - if (lxcContainerWaitForContinue(argv->monitor) < 0) { - virReportSystemError(errno, "%s", - _("Failed to read the container continue message")); - goto cleanup; - } - VIR_DEBUG("Received container continue message"); - /* rename and enable interfaces */ if (lxcContainerRenameAndEnableInterfaces(!!(vmDef->features & (1 << VIR_DOMAIN_FEATURE_PRIVNET)), diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index 8c358c3..e9b90bf 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1036,6 +1036,73 @@ cleanup2: } +static int virLXCControllerSetupUsernsMap(struct idmap *map, + char *path, + int num) +{ + char *map_value = NULL; + int i, ret = -1; + + for (i = 0; i < num; i++) { + if (virAsprintf(&map_value, "%s %u %u %u\n", + map_value ? map_value : "", + map[i].start, map[i].target, map[i].count) < 0) + goto cleanup; + } + + if (virFileWriteStr(path, map_value, 0) < 0) { + if (errno == -ENOENT) + virReportSystemError(errno, + _("%s doesn't exist, please disable userns"), + path); + virReportSystemError(errno, _("unable write to %s"), path); + goto cleanup; + } + ret = 0; + +cleanup: + VIR_FREE(map_value); + return ret; +} + +/** + * virLXCControllerSetupUserns + * + * Set proc files for user namespace + * + * Returns 0 on success or -1 in case of error + */ +static int virLXCControllerSetupUserns(virLXCControllerPtr ctrl) +{ + char *uid_map = NULL; + char *gid_map = NULL; + int ret = -1; + + if (ctrl->def->idmap.nuidmap == 0 || ctrl->def->idmap.ngidmap == 0) + return 0; + + if (virAsprintf(&uid_map, "/proc/%d/uid_map", ctrl->initpid) < 0) + goto cleanup; + + if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.uidmap, uid_map, + ctrl->def->idmap.nuidmap) < 0) + goto cleanup; + + if (virAsprintf(&gid_map, "/proc/%d/gid_map", ctrl->initpid) < 0) + goto cleanup; + + if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.gidmap, gid_map, + ctrl->def->idmap.ngidmap) < 0) + goto cleanup; + + ret = 0; +cleanup: + VIR_FREE(uid_map); + VIR_FREE(gid_map); + return ret; +} + + /** * virLXCControllerMoveInterfaces @@ -1467,6 +1534,9 @@ virLXCControllerRun(virLXCControllerPtr ctrl) VIR_FORCE_CLOSE(control[1]); VIR_FORCE_CLOSE(containerhandshake[1]); + if (virLXCControllerSetupUserns(ctrl) < 0) + goto cleanup; + if (virLXCControllerMoveInterfaces(ctrl) < 0) goto cleanup; -- 1.8.1.4

On Fri, May 10, 2013 at 05:58:13PM +0800, Gao feng wrote:
This patch introduces new helper function virLXCControllerSetupUserns, in this function, we set the files uid_map and gid_map of the init task of container.
lxcContainerSetUserns is used for creating cred for tasks running in container. Since after setuid/setgid, we may be a new user. This patch calls lxcContainerSetUserns at first to make sure the new created files belong to right user.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_container.c | 59 +++++++++++++++++++++++++++------------- src/lxc/lxc_controller.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 18 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 094f205..eabaa34 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -331,6 +331,26 @@ int lxcContainerWaitForContinue(int control)
/** + * lxcContainerSetUserns: + * + * This function calls setuid and setgid to create proper + * cred for tasks running in container. + * + * Returns 0 on success or -1 in case of error + */ +static int lxcContainerSetUserns(virDomainDefPtr def)
I think it'd be better named as s/lxcContainerSetUserns/lxcContainerSetID/
+{ + if (def->idmap.nuidmap && def->idmap.ngidmap && virSetUIDGID(0, 0) < 0) {
This doesn't do the right thing if "nuidmap > 0 && ngidmap == 0" or vica-verca. Better to write it as uid_t uid = (uid_t)-1; gid_t gid = (gid_t)-1; if (def->idmap.nuidmap) uid = 0; if (def->idmap.ngidmap) gid = 0; if ((def->idmap.nuidmap || def->idmap.ngidmap) && virSetUIDGID(uid, gid) < 0) { ...error... } (virSetUIDGID accepts -1 as meaning don't change that id)
diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index 8c358c3..e9b90bf 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1036,6 +1036,73 @@ cleanup2: }
+static int virLXCControllerSetupUsernsMap(struct idmap *map, + char *path, + int num) +{ + char *map_value = NULL; + int i, ret = -1; + + for (i = 0; i < num; i++) { + if (virAsprintf(&map_value, "%s %u %u %u\n", + map_value ? map_value : "", + map[i].start, map[i].target, map[i].count) < 0) + goto cleanup; + }
You're leaking memory on every iteration of this loop by ovewriting the map_value pointer each time. You should use the virBuffer APIs to build up a string incrementally.
+ + if (virFileWriteStr(path, map_value, 0) < 0) { + if (errno == -ENOENT) + virReportSystemError(errno, + _("%s doesn't exist, please disable userns"), + path);
This special errno check shouldn't be required if your earlier check for existance of user ns is correctly called.
+ virReportSystemError(errno, _("unable write to %s"), path); + goto cleanup; + } + ret = 0; + +cleanup: + VIR_FREE(map_value); + return ret; +} + +/** + * virLXCControllerSetupUserns + * + * Set proc files for user namespace + * + * Returns 0 on success or -1 in case of error + */ +static int virLXCControllerSetupUserns(virLXCControllerPtr ctrl) +{ + char *uid_map = NULL; + char *gid_map = NULL; + int ret = -1; + + if (ctrl->def->idmap.nuidmap == 0 || ctrl->def->idmap.ngidmap == 0) + return 0;
This implies that the user must supply both a UID map and a GID map or neither. If it is an error to provide one and not the other, then this is something you should enforce during parsing.
+ + if (virAsprintf(&uid_map, "/proc/%d/uid_map", ctrl->initpid) < 0) + goto cleanup; + + if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.uidmap, uid_map, + ctrl->def->idmap.nuidmap) < 0) + goto cleanup; + + if (virAsprintf(&gid_map, "/proc/%d/gid_map", ctrl->initpid) < 0) + goto cleanup; + + if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.gidmap, gid_map, + ctrl->def->idmap.ngidmap) < 0) + goto cleanup; + + ret = 0; +cleanup: + VIR_FREE(uid_map); + VIR_FREE(gid_map); + return ret; +}
Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

user namespace doesn't allow to create devices in uninit userns. We should create devices on host side. Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_container.c | 47 +++++++---------------------- src/lxc/lxc_controller.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 37 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index eabaa34..540246f 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -833,22 +833,10 @@ cleanup: return ret; } -static int lxcContainerPopulateDevices(char **ttyPaths, size_t nttyPaths) +static int lxcContainerSetupDevices(char **ttyPaths, size_t nttyPaths) { size_t i; const struct { - int maj; - int min; - mode_t mode; - const char *path; - } devs[] = { - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" }, - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" }, - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" }, - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" }, - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" }, - }; - const struct { const char *src; const char *dst; } links[] = { @@ -858,18 +846,6 @@ static int lxcContainerPopulateDevices(char **ttyPaths, size_t nttyPaths) { "/proc/self/fd", "/dev/fd" }, }; - /* Populate /dev/ with a few important bits */ - for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { - dev_t dev = makedev(devs[i].maj, devs[i].min); - if (mknod(devs[i].path, S_IFCHR, dev) < 0 || - chmod(devs[i].path, devs[i].mode)) { - virReportSystemError(errno, - _("Failed to make device %s"), - devs[i].path); - return -1; - } - } - for (i = 0 ; i < ARRAY_CARDINALITY(links) ; i++) { if (symlink(links[i].src, links[i].dst) < 0) { virReportSystemError(errno, @@ -882,22 +858,13 @@ static int lxcContainerPopulateDevices(char **ttyPaths, size_t nttyPaths) if (access("/dev/pts/ptmx", W_OK) == 0) { /* We have private devpts capability, so bind that */ if (virFileTouch("/dev/ptmx", 0666) < 0) - return -1; + return -1; if (mount("/dev/pts/ptmx", "/dev/ptmx", "ptmx", MS_BIND, NULL) < 0) { virReportSystemError(errno, "%s", _("Failed to bind /dev/pts/ptmx on to /dev/ptmx")); return -1; } - } else { - /* Legacy devpts, so we need to just use shared one */ - dev_t dev = makedev(LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX); - if (mknod("/dev/ptmx", S_IFCHR, dev) < 0 || - chmod("/dev/ptmx", 0666)) { - virReportSystemError(errno, "%s", - _("Failed to make device /dev/ptmx")); - return -1; - } } for (i = 0 ; i < nttyPaths ; i++) { @@ -1825,8 +1792,8 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, if (lxcContainerMountFSDevPTS(vmDef, "/.oldroot") < 0) goto cleanup; - /* Populates device nodes in /dev/ */ - if (lxcContainerPopulateDevices(ttyPaths, nttyPaths) < 0) + /* Sets up device nodes in /dev/ */ + if (lxcContainerSetupDevices(ttyPaths, nttyPaths) < 0) goto cleanup; /* Sets up any non-root mounts from guest config */ @@ -2037,6 +2004,12 @@ static int lxcContainerChild(void *data) goto cleanup; } + if (lxcContainerWaitForContinue(argv->monitor) < 0) { + virReportSystemError(errno, "%s", + _("Failed to read the container continue message")); + goto cleanup; + } + ret = 0; cleanup: VIR_FREE(ttyPath); diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index e9b90bf..2072e9a 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1103,6 +1103,73 @@ cleanup: } +static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) +{ + size_t i; + int ret = -1; + char *ptmx = NULL; + char *path = NULL; + const struct { + int maj; + int min; + mode_t mode; + const char *path; + } devs[] = { + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" }, + }; + + /* Populate /dev/ with a few important bits */ + for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { + if (virAsprintf(&path, "/proc/%llu/root/%s", + (unsigned long long)ctrl->initpid, + devs[i].path) < 0) { + virReportOOMError(); + goto out; + } + + dev_t dev = makedev(devs[i].maj, devs[i].min); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, devs[i].mode)) { + virReportSystemError(errno, + _("Failed to make device %s"), + devs[i].path); + goto out; + } + } + + if (virAsprintf(&ptmx, "/proc/%llu/root/dev/pts/ptmx", + (unsigned long long)ctrl->initpid) < 0) { + virReportOOMError(); + goto out; + } + + if (access(ptmx, W_OK)) { + VIR_FREE(path); + + if (virAsprintf(&path, "/proc/%llu/root/dev/ptmx", + (unsigned long long)ctrl->initpid) < 0) { + virReportOOMError(); + goto out; + } + /* Legacy devpts, so we need to just use shared one */ + dev_t dev = makedev(LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, 0666)) { + virReportSystemError(errno, _("Failed to make device %s"), path); + goto out; + } + } + + ret = 0; +out: + VIR_FREE(ptmx); + VIR_FREE(path); + return ret; +} /** * virLXCControllerMoveInterfaces @@ -1552,6 +1619,16 @@ virLXCControllerRun(virLXCControllerPtr ctrl) goto cleanup; } + /* Populate devices for container */ + if (virLXCControllerPopulateDevices(ctrl) < 0) + goto cleanup; + + if (lxcContainerSendContinue(control[0]) < 0) { + virReportSystemError(errno, "%s", + _("Unable to send container continue message")); + goto cleanup; + } + /* Now the container is fully setup... */ /* ...and reduce our privileges */ -- 1.8.1.4

----- Ursprüngliche Mail -----
user namespace doesn't allow to create devices in uninit userns. We should create devices on host side.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_container.c | 47 +++++++---------------------- src/lxc/lxc_controller.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 37 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index eabaa34..540246f 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -833,22 +833,10 @@ cleanup: return ret; }
-static int lxcContainerPopulateDevices(char **ttyPaths, size_t nttyPaths) +static int lxcContainerSetupDevices(char **ttyPaths, size_t nttyPaths) { size_t i; const struct { - int maj; - int min; - mode_t mode; - const char *path; - } devs[] = { - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" }, - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" }, - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" }, - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" }, - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" }, - }; - const struct { const char *src; const char *dst; } links[] = { @@ -858,18 +846,6 @@ static int lxcContainerPopulateDevices(char **ttyPaths, size_t nttyPaths) { "/proc/self/fd", "/dev/fd" }, };
- /* Populate /dev/ with a few important bits */ - for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { - dev_t dev = makedev(devs[i].maj, devs[i].min); - if (mknod(devs[i].path, S_IFCHR, dev) < 0 || - chmod(devs[i].path, devs[i].mode)) { - virReportSystemError(errno, - _("Failed to make device %s"), - devs[i].path); - return -1; - } - } - for (i = 0 ; i < ARRAY_CARDINALITY(links) ; i++) { if (symlink(links[i].src, links[i].dst) < 0) { virReportSystemError(errno, @@ -882,22 +858,13 @@ static int lxcContainerPopulateDevices(char **ttyPaths, size_t nttyPaths) if (access("/dev/pts/ptmx", W_OK) == 0) { /* We have private devpts capability, so bind that */ if (virFileTouch("/dev/ptmx", 0666) < 0) - return -1; + return -1;
if (mount("/dev/pts/ptmx", "/dev/ptmx", "ptmx", MS_BIND, NULL) < 0) { virReportSystemError(errno, "%s", _("Failed to bind /dev/pts/ptmx on to /dev/ptmx")); return -1; } - } else { - /* Legacy devpts, so we need to just use shared one */ - dev_t dev = makedev(LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX); - if (mknod("/dev/ptmx", S_IFCHR, dev) < 0 || - chmod("/dev/ptmx", 0666)) { - virReportSystemError(errno, "%s", - _("Failed to make device /dev/ptmx")); - return -1; - } }
for (i = 0 ; i < nttyPaths ; i++) { @@ -1825,8 +1792,8 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, if (lxcContainerMountFSDevPTS(vmDef, "/.oldroot") < 0) goto cleanup;
- /* Populates device nodes in /dev/ */ - if (lxcContainerPopulateDevices(ttyPaths, nttyPaths) < 0) + /* Sets up device nodes in /dev/ */ + if (lxcContainerSetupDevices(ttyPaths, nttyPaths) < 0) goto cleanup;
/* Sets up any non-root mounts from guest config */ @@ -2037,6 +2004,12 @@ static int lxcContainerChild(void *data) goto cleanup; }
+ if (lxcContainerWaitForContinue(argv->monitor) < 0) { + virReportSystemError(errno, "%s", + _("Failed to read the container continue message")); + goto cleanup; + } + ret = 0; cleanup: VIR_FREE(ttyPath); diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index e9b90bf..2072e9a 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1103,6 +1103,73 @@ cleanup: }
+static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) +{ + size_t i; + int ret = -1; + char *ptmx = NULL; + char *path = NULL; + const struct { + int maj; + int min; + mode_t mode; + const char *path; + } devs[] = { + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" }, + }; + + /* Populate /dev/ with a few important bits */ + for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { + if (virAsprintf(&path, "/proc/%llu/root/%s", + (unsigned long long)ctrl->initpid, + devs[i].path) < 0) { + virReportOOMError(); + goto out; + } + + dev_t dev = makedev(devs[i].maj, devs[i].min); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, devs[i].mode)) { + virReportSystemError(errno, + _("Failed to make device %s"), + devs[i].path); + goto out; + } + } + + if (virAsprintf(&ptmx, "/proc/%llu/root/dev/pts/ptmx", + (unsigned long long)ctrl->initpid) < 0) { + virReportOOMError(); + goto out; + } + + if (access(ptmx, W_OK)) { + VIR_FREE(path); + + if (virAsprintf(&path, "/proc/%llu/root/dev/ptmx", + (unsigned long long)ctrl->initpid) < 0) { + virReportOOMError(); + goto out; + } + /* Legacy devpts, so we need to just use shared one */ + dev_t dev = makedev(LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, 0666)) { + virReportSystemError(errno, _("Failed to make device %s"), path); + goto out; + } + } + + ret = 0; +out: + VIR_FREE(ptmx); + VIR_FREE(path); + return ret; +}
/** * virLXCControllerMoveInterfaces @@ -1552,6 +1619,16 @@ virLXCControllerRun(virLXCControllerPtr ctrl) goto cleanup; }
+ /* Populate devices for container */ + if (virLXCControllerPopulateDevices(ctrl) < 0) + goto cleanup;
systemd will be not happy with this. If /dev/ is not already a mountpoint it will try to mount devtmpfs which will fail an systemd stops booting. It would be nice to have: a) A way to disable virLXCControllerPopulateDevices(), i.e. one bind mounts his own lxc-/dev to /dev b) libvirt mounts a tmpfs onto /dev and then creates the devices nodes. Thanks, //richard

On 05/10/2013 06:28 PM, Richard RW. Weinberger wrote:
----- Ursprüngliche Mail -----
user namespace doesn't allow to create devices in uninit userns. We should create devices on host side.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_container.c | 47 +++++++---------------------- src/lxc/lxc_controller.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 37 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index eabaa34..540246f 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -833,22 +833,10 @@ cleanup: return ret; }
-static int lxcContainerPopulateDevices(char **ttyPaths, size_t nttyPaths) +static int lxcContainerSetupDevices(char **ttyPaths, size_t nttyPaths) { size_t i; const struct { - int maj; - int min; - mode_t mode; - const char *path; - } devs[] = { - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" }, - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" }, - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" }, - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" }, - { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" }, - }; - const struct { const char *src; const char *dst; } links[] = { @@ -858,18 +846,6 @@ static int lxcContainerPopulateDevices(char **ttyPaths, size_t nttyPaths) { "/proc/self/fd", "/dev/fd" }, };
- /* Populate /dev/ with a few important bits */ - for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { - dev_t dev = makedev(devs[i].maj, devs[i].min); - if (mknod(devs[i].path, S_IFCHR, dev) < 0 || - chmod(devs[i].path, devs[i].mode)) { - virReportSystemError(errno, - _("Failed to make device %s"), - devs[i].path); - return -1; - } - } - for (i = 0 ; i < ARRAY_CARDINALITY(links) ; i++) { if (symlink(links[i].src, links[i].dst) < 0) { virReportSystemError(errno, @@ -882,22 +858,13 @@ static int lxcContainerPopulateDevices(char **ttyPaths, size_t nttyPaths) if (access("/dev/pts/ptmx", W_OK) == 0) { /* We have private devpts capability, so bind that */ if (virFileTouch("/dev/ptmx", 0666) < 0) - return -1; + return -1;
if (mount("/dev/pts/ptmx", "/dev/ptmx", "ptmx", MS_BIND, NULL) < 0) { virReportSystemError(errno, "%s", _("Failed to bind /dev/pts/ptmx on to /dev/ptmx")); return -1; } - } else { - /* Legacy devpts, so we need to just use shared one */ - dev_t dev = makedev(LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX); - if (mknod("/dev/ptmx", S_IFCHR, dev) < 0 || - chmod("/dev/ptmx", 0666)) { - virReportSystemError(errno, "%s", - _("Failed to make device /dev/ptmx")); - return -1; - } }
for (i = 0 ; i < nttyPaths ; i++) { @@ -1825,8 +1792,8 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, if (lxcContainerMountFSDevPTS(vmDef, "/.oldroot") < 0) goto cleanup;
- /* Populates device nodes in /dev/ */ - if (lxcContainerPopulateDevices(ttyPaths, nttyPaths) < 0) + /* Sets up device nodes in /dev/ */ + if (lxcContainerSetupDevices(ttyPaths, nttyPaths) < 0) goto cleanup;
/* Sets up any non-root mounts from guest config */ @@ -2037,6 +2004,12 @@ static int lxcContainerChild(void *data) goto cleanup; }
+ if (lxcContainerWaitForContinue(argv->monitor) < 0) { + virReportSystemError(errno, "%s", + _("Failed to read the container continue message")); + goto cleanup; + } + ret = 0; cleanup: VIR_FREE(ttyPath); diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index e9b90bf..2072e9a 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1103,6 +1103,73 @@ cleanup: }
+static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) +{ + size_t i; + int ret = -1; + char *ptmx = NULL; + char *path = NULL; + const struct { + int maj; + int min; + mode_t mode; + const char *path; + } devs[] = { + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" }, + }; + + /* Populate /dev/ with a few important bits */ + for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { + if (virAsprintf(&path, "/proc/%llu/root/%s", + (unsigned long long)ctrl->initpid, + devs[i].path) < 0) { + virReportOOMError(); + goto out; + } + + dev_t dev = makedev(devs[i].maj, devs[i].min); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, devs[i].mode)) { + virReportSystemError(errno, + _("Failed to make device %s"), + devs[i].path); + goto out; + } + } + + if (virAsprintf(&ptmx, "/proc/%llu/root/dev/pts/ptmx", + (unsigned long long)ctrl->initpid) < 0) { + virReportOOMError(); + goto out; + } + + if (access(ptmx, W_OK)) { + VIR_FREE(path); + + if (virAsprintf(&path, "/proc/%llu/root/dev/ptmx", + (unsigned long long)ctrl->initpid) < 0) { + virReportOOMError(); + goto out; + } + /* Legacy devpts, so we need to just use shared one */ + dev_t dev = makedev(LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, 0666)) { + virReportSystemError(errno, _("Failed to make device %s"), path); + goto out; + } + } + + ret = 0; +out: + VIR_FREE(ptmx); + VIR_FREE(path); + return ret; +}
/** * virLXCControllerMoveInterfaces @@ -1552,6 +1619,16 @@ virLXCControllerRun(virLXCControllerPtr ctrl) goto cleanup; }
+ /* Populate devices for container */ + if (virLXCControllerPopulateDevices(ctrl) < 0) + goto cleanup;
systemd will be not happy with this. If /dev/ is not already a mountpoint it will try to mount devtmpfs which will fail an systemd stops booting.
It would be nice to have: a) A way to disable virLXCControllerPopulateDevices(), i.e. one bind mounts his own lxc-/dev to /dev b) libvirt mounts a tmpfs onto /dev and then creates the devices nodes.
Thanks for you pointing it out, I will pay attention to this problem in my next round patchset. Thanks, Gao

On Fri, May 10, 2013 at 05:58:14PM +0800, Gao feng wrote:
user namespace doesn't allow to create devices in uninit userns. We should create devices on host side.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_container.c | 47 +++++++---------------------- src/lxc/lxc_controller.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 37 deletions(-)
diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index e9b90bf..2072e9a 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1103,6 +1103,73 @@ cleanup: }
+static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) +{ + size_t i; + int ret = -1; + char *ptmx = NULL; + char *path = NULL; + const struct { + int maj; + int min; + mode_t mode; + const char *path; + } devs[] = { + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" }, + }; + + /* Populate /dev/ with a few important bits */ + for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { + if (virAsprintf(&path, "/proc/%llu/root/%s", + (unsigned long long)ctrl->initpid, + devs[i].path) < 0) { + virReportOOMError(); + goto out; + } + + dev_t dev = makedev(devs[i].maj, devs[i].min); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, devs[i].mode)) { + virReportSystemError(errno, + _("Failed to make device %s"), + devs[i].path); + goto out; + } + } + + if (virAsprintf(&ptmx, "/proc/%llu/root/dev/pts/ptmx", + (unsigned long long)ctrl->initpid) < 0) { + virReportOOMError();
It is really non-obvious that this code is not being run until the container has started. IMHO rather than playing games with the /proc/$PID/root/dev link, you should make the lxc_controller.c code responsible for mounting the /dev tmpfs somewhere, and populate it before any of the lxc_container code even runs. Then the lxc_container code can simply MS_MOVE the pre-populate /dev to the right place when it starts. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 05/10/2013 06:42 PM, Daniel P. Berrange wrote:
On Fri, May 10, 2013 at 05:58:14PM +0800, Gao feng wrote:
user namespace doesn't allow to create devices in uninit userns. We should create devices on host side.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_container.c | 47 +++++++---------------------- src/lxc/lxc_controller.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 37 deletions(-)
diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index e9b90bf..2072e9a 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1103,6 +1103,73 @@ cleanup: }
+static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) +{ + size_t i; + int ret = -1; + char *ptmx = NULL; + char *path = NULL; + const struct { + int maj; + int min; + mode_t mode; + const char *path; + } devs[] = { + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" }, + { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" }, + }; + + /* Populate /dev/ with a few important bits */ + for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { + if (virAsprintf(&path, "/proc/%llu/root/%s", + (unsigned long long)ctrl->initpid, + devs[i].path) < 0) { + virReportOOMError(); + goto out; + } + + dev_t dev = makedev(devs[i].maj, devs[i].min); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, devs[i].mode)) { + virReportSystemError(errno, + _("Failed to make device %s"), + devs[i].path); + goto out; + } + } + + if (virAsprintf(&ptmx, "/proc/%llu/root/dev/pts/ptmx", + (unsigned long long)ctrl->initpid) < 0) { + virReportOOMError();
It is really non-obvious that this code is not being run until the container has started. IMHO rather than playing games with the /proc/$PID/root/dev link, you should make the lxc_controller.c code responsible for mounting the /dev tmpfs somewhere, and populate it before any of the lxc_container code even runs. Then the lxc_container code can simply MS_MOVE the pre-populate /dev to the right place when it starts.
Good idea, I will try it this way. Thanks! Gao

Since this devices are created for the container. the owner should be the root user of the container. Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_controller.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index 2072e9a..f7bdf54 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1109,6 +1109,9 @@ static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) int ret = -1; char *ptmx = NULL; char *path = NULL; + uid_t uid = -1; + gid_t gid = -1; + bool userns_enabled = false; const struct { int maj; int min; @@ -1122,6 +1125,12 @@ static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" }, }; + if (ctrl->def->idmap.uidmap && ctrl->def->idmap.gidmap) { + uid = ctrl->def->idmap.uidmap[0].target; + gid = ctrl->def->idmap.gidmap[0].target; + userns_enabled = true; + } + /* Populate /dev/ with a few important bits */ for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { if (virAsprintf(&path, "/proc/%llu/root/%s", @@ -1139,6 +1148,14 @@ static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) devs[i].path); goto out; } + + if (userns_enabled && (chown(path, uid, gid) < 0)) { + virReportSystemError(errno, + _("Failed to change owner of device" + " %s to %u:%u"), + devs[i].path, uid, gid); + goto out; + } } if (virAsprintf(&ptmx, "/proc/%llu/root/dev/pts/ptmx", @@ -1162,6 +1179,14 @@ static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) virReportSystemError(errno, _("Failed to make device %s"), path); goto out; } + + if (userns_enabled && (chown(path, uid, gid) < 0)) { + virReportSystemError(errno, + _("Failed to change owner of device" + " %s to %u:%u"), + path, uid, gid); + goto out; + } } ret = 0; -- 1.8.1.4

Since these tty devices will be used by container, the owner of them should be the root user of container. Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_controller.c | 46 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index f7bdf54..31c7cd5 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1288,13 +1288,14 @@ static int lxcSetPersonality(virDomainDefPtr def) * *TTYNAME. Heavily borrowed from glibc, but doesn't require that * devpts == "/dev/pts" */ static int -lxcCreateTty(char *ptmx, int *ttymaster, char **ttyName) +lxcCreateTty(virLXCControllerPtr ctrl, int *ttymaster, + char **ttyName, char **ttyHostPath) { int ret = -1; int ptyno; int unlock = 0; - if ((*ttymaster = open(ptmx, O_RDWR|O_NOCTTY|O_NONBLOCK)) < 0) + if ((*ttymaster = open(ctrl->devptmx, O_RDWR|O_NOCTTY|O_NONBLOCK)) < 0) goto cleanup; if (ioctl(*ttymaster, TIOCSPTLCK, &unlock) < 0) @@ -1315,6 +1316,15 @@ lxcCreateTty(char *ptmx, int *ttymaster, char **ttyName) goto cleanup; } + /* Change the owner of this new created tty device to the root + * user of container. */ + if (virAsprintf(ttyHostPath, "%s/%s.devpts/%d", LXC_STATE_DIR, + ctrl->def->name, ptyno) < 0) { + virReportOOMError(); + errno = ENOMEM; + goto cleanup; + } + ret = 0; cleanup: @@ -1452,18 +1462,44 @@ virLXCControllerSetupConsoles(virLXCControllerPtr ctrl, char **containerTTYPaths) { size_t i; + int ret = -1; + uid_t uid = -1; + gid_t gid = -1; + bool userns_enabled = false; + char *ttyHostPath = NULL; + + if (ctrl->def->idmap.uidmap && ctrl->def->idmap.gidmap) { + uid = ctrl->def->idmap.uidmap[0].target; + gid = ctrl->def->idmap.gidmap[0].target; + userns_enabled = true; + } for (i = 0 ; i < ctrl->nconsoles ; i++) { VIR_DEBUG("Opening tty on private %s", ctrl->devptmx); - if (lxcCreateTty(ctrl->devptmx, + if (lxcCreateTty(ctrl, &ctrl->consoles[i].contFd, - &containerTTYPaths[i]) < 0) { + &containerTTYPaths[i], &ttyHostPath) < 0) { virReportSystemError(errno, "%s", _("Failed to allocate tty")); return -1; } + + /* Change the owner of this new created tty device to the root + * user of container. */ + if (userns_enabled && (chown(ttyHostPath, uid, gid) < 0)) { + virReportSystemError(errno, + _("Failed to change owner of tty" + " %s to %u:%u"), + ttyHostPath, uid, gid); + goto out; + } + VIR_FREE(ttyHostPath); } - return 0; + + ret = 0; +out: + VIR_FREE(ttyHostPath); + return ret; } -- 1.8.1.4

Otherwise we will fail to mount the meminfo file. This patch also allows any users to access the fuse mount point. Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> --- src/lxc/lxc_fuse.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/lxc/lxc_fuse.c b/src/lxc/lxc_fuse.c index b6df99c..ddb737c 100644 --- a/src/lxc/lxc_fuse.c +++ b/src/lxc/lxc_fuse.c @@ -49,6 +49,9 @@ static int lxcProcGetattr(const char *path, struct stat *stbuf) char *mempath = NULL; struct stat sb; + struct fuse_context *context = fuse_get_context(); + virDomainDefPtr def = (virDomainDefPtr)context->private_data; + memset(stbuf, 0, sizeof(struct stat)); if (virAsprintf(&mempath, "/proc/%s", path) < 0) { virReportOOMError(); @@ -66,6 +69,8 @@ static int lxcProcGetattr(const char *path, struct stat *stbuf) goto cleanup; } + stbuf->st_uid = def->idmap.uidmap ? def->idmap.uidmap[0].target : 0; + stbuf->st_gid = def->idmap.gidmap ? def->idmap.gidmap[0].target : 0; stbuf->st_mode = sb.st_mode; stbuf->st_nlink = 1; stbuf->st_blksize = sb.st_blksize; @@ -307,6 +312,7 @@ int lxcSetupFuse(virLXCFusePtr *f, virDomainDefPtr def) /* process name is libvirt_lxc */ if (fuse_opt_add_arg(&args, "libvirt_lxc") == -1 || fuse_opt_add_arg(&args, "-odirect_io") == -1 || + fuse_opt_add_arg(&args, "-oallow_other") == -1 || fuse_opt_add_arg(&args, "-ofsname=libvirt") == -1) goto cleanup1; -- 1.8.1.4
participants (3)
-
Daniel P. Berrange
-
Gao feng
-
Richard RW. Weinberger