Hello Michal,
Thank you for taking the time to review my patch, you made some very good
points and helped me discover features of the code base I didn't know.
To answer you question: systemd can still do whatever it wants within the
container this won't change that. Setting of the rlimits happens before the
<init> of the container is even exec'd. So my use case for writing this
patch was that we were trying to run things that required large numbers of
open files, like apache, but when the containers started they inherited the
RLIMIT_NOFILE limits from the system init which is set conservatively low.
In a privileged container it would be no problem to increase that limit,
because you can run things as root (id=0) and you have the capability.
However we use <idmap> so that we don't have privileged root containers
running and as a unprivileged user you can only decrease your limit, not
increase them. This patch allows me to set the starting RLLIMIT_NOFILE
limits I want at container boot and can be changed per container.
In my next reply you will find my new patch addressing your criticisms.
Thanks
Ryan
On Wed, Dec 10, 2014 at 8:18 AM, Michal Privoznik <mprivozn(a)redhat.com>
wrote:
On 07.12.2014 04:17, Ryan Cleere wrote:
> ---
> src/conf/domain_conf.c | 115
+++++++++++++++++++++++++++++++++++++++++++++++
> src/conf/domain_conf.h | 12 +++++
> src/lxc/lxc_controller.c | 12 +++++
> src/util/virprocess.c | 4 +-
> src/util/virprocess.h | 3 ++
> 5 files changed, 144 insertions(+), 2 deletions(-)
First of all, how would this play with systemd? I mean, what if systemd
within the container wants to set up its own limits?
Then, you are extending the domain XML space without any documentation.
All XML changes must go hand in hand with documentation and RNG schema
change too.
>
> diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
> index 2d81c37..a673dc2 100644
> --- a/src/conf/domain_conf.c
> +++ b/src/conf/domain_conf.c
> @@ -26,6 +26,7 @@
> #include <dirent.h>
> #include <fcntl.h>
> #include <strings.h>
> +#include <sys/resource.h>
> #include <sys/stat.h>
> #include <unistd.h>
>
> @@ -1003,7 +1004,86 @@ virDomainBlkioDeviceParseXML(xmlNodePtr root,
> return -1;
> }
>
> +static virDomainRLimitsPtr
> +virDomainRLimitsNew(void)
> +{
> + virDomainRLimitsPtr def = NULL;
> +
> + if (VIR_ALLOC(def) < 0)
> + return NULL;
> +
> + return def;
> +}
What's the reason for this function? I mean, unless it is setting some
defauls, or doing something different than VIR_ALLOC() I don't think we
need that. Esp. if there's no Free() counterpart.
> +
> +static virDomainRLimitsPtr
> +virDomainRLimitParseXML(xmlNodePtr node)
> +{
> + char *c = NULL;
> + long long val;
> + virDomainRLimitsPtr def;
>
> + if (!(def = virDomainRLimitsNew()))
> + return NULL;
> +
> + if (node->type == XML_ELEMENT_NODE) {
> + c = (char *)xmlNodeGetContent(node);
> + if (virStrToLong_ll(c, NULL, 10, &val) < 0) {
> + virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
> + _("could not parse rlimit value of %s"),
> + c);
> + goto error;
> + }
> + VIR_FREE(c);
> +
> + def->limit = val;
> + if (VIR_STRDUP(def->name, (char *)node->name) < 0)
> + goto error;
And here we go. If this VIR_STRDUP succeeds but the node->name is not
known to the following huge if-else chunk, the string is leaked. Moreover,
the def->name is just string translation of def->resource. The translation
is static so I think you should use VIR_ENUM_{DECL,IMPL} macros rather than
the following.
> +
> + if (xmlStrEqual(node->name, BAD_CAST "as")) {
> + def->resource = RLIMIT_AS;
> + } else if (xmlStrEqual(node->name, BAD_CAST "core")) {
> + def->resource = RLIMIT_CORE;
> + } else if (xmlStrEqual(node->name, BAD_CAST "cpu")) {
> + def->resource = RLIMIT_CPU;
> + } else if (xmlStrEqual(node->name, BAD_CAST "data")) {
> + def->resource = RLIMIT_DATA;
> + } else if (xmlStrEqual(node->name, BAD_CAST "fsize")) {
> + def->resource = RLIMIT_FSIZE;
> + } else if (xmlStrEqual(node->name, BAD_CAST "locks")) {
> + def->resource = RLIMIT_LOCKS;
> + } else if (xmlStrEqual(node->name, BAD_CAST "memlock")) {
> + def->resource = RLIMIT_MEMLOCK;
> + } else if (xmlStrEqual(node->name, BAD_CAST "msgqueue")) {
> + def->resource = RLIMIT_MSGQUEUE;
> + } else if (xmlStrEqual(node->name, BAD_CAST "nice")) {
> + def->resource = RLIMIT_NICE;
> + } else if (xmlStrEqual(node->name, BAD_CAST "nofile")) {
> + def->resource = RLIMIT_NOFILE;
> + } else if (xmlStrEqual(node->name, BAD_CAST "nproc")) {
> + def->resource = RLIMIT_NPROC;
> + } else if (xmlStrEqual(node->name, BAD_CAST "rss")) {
> + def->resource = RLIMIT_RSS;
> + } else if (xmlStrEqual(node->name, BAD_CAST "rtprio")) {
> + def->resource = RLIMIT_RTPRIO;
> + } else if (xmlStrEqual(node->name, BAD_CAST "sigpending")) {
> + def->resource = RLIMIT_SIGPENDING;
> + } else if (xmlStrEqual(node->name, BAD_CAST "stack")) {
> + def->resource = RLIMIT_STACK;
So, if you go with VIR_ENUM_* that I've mentioned above, this code would
boil down to:
if ((def->resource = virRLimitTypeFromString((const char
*)node->name)) < 0) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("could not determine resource type of
'%s'"),
node->name);
goto error;
}
> + } else {
> + virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
> + _("could not determine resource type of
'%s'"),
> + node->name);
> + goto error;
The def->name is allocated here...
> + }
> + }
> +
> + return def;
> +
> + error:
> + VIR_FREE(c);
> + VIR_FREE(def);
.. and leaked here.
> + return NULL;
> +}
>
> static void
> virDomainObjListDataFree(void *payload, const void *name
ATTRIBUTE_UNUSED)
> @@ -14180,6 +14260,28 @@ virDomainDefParseXML(xmlDocPtr xml,
>
> virHashFree(bootHash);
>
> + if ((node = virXPathNode("./rlimits[1]", ctxt)) != NULL && (n
=
virXMLChildElementCount(node)) > 0) {
> + xmlNodePtr cur = node->children;
> + if (n && VIR_ALLOC_N(def->rlimits, n) < 0)
> + goto error;
> +
> + for (i = 0; i < n; i++) {
> + if (!(def->rlimits[i] = virDomainRLimitParseXML(cur)))
> + goto error;
> + def->nrlimits++;
> + for (j = 0; j < i; j++) {
> + if (def->rlimits[j]->resource ==
def->rlimits[i]->resource) {
> + virReportError(VIR_ERR_XML_ERROR,
> + _("duplicate rlimit resources
'%s'"),
> + def->rlimits[j]->name);
> + goto error;
> + }
> + }
> + cur = cur->next;
> + }
> + }
> + VIR_FREE(node);
> +
> return def;
>
> error:
> @@ -19759,6 +19861,19 @@ virDomainDefFormatInternal(virDomainDefPtr def,
> goto error;
> }
>
> + if (def->nrlimits > 0) {
> + virBufferAddLit(buf, "<rlimits>\n");
> + virBufferAdjustIndent(buf, 2);
> + for (n = 0; n < def->nrlimits; n++) {
> + virBufferAsprintf(buf, "<%s>%lld</%s>\n",
> + def->rlimits[n]->name,
> + def->rlimits[n]->limit,
> + def->rlimits[n]->name);
Since we will have VIR_ENUM_* we have TypeToString() function too. So the
code can be rewritten to something like this:
virBufferAsprintf(buf, "<%s>%lld</%s>\n",
virRLimitTypeToString(def->rlimits[n]->resource),
def->rlimits[n]->limit,
virRLimitTypeToString(def->rlimits[n]->resource));
That way, the def->rlimits[n]->name is useless and can be dropped.
> + }
> + virBufferAdjustIndent(buf, -2);
> + virBufferAddLit(buf, "</rlimits>\n");
> + }
> +
> virBufferAdjustIndent(buf, -2);
> virBufferAddLit(buf, "</domain>\n");
>
> diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
> index 439f3c0..abad30e 100644
> --- a/src/conf/domain_conf.h
> +++ b/src/conf/domain_conf.h
> @@ -2021,6 +2021,15 @@ struct _virDomainPowerManagement {
> int s4;
> };
>
> +typedef struct _virDomainRLimits virDomainRLimits;
> +typedef virDomainRLimits *virDomainRLimitsPtr;
> +
> +struct _virDomainRLimits {
> + char *name;
> + int resource;
> + long long limit;
> +};
> +
> /*
> * Guest VM main configuration
> *
> @@ -2138,6 +2147,9 @@ struct _virDomainDef {
> size_t nshmems;
> virDomainShmemDefPtr *shmems;
>
> + size_t nrlimits;
> + virDomainRLimitsPtr *rlimits;
> +
> /* Only 1 */
> virDomainWatchdogDefPtr watchdog;
> virDomainMemballoonDefPtr memballoon;
> diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c
> index 53a2c8d..ef5551e 100644
> --- a/src/lxc/lxc_controller.c
> +++ b/src/lxc/lxc_controller.c
> @@ -2490,6 +2490,18 @@ int main(int argc, char *argv[])
> }
> }
>
> + VIR_INFO("nrlimits = %d", (int)ctrl->def->nrlimits);
> + if (ctrl->def->nrlimits > 0) {
> + struct rlimit rlim;
> + int n;
> + for (i = 0; i < ctrl->def->nrlimits; i++) {
> + rlim.rlim_max = rlim.rlim_cur =
ctrl->def->rlimits[i]->limit;
> + n = virProcessPrLimit(0, ctrl->def->rlimits[i]->resource,
&rlim);
> + if (n < 0)
> + goto cleanup;
> + }
> + }
> +
> rc = virLXCControllerRun(ctrl);
>
> cleanup:
> diff --git a/src/util/virprocess.c b/src/util/virprocess.c
> index 0c8a32f..9bb5370 100644
> --- a/src/util/virprocess.c
> +++ b/src/util/virprocess.c
> @@ -675,13 +675,13 @@ int virProcessSetNamespaces(size_t nfdlist,
> }
>
> #if HAVE_PRLIMIT
> -static int
> +int
> virProcessPrLimit(pid_t pid, int resource, struct rlimit *rlim)
> {
> return prlimit(pid, resource, rlim, NULL);
> }
> #elif HAVE_SETRLIMIT
> -static int
> +int
> virProcessPrLimit(pid_t pid ATTRIBUTE_UNUSED,
> int resource ATTRIBUTE_UNUSED,
> struct rlimit *rlim ATTRIBUTE_UNUSED)
> diff --git a/src/util/virprocess.h b/src/util/virprocess.h
> index bcaede5..045f8d4 100644
> --- a/src/util/virprocess.h
> +++ b/src/util/virprocess.h
> @@ -22,6 +22,7 @@
> #ifndef __VIR_PROCESS_H__
> # define __VIR_PROCESS_H__
>
> +# include <sys/resource.h>
> # include <sys/types.h>
>
> # include "internal.h"
> @@ -73,4 +74,6 @@ typedef int (*virProcessNamespaceCallback)(pid_t pid,
void *opaque);
> int virProcessRunInMountNamespace(pid_t pid,
> virProcessNamespaceCallback cb,
> void *opaque);
> +
> +int virProcessPrLimit(pid_t pid, int resource, struct rlimit *rlim);
> #endif /* __VIR_PROCESS_H__ */
>
I'm surprised the linker wasn't unhappy with this. You need to expose the
virProcessPrLimit in src/libvirt_private.syms too.
Looking forward to v2.
Michal