[libvirt] [PATCH 0/3] Optimize mass closing of FDs on child spwaning

Michal Privoznik

3 Jul 2019 3 Jul '19

4:19 p.m.

If the limit for max opened files is way bigger than the default (1024), say 3 orders bigger then spawning a child through virCommand can be expensive because we iterate over ALL FDs within the limit and close them. There's no need to that since we can learn the list of opened FDs from /proc/self/fd/. Michal Prívozník (3): virNetDevOpenvswitchInterfaceStats: Optimize for speed vircommand: Separate mass FD closing into a function virCommand: use procfs to learn opened FDs src/util/vircommand.c | 114 +++++++++++++++++++++++++++----- src/util/virnetdevopenvswitch.c | 111 ++++++++++++++++++++----------- 2 files changed, 170 insertions(+), 55 deletions(-) -- 2.21.0

Show replies by date

Michal Privoznik

3 Jul 3 Jul

4:19 p.m.

New subject: [libvirt] [PATCH 1/3] virNetDevOpenvswitchInterfaceStats: Optimize for speed

We run 'ovs-vsctl' nine times (first to find if interface is there and then eight times = for each stats member separately). This is very inefficient. I've found a way to run it once and with a bit of help from virJSON module we can parse out stats we need. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/virnetdevopenvswitch.c | 111 +++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 37 deletions(-) diff --git a/src/util/virnetdevopenvswitch.c b/src/util/virnetdevopenvswitch.c index c99ecfbf15..0fe64bedab 100644 --- a/src/util/virnetdevopenvswitch.c +++ b/src/util/virnetdevopenvswitch.c @@ -28,6 +28,7 @@ #include "virmacaddr.h" #include "virstring.h" #include "virlog.h" +#include "virjson.h" #define VIR_FROM_THIS VIR_FROM_NONE @@ -311,58 +312,94 @@ int virNetDevOpenvswitchInterfaceStats(const char *ifname, virDomainInterfaceStatsPtr stats) { - char *tmp; - bool gotStats = false; VIR_AUTOPTR(virCommand) cmd = NULL; VIR_AUTOFREE(char *) output = NULL; + VIR_AUTOPTR(virJSONValue) jsonStats = NULL; + virJSONValuePtr jsonMap = NULL; + size_t i; - /* Just ensure the interface exists in ovs */ cmd = virCommandNew(OVSVSCTL); virNetDevOpenvswitchAddTimeout(cmd); - virCommandAddArgList(cmd, "get", "Interface", ifname, "name", NULL); + virCommandAddArgList(cmd, "--if-exists", "--format=list", "--data=json", + "--no-headings", "--columns=statistics", "list", + "Interface", ifname, NULL); virCommandSetOutputBuffer(cmd, &output); - if (virCommandRun(cmd, NULL) < 0) { + /* The above command returns either: + * 1) empty string if @ifname doesn't exist, or + * 2) a JSON array, for instance: + * ["map",[["collisions",0],["rx_bytes",0],["rx_crc_err",0],["rx_dropped",0], + * ["rx_errors",0],["rx_frame_err",0],["rx_over_err",0],["rx_packets",0], + * ["tx_bytes",12406],["tx_dropped",0],["tx_errors",0],["tx_packets",173]]] + */ + + if (virCommandRun(cmd, NULL) < 0 || + STREQ_NULLABLE(output, "")) { /* no ovs-vsctl or interface 'ifname' doesn't exists in ovs */ virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Interface not found")); return -1; } -#define GET_STAT(name, member) \ - do { \ - VIR_FREE(output); \ - virCommandFree(cmd); \ - cmd = virCommandNew(OVSVSCTL); \ - virNetDevOpenvswitchAddTimeout(cmd); \ - virCommandAddArgList(cmd, "--if-exists", "get", "Interface", \ - ifname, "statistics:" name, NULL); \ - virCommandSetOutputBuffer(cmd, &output); \ - if (virCommandRun(cmd, NULL) < 0 || !output || !*output || *output == '\n') { \ - stats->member = -1; \ - } else { \ - if (virStrToLong_ll(output, &tmp, 10, &stats->member) < 0 || \ - *tmp != '\n') { \ - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", \ - _("Fail to parse ovs-vsctl output")); \ - return -1; \ - } \ - gotStats = true; \ - } \ - } while (0) + if (!(jsonStats = virJSONValueFromString(output)) || + !virJSONValueIsArray(jsonStats) || + !(jsonMap = virJSONValueArrayGet(jsonStats, 1))) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Unable to parse ovs-vsctl output")); + return -1; + } - /* The TX/RX fields appear to be swapped here - * because this is the host view. */ - GET_STAT("rx_bytes", tx_bytes); - GET_STAT("rx_packets", tx_packets); - GET_STAT("rx_errors", tx_errs); - GET_STAT("rx_dropped", tx_drop); - GET_STAT("tx_bytes", rx_bytes); - GET_STAT("tx_packets", rx_packets); - GET_STAT("tx_errors", rx_errs); - GET_STAT("tx_dropped", rx_drop); + stats->rx_bytes = stats->rx_packets = stats->rx_errs = stats->rx_drop = -1; + stats->tx_bytes = stats->tx_packets = stats->tx_errs = stats->tx_drop = -1; - if (!gotStats) { + for (i = 0; i < virJSONValueArraySize(jsonMap); i++) { + virJSONValuePtr item = virJSONValueArrayGet(jsonMap, i); + virJSONValuePtr jsonKey; + virJSONValuePtr jsonVal; + const char *key; + long long val; + + if (!item || + (!(jsonKey = virJSONValueArrayGet(item, 0))) || + (!(jsonVal = virJSONValueArrayGet(item, 1))) || + (!(key = virJSONValueGetString(jsonKey))) || + (virJSONValueGetNumberLong(jsonVal, &val) < 0)) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Malformed ovs-vsctl output")); + return -1; + } + + /* The TX/RX fields appear to be swapped here + * because this is the host view. */ + if (STREQ(key, "rx_bytes")) { + stats->tx_bytes = val; + } else if (STREQ(key, "rx_packets")) { + stats->tx_packets = val; + } else if (STREQ(key, "rx_errors")) { + stats->tx_errs = val; + } else if (STREQ(key, "rx_dropped")) { + stats->tx_drop = val; + } else if (STREQ(key, "tx_bytes")) { + stats->rx_bytes = val; + } else if (STREQ(key, "tx_packets")) { + stats->rx_packets = val; + } else if (STREQ(key, "tx_errors")) { + stats->rx_errs = val; + } else if (STREQ(key, "tx_dropped")) { + stats->rx_drop = val; + } else { + VIR_DEBUG("Unused ovs-vsctl stat key=%s val=%lld", key, val); + } + } + + if (stats->rx_bytes == -1 && + stats->rx_packets == -1 && + stats->rx_errs == -1 && + stats->rx_drop == -1 && + stats->tx_bytes == -1 && + stats->tx_packets == -1 && + stats->tx_errs == -1 && + stats->tx_drop == -1) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Interface doesn't have any statistics")); return -1; -- 2.21.0

Ján Tomko

13 Jul 13 Jul

1:28 a.m.

New subject: [libvirt] [PATCH 1/3] virNetDevOpenvswitchInterfaceStats: Optimize for speed

On Wed, Jul 03, 2019 at 09:19:18AM +0200, Michal Privoznik wrote:

...

We run 'ovs-vsctl' nine times (first to find if interface is there and then eight times = for each stats member separately). This is very inefficient. I've found a way to run it once and with a bit of help from virJSON module we can parse out stats we need.

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/virnetdevopenvswitch.c | 111 +++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 37 deletions(-)

diff --git a/src/util/virnetdevopenvswitch.c b/src/util/virnetdevopenvswitch.c index c99ecfbf15..0fe64bedab 100644 --- a/src/util/virnetdevopenvswitch.c +++ b/src/util/virnetdevopenvswitch.c @@ -28,6 +28,7 @@ #include "virmacaddr.h" #include "virstring.h" #include "virlog.h" +#include "virjson.h"

#define VIR_FROM_THIS VIR_FROM_NONE

@@ -311,58 +312,94 @@ int virNetDevOpenvswitchInterfaceStats(const char *ifname, virDomainInterfaceStatsPtr stats) { - char *tmp; - bool gotStats = false; VIR_AUTOPTR(virCommand) cmd = NULL; VIR_AUTOFREE(char *) output = NULL; + VIR_AUTOPTR(virJSONValue) jsonStats = NULL; + virJSONValuePtr jsonMap = NULL; + size_t i;

- /* Just ensure the interface exists in ovs */ cmd = virCommandNew(OVSVSCTL); virNetDevOpenvswitchAddTimeout(cmd); - virCommandAddArgList(cmd, "get", "Interface", ifname, "name", NULL); + virCommandAddArgList(cmd, "--if-exists", "--format=list", "--data=json", + "--no-headings", "--columns=statistics", "list", + "Interface", ifname, NULL);

Can we assume these options are present in all the supported versions of ovs-vsctl?

...

virCommandSetOutputBuffer(cmd, &output);

- if (virCommandRun(cmd, NULL) < 0) { + /* The above command returns either: + * 1) empty string if @ifname doesn't exist, or + * 2) a JSON array, for instance: + * ["map",[["collisions",0],["rx_bytes",0],["rx_crc_err",0],["rx_dropped",0], + * ["rx_errors",0],["rx_frame_err",0],["rx_over_err",0],["rx_packets",0], + * ["tx_bytes",12406],["tx_dropped",0],["tx_errors",0],["tx_packets",173]]] + */ +

This example would look better in tests/ where you test this function against actual output O:-)

...

+ if (virCommandRun(cmd, NULL) < 0 || + STREQ_NULLABLE(output, "")) { /* no ovs-vsctl or interface 'ifname' doesn't exists in ovs */ virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Interface not found")); return -1; }

Jano

Michal Prívozník

3:59 p.m.

New subject: [libvirt] [PATCH 1/3] virNetDevOpenvswitchInterfaceStats: Optimize for speed

On 7/12/19 6:28 PM, Ján Tomko wrote:

...

On Wed, Jul 03, 2019 at 09:19:18AM +0200, Michal Privoznik wrote:

...
We run 'ovs-vsctl' nine times (first to find if interface is there and then eight times = for each stats member separately). This is very inefficient. I've found a way to run it once and with a bit of help from virJSON module we can parse out stats we need.

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/virnetdevopenvswitch.c | 111 +++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 37 deletions(-)

diff --git a/src/util/virnetdevopenvswitch.c b/src/util/virnetdevopenvswitch.c index c99ecfbf15..0fe64bedab 100644 --- a/src/util/virnetdevopenvswitch.c +++ b/src/util/virnetdevopenvswitch.c @@ -28,6 +28,7 @@ #include "virmacaddr.h" #include "virstring.h" #include "virlog.h" +#include "virjson.h"

#define VIR_FROM_THIS VIR_FROM_NONE

@@ -311,58 +312,94 @@ int virNetDevOpenvswitchInterfaceStats(const char *ifname, virDomainInterfaceStatsPtr stats) { - char *tmp; - bool gotStats = false; VIR_AUTOPTR(virCommand) cmd = NULL; VIR_AUTOFREE(char *) output = NULL; + VIR_AUTOPTR(virJSONValue) jsonStats = NULL; + virJSONValuePtr jsonMap = NULL; + size_t i;

- /* Just ensure the interface exists in ovs */ cmd = virCommandNew(OVSVSCTL); virNetDevOpenvswitchAddTimeout(cmd); - virCommandAddArgList(cmd, "get", "Interface", ifname, "name", NULL); + virCommandAddArgList(cmd, "--if-exists", "--format=list", "--data=json", + "--no-headings", "--columns=statistics", "list", + "Interface", ifname, NULL);

Can we assume these options are present in all the supported versions of ovs-vsctl?

Rough skim through ovs' git log shows that these were added starting from v1.1.0pre1~233 till v1.1.0~294 (mid of 2010 - beginnig of 2011). Therefore, I believe it's safe to use them. And if there's somebody running 10 years old ovs with the latest libvirt we can tell them to update.

...

...
    virCommandSetOutputBuffer(cmd, &output);

-    if (virCommandRun(cmd, NULL) < 0) { +    /* The above command returns either: +     * 1) empty string if @ifname doesn't exist, or +     * 2) a JSON array, for instance: +     *    ["map",[["collisions",0],["rx_bytes",0],["rx_crc_err",0],["rx_dropped",0],

+     *    ["rx_errors",0],["rx_frame_err",0],["rx_over_err",0],["rx_packets",0], +     *    ["tx_bytes",12406],["tx_dropped",0],["tx_errors",0],["tx_packets",173]]] +     */ +

This example would look better in tests/ where you test this function against actual output O:-)

Well, we don't have virNetDevOpenvswitch test or anything. But even if we did, the test case would work over the output we obtained at some point in the time. The input would not change. So effectively, we would be testing only our JSON parsing skills and not if the function fetches the correct data. Sorry. Michal

Michal Privoznik

3 Jul 3 Jul

4:19 p.m.

New subject: [libvirt] [PATCH 2/3] vircommand: Separate mass FD closing into a function

I will optimize this code a bit in the next commit. But for that it is better if the code lives in a separate function. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/vircommand.c | 52 ++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/src/util/vircommand.c b/src/util/vircommand.c index 8695c98d1b..faee36c449 100644 --- a/src/util/vircommand.c +++ b/src/util/vircommand.c @@ -491,6 +491,37 @@ virExecCommon(virCommandPtr cmd, gid_t *groups, int ngroups) return ret; } +static int +virCommandMassClose(virCommandPtr cmd, + int childin, + int childout, + int childerr) +{ + int openmax = sysconf(_SC_OPEN_MAX); + int fd; + int tmpfd; + + if (openmax < 0) { + virReportSystemError(errno, "%s", + _("sysconf(_SC_OPEN_MAX) failed")); + return -1; + } + + for (fd = 3; fd < openmax; fd++) { + if (fd == childin || fd == childout || fd == childerr) + continue; + if (!virCommandFDIsSet(cmd, fd)) { + tmpfd = fd; + VIR_MASS_CLOSE(tmpfd); + } else if (virSetInherit(fd, true) < 0) { + virReportSystemError(errno, _("failed to preserve fd %d"), fd); + return -1; + } + } + + return 0; +} + /* * virExec: * @cmd virCommandPtr containing all information about the program to @@ -500,13 +531,12 @@ static int virExec(virCommandPtr cmd) { pid_t pid; - int null = -1, fd, openmax; + int null = -1; int pipeout[2] = {-1, -1}; int pipeerr[2] = {-1, -1}; int childin = cmd->infd; int childout = -1; int childerr = -1; - int tmpfd; VIR_AUTOFREE(char *) binarystr = NULL; const char *binary = NULL; int ret; @@ -612,23 +642,9 @@ virExec(virCommandPtr cmd) if (cmd->mask) umask(cmd->mask); ret = EXIT_CANCELED; - openmax = sysconf(_SC_OPEN_MAX); - if (openmax < 0) { - virReportSystemError(errno, "%s", - _("sysconf(_SC_OPEN_MAX) failed")); + + if (virCommandMassClose(cmd, childin, childout, childerr) < 0) goto fork_error; - } - for (fd = 3; fd < openmax; fd++) { - if (fd == childin || fd == childout || fd == childerr) - continue; - if (!virCommandFDIsSet(cmd, fd)) { - tmpfd = fd; - VIR_MASS_CLOSE(tmpfd); - } else if (virSetInherit(fd, true) < 0) { - virReportSystemError(errno, _("failed to preserve fd %d"), fd); - goto fork_error; - } - } if (prepareStdFd(childin, STDIN_FILENO) < 0) { virReportSystemError(errno, -- 2.21.0

Ján Tomko

13 Jul 13 Jul

1:32 a.m.

New subject: [libvirt] [PATCH 2/3] vircommand: Separate mass FD closing into a function

On Wed, Jul 03, 2019 at 09:19:19AM +0200, Michal Privoznik wrote:

...

I will optimize this code a bit in the next commit. But for that it is better if the code lives in a separate function.

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/vircommand.c | 52 ++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 18 deletions(-)

Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

Michal Privoznik

3 Jul 3 Jul

4:19 p.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

When spawning a child process, between fork() and exec() we close all file descriptors and keep only those the caller wants us to pass onto the child. The problem is how we do that. Currently, we get the limit of opened files and then iterate through each one of them and either close() it or make it survive exec(). This approach is suboptimal (although, not that much in default configurations where the limit is pretty low - 1024). We have /proc where we can learn what FDs we hold open and thus we can selectively close only those. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/vircommand.c | 78 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 8 deletions(-) diff --git a/src/util/vircommand.c b/src/util/vircommand.c index faee36c449..d154bb81d4 100644 --- a/src/util/vircommand.c +++ b/src/util/vircommand.c @@ -491,27 +491,89 @@ virExecCommon(virCommandPtr cmd, gid_t *groups, int ngroups) return ret; } +# ifdef __linux__ +/* On Linux, we can utilize procfs and read the table of opened + * FDs and selectively close only those FDs we don't want to pass + * onto child process (well, the one we will exec soon since this + * is called from the child). */ +static int +virCommandMassCloseGetFDsLinux(virCommandPtr cmd ATTRIBUTE_UNUSED, + virBitmapPtr fds) +{ + DIR *dp = NULL; + struct dirent *entry; + const char *dirName = "/proc/self/fd"; + int rc; + int ret = -1; + + if (virDirOpen(&dp, dirName) < 0) + return -1; + + while ((rc = virDirRead(dp, &entry, dirName)) > 0) { + int fd; + + if (virStrToLong_i(entry->d_name, NULL, 10, &fd) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to parse FD: %s"), + entry->d_name); + goto cleanup; + } + + if (virBitmapSetBit(fds, fd) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to set FD as open: %d"), + fd); + goto cleanup; + } + } + + if (rc < 0) + goto cleanup; + + ret = 0; + cleanup: + VIR_DIR_CLOSE(dp); + return ret; +} + +# else /* !__linux__ */ + +static int +virCommandMassCloseGetFDsGeneric(virCommandPtr cmd ATTRIBUTE_UNUSED, + virBitmapPtr fds) +{ + virBitmapSetAll(fds); + return 0; +} +# endif /* !__linux__ */ + static int virCommandMassClose(virCommandPtr cmd, int childin, int childout, int childerr) { + VIR_AUTOPTR(virBitmap) fds = NULL; int openmax = sysconf(_SC_OPEN_MAX); - int fd; - int tmpfd; + int fd = -1; - if (openmax < 0) { - virReportSystemError(errno, "%s", - _("sysconf(_SC_OPEN_MAX) failed")); + if (!(fds = virBitmapNew(openmax))) return -1; - } - for (fd = 3; fd < openmax; fd++) { +# ifdef __linux__ + if (virCommandMassCloseGetFDsLinux(cmd, fds) < 0) + return -1; +# else + if (virCommandMassCloseGetFDsGeneric(cmd, fds) < 0) + return -1; +# endif + + fd = virBitmapNextSetBit(fds, -1); + for (; fd >= 0; fd = virBitmapNextSetBit(fds, fd)) { if (fd == childin || fd == childout || fd == childerr) continue; if (!virCommandFDIsSet(cmd, fd)) { - tmpfd = fd; + int tmpfd = fd; VIR_MASS_CLOSE(tmpfd); } else if (virSetInherit(fd, true) < 0) { virReportSystemError(errno, _("failed to preserve fd %d"), fd); -- 2.21.0

Eric Blake

13 Jul 13 Jul

1:55 a.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

On 7/3/19 2:19 AM, Michal Privoznik wrote:

...

When spawning a child process, between fork() and exec() we close all file descriptors and keep only those the caller wants us to pass onto the child. The problem is how we do that. Currently, we get the limit of opened files and then iterate through each one of them and either close() it or make it survive exec(). This approach is suboptimal (although, not that much in default configurations where the limit is pretty low - 1024). We have /proc where we can learn what FDs we hold open and thus we can selectively close only those.

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/vircommand.c | 78 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 8 deletions(-)

...

+# ifdef __linux__ +/* On Linux, we can utilize procfs and read the table of opened + * FDs and selectively close only those FDs we don't want to pass + * onto child process (well, the one we will exec soon since this + * is called from the child). */ +static int +virCommandMassCloseGetFDsLinux(virCommandPtr cmd ATTRIBUTE_UNUSED, + virBitmapPtr fds) +{ + DIR *dp = NULL; + struct dirent *entry; + const char *dirName = "/proc/self/fd"; + int rc; + int ret = -1; + + if (virDirOpen(&dp, dirName) < 0) + return -1;

Unfortunately, POSIX says that opendir()/readdir() are not async-safe (the list of async-safe functions is https://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#ta...; some implementations of opendir() call malloc(), and malloc() is definitely not async-safe - if you ever fork() in one thread while another thread is locked inside a malloc, your child process is deadlocked if it has to malloc because the forked process no longer has the thread to release the malloc lock). It also says readdir() is not threadafe (https://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#ta...) Therefore, opendir() (hidden in virDirOpen) and readdir() (hidden in virDirRead) are generally unsafe to be used between fork() of a multi-threaded app and the subsequent exec(). But maybe you are safe because this code is only compiled on Linux? Are we absolutely sure this can't deadlock, in spite of violating POSIX constraints on async-safety? http://austingroupbugs.net/view.php?id=696 points out some interesting problems from the POSIX side - readdir_r() is absolutely broken, and readdir() being marked as thread-unsafe may be too strict. But then again, http://austingroupbugs.net/view.php?id=75 states "The application shall not modify the structure to which the return value of readdir() points, nor any storage areas pointed to by pointers within the structure. The returned pointer, and pointers within the structure, might be invalidated or the structure or the storage areas might be overwritten by a subsequent call to readdir() on the same directory stream. They shall not be affected by a call to readdir() on a different directory stream." where it may be the intent that if opendir() doesn't take any locks or other async-unsafe actions, using opendir() after fork() may be safe after all (readdir on a DIR* that was opened in the parent is not, but readdir() on a fresh DIR* opened in the child might be safe, even if not currently strictly portable).

...

+ + while ((rc = virDirRead(dp, &entry, dirName)) > 0) { + int fd; + + if (virStrToLong_i(entry->d_name, NULL, 10, &fd) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to parse FD: %s"), + entry->d_name); + goto cleanup; + } + + if (virBitmapSetBit(fds, fd) < 0) {

Is our use of virBitmap likewise avoiding use of malloc()?

...

+ virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to set FD as open: %d"), + fd); + goto cleanup; + } + } + + if (rc < 0) + goto cleanup; + + ret = 0; + cleanup: + VIR_DIR_CLOSE(dp); + return ret; +} + +# else /* !__linux__ */ + +static int +virCommandMassCloseGetFDsGeneric(virCommandPtr cmd ATTRIBUTE_UNUSED, + virBitmapPtr fds) +{ + virBitmapSetAll(fds); + return 0; +} +# endif /* !__linux__ */ + static int virCommandMassClose(virCommandPtr cmd, int childin, int childout, int childerr) { + VIR_AUTOPTR(virBitmap) fds = NULL; int openmax = sysconf(_SC_OPEN_MAX); - int fd; - int tmpfd; + int fd = -1;

- if (openmax < 0) { - virReportSystemError(errno, "%s", - _("sysconf(_SC_OPEN_MAX) failed")); + if (!(fds = virBitmapNew(openmax)))

Would it be better to pre-alloc the bitmap in the parent, prior to fork()ing, to ensure that we are not using malloc() in between fork and exec?

...

return -1; - }

- for (fd = 3; fd < openmax; fd++) { +# ifdef __linux__ + if (virCommandMassCloseGetFDsLinux(cmd, fds) < 0) + return -1; +# else + if (virCommandMassCloseGetFDsGeneric(cmd, fds) < 0) + return -1; +# endif + + fd = virBitmapNextSetBit(fds, -1); + for (; fd >= 0; fd = virBitmapNextSetBit(fds, fd)) { if (fd == childin || fd == childout || fd == childerr) continue; if (!virCommandFDIsSet(cmd, fd)) { - tmpfd = fd; + int tmpfd = fd; VIR_MASS_CLOSE(tmpfd); } else if (virSetInherit(fd, true) < 0) { virReportSystemError(errno, _("failed to preserve fd %d"), fd);

-- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org

Michal Prívozník

4:41 p.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

On 7/12/19 6:55 PM, Eric Blake wrote:

...

On 7/3/19 2:19 AM, Michal Privoznik wrote:

...
When spawning a child process, between fork() and exec() we close all file descriptors and keep only those the caller wants us to pass onto the child. The problem is how we do that. Currently, we get the limit of opened files and then iterate through each one of them and either close() it or make it survive exec(). This approach is suboptimal (although, not that much in default configurations where the limit is pretty low - 1024). We have /proc where we can learn what FDs we hold open and thus we can selectively close only those.

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/vircommand.c | 78 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 8 deletions(-)

...
+# ifdef __linux__ +/* On Linux, we can utilize procfs and read the table of opened + * FDs and selectively close only those FDs we don't want to pass + * onto child process (well, the one we will exec soon since this + * is called from the child). */ +static int +virCommandMassCloseGetFDsLinux(virCommandPtr cmd ATTRIBUTE_UNUSED, + virBitmapPtr fds) +{ + DIR *dp = NULL; + struct dirent *entry; + const char *dirName = "/proc/self/fd"; + int rc; + int ret = -1; + + if (virDirOpen(&dp, dirName) < 0) + return -1;

Unfortunately, POSIX says that opendir()/readdir() are not async-safe (the list of async-safe functions is https://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#ta...; some implementations of opendir() call malloc(), and malloc() is definitely not async-safe - if you ever fork() in one thread while another thread is locked inside a malloc, your child process is deadlocked if it has to malloc because the forked process no longer has the thread to release the malloc lock). It also says readdir() is not threadafe (https://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#ta...)

Therefore, opendir() (hidden in virDirOpen) and readdir() (hidden in virDirRead) are generally unsafe to be used between fork() of a multi-threaded app and the subsequent exec(). But maybe you are safe because this code is only compiled on Linux? Are we absolutely sure this can't deadlock, in spite of violating POSIX constraints on async-safety?

http://austingroupbugs.net/view.php?id=696 points out some interesting problems from the POSIX side - readdir_r() is absolutely broken, and readdir() being marked as thread-unsafe may be too strict. But then again, http://austingroupbugs.net/view.php?id=75 states "The application shall not modify the structure to which the return value of readdir() points, nor any storage areas pointed to by pointers within the structure. The returned pointer, and pointers within the structure, might be invalidated or the structure or the storage areas might be overwritten by a subsequent call to readdir() on the same directory stream. They shall not be affected by a call to readdir() on a different directory stream." where it may be the intent that if opendir() doesn't take any locks or other async-unsafe actions, using opendir() after fork() may be safe after all (readdir on a DIR* that was opened in the parent is not, but readdir() on a fresh DIR* opened in the child might be safe, even if not currently strictly portable).

D'oh. POSIX and its limitations. I could allocate the bitmap in the parent, sure. But avoiding opendir() is not possible (that's the whole point of this patch). While testing this - on Linux - I did not run into any deadlock, but maybe I was just lucky. On the other hand, this is going to run only on Linux, on anything else we'll still iterate over all FDs. I find it rather surprising (in a disturbing way) that there is no better solution to this problem than iterating over all FDs and closing them. One by one. Michal

Eric Blake

14 Jul 14 Jul

12:33 a.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

[adding libc-help in cc] On 7/13/19 2:41 AM, Michal Prívozník wrote:

...

On 7/12/19 6:55 PM, Eric Blake wrote:

...
On 7/3/19 2:19 AM, Michal Privoznik wrote:

...
When spawning a child process, between fork() and exec() we close all file descriptors and keep only those the caller wants us to pass onto the child. The problem is how we do that. Currently, we get the limit of opened files and then iterate through each one of them and either close() it or make it survive exec(). This approach is suboptimal (although, not that much in default configurations where the limit is pretty low - 1024). We have /proc where we can learn what FDs we hold open and thus we can selectively close only those.

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/vircommand.c | 78 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 8 deletions(-)

...
+# ifdef __linux__ +/* On Linux, we can utilize procfs and read the table of opened + * FDs and selectively close only those FDs we don't want to pass + * onto child process (well, the one we will exec soon since this + * is called from the child). */ +static int +virCommandMassCloseGetFDsLinux(virCommandPtr cmd ATTRIBUTE_UNUSED, + virBitmapPtr fds) +{ + DIR *dp = NULL; + struct dirent *entry; + const char *dirName = "/proc/self/fd"; + int rc; + int ret = -1; + + if (virDirOpen(&dp, dirName) < 0) + return -1;

Unfortunately, POSIX says that opendir()/readdir() are not async-safe (the list of async-safe functions is https://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#ta...; some implementations of opendir() call malloc(), and malloc() is definitely not async-safe - if you ever fork() in one thread while another thread is locked inside a malloc, your child process is deadlocked if it has to malloc because the forked process no longer has the thread to release the malloc lock). It also says readdir() is not threadafe (https://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#ta...)

Therefore, opendir() (hidden in virDirOpen) and readdir() (hidden in virDirRead) are generally unsafe to be used between fork() of a multi-threaded app and the subsequent exec(). But maybe you are safe because this code is only compiled on Linux? Are we absolutely sure this can't deadlock, in spite of violating POSIX constraints on async-safety?

http://austingroupbugs.net/view.php?id=696 points out some interesting problems from the POSIX side - readdir_r() is absolutely broken, and readdir() being marked as thread-unsafe may be too strict. But then again, http://austingroupbugs.net/view.php?id=75 states "The application shall not modify the structure to which the return value of readdir() points, nor any storage areas pointed to by pointers within the structure. The returned pointer, and pointers within the structure, might be invalidated or the structure or the storage areas might be overwritten by a subsequent call to readdir() on the same directory stream. They shall not be affected by a call to readdir() on a different directory stream." where it may be the intent that if opendir() doesn't take any locks or other async-unsafe actions, using opendir() after fork() may be safe after all (readdir on a DIR* that was opened in the parent is not, but readdir() on a fresh DIR* opened in the child might be safe, even if not currently strictly portable).

D'oh. POSIX and its limitations. I could allocate the bitmap in the parent, sure. But avoiding opendir() is not possible (that's the whole point of this patch). While testing this - on Linux - I did not run into any deadlock, but maybe I was just lucky. On the other hand, this is going to run only on Linux, on anything else we'll still iterate over all FDs.

I find it rather surprising (in a disturbing way) that there is no better solution to this problem than iterating over all FDs and closing them. One by one.

Does anyone know if glibc guarantees that opendir/readdir in between multi-threaded fork() and exec() is safe, even though POSIX does not guarantee that safety in general? I know that one approach to avoid having to close all fds is religiously using O_CLOEXEC everywhere (so that the race window of having an fd that would leak is not possible), but that's also an expensive audit, compared to just ensuring that things are closed after fork(). Are there any other ideas out there that we should be aware of (and no, pthread_atfork is not a sane idea)? (various BSD systems have the closefrom() syscall which is more efficient than lots of close() calls; and Linux may be adding something similar https://lwn.net/Articles/789023/), Is there any saner way to close all unneeded fds that were not properly marked O_CLOEXEC by an application linking against a multithreaded lib that must perform fork/exec? -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org

Michal Prívozník

2:11 a.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

On 7/13/19 5:33 PM, Eric Blake wrote:

...

[adding libc-help in cc]

On 7/13/19 2:41 AM, Michal Prívozník wrote:

...
On 7/12/19 6:55 PM, Eric Blake wrote:

...
On 7/3/19 2:19 AM, Michal Privoznik wrote:

...
When spawning a child process, between fork() and exec() we close all file descriptors and keep only those the caller wants us to pass onto the child. The problem is how we do that. Currently, we get the limit of opened files and then iterate through each one of them and either close() it or make it survive exec(). This approach is suboptimal (although, not that much in default configurations where the limit is pretty low - 1024). We have /proc where we can learn what FDs we hold open and thus we can selectively close only those.

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/vircommand.c | 78 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 8 deletions(-)

...
+# ifdef __linux__ +/* On Linux, we can utilize procfs and read the table of opened + * FDs and selectively close only those FDs we don't want to pass + * onto child process (well, the one we will exec soon since this + * is called from the child). */ +static int +virCommandMassCloseGetFDsLinux(virCommandPtr cmd ATTRIBUTE_UNUSED, + virBitmapPtr fds) +{ + DIR *dp = NULL; + struct dirent *entry; + const char *dirName = "/proc/self/fd"; + int rc; + int ret = -1; + + if (virDirOpen(&dp, dirName) < 0) + return -1;

Unfortunately, POSIX says that opendir()/readdir() are not async-safe (the list of async-safe functions is https://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#ta...; some implementations of opendir() call malloc(), and malloc() is definitely not async-safe - if you ever fork() in one thread while another thread is locked inside a malloc, your child process is deadlocked if it has to malloc because the forked process no longer has the thread to release the malloc lock). It also says readdir() is not threadafe (https://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#ta...)

Therefore, opendir() (hidden in virDirOpen) and readdir() (hidden in virDirRead) are generally unsafe to be used between fork() of a multi-threaded app and the subsequent exec(). But maybe you are safe because this code is only compiled on Linux? Are we absolutely sure this can't deadlock, in spite of violating POSIX constraints on async-safety?

http://austingroupbugs.net/view.php?id=696 points out some interesting problems from the POSIX side - readdir_r() is absolutely broken, and readdir() being marked as thread-unsafe may be too strict. But then again, http://austingroupbugs.net/view.php?id=75 states "The application shall not modify the structure to which the return value of readdir() points, nor any storage areas pointed to by pointers within the structure. The returned pointer, and pointers within the structure, might be invalidated or the structure or the storage areas might be overwritten by a subsequent call to readdir() on the same directory stream. They shall not be affected by a call to readdir() on a different directory stream." where it may be the intent that if opendir() doesn't take any locks or other async-unsafe actions, using opendir() after fork() may be safe after all (readdir on a DIR* that was opened in the parent is not, but readdir() on a fresh DIR* opened in the child might be safe, even if not currently strictly portable).

D'oh. POSIX and its limitations. I could allocate the bitmap in the parent, sure. But avoiding opendir() is not possible (that's the whole point of this patch). While testing this - on Linux - I did not run into any deadlock, but maybe I was just lucky. On the other hand, this is going to run only on Linux, on anything else we'll still iterate over all FDs.

I find it rather surprising (in a disturbing way) that there is no better solution to this problem than iterating over all FDs and closing them. One by one.

Does anyone know if glibc guarantees that opendir/readdir in between multi-threaded fork() and exec() is safe, even though POSIX does not guarantee that safety in general? I know that one approach to avoid having to close all fds is religiously using O_CLOEXEC everywhere (so that the race window of having an fd that would leak is not possible), but that's also an expensive audit, compared to just ensuring that things are closed after fork().

Problem is not only that our codebase is big and thus not auditable easily, the other part of the problem is that we are linking with variety of libraries that may open/close FDs as we call them (e.g. libiscsi will open a connection to iSCSI target). But from our POV those FDs are out of our reach. And yet, we don't want to leak them into child processes. Michal

Florian Weimer

2:23 p.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

* Eric Blake:

...

Does anyone know if glibc guarantees that opendir/readdir in between multi-threaded fork() and exec() is safe, even though POSIX does not guarantee that safety in general?

glibc supports malloc after multi-threaded fork as an extension (or as a bug, because it makes malloc not async-signal-safe). Lots of programs depend on this. OpenJDK even calls malloc after vfork, which is not officially supported (but of course we can't break OpenJDK).

...

I know that one approach to avoid having to close all fds is religiously using O_CLOEXEC everywhere (so that the race window of having an fd that would leak is not possible), but that's also an expensive audit, compared to just ensuring that things are closed after fork(). Are there any other ideas out there that we should be aware of (and no, pthread_atfork is not a sane idea)? (various BSD systems have the closefrom() syscall which is more efficient than lots of close() calls; and Linux may be adding something similar https://lwn.net/Articles/789023/), Is there any saner way to close all unneeded fds that were not properly marked O_CLOEXEC by an application linking against a multithreaded lib that must perform fork/exec?

I tried to add getdents64 (which got committed, but may yet move from <unistd.h> to <dirent.h>, to match musl) and <sys/direntries.h> (which did not) in glibc 2.30. Those interfaces are async-signal-safe (except on some MIPS variants, where getdents64 has complex emulation). If you do not want to use opendir/readdir, issuing getdents64 directly and parsing the buffer is your best option right now. (Lowering the RLIMIT_NOFILE limit does not enable probing for stray descriptors, unfortunately.) But opendir/readdir after fork should be fine, really.

Eric Blake

15 Jul 15 Jul

11:23 p.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

On 7/14/19 12:23 AM, Florian Weimer wrote:

...

* Eric Blake:

...
Does anyone know if glibc guarantees that opendir/readdir in between multi-threaded fork() and exec() is safe, even though POSIX does not guarantee that safety in general?

glibc supports malloc after multi-threaded fork as an extension (or as a bug, because it makes malloc not async-signal-safe).

It's not a bug for glibc to provide guarantees above what POSIX requires, but IS a bug for applications to depend on those guarantees without realizing they are non-portable.

...

If you do not want to use opendir/readdir, issuing getdents64 directly and parsing the buffer is your best option right now. (Lowering the RLIMIT_NOFILE limit does not enable probing for stray descriptors, unfortunately.) But opendir/readdir after fork should be fine, really.

Thanks for checking; I'm okay with the patch that started this thread going in libvirt if we tweak it to also include a big fat comment stating that use of opendir/readdir is not safe in general, but should be safe in this specific use (because glibc adds async-signal safety to those functions that was not required by POSIX), since the patch is only using opendir on Linux. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org

Florian Weimer

11:26 p.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

* Eric Blake:

...

On 7/14/19 12:23 AM, Florian Weimer wrote:

...
* Eric Blake:

...
Does anyone know if glibc guarantees that opendir/readdir in between multi-threaded fork() and exec() is safe, even though POSIX does not guarantee that safety in general?

glibc supports malloc after multi-threaded fork as an extension (or as a bug, because it makes malloc not async-signal-safe).

It's not a bug for glibc to provide guarantees above what POSIX requires, but IS a bug for applications to depend on those guarantees without realizing they are non-portable.

It's a bug because it makes malloc not async-signal-safe (as required by POSIX) in our current implementation of malloc.

Eric Blake

16 Jul 16 Jul

1:57 a.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

On 7/15/19 9:26 AM, Florian Weimer wrote:

...

* Eric Blake:

...
On 7/14/19 12:23 AM, Florian Weimer wrote:

...
* Eric Blake:

...
Does anyone know if glibc guarantees that opendir/readdir in between multi-threaded fork() and exec() is safe, even though POSIX does not guarantee that safety in general?

glibc supports malloc after multi-threaded fork as an extension (or as a bug, because it makes malloc not async-signal-safe).

It's not a bug for glibc to provide guarantees above what POSIX requires, but IS a bug for applications to depend on those guarantees without realizing they are non-portable.

It's a bug because it makes malloc not async-signal-safe (as required by POSIX) in our current implementation of malloc.

Huh? malloc() is NOT required by POSIX to be async-signal-safe (it is NOT in the list at https://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#ta... 2.4.3 Signal Actions). POSIX allows for malloc() to be made async-signal-safe ("Implementations may make other interfaces async-signal-safe.") but does not require it; therefore, a strictly-compliant POSIX application CANNOT call malloc() from inside a signal handler, or after fork()ing from a multi-threaded app but prior to exec(), because the signal or the fork may have interrupted another use of malloc(), where the nested malloc() deadlocks trying to obtain the resource held by the interrupted malloc(). But if glibc has ways to guarantee that one malloc() can be interrupted by a signal or by a multi-threaded fork, and still have the second re-entrant usage from that interrupting context that won't deadlock, then glibc is providing a stronger guarantee than what POSIX requires. Anything that has to obtain a mutex is notoriously difficult to make async-signal-safe, which is why POSIX does not make the requirement of malloc(), or in turn of anything like opendir() that might use malloc() under the hood. I don't get what you say will make malloc() non-async-signal-safe, since POSIX does not require that was in the first place. I am, however, stating that glibc might be willing to provide that additional guarantee (maybe for just opendir(), rather than malloc()) even though relying on that aspect of glibc means your application is no longer strictly compliant, but will only work on glibc. But knowing what glibc guarantees even when POSIX does not determines what we can do under #ifdef __linux__ that we otherwise cannot portably do for risk of deadlock. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org

Florian Weimer

2:01 a.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

* Eric Blake:

...

On 7/15/19 9:26 AM, Florian Weimer wrote:

...
* Eric Blake:

...
On 7/14/19 12:23 AM, Florian Weimer wrote:

...
* Eric Blake:

...
Does anyone know if glibc guarantees that opendir/readdir in between multi-threaded fork() and exec() is safe, even though POSIX does not guarantee that safety in general?

glibc supports malloc after multi-threaded fork as an extension (or as a bug, because it makes malloc not async-signal-safe).

It's not a bug for glibc to provide guarantees above what POSIX requires, but IS a bug for applications to depend on those guarantees without realizing they are non-portable.

It's a bug because it makes malloc not async-signal-safe (as required by POSIX) in our current implementation of malloc.

Huh? malloc() is NOT required by POSIX to be async-signal-safe (it is NOT in the list at

Sorry, I mistyped. I meant to write fork. It's on the list. Thanks, Florian

Eric Blake

3:13 a.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

On 7/15/19 12:01 PM, Florian Weimer wrote:

...

...
...
...
...
glibc supports malloc after multi-threaded fork as an extension (or as a bug, because it makes malloc not async-signal-safe).

It's not a bug for glibc to provide guarantees above what POSIX requires, but IS a bug for applications to depend on those guarantees without realizing they are non-portable.

It's a bug because it makes malloc not async-signal-safe (as required by POSIX) in our current implementation of malloc.

Huh? malloc() is NOT required by POSIX to be async-signal-safe (it is NOT in the list at

Sorry, I mistyped. I meant to write fork. It's on the list.

Ah, that makes much more sense. In other words, the manner in which glibc guarantees that malloc() can be called after fork() is enough to simultaneously break the POSIX requirement that fork() can be called from a signal handler without risking deadlock (because whatever glibc does to make malloc safe after fork is not re-entrant enough if a signal handler tries to fork while glibc was already in the middle of preparing for fork). Is it worth opening a bug against POSIX to try to strike or reduce the requirement that fork() be async-signal safe? For a single-threaded app, fork()ing in a signal handler might have made sense, but as POSIX already says: "When the application calls fork() from a signal handler and any of the fork handlers registered by pthread_atfork() calls a function that is not async-signal-safe, the behavior is undefined." ... "While the fork() function is async-signal-safe, there is no way for an implementation to determine whether the fork handlers established by pthread_atfork() are async-signal-safe. The fork handlers may attempt to execute portions of the implementation that are not async-signal-safe, such as those that are protected by mutexes, leading to a deadlock condition. It is therefore undefined for the fork handlers to execute functions that are not async-signal-safe when fork() is called from a signal handler." it might be nicer if the POSIX wording were changed to require fork() to be async-signal safe only when used in a single-threaded application (where pthread_atfork() is not in use), or even dropped the requirement altogether (relying on implementations to provide that additional guarantee if they'd like, but not requiring it of all implementations). -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org

Florian Weimer

3:27 a.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

* Eric Blake:

...

On 7/15/19 12:01 PM, Florian Weimer wrote:

...
...
...
...
...
glibc supports malloc after multi-threaded fork as an extension (or as a bug, because it makes malloc not async-signal-safe).

It's not a bug for glibc to provide guarantees above what POSIX requires, but IS a bug for applications to depend on those guarantees without realizing they are non-portable.

It's a bug because it makes malloc not async-signal-safe (as required by POSIX) in our current implementation of malloc.

Huh? malloc() is NOT required by POSIX to be async-signal-safe (it is NOT in the list at

Sorry, I mistyped. I meant to write fork. It's on the list.

Ah, that makes much more sense. In other words, the manner in which glibc guarantees that malloc() can be called after fork() is enough to simultaneously break the POSIX requirement that fork() can be called from a signal handler without risking deadlock (because whatever glibc does to make malloc safe after fork is not re-entrant enough if a signal handler tries to fork while glibc was already in the middle of preparing for fork).

It's taking all the malloc locks, so interrupting malloc is sufficient to trigger a deadlock with fork. There are a bunch of other locks as well, but the malloc locks is the most prominent one.

...

Is it worth opening a bug against POSIX to try to strike or reduce the requirement that fork() be async-signal safe? For a single-threaded app, fork()ing in a signal handler might have made sense, but as POSIX already says:

fork is commonly used in in-process crash handlers. I suspect this is in part because it allows one to stop all other threads. I do not think this is the proper way to write crash handlers (at least not on Linux), but it's still very much current practice. I would certainly welcome a different requirement from POSIX, but then maybe POSIX does not want to act as the driver of change on our behalf like this. Thanks, Florian

Daniel P. Berrangé

15 Jul 15 Jul

11:35 p.m.

New subject: [libvirt] [PATCH 3/3] virCommand: use procfs to learn opened FDs

On Sun, Jul 14, 2019 at 07:23:10AM +0200, Florian Weimer wrote:

...

* Eric Blake:

...
Does anyone know if glibc guarantees that opendir/readdir in between multi-threaded fork() and exec() is safe, even though POSIX does not guarantee that safety in general?

glibc supports malloc after multi-threaded fork as an extension (or as a bug, because it makes malloc not async-signal-safe).

Lots of programs depend on this. OpenJDK even calls malloc after vfork, which is not officially supported (but of course we can't break OpenJDK).

Yep, libvirt already relies glibc semantics that let malloc/free work after fork(), for other code we've had since essentially forever.

...

...
I know that one approach to avoid having to close all fds is religiously using O_CLOEXEC everywhere (so that the race window of having an fd that would leak is not possible), but that's also an expensive audit, compared to just ensuring that things are closed after fork(). Are there any other ideas out there that we should be aware of (and no, pthread_atfork is not a sane idea)? (various BSD systems have the closefrom() syscall which is more efficient than lots of close() calls; and Linux may be adding something similar https://lwn.net/Articles/789023/), Is there any saner way to close all unneeded fds that were not properly marked O_CLOEXEC by an application linking against a multithreaded lib that must perform fork/exec?

I tried to add getdents64 (which got committed, but may yet move from <unistd.h> to <dirent.h>, to match musl) and <sys/direntries.h> (which did not) in glibc 2.30. Those interfaces are async-signal-safe (except on some MIPS variants, where getdents64 has complex emulation).

If you do not want to use opendir/readdir, issuing getdents64 directly and parsing the buffer is your best option right now. (Lowering the RLIMIT_NOFILE limit does not enable probing for stray descriptors, unfortunately.) But opendir/readdir after fork should be fine, really.

Ok, lets just keep it simple & use opendif/readdir. If we ever hit problems, we can just disable the code & go back to the slower code we have right now. Hopefully the kernel folks will finally merge one of the recent closefrom()-like proposals and make life much easier. Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

Daniel P. Berrangé

3 Jul 3 Jul

5:44 p.m.

On Wed, Jul 03, 2019 at 09:19:17AM +0200, Michal Privoznik wrote:

...

If the limit for max opened files is way bigger than the default (1024), say 3 orders bigger then spawning a child through virCommand can be expensive because we iterate over ALL FDs within the limit and close them. There's no need to that since we can learn the list of opened FDs from /proc/self/fd/.

Yeah I've seen this in docker containers in particular where the nfiles limit is something like 2 million. Hopefully Linux will finally get something equiv to BSD's closefrom() syscall we can use in the future: https://lwn.net/Articles/593778/ meanwhile relying on /proc is reasonable I guess. The extra syscalls to read the dir should be balanced by fewer close syscalls quite easily. Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

Daniel P. Berrangé

5:48 p.m.

On Wed, Jul 03, 2019 at 09:44:41AM +0100, Daniel P. Berrangé wrote:

...

On Wed, Jul 03, 2019 at 09:19:17AM +0200, Michal Privoznik wrote:

...
If the limit for max opened files is way bigger than the default (1024), say 3 orders bigger then spawning a child through virCommand can be expensive because we iterate over ALL FDs within the limit and close them. There's no need to that since we can learn the list of opened FDs from /proc/self/fd/.

Yeah I've seen this in docker containers in particular where the nfiles limit is something like 2 million.

Hopefully Linux will finally get something equiv to BSD's closefrom() syscall we can use in the future:

https://lwn.net/Articles/593778/

Opps, wrong LWN link https://lwn.net/Articles/789023/ Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

2370

Age (days ago)

2382

Last active (days ago)

List overview

Download

20 comments

7 participants

participants (7)

Daniel P. Berrangé
Eric Blake
Florian Weimer
Florian Weimer
Ján Tomko
Michal Privoznik
Michal Prívozník

[libvirt] [PATCH 0/3] Optimize mass closing of FDs on child spwaning

tags

participants (7)