[libvirt] [PATCH 0/1] Support domain restore from root-squash NFS

Most of the explanation is in the commit message with the patch proper. When reveiwing please take pains to point out any places where I may not be diligent enough cleaning up loose ends, especially in terms of assuring that the child process always terminates properly no matter what happens. The (unrelated) race condition in qemudDomainRestore() remains - I solved it locally by putting a sleep(10) just before qemuMonitorStartCPUs(), but it needs something with a bit higher comfort factor ;-)

If qemudDomainRestore fails to open the domain save file, create a pipe, then fork a process that does setuid(qemu_user) and opens the file, then reads this file and stuffs it into the pipe. the parnet libvirtd process will use the other end of the pipe as its fd, then reap the child process after its done reading. This makes domain restore work on a root-squash NFS share that is only visible to the qemu user. --- src/qemu/qemu_driver.c | 158 +++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 155 insertions(+), 3 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index ac6ef6a..f909a59 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4659,6 +4659,135 @@ cleanup: return ret; } +/* qemudOpenAsUID() - pipe/fork/setuid/open a file, and return the + pipe fd to caller, so that it can read from the file. Also return + the pid of the child process, so the caller can wait for it to exit + after it's finished reading (to avoid a zombie, if nothing + else). */ + +static int qemudOpenAsUID(const char *path, uid_t uid, pid_t *childpid) { + int pipefd[2] = {-1, -1}; + int fd = -1; + + *childpid = -1; + + if (pipe(pipefd) < 0) { + virReportSystemError(errno, + _("failed to create pipe to read '%s'"), + path); + goto parentcleanup; + } + + int forkRet = virFork(childpid); + + if (*childpid < 0) { + virReportSystemError(errno, + _("failed to fork child to read '%s'"), + path); + goto parentcleanup; + } + + if (*childpid > 0) { + + /* parent */ + + /* parent doesn't need the write side of the pipe */ + close(pipefd[1]); + pipefd[1] = -1; + + if (forkRet < 0) { + virReportSystemError(errno, + _("failed in parent after forking child to read '%s'"), + path); + goto parentcleanup; + } + /* caller gets the read side of the pipe */ + fd = pipefd[0]; + pipefd[0] = -1; +parentcleanup: + if (pipefd[0] != -1) + close(pipefd[0]); + if (pipefd[1] != -1) + close(pipefd[1]); + if ((fd < 0) && (*childpid > 0)) { + /* a child process was started and subsequently an error + occurred in the parent, so we need to wait for it to + exit, but its status is inconsequential. */ + while ((waitpid(*childpid, NULL, 0) == -1) + && (errno == EINTR)); + *childpid = -1; + } + return fd; + } + + /* child */ + + /* setuid to the qemu user, then open the file, read it, + and stuff it into the pipe for the parent process to + read */ + int exit_code; + char *buf = NULL; + int bufsize = 1024 * 1024; + int bytesread; + + /* child doesn't need the read side of the pipe */ + close(pipefd[0]); + + if (forkRet < 0) { + exit_code = errno; + virReportSystemError(errno, + _("failed in child after forking to read '%s'"), + path); + goto childcleanup; + } + + if (setuid(uid) != 0) { + exit_code = errno; + virReportSystemError(errno, + _("cannot setuid(%d) to read '%s'"), + getuid(), path); + goto childcleanup; + } + if ((fd = open(path, O_RDONLY)) < 0) { + exit_code = errno; + virReportSystemError(errno, + _("cannot open '%s' as uid %d"), + path, getuid()); + goto childcleanup; + } + if (VIR_ALLOC_N(buf, bufsize) < 0) { + exit_code = ENOMEM; + virReportOOMError(); + goto childcleanup; + } + + /* read from fd and write to pipefd[1] until EOF */ + do { + if ((bytesread = saferead(fd, buf, bufsize)) < 0) { + exit_code = errno; + virReportSystemError(errno, + _("child failed reading from '%s'"), + path); + goto childcleanup; + } + if (safewrite(pipefd[1], buf, bytesread) != bytesread) { + exit_code = errno; + virReportSystemError(errno, "%s", + _("child failed writing to pipe")); + goto childcleanup; + } + } while (bytesread > 0); + exit_code = 0; + +childcleanup: + VIR_FREE(buf); + if (fd != -1) + close(fd); + if (pipefd[1] != -1) + close(pipefd[1]); + _exit(exit_code); +} + /* TODO: check seclabel restore */ static int qemudDomainRestore(virConnectPtr conn, const char *path) { @@ -4666,6 +4795,7 @@ static int qemudDomainRestore(virConnectPtr conn, virDomainDefPtr def = NULL; virDomainObjPtr vm = NULL; int fd = -1; + pid_t read_pid = -1; int ret = -1; char *xml = NULL; struct qemud_save_header header; @@ -4677,9 +4807,20 @@ static int qemudDomainRestore(virConnectPtr conn, qemuDriverLock(driver); /* Verify the header and read the XML */ if ((fd = open(path, O_RDONLY)) < 0) { - qemuReportError(VIR_ERR_OPERATION_FAILED, - "%s", _("cannot read domain image")); - goto cleanup; + if ((driver->user == 0) || (getuid() != 0)) { + qemuReportError(VIR_ERR_OPERATION_FAILED, + "%s", _("cannot read domain image")); + goto cleanup; + } + + /* Opening as root failed, but qemu runs as a different user + that might have better luck. Create a pipe, then fork a + child process to run as the qemu user, which will hopefully + have the necessary authority to read the file. */ + if ((fd = qemudOpenAsUID(path, driver->user, &read_pid)) < 0) { + /* error already reported */ + goto cleanup; + } } if (saferead(fd, &header, sizeof(header)) != sizeof(header)) { @@ -4769,6 +4910,12 @@ static int qemudDomainRestore(virConnectPtr conn, close(intermediatefd); close(fd); fd = -1; + if (read_pid != -1) { + /* reap the process that read the file */ + while ((waitpid(read_pid, NULL, 0) == -1) + && (errno == EINTR)); + read_pid = -1; + } if (ret < 0) { if (!vm->persistent) { if (qemuDomainObjEndJob(vm) > 0) @@ -4810,6 +4957,11 @@ cleanup: VIR_FREE(xml); if (fd != -1) close(fd); + if (read_pid != 0) { + /* reap the process that read the file */ + while ((waitpid(read_pid, NULL, 0) == -1) + && (errno == EINTR)); + } if (vm) virDomainObjUnlock(vm); if (event) -- 1.6.6.1

According to Laine Stump on 2/23/2010 12:54 PM: Minor nits (I have not validated the overall patch in context; this is just from a quick high-level review)
If qemudDomainRestore fails to open the domain save file, create a pipe, then fork a process that does setuid(qemu_user) and opens the file, then reads this file and stuffs it into the pipe. the parnet
s/. the parnet/. The parent/
libvirtd process will use the other end of the pipe as its fd, then reap the child process after its done reading.
s/its/it's/
This makes domain restore work on a root-squash NFS share that is only visible to the qemu user. --- src/qemu/qemu_driver.c | 158 +++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 155 insertions(+), 3 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index ac6ef6a..f909a59 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4659,6 +4659,135 @@ cleanup: return ret; }
+/* qemudOpenAsUID() - pipe/fork/setuid/open a file, and return the + pipe fd to caller, so that it can read from the file. Also return + the pid of the child process, so the caller can wait for it to exit + after it's finished reading (to avoid a zombie, if nothing + else). */ + +static int qemudOpenAsUID(const char *path, uid_t uid, pid_t *childpid) { + int pipefd[2] = {-1, -1};
Why the explicit initialization...
+ int fd = -1; + + *childpid = -1; + + if (pipe(pipefd) < 0) {
...since I see nothing in POSIX that guarantees that pipe() will preserve your initialized values across failure. I think it's better to explicitly set things to -1 on the failure path, rather than initializing that way.
+ virReportSystemError(errno, + _("failed to create pipe to read '%s'"), + path); + goto parentcleanup; + } + + int forkRet = virFork(childpid);
Unrelated to your patch, but we should probably start using pipe2 and atomically marking pipes as CLOEXEC, so that use of libvirt in a multithreaded environment does not have a race window where unrelated threads can leak libvirt's fds across an exec (well, for new enough kernels anyway; at least gnulib has a pipe2 fallback for older kernels). ...
+ /* child */ + + /* setuid to the qemu user, then open the file, read it, + and stuff it into the pipe for the parent process to + read */ + int exit_code; + char *buf = NULL; + int bufsize = 1024 * 1024; + int bytesread; + + /* child doesn't need the read side of the pipe */ + close(pipefd[0]); + + if (forkRet < 0) { + exit_code = errno; + virReportSystemError(errno, + _("failed in child after forking to read '%s'"), + path); + goto childcleanup; + } + + if (setuid(uid) != 0) { + exit_code = errno; + virReportSystemError(errno, + _("cannot setuid(%d) to read '%s'"), + getuid(), path);
Is that the right uid to be reporting in the error message? Nothing else jumped out at me as suspicious. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

Thanks for the quick response! On 02/23/2010 03:32 PM, Eric Blake wrote:
According to Laine Stump on 2/23/2010 12:54 PM:
Minor nits (I have not validated the overall patch in context; this is just from a quick high-level review)
If qemudDomainRestore fails to open the domain save file, create a pipe, then fork a process that does setuid(qemu_user) and opens the file, then reads this file and stuffs it into the pipe. the parnet
s/. the parnet/. The parent/
libvirtd process will use the other end of the pipe as its fd, then reap the child process after its done reading.
s/its/it's/
Ah, yes. Those terrible typos that never seem worth the effort to fix after they're already checked in. Thanks for nipping it in the bud!
This makes domain restore work on a root-squash NFS share that is only visible to the qemu user. --- src/qemu/qemu_driver.c | 158 +++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 155 insertions(+), 3 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index ac6ef6a..f909a59 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4659,6 +4659,135 @@ cleanup: return ret; }
+/* qemudOpenAsUID() - pipe/fork/setuid/open a file, and return the + pipe fd to caller, so that it can read from the file. Also return + the pid of the child process, so the caller can wait for it to exit + after it's finished reading (to avoid a zombie, if nothing + else). */ + +static int qemudOpenAsUID(const char *path, uid_t uid, pid_t *childpid) { + int pipefd[2] = {-1, -1};
Why the explicit initialization...
Mostly because I was following the example of util.c:__virExec(), which does the same thing. Following examples seems to get me into nearly as much trouble as ignoring them ;-)
+ int fd = -1; + + *childpid = -1; + + if (pipe(pipefd)< 0) {
...since I see nothing in POSIX that guarantees that pipe() will preserve your initialized values across failure. I think it's better to explicitly set things to -1 on the failure path, rather than initializing that way.
Interesting point. If we can't rely on that, then __virExec() should be changed too. Of course this makes maintenance a bit more error prone, since we would have to make sure that *every* jump to the cleanup that happened prior to creating the pipe would set the pipe fds to -1 (or just leave the initialization in, and re-set them to -1 when pipe failed; that's probably the safest thing to do).
+ virReportSystemError(errno, + _("failed to create pipe to read '%s'"), + path); + goto parentcleanup; + } + + int forkRet = virFork(childpid);
Unrelated to your patch, but we should probably start using pipe2 and atomically marking pipes as CLOEXEC, so that use of libvirt in a multithreaded environment does not have a race window where unrelated threads can leak libvirt's fds across an exec (well, for new enough kernels anyway; at least gnulib has a pipe2 fallback for older kernels).
...
I think I just heard someone volunteering! ;-)
+ /* child */ + + /* setuid to the qemu user, then open the file, read it, + and stuff it into the pipe for the parent process to + read */ + int exit_code; + char *buf = NULL; + int bufsize = 1024 * 1024; + int bytesread; + + /* child doesn't need the read side of the pipe */ + close(pipefd[0]); + + if (forkRet< 0) { + exit_code = errno; + virReportSystemError(errno, + _("failed in child after forking to read '%s'"), + path); + goto childcleanup; + } + + if (setuid(uid) != 0) { + exit_code = errno; + virReportSystemError(errno, + _("cannot setuid(%d) to read '%s'"), + getuid(), path);
Is that the right uid to be reporting in the error message?
What's the proper emoticon for blushing in embarrassment? Bonehead typo ;-) Thanks for pointing it out.

On 02/24/2010 06:20 AM, Laine Stump wrote:
...since I see nothing in POSIX that guarantees that pipe() will preserve your initialized values across failure. I think it's better to explicitly set things to -1 on the failure path, rather than initializing that way.
Interesting point. If we can't rely on that, then __virExec() should be changed too.
Since libvirtd is Linux-only, we can luckily rely on that at least for now: int do_pipe_flags(int *fd, int flags) { struct file *fw, *fr; int error; int fdw, fdr; if (flags & ~(O_CLOEXEC | O_NONBLOCK)) return -EINVAL; fw = create_write_pipe(flags); if (IS_ERR(fw)) return PTR_ERR(fw); fr = create_read_pipe(fw, flags); error = PTR_ERR(fr); if (IS_ERR(fr)) goto err_write_pipe; error = get_unused_fd_flags(flags); if (error < 0) goto err_read_pipe; fdr = error; error = get_unused_fd_flags(flags); if (error < 0) goto err_fdr; fdw = error; audit_fd_pair(fdr, fdw); fd_install(fdr, fr); fd_install(fdw, fw); fd[0] = fdr; fd[1] = fdw; return 0; err_fdr: put_unused_fd(fdr); err_read_pipe: path_put(&fr->f_path); put_filp(fr); err_write_pipe: free_write_pipe(fw); return error; } which makes it quite low priority. Paolo

Laine Stump wrote:
If qemudDomainRestore fails to open the domain save file, create a pipe, then fork a process that does setuid(qemu_user) and opens the file, then reads this file and stuffs it into the pipe. the parnet libvirtd process will use the other end of the pipe as its fd, then reap the child process after its done reading.
Hi Laine, This looks fine, modulo a few minor stylistic and error-handling nits. Comments below.
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c ... +static int qemudOpenAsUID(const char *path, uid_t uid, pid_t *childpid) { ... + /* caller gets the read side of the pipe */ + fd = pipefd[0]; + pipefd[0] = -1; +parentcleanup:
This_is_more_readable than notusingwordseparators ;-) s/cleanup/_cleanup/
+ if (pipefd[0] != -1) + close(pipefd[0]); + if (pipefd[1] != -1) + close(pipefd[1]); + if ((fd < 0) && (*childpid > 0)) { + /* a child process was started and subsequently an error + occurred in the parent, so we need to wait for it to + exit, but its status is inconsequential. */ + while ((waitpid(*childpid, NULL, 0) == -1) + && (errno == EINTR)); + *childpid = -1; + } + return fd; + } + + /* child */ + + /* setuid to the qemu user, then open the file, read it, + and stuff it into the pipe for the parent process to + read */ + int exit_code; + char *buf = NULL; + int bufsize = 1024 * 1024;
It's slightly better to use size_t in place of int, to match the prototype of VIR_ALLOC_N.
+ int bytesread; + + /* child doesn't need the read side of the pipe */ + close(pipefd[0]); + + if (forkRet < 0) { + exit_code = errno; + virReportSystemError(errno, + _("failed in child after forking to read '%s'"), + path); + goto childcleanup; + } + + if (setuid(uid) != 0) { + exit_code = errno; + virReportSystemError(errno, + _("cannot setuid(%d) to read '%s'"), + getuid(), path); + goto childcleanup; + } + if ((fd = open(path, O_RDONLY)) < 0) { + exit_code = errno; + virReportSystemError(errno, + _("cannot open '%s' as uid %d"), + path, getuid()); + goto childcleanup; + } + if (VIR_ALLOC_N(buf, bufsize) < 0) { + exit_code = ENOMEM; + virReportOOMError(); + goto childcleanup; + } + + /* read from fd and write to pipefd[1] until EOF */ + do { + if ((bytesread = saferead(fd, buf, bufsize)) < 0) { + exit_code = errno; + virReportSystemError(errno, + _("child failed reading from '%s'"), + path); + goto childcleanup; + } + if (safewrite(pipefd[1], buf, bytesread) != bytesread) { + exit_code = errno; + virReportSystemError(errno, "%s", + _("child failed writing to pipe")); + goto childcleanup; + } + } while (bytesread > 0); + exit_code = 0; + +childcleanup:
s/cleanup/_cleanup/
+ VIR_FREE(buf); + if (fd != -1) + close(fd); + if (pipefd[1] != -1) + close(pipefd[1]); + _exit(exit_code); +} + /* TODO: check seclabel restore */ static int qemudDomainRestore(virConnectPtr conn, const char *path) { @@ -4666,6 +4795,7 @@ static int qemudDomainRestore(virConnectPtr conn, virDomainDefPtr def = NULL; virDomainObjPtr vm = NULL; int fd = -1; + pid_t read_pid = -1; int ret = -1; char *xml = NULL; struct qemud_save_header header; @@ -4677,9 +4807,20 @@ static int qemudDomainRestore(virConnectPtr conn, qemuDriverLock(driver); /* Verify the header and read the XML */ if ((fd = open(path, O_RDONLY)) < 0) { - qemuReportError(VIR_ERR_OPERATION_FAILED, - "%s", _("cannot read domain image")); - goto cleanup; + if ((driver->user == 0) || (getuid() != 0)) { + qemuReportError(VIR_ERR_OPERATION_FAILED, + "%s", _("cannot read domain image")); + goto cleanup; + } + + /* Opening as root failed, but qemu runs as a different user + that might have better luck. Create a pipe, then fork a + child process to run as the qemu user, which will hopefully + have the necessary authority to read the file. */ + if ((fd = qemudOpenAsUID(path, driver->user, &read_pid)) < 0) { + /* error already reported */ + goto cleanup; + } }
if (saferead(fd, &header, sizeof(header)) != sizeof(header)) { @@ -4769,6 +4910,12 @@ static int qemudDomainRestore(virConnectPtr conn, close(intermediatefd); close(fd); fd = -1; + if (read_pid != -1) { + /* reap the process that read the file */ + while ((waitpid(read_pid, NULL, 0) == -1) + && (errno == EINTR));
Two things. In general, terminating an empty loop with a semicolon is risky -- too easy to miss that syntax, and snuggling it up on the end of the "while (cond)" line makes it even easier to miss. If you do this, please at least add a comment on a separate line: while (cond) /* empty */ ; using braces might even be better: while (cond) { /* empty */ } You might want to diagnose a failing waitpid, when errno != EINTR. ECHILD (no such PID) appears to be the only one we'd care about. Obviously, we'd ignore waitpid failure during clean-up, but in the parent, post-read, it might indicate a problem. Also, don't you want to detect child failure in the latter? If so, you'd pass say, &child_status as second argument to waitpid.
+ read_pid = -1; + } if (ret < 0) { if (!vm->persistent) { if (qemuDomainObjEndJob(vm) > 0) @@ -4810,6 +4957,11 @@ cleanup: VIR_FREE(xml); if (fd != -1) close(fd); + if (read_pid != 0) { + /* reap the process that read the file */ + while ((waitpid(read_pid, NULL, 0) == -1) + && (errno == EINTR)); + } if (vm) virDomainObjUnlock(vm); if (event)

Thanks for the review! On 02/23/2010 04:57 PM, Jim Meyering wrote:
Laine Stump wrote:
If qemudDomainRestore fails to open the domain save file, create a pipe, then fork a process that does setuid(qemu_user) and opens the file, then reads this file and stuffs it into the pipe. the parnet libvirtd process will use the other end of the pipe as its fd, then reap the child process after its done reading.
Hi Laine,
This looks fine, modulo a few minor stylistic and error-handling nits. Comments below.
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
...
+static int qemudOpenAsUID(const char *path, uid_t uid, pid_t *childpid) {
...
+ /* caller gets the read side of the pipe */ + fd = pipefd[0]; + pipefd[0] = -1; +parentcleanup:
This_is_more_readable than notusingwordseparators ;-) s/cleanup/_cleanup/
Yeah, okay. I get conflicted working in libvirt, because I find bits with both studlyCaps and underscore_separators, and can't decide which to use, so my brain locks up and I use none.
+ if (pipefd[0] != -1) + close(pipefd[0]); + if (pipefd[1] != -1) + close(pipefd[1]); + if ((fd< 0)&& (*childpid> 0)) { + /* a child process was started and subsequently an error + occurred in the parent, so we need to wait for it to + exit, but its status is inconsequential. */ + while ((waitpid(*childpid, NULL, 0) == -1) +&& (errno == EINTR)); + *childpid = -1; + } + return fd; + } + + /* child */ + + /* setuid to the qemu user, then open the file, read it, + and stuff it into the pipe for the parent process to + read */ + int exit_code; + char *buf = NULL; + int bufsize = 1024 * 1024;
It's slightly better to use size_t in place of int, to match the prototype of VIR_ALLOC_N.
Right. Done.
+ int bytesread; + + /* child doesn't need the read side of the pipe */ + close(pipefd[0]); + + if (forkRet< 0) { + exit_code = errno; + virReportSystemError(errno, + _("failed in child after forking to read '%s'"), + path); + goto childcleanup; + } + + if (setuid(uid) != 0) { + exit_code = errno; + virReportSystemError(errno, + _("cannot setuid(%d) to read '%s'"), + getuid(), path); + goto childcleanup; + } + if ((fd = open(path, O_RDONLY))< 0) { + exit_code = errno; + virReportSystemError(errno, + _("cannot open '%s' as uid %d"), + path, getuid()); + goto childcleanup; + } + if (VIR_ALLOC_N(buf, bufsize)< 0) { + exit_code = ENOMEM; + virReportOOMError(); + goto childcleanup; + } + + /* read from fd and write to pipefd[1] until EOF */ + do { + if ((bytesread = saferead(fd, buf, bufsize))< 0) { + exit_code = errno; + virReportSystemError(errno, + _("child failed reading from '%s'"), + path); + goto childcleanup; + } + if (safewrite(pipefd[1], buf, bytesread) != bytesread) { + exit_code = errno; + virReportSystemError(errno, "%s", + _("child failed writing to pipe")); + goto childcleanup; + } + } while (bytesread> 0); + exit_code = 0; + +childcleanup:
s/cleanup/_cleanup/
Yup.
+ VIR_FREE(buf); + if (fd != -1) + close(fd); + if (pipefd[1] != -1) + close(pipefd[1]); + _exit(exit_code); +} + /* TODO: check seclabel restore */ static int qemudDomainRestore(virConnectPtr conn, const char *path) { @@ -4666,6 +4795,7 @@ static int qemudDomainRestore(virConnectPtr conn, virDomainDefPtr def = NULL; virDomainObjPtr vm = NULL; int fd = -1; + pid_t read_pid = -1; int ret = -1; char *xml = NULL; struct qemud_save_header header; @@ -4677,9 +4807,20 @@ static int qemudDomainRestore(virConnectPtr conn, qemuDriverLock(driver); /* Verify the header and read the XML */ if ((fd = open(path, O_RDONLY))< 0) { - qemuReportError(VIR_ERR_OPERATION_FAILED, - "%s", _("cannot read domain image")); - goto cleanup; + if ((driver->user == 0) || (getuid() != 0)) { + qemuReportError(VIR_ERR_OPERATION_FAILED, + "%s", _("cannot read domain image")); + goto cleanup; + } + + /* Opening as root failed, but qemu runs as a different user + that might have better luck. Create a pipe, then fork a + child process to run as the qemu user, which will hopefully + have the necessary authority to read the file. */ + if ((fd = qemudOpenAsUID(path, driver->user,&read_pid))< 0) { + /* error already reported */ + goto cleanup; + } }
if (saferead(fd,&header, sizeof(header)) != sizeof(header)) { @@ -4769,6 +4910,12 @@ static int qemudDomainRestore(virConnectPtr conn, close(intermediatefd); close(fd); fd = -1; + if (read_pid != -1) { + /* reap the process that read the file */ + while ((waitpid(read_pid, NULL, 0) == -1) +&& (errno == EINTR));
Two things. In general, terminating an empty loop with a semicolon is risky -- too easy to miss that syntax, and snuggling it up on the end of the "while (cond)" line makes it even easier to miss. If you do this, please at least add a comment on a separate line:
while (cond) /* empty */ ;
using braces might even be better:
while (cond) { /* empty */ }
There I go, letting cut-paste get me into trouble again. I also prefer using braces, and will change it to that.
You might want to diagnose a failing waitpid, when errno != EINTR. ECHILD (no such PID) appears to be the only one we'd care about. Obviously, we'd ignore waitpid failure during clean-up, but in the parent, post-read, it might indicate a problem.
Also, don't you want to detect child failure in the latter? If so, you'd pass say,&child_status as second argument to waitpid.
Yeah, I wondered about that, but wasn't sure what I could do after qemu had already been started up and read data from the file (which presumably would itself have failed if there was some problem in the child that cause bad/incomplete data to show up). So I punted. I'll spend some time in the morning trying to do something suitable if the child sends back a failure status.
+ read_pid = -1; + } if (ret< 0) { if (!vm->persistent) { if (qemuDomainObjEndJob(vm)> 0) @@ -4810,6 +4957,11 @@ cleanup: VIR_FREE(xml); if (fd != -1) close(fd); + if (read_pid != 0) { + /* reap the process that read the file */ + while ((waitpid(read_pid, NULL, 0) == -1) +&& (errno == EINTR)); + } if (vm) virDomainObjUnlock(vm); if (event)

(This version incorporates the suggestions from Jim Meyering and Eric Blake) If qemudDomainRestore fails to open the domain save file, create a pipe, then fork a process that does setuid(qemu_user) and opens the file, then reads this file and stuffs it into the pipe. the parent libvirtd process will use the other end of the pipe as its fd, then reap the child process after it's done reading. This makes domain restore work on a root-squash NFS share that is only visible to the qemu user. --- src/qemu/qemu_driver.c | 186 +++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 183 insertions(+), 3 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 5f794fb..dd8bd55 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4659,6 +4659,138 @@ cleanup: return ret; } +/* qemudOpenAsUID() - pipe/fork/setuid/open a file, and return the + pipe fd to caller, so that it can read from the file. Also return + the pid of the child process, so the caller can wait for it to exit + after it's finished reading (to avoid a zombie, if nothing + else). */ + +static int qemudOpenAsUID(const char *path, uid_t uid, pid_t *child_pid) { + int pipefd[2]; + int fd = -1; + + *child_pid = -1; + + if (pipe(pipefd) < 0) { + virReportSystemError(errno, + _("failed to create pipe to read '%s'"), + path); + pipefd[0] = pipefd[1] = -1; + goto parent_cleanup; + } + + int forkRet = virFork(child_pid); + + if (*child_pid < 0) { + virReportSystemError(errno, + _("failed to fork child to read '%s'"), + path); + goto parent_cleanup; + } + + if (*child_pid > 0) { + + /* parent */ + + /* parent doesn't need the write side of the pipe */ + close(pipefd[1]); + pipefd[1] = -1; + + if (forkRet < 0) { + virReportSystemError(errno, + _("failed in parent after forking child to read '%s'"), + path); + goto parent_cleanup; + } + /* caller gets the read side of the pipe */ + fd = pipefd[0]; + pipefd[0] = -1; +parent_cleanup: + if (pipefd[0] != -1) + close(pipefd[0]); + if (pipefd[1] != -1) + close(pipefd[1]); + if ((fd < 0) && (*child_pid > 0)) { + /* a child process was started and subsequently an error + occurred in the parent, so we need to wait for it to + exit, but its status is inconsequential. */ + while ((waitpid(*child_pid, NULL, 0) == -1) + && (errno == EINTR)) { + /* empty */ + } + *child_pid = -1; + } + return fd; + } + + /* child */ + + /* setuid to the qemu user, then open the file, read it, + and stuff it into the pipe for the parent process to + read */ + int exit_code; + char *buf = NULL; + size_t bufsize = 1024 * 1024; + int bytesread; + + /* child doesn't need the read side of the pipe */ + close(pipefd[0]); + + if (forkRet < 0) { + exit_code = errno; + virReportSystemError(errno, + _("failed in child after forking to read '%s'"), + path); + goto child_cleanup; + } + + if (setuid(uid) != 0) { + exit_code = errno; + virReportSystemError(errno, + _("cannot setuid(%d) to read '%s'"), + uid, path); + goto child_cleanup; + } + if ((fd = open(path, O_RDONLY)) < 0) { + exit_code = errno; + virReportSystemError(errno, + _("cannot open '%s' as uid %d"), + path, uid); + goto child_cleanup; + } + if (VIR_ALLOC_N(buf, bufsize) < 0) { + exit_code = ENOMEM; + virReportOOMError(); + goto child_cleanup; + } + + /* read from fd and write to pipefd[1] until EOF */ + do { + if ((bytesread = saferead(fd, buf, bufsize)) < 0) { + exit_code = errno; + virReportSystemError(errno, + _("child failed reading from '%s'"), + path); + goto child_cleanup; + } + if (safewrite(pipefd[1], buf, bytesread) != bytesread) { + exit_code = errno; + virReportSystemError(errno, "%s", + _("child failed writing to pipe")); + goto child_cleanup; + } + } while (bytesread > 0); + exit_code = 0; + +child_cleanup: + VIR_FREE(buf); + if (fd != -1) + close(fd); + if (pipefd[1] != -1) + close(pipefd[1]); + _exit(exit_code); +} + /* TODO: check seclabel restore */ static int qemudDomainRestore(virConnectPtr conn, const char *path) { @@ -4666,6 +4798,7 @@ static int qemudDomainRestore(virConnectPtr conn, virDomainDefPtr def = NULL; virDomainObjPtr vm = NULL; int fd = -1; + pid_t read_pid = -1; int ret = -1; char *xml = NULL; struct qemud_save_header header; @@ -4677,9 +4810,20 @@ static int qemudDomainRestore(virConnectPtr conn, qemuDriverLock(driver); /* Verify the header and read the XML */ if ((fd = open(path, O_RDONLY)) < 0) { - qemuReportError(VIR_ERR_OPERATION_FAILED, - "%s", _("cannot read domain image")); - goto cleanup; + if ((driver->user == 0) || (getuid() != 0)) { + qemuReportError(VIR_ERR_OPERATION_FAILED, + "%s", _("cannot read domain image")); + goto cleanup; + } + + /* Opening as root failed, but qemu runs as a different user + that might have better luck. Create a pipe, then fork a + child process to run as the qemu user, which will hopefully + have the necessary authority to read the file. */ + if ((fd = qemudOpenAsUID(path, driver->user, &read_pid)) < 0) { + /* error already reported */ + goto cleanup; + } } if (saferead(fd, &header, sizeof(header)) != sizeof(header)) { @@ -4769,6 +4913,35 @@ static int qemudDomainRestore(virConnectPtr conn, close(intermediatefd); close(fd); fd = -1; + if (read_pid != -1) { + int wait_ret; + int status; + /* reap the process that read the file */ + while (((wait_ret = waitpid(read_pid, &status, 0)) == -1) + && (errno == EINTR)) { + /* empty */ + } + read_pid = -1; + if (wait_ret == -1) { + virReportSystemError(errno, + _("failed to wait for process reading '%s'"), + path); + ret = -1; + } else if (!WIFEXITED(status)) { + qemuReportError(VIR_ERR_OPERATION_FAILED, + _("child process exited abnormally reading '%s'"), + path); + ret = -1; + } else { + int exit_status = WEXITSTATUS(status); + if (exit_status != 0) { + virReportSystemError(exit_status, + _("child process returned error reading '%s'"), + path); + ret = -1; + } + } + } if (ret < 0) { if (!vm->persistent) { if (qemuDomainObjEndJob(vm) > 0) @@ -4810,6 +4983,13 @@ cleanup: VIR_FREE(xml); if (fd != -1) close(fd); + if (read_pid != 0) { + /* reap the process that read the file */ + while ((waitpid(read_pid, NULL, 0) == -1) + && (errno == EINTR)) { + /* empty */ + } + } if (vm) virDomainObjUnlock(vm); if (event) -- 1.6.6.1

According to Laine Stump on 2/24/2010 1:58 PM:
(This version incorporates the suggestions from Jim Meyering and Eric Blake)
I guess that line is okay to leave in the commit message. If it had been me, though, I would have left it out of the commit log, and just inserted it between the --- and diffstat during 'git send-email --annotate' - it is relevant to the thread that explains the updated commit and how it differs from the first attempt, but it is not really related to the patch itself. After all, at the end of the day, the original version of this patch will not be in libvirt.git, so the words "This version" no longer have any context for comparison. Likewise, a subject line of "[PATCHv2] Fix..." is better than "[PATCH] Take Two - Fix...", since git am strips "[PATCH...]" but not "Take Two".
If qemudDomainRestore fails to open the domain save file, create a pipe, then fork a process that does setuid(qemu_user) and opens the file, then reads this file and stuffs it into the pipe. the parent libvirtd process will use the other end of the pipe as its fd, then reap the child process after it's done reading.
ACK - you fixed everything I brought up, and the patch itself looks good. -- Don't work too hard, make some time for fun as well! Eric Blake ebb9@byu.net

Thanks once again for the informative review! (I've learned something new about git, which isn't really surprising I supposed, since I currently know so little ;-)) On 02/25/2010 12:34 PM, Eric Blake wrote:
According to Laine Stump on 2/24/2010 1:58 PM:
(This version incorporates the suggestions from Jim Meyering and Eric Blake)
I guess that line is okay to leave in the commit message. If it had been me, though, I would have left it out of the commit log, and just inserted it between the --- and diffstat during 'git send-email --annotate' - it is relevant to the thread that explains the updated commit and how it differs from the first attempt, but it is not really related to the patch itself. After all, at the end of the day, the original version of this patch will not be in libvirt.git, so the words "This version" no longer have any context for comparison.
I agree with that, just didn't know that git send-email could do that! ;-) I contemplated doing --compose and putting all that stuff in the separate introductory email, but that seemed a bit too verbose and complicating. Instead, I decided to rely on DV (or whoever pushed the change) to remove those bits from the message before running git am.
Likewise, a subject line of "[PATCHv2] Fix..." is better than "[PATCH] Take Two - Fix...", since git am strips "[PATCH...]" but not "Take Two".
Can git send-email be made to put that in the subject line rather than just [PATCH] or [PATCH n/m]? That would be really useful!

According to Laine Stump on 2/25/2010 3:19 PM:
Thanks once again for the informative review! (I've learned something new about git, which isn't really surprising I supposed, since I currently know so little ;-))
One of the things I love about open source is that everyone learns from each other.
Likewise, a subject line of "[PATCHv2] Fix..." is better than "[PATCH] Take Two - Fix...", since git am strips "[PATCH...]" but not "Take Two".
Can git send-email be made to put that in the subject line rather than just [PATCH] or [PATCH n/m]? That would be really useful!
'git send-email' takes any argument that 'git format-patch' does; the latter is documented as supporting --subject-prefix. So: git send-email -2 --subject-prefix=PATCHv2 sends the last two commits, using [PATCHv2 1/2] as the subject line leader. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Wed, Feb 24, 2010 at 03:58:09PM -0500, Laine Stump wrote:
(This version incorporates the suggestions from Jim Meyering and Eric Blake)
If qemudDomainRestore fails to open the domain save file, create a pipe, then fork a process that does setuid(qemu_user) and opens the file, then reads this file and stuffs it into the pipe. the parent libvirtd process will use the other end of the pipe as its fd, then reap the child process after it's done reading.
This makes domain restore work on a root-squash NFS share that is only visible to the qemu user. --- src/qemu/qemu_driver.c | 186 +++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 183 insertions(+), 3 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 5f794fb..dd8bd55 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4659,6 +4659,138 @@ cleanup: return ret; }
+/* qemudOpenAsUID() - pipe/fork/setuid/open a file, and return the + pipe fd to caller, so that it can read from the file. Also return + the pid of the child process, so the caller can wait for it to exit + after it's finished reading (to avoid a zombie, if nothing + else). */ + +static int qemudOpenAsUID(const char *path, uid_t uid, pid_t *child_pid) { + int pipefd[2]; + int fd = -1; + + *child_pid = -1; + + if (pipe(pipefd) < 0) { + virReportSystemError(errno, + _("failed to create pipe to read '%s'"), + path); + pipefd[0] = pipefd[1] = -1; + goto parent_cleanup; + } + + int forkRet = virFork(child_pid); + + if (*child_pid < 0) { + virReportSystemError(errno, + _("failed to fork child to read '%s'"), + path); + goto parent_cleanup; + } + + if (*child_pid > 0) { + + /* parent */ + + /* parent doesn't need the write side of the pipe */ + close(pipefd[1]); + pipefd[1] = -1; + + if (forkRet < 0) { + virReportSystemError(errno, + _("failed in parent after forking child to read '%s'"), + path); + goto parent_cleanup; + } + /* caller gets the read side of the pipe */ + fd = pipefd[0]; + pipefd[0] = -1; +parent_cleanup: + if (pipefd[0] != -1) + close(pipefd[0]); + if (pipefd[1] != -1) + close(pipefd[1]); + if ((fd < 0) && (*child_pid > 0)) { + /* a child process was started and subsequently an error + occurred in the parent, so we need to wait for it to + exit, but its status is inconsequential. */ + while ((waitpid(*child_pid, NULL, 0) == -1) + && (errno == EINTR)) { + /* empty */ + } + *child_pid = -1; + } + return fd; + } + + /* child */ + + /* setuid to the qemu user, then open the file, read it, + and stuff it into the pipe for the parent process to + read */ + int exit_code; + char *buf = NULL; + size_t bufsize = 1024 * 1024; + int bytesread; + + /* child doesn't need the read side of the pipe */ + close(pipefd[0]); + + if (forkRet < 0) { + exit_code = errno; + virReportSystemError(errno, + _("failed in child after forking to read '%s'"), + path); + goto child_cleanup; + } + + if (setuid(uid) != 0) { + exit_code = errno; + virReportSystemError(errno, + _("cannot setuid(%d) to read '%s'"), + uid, path); + goto child_cleanup; + } + if ((fd = open(path, O_RDONLY)) < 0) { + exit_code = errno; + virReportSystemError(errno, + _("cannot open '%s' as uid %d"), + path, uid); + goto child_cleanup; + } + if (VIR_ALLOC_N(buf, bufsize) < 0) { + exit_code = ENOMEM; + virReportOOMError(); + goto child_cleanup; + } + + /* read from fd and write to pipefd[1] until EOF */ + do { + if ((bytesread = saferead(fd, buf, bufsize)) < 0) { + exit_code = errno; + virReportSystemError(errno, + _("child failed reading from '%s'"), + path); + goto child_cleanup; + } + if (safewrite(pipefd[1], buf, bytesread) != bytesread) { + exit_code = errno; + virReportSystemError(errno, "%s", + _("child failed writing to pipe")); + goto child_cleanup; + } + } while (bytesread > 0); + exit_code = 0; + +child_cleanup: + VIR_FREE(buf); + if (fd != -1) + close(fd); + if (pipefd[1] != -1) + close(pipefd[1]); + _exit(exit_code); +} + /* TODO: check seclabel restore */ static int qemudDomainRestore(virConnectPtr conn, const char *path) { @@ -4666,6 +4798,7 @@ static int qemudDomainRestore(virConnectPtr conn, virDomainDefPtr def = NULL; virDomainObjPtr vm = NULL; int fd = -1; + pid_t read_pid = -1; int ret = -1; char *xml = NULL; struct qemud_save_header header; @@ -4677,9 +4810,20 @@ static int qemudDomainRestore(virConnectPtr conn, qemuDriverLock(driver); /* Verify the header and read the XML */ if ((fd = open(path, O_RDONLY)) < 0) { - qemuReportError(VIR_ERR_OPERATION_FAILED, - "%s", _("cannot read domain image")); - goto cleanup; + if ((driver->user == 0) || (getuid() != 0)) { + qemuReportError(VIR_ERR_OPERATION_FAILED, + "%s", _("cannot read domain image")); + goto cleanup; + } + + /* Opening as root failed, but qemu runs as a different user + that might have better luck. Create a pipe, then fork a + child process to run as the qemu user, which will hopefully + have the necessary authority to read the file. */ + if ((fd = qemudOpenAsUID(path, driver->user, &read_pid)) < 0) { + /* error already reported */ + goto cleanup; + } }
if (saferead(fd, &header, sizeof(header)) != sizeof(header)) { @@ -4769,6 +4913,35 @@ static int qemudDomainRestore(virConnectPtr conn, close(intermediatefd); close(fd); fd = -1; + if (read_pid != -1) { + int wait_ret; + int status; + /* reap the process that read the file */ + while (((wait_ret = waitpid(read_pid, &status, 0)) == -1) + && (errno == EINTR)) { + /* empty */ + } + read_pid = -1; + if (wait_ret == -1) { + virReportSystemError(errno, + _("failed to wait for process reading '%s'"), + path); + ret = -1; + } else if (!WIFEXITED(status)) { + qemuReportError(VIR_ERR_OPERATION_FAILED, + _("child process exited abnormally reading '%s'"), + path); + ret = -1; + } else { + int exit_status = WEXITSTATUS(status); + if (exit_status != 0) { + virReportSystemError(exit_status, + _("child process returned error reading '%s'"), + path); + ret = -1; + } + } + } if (ret < 0) { if (!vm->persistent) { if (qemuDomainObjEndJob(vm) > 0) @@ -4810,6 +4983,13 @@ cleanup: VIR_FREE(xml); if (fd != -1) close(fd); + if (read_pid != 0) { + /* reap the process that read the file */ + while ((waitpid(read_pid, NULL, 0) == -1) + && (errno == EINTR)) { + /* empty */ + } + } if (vm) virDomainObjUnlock(vm); if (event)
Okay, that's a bit complex, but unfortunately there is no choice we need it when restoring from root-squashed NFS, ACK, and pushed, thanks ! Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/
participants (5)
-
Daniel Veillard
-
Eric Blake
-
Jim Meyering
-
Laine Stump
-
Paolo Bonzini