[libvirt] [PATCH v2] virsh: add keepalive protocol in virsh

Bugzilla:https://bugzilla.redhat.com/show_bug.cgi?id=822839 add two general virsh options to support keepalive message protocol -i | --keepalive_interval interval time value (default 5 seconds) -n | --keepalive_count number of heartbeats (default 5 times) For non-p2p migration, start keepalive for remote driver to determine the status of network connection, aborting migrating job after defined amount of interval time. --- tools/virsh.c | 88 +++++++++++++++++++++++++++++++++++++++++++++----------- 1 files changed, 70 insertions(+), 18 deletions(-) diff --git a/tools/virsh.c b/tools/virsh.c index 0e41d00..602e5a5 100644 --- a/tools/virsh.c +++ b/tools/virsh.c @@ -251,6 +251,8 @@ typedef struct __vshControl { bool readonly; /* connect readonly (first time only, not * during explicit connect command) */ + int keepalive_interval; /* interval time value */ + int keepalive_count; /* keepalive_count value */ char *logfile; /* log file name */ int log_fd; /* log file descriptor */ char *historydir; /* readline history directory name */ @@ -415,13 +417,14 @@ typedef struct __vshCtrlData { vshControl *ctl; const vshCmd *cmd; int writefd; + virConnectPtr dconn; } vshCtrlData; typedef void (*jobWatchTimeoutFunc) (vshControl *ctl, virDomainPtr dom, void *opaque); static bool -vshWatchJob(vshControl *ctl, +vshWatchJob(vshCtrlData *data, virDomainPtr dom, bool verbose, int pipe_fd, @@ -3334,6 +3337,7 @@ cmdSave(vshControl *ctl, const vshCmd *cmd) data.ctl = ctl; data.cmd = cmd; data.writefd = p[1]; + data.dconn = NULL; if (virThreadCreate(&workerThread, true, @@ -3341,7 +3345,7 @@ cmdSave(vshControl *ctl, const vshCmd *cmd) &data) < 0) goto cleanup; - ret = vshWatchJob(ctl, dom, verbose, p[0], 0, NULL, NULL, _("Save")); + ret = vshWatchJob(&data, dom, verbose, p[0], 0, NULL, NULL, _("Save")); virThreadJoin(&workerThread); @@ -3608,6 +3612,7 @@ cmdManagedSave(vshControl *ctl, const vshCmd *cmd) data.ctl = ctl; data.cmd = cmd; data.writefd = p[1]; + data.dconn = NULL; if (virThreadCreate(&workerThread, true, @@ -3615,7 +3620,7 @@ cmdManagedSave(vshControl *ctl, const vshCmd *cmd) &data) < 0) goto cleanup; - ret = vshWatchJob(ctl, dom, verbose, p[0], 0, + ret = vshWatchJob(&data, dom, verbose, p[0], 0, NULL, NULL, _("Managedsave")); virThreadJoin(&workerThread); @@ -4086,6 +4091,7 @@ cmdDump(vshControl *ctl, const vshCmd *cmd) data.ctl = ctl; data.cmd = cmd; data.writefd = p[1]; + data.dconn = NULL; if (virThreadCreate(&workerThread, true, @@ -4093,7 +4099,7 @@ cmdDump(vshControl *ctl, const vshCmd *cmd) &data) < 0) goto cleanup; - ret = vshWatchJob(ctl, dom, verbose, p[0], 0, NULL, NULL, _("Dump")); + ret = vshWatchJob(&data, dom, verbose, p[0], 0, NULL, NULL, _("Dump")); virThreadJoin(&workerThread); @@ -7213,6 +7219,12 @@ doMigrate (void *opaque) dconn = virConnectOpenAuth (desturi, virConnectAuthPtrDefault, 0); if (!dconn) goto out; + data->dconn = dconn; + if (virConnectSetKeepAlive(dconn, + ctl->keepalive_interval, + ctl->keepalive_count) < 0) + vshDebug(ctl, VSH_ERR_WARNING, "migrate: Failed to start keepalive\n"); + ddom = virDomainMigrate2(dom, dconn, xml, flags, dname, migrateuri, 0); if (ddom) { virDomainFree(ddom); @@ -7268,7 +7280,7 @@ vshMigrationTimeout(vshControl *ctl, } static bool -vshWatchJob(vshControl *ctl, +vshWatchJob(vshCtrlData *data, virDomainPtr dom, bool verbose, int pipe_fd, @@ -7286,6 +7298,7 @@ vshWatchJob(vshControl *ctl, char retchar; bool functionReturn = false; sigset_t sigmask, oldsigmask; + vshControl *ctl = data->ctl; sigemptyset(&sigmask); sigaddset(&sigmask, SIGINT); @@ -7329,6 +7342,13 @@ repoll: goto cleanup; } + if (data->dconn && virConnectIsAlive(data->dconn) <= 0) { + virDomainAbortJob(dom); + vshError(ctl, "%s", + _("Lost connection to destination host")); + goto cleanup; + } + GETTIMEOFDAY(&curr); if (timeout && (((int)(curr.tv_sec - start.tv_sec) * 1000 + (int)(curr.tv_usec - start.tv_usec) / 1000) > @@ -7402,13 +7422,14 @@ cmdMigrate(vshControl *ctl, const vshCmd *cmd) data.ctl = ctl; data.cmd = cmd; data.writefd = p[1]; + data.dconn = NULL; if (virThreadCreate(&workerThread, true, doMigrate, &data) < 0) goto cleanup; - functionReturn = vshWatchJob(ctl, dom, verbose, p[0], timeout, + functionReturn = vshWatchJob(&data, dom, verbose, p[0], timeout, vshMigrationTimeout, NULL, _("Migration")); virThreadJoin(&workerThread); @@ -18673,6 +18694,7 @@ vshCommandRun(vshControl *ctl, const vshCmd *cmd) while (cmd) { struct timeval before, after; bool enable_timing = ctl->timing; + const char *driver = NULL; if ((ctl->conn == NULL || disconnected) && !(cmd->def->flags & VSH_CMD_FLAG_NOCONNECT)) @@ -18681,6 +18703,18 @@ vshCommandRun(vshControl *ctl, const vshCmd *cmd) if (enable_timing) GETTIMEOFDAY(&before); + /* start keepalive for remote driver */ + driver = virConnectGetType(ctl->conn); + if (STREQ_NULLABLE(driver, "QEMU") || + STREQ_NULLABLE(driver, "xenlight") || + STREQ_NULLABLE(driver, "UML") || + STREQ_NULLABLE(driver, "LXC") || + STREQ_NULLABLE(driver, "remote")) + if (virConnectSetKeepAlive(ctl->conn, + ctl->keepalive_interval, + ctl->keepalive_count) < 0) + vshDebug(ctl, VSH_ERR_WARNING, "migrate: Failed to start keepalive\n"); + ret = cmd->def->handler(ctl, cmd); if (enable_timing) @@ -19959,17 +19993,19 @@ vshUsage(void) fprintf(stdout, _("\n%s [options]... [<command_string>]" "\n%s [options]... <command> [args...]\n\n" " options:\n" - " -c | --connect=URI hypervisor connection URI\n" - " -r | --readonly connect readonly\n" - " -d | --debug=NUM debug level [0-4]\n" - " -h | --help this help\n" - " -q | --quiet quiet mode\n" - " -t | --timing print timing information\n" - " -l | --log=FILE output logging to file\n" - " -v short version\n" - " -V long version\n" - " --version[=TYPE] version, TYPE is short or long (default short)\n" - " -e | --escape <char> set escape sequence for console\n\n" + " -c | --connect=URI hypervisor connection URI\n" + " -r | --readonly connect readonly\n" + " -d | --debug=NUM debug level [0-4]\n" + " -h | --help this help\n" + " -q | --quiet quiet mode\n" + " -t | --timing print timing information\n" + " -l | --log=FILE output logging to file\n" + " -v short version\n" + " -V long version\n" + " --version[=TYPE] version, TYPE is short or long (default short)\n" + " -e | --escape <char> set escape sequence for console\n" + " -i | --keepalive_interval interval time value (default 5 seconds)\n" + " -n | --keepalive_count number of heartbeats (default 5 times)\n\n" " commands (non interactive mode):\n\n"), progname, progname); for (grp = cmdGroups; grp->name; grp++) { @@ -20146,13 +20182,15 @@ vshParseArgv(vshControl *ctl, int argc, char **argv) {"readonly", no_argument, NULL, 'r'}, {"log", required_argument, NULL, 'l'}, {"escape", required_argument, NULL, 'e'}, + {"keepalive_interval", required_argument, NULL, 'i'}, + {"keepalive_count", required_argument, NULL, 'n'}, {NULL, 0, NULL, 0} }; /* Standard (non-command) options. The leading + ensures that no * argument reordering takes place, so that command options are * not confused with top-level virsh options. */ - while ((arg = getopt_long(argc, argv, "+d:hqtc:vVrl:e:", opt, NULL)) != -1) { + while ((arg = getopt_long(argc, argv, "+d:hqtc:vVrl:e:i:n:", opt, NULL)) != -1) { switch (arg) { case 'd': if (virStrToLong_i(optarg, NULL, 10, &ctl->debug) < 0) { @@ -20201,6 +20239,18 @@ vshParseArgv(vshControl *ctl, int argc, char **argv) exit(EXIT_FAILURE); } break; + case 'i': + if (virStrToLong_i(optarg, NULL, 10, &ctl->keepalive_interval) < 0) { + vshError(ctl, "%s", _("option -i takes a numeric argument")); + exit(EXIT_FAILURE); + } + break; + case 'n': + if (virStrToLong_i(optarg, NULL, 10, &ctl->keepalive_count) < 0) { + vshError(ctl, "%s", _("option -n takes a numeric argument")); + exit(EXIT_FAILURE); + } + break; default: vshError(ctl, _("unsupported option '-%c'. See --help."), arg); exit(EXIT_FAILURE); @@ -20232,6 +20282,8 @@ main(int argc, char **argv) ctl->log_fd = -1; /* Initialize log file descriptor */ ctl->debug = VSH_DEBUG_DEFAULT; ctl->escapeChar = CTRL_CLOSE_BRACKET; + ctl->keepalive_interval = 5; + ctl->keepalive_count = 5; if (!setlocale(LC_ALL, "")) { -- 1.7.7.5

On 06/05/2012 02:36 AM, Guannan Ren wrote:
Bugzilla:https://bugzilla.redhat.com/show_bug.cgi?id=822839 add two general virsh options to support keepalive message protocol
-i | --keepalive_interval interval time value (default 5 seconds) -n | --keepalive_count number of heartbeats (default 5 times)
For non-p2p migration, start keepalive for remote driver to determine the status of network connection, aborting migrating job after defined amount of interval time. --- tools/virsh.c | 88 +++++++++++++++++++++++++++++++++++++++++++++----------- 1 files changed, 70 insertions(+), 18 deletions(-)
Missing a counterpart documentation in virsh.pod.
@@ -7213,6 +7219,12 @@ doMigrate (void *opaque) dconn = virConnectOpenAuth (desturi, virConnectAuthPtrDefault, 0); if (!dconn) goto out;
+ data->dconn = dconn; + if (virConnectSetKeepAlive(dconn, + ctl->keepalive_interval, + ctl->keepalive_count) < 0) + vshDebug(ctl, VSH_ERR_WARNING, "migrate: Failed to start keepalive\n");
This makes it impossible to migrate to an older server that lacks virConnectSetKeepAlive() support. You need to avoid doing this if keepalive_interval and/or keepalive_count is set to a value that avoids keepalive. I think that also means you need to distinguish between a normal default of 5 seconds with new servers but unspecified by the user (where we just gracefully continue without keepalive), compared to an explicit user request of 5 seconds (must fail if the keepalive request cannot be honored), and compared to an explicit user request of no keepalive.
@@ -7329,6 +7342,13 @@ repoll: goto cleanup; }
+ if (data->dconn && virConnectIsAlive(data->dconn) <= 0) { + virDomainAbortJob(dom); + vshError(ctl, "%s", + _("Lost connection to destination host")); + goto cleanup; + }
Again, need to be careful of older servers that lack virConnectIsAlive() support.
@@ -18681,6 +18703,18 @@ vshCommandRun(vshControl *ctl, const vshCmd *cmd) if (enable_timing) GETTIMEOFDAY(&before);
+ /* start keepalive for remote driver */ + driver = virConnectGetType(ctl->conn); + if (STREQ_NULLABLE(driver, "QEMU") || + STREQ_NULLABLE(driver, "xenlight") || + STREQ_NULLABLE(driver, "UML") || + STREQ_NULLABLE(driver, "LXC") || + STREQ_NULLABLE(driver, "remote"))
Why are you limiting this to particular hypervisors? That feels wrong. Rather, you should blindly attempt it, and gracefully detect when it is unsupported.
@@ -19959,17 +19993,19 @@ vshUsage(void) fprintf(stdout, _("\n%s [options]... [<command_string>]" "\n%s [options]... <command> [args...]\n\n" " options:\n" - " -c | --connect=URI hypervisor connection URI\n"
+ " -c | --connect=URI hypervisor connection URI\n"
Rather than reindenting everything, I would just alter your new lines to use multiple lines:
+ " -i | --keepalive_interval interval time value (default 5 seconds)\n" + " -n | --keepalive_count number of heartbeats (default 5 times)\n\n"
-c | --connect=URI hypervisor connection URI -i | --keepalive_interval interval time value (default 5 seconds)
/* Standard (non-command) options. The leading + ensures that no * argument reordering takes place, so that command options are * not confused with top-level virsh options. */ - while ((arg = getopt_long(argc, argv, "+d:hqtc:vVrl:e:", opt, NULL)) != -1) { + while ((arg = getopt_long(argc, argv, "+d:hqtc:vVrl:e:i:n:", opt, NULL)) != -1) {
Pre-existing, but I would welcome an independent patch that cleaned this up into alphabetic ordering to make it easier to find code for a given option now that we have more options to deal with. Looking forward to v3. -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On Thu, Jun 07, 2012 at 12:03:02 -0600, Eric Blake wrote:
On 06/05/2012 02:36 AM, Guannan Ren wrote:
Bugzilla:https://bugzilla.redhat.com/show_bug.cgi?id=822839 add two general virsh options to support keepalive message protocol
-i | --keepalive_interval interval time value (default 5 seconds) -n | --keepalive_count number of heartbeats (default 5 times)
For non-p2p migration, start keepalive for remote driver to determine the status of network connection, aborting migrating job after defined amount of interval time. --- tools/virsh.c | 88 +++++++++++++++++++++++++++++++++++++++++++++----------- 1 files changed, 70 insertions(+), 18 deletions(-)
Missing a counterpart documentation in virsh.pod.
@@ -7213,6 +7219,12 @@ doMigrate (void *opaque) dconn = virConnectOpenAuth (desturi, virConnectAuthPtrDefault, 0); if (!dconn) goto out;
+ data->dconn = dconn; + if (virConnectSetKeepAlive(dconn, + ctl->keepalive_interval, + ctl->keepalive_count) < 0) + vshDebug(ctl, VSH_ERR_WARNING, "migrate: Failed to start keepalive\n");
This makes it impossible to migrate to an older server that lacks virConnectSetKeepAlive() support.
virConnectSetKeepAlive returns 1 when remote party does not support keepalive, which means the above code is perfectly compatible with older servers.
You need to avoid doing this if keepalive_interval and/or keepalive_count is set to a value that avoids keepalive. I think that also means you need to distinguish between a normal default of 5 seconds with new servers but unspecified by the user (where we just gracefully continue without keepalive), compared to an explicit user request of 5 seconds (must fail if the keepalive request cannot be honored), and compared to an explicit user request of no keepalive.
But I agree that we need to distinguish between default keepalive settings and those explicitly requested.
@@ -7329,6 +7342,13 @@ repoll: goto cleanup; }
+ if (data->dconn && virConnectIsAlive(data->dconn) <= 0) { + virDomainAbortJob(dom); + vshError(ctl, "%s", + _("Lost connection to destination host")); + goto cleanup; + }
Again, need to be careful of older servers that lack virConnectIsAlive() support.
This doesn't call to the server at all. However, virConnectIsAlive returning -1 doesn't mean the connection is closed, especially if it sets the error to VIR_ERR_NO_SUPPORT.
@@ -18681,6 +18703,18 @@ vshCommandRun(vshControl *ctl, const vshCmd *cmd) if (enable_timing) GETTIMEOFDAY(&before);
+ /* start keepalive for remote driver */ + driver = virConnectGetType(ctl->conn); + if (STREQ_NULLABLE(driver, "QEMU") || + STREQ_NULLABLE(driver, "xenlight") || + STREQ_NULLABLE(driver, "UML") || + STREQ_NULLABLE(driver, "LXC") || + STREQ_NULLABLE(driver, "remote"))
Why are you limiting this to particular hypervisors? That feels wrong. Rather, you should blindly attempt it, and gracefully detect when it is unsupported.
Yes, and another issue is that vshCommandRun is a bad place to do this. Keepalive should be enabled just after opening a connection, i.e., in vshReconnect, cmdConnect, and vshInit (in addition to doMigrate, where you correctly do so). I think virsh would benefit from a new wrapper for virConnectOpenAuth that would also deal with keepalive and which would be called from all four places. Jirka

On 06/08/2012 03:46 PM, Jiri Denemark wrote:
On Thu, Jun 07, 2012 at 12:03:02 -0600, Eric Blake wrote:
On 06/05/2012 02:36 AM, Guannan Ren wrote:
Bugzilla:https://bugzilla.redhat.com/show_bug.cgi?id=822839 add two general virsh options to support keepalive message protocol
-i | --keepalive_interval interval time value (default 5 seconds) -n | --keepalive_count number of heartbeats (default 5 times)
For non-p2p migration, start keepalive for remote driver to determine the status of network connection, aborting migrating job after defined amount of interval time. --- tools/virsh.c | 88 +++++++++++++++++++++++++++++++++++++++++++++----------- 1 files changed, 70 insertions(+), 18 deletions(-) Missing a counterpart documentation in virsh.pod.
@@ -7213,6 +7219,12 @@ doMigrate (void *opaque) dconn = virConnectOpenAuth (desturi, virConnectAuthPtrDefault, 0); if (!dconn) goto out;
+ data->dconn = dconn; + if (virConnectSetKeepAlive(dconn, + ctl->keepalive_interval, + ctl->keepalive_count)< 0) + vshDebug(ctl, VSH_ERR_WARNING, "migrate: Failed to start keepalive\n"); This makes it impossible to migrate to an older server that lacks virConnectSetKeepAlive() support. virConnectSetKeepAlive returns 1 when remote party does not support keepalive, which means the above code is perfectly compatible with older servers.
You need to avoid doing this if keepalive_interval and/or keepalive_count is set to a value that avoids keepalive. I think that also means you need to distinguish between a normal default of 5 seconds with new servers but unspecified by the user (where we just gracefully continue without keepalive), compared to an explicit user request of 5 seconds (must fail if the keepalive request cannot be honored), and compared to an explicit user request of no keepalive. But I agree that we need to distinguish between default keepalive settings and those explicitly requested.
If the server supports keepalive, virConnectSetKeepAlive will success, otherwise failed. I think, on success the default value 5 seconds or the values from options --keepalive_interval and --keepalive_count will be used. On failure, all value are not invalid even given by user explicitly. That means the keepalive is set up or not automatically by client. What the user will do is to override the default values or leave them untouched.
@@ -7329,6 +7342,13 @@ repoll: goto cleanup; }
+ if (data->dconn&& virConnectIsAlive(data->dconn)<= 0) { + virDomainAbortJob(dom); + vshError(ctl, "%s", + _("Lost connection to destination host")); + goto cleanup; + } Again, need to be careful of older servers that lack virConnectIsAlive() support. This doesn't call to the server at all. However, virConnectIsAlive returning -1 doesn't mean the connection is closed, especially if it sets the error to VIR_ERR_NO_SUPPORT.
virConnectIsAlive from remote driver is for determining the working of client, after certain time, the keepalive will close the client, we use the function to decide when to abort the migration job.
@@ -18681,6 +18703,18 @@ vshCommandRun(vshControl *ctl, const vshCmd *cmd) if (enable_timing) GETTIMEOFDAY(&before);
+ /* start keepalive for remote driver */ + driver = virConnectGetType(ctl->conn); + if (STREQ_NULLABLE(driver, "QEMU") || + STREQ_NULLABLE(driver, "xenlight") || + STREQ_NULLABLE(driver, "UML") || + STREQ_NULLABLE(driver, "LXC") || + STREQ_NULLABLE(driver, "remote")) Why are you limiting this to particular hypervisors? That feels wrong. Rather, you should blindly attempt it, and gracefully detect when it is unsupported. Yes, and another issue is that vshCommandRun is a bad place to do this. Keepalive should be enabled just after opening a connection, i.e., in vshReconnect, cmdConnect, and vshInit (in addition to doMigrate, where you correctly do so). I think virsh would benefit from a new wrapper for virConnectOpenAuth that would also deal with keepalive and which would be called from all four places.
Jirka
I agree with the common function called from these four places to set keepalive up.
participants (3)
-
Eric Blake
-
Guannan Ren
-
Jiri Denemark