[libvirt] [PATCH] libvirt-guests: Enable parallel operations and improve error handling

This patch modifies the libvirt-guests init script to enable parallel shiutdown on guest machines and gets rid of error messages if the client is unable connect to the URI specified in the config file. --- Simultaneous shutdown of machines may speed up the shutdown process as the shutdown sequence of guests often consists of timeouts and storage un-intensive tasks. Simultaneous resume of machines was already supported, although not documented well enough. This patch also checks if connection to the URI can be done, and prints a error message if it's not the case. This get's rid of unrelevant and repeated error messages if the URI is unreachable. The last improvement is while using managed-save. Transient domains are excluded from the save sequence to get rid of error messages and a list of domains that are left behind is printed. Please tell me your suggestions how to improve this, as I'm not a bash "native speaker". tools/libvirt-guests.init.sh | 214 ++++++++++++++++++++++++++++++++++++++---- tools/libvirt-guests.sysconf | 10 ++- 2 files changed, 204 insertions(+), 20 deletions(-) diff --git a/tools/libvirt-guests.init.sh b/tools/libvirt-guests.init.sh index 367177e..c858747 100644 --- a/tools/libvirt-guests.init.sh +++ b/tools/libvirt-guests.init.sh @@ -78,8 +78,25 @@ run_virsh_c() { ( export LC_ALL=C; run_virsh "$@" ) } +test_connect() +{ + uri=$1 + + run_virsh "$uri" connect 2>/dev/null + if [ $? -ne 0 ]; then + eval_gettext "Can't connect to \$uri. Skipping." + echo + return 1 + fi +} + +# "persistent" argument options: +# yes: list only persistent guests +# no: list only transient guests +# all: list both persistent and transient guests list_guests() { uri=$1 + persistent=$2 list=$(run_virsh_c "$uri" list) if [ $? -ne 0 ]; then @@ -89,12 +106,19 @@ list_guests() { uuids= for id in $(echo "$list" | awk 'NR > 2 {print $1}'); do - uuid=$(run_virsh_c "$uri" dominfo "$id" | awk '/^UUID:/{print $2}') - if [ -z "$uuid" ]; then + dominfo=$(run_virsh_c "$uri" dominfo "$id") + uuid=$(echo "$dominfo" | awk '/^UUID:/{print $2}') + dompersist=$(echo "$dominfo" | awk '/^Persistent:/{print $2}') + + if [ -z "$uuid" ] || [ -z "$dompersist" ]; then RETVAL=1 return 1 fi - uuids="$uuids $uuid" + + if [ "$persistent" == "$dompersist" ] || + [ "$persistent" == "all" ]; then + uuids="$uuids $uuid" + fi done echo $uuids @@ -162,6 +186,8 @@ start() { continue fi + test_connect "$uri" || continue + eval_gettext "Resuming guests on \$uri URI..."; echo for guest in $list; do name=$(guest_name "$uri" "$guest") @@ -241,6 +267,102 @@ shutdown_guest() fi } +shutdown_guest_async() +{ + uri=$1 + guest=$2 + + name=$(guest_name "$uri" "$guest") + label=$(eval_gettext "Starting shutdown on guest: \$name") + echo $label + retval run_virsh "$uri" shutdown "$guest" >/dev/null || return +} + +set_add() +{ + item=$1 + items=$2 + + echo "$items $item" +} + +set_remove() +{ + item=$1 + items=$2 + + newitems= + for nit in $items; do + if [ "$nit" != "$domain" ]; then + newitems="$newitems $nit" + fi + done + + echo "$newitems" +} + +set_count() +{ + items=$1 + + count="0" + for item in $items; do + count=$((count+1)) + done + + echo $count +} + +set_head() +{ + items=$1 + + for item in $items; do + echo $item + return 0 + done +} + +remove_shutdown_domains() +{ + uri=$1 + domains=$2 + + newlist= + for dom in $domains; do + guest_is_on "$uri" "$dom" 2>&1 > /dev/null || return + if "$guest_running"; then + newlist="$newlist $dom" + fi + done + + echo "$newlist" +} + +notify_shutdown_domains() +{ + uri=$1 + all=$2 + running=$3 + + for dom in $all; do + found=false + for run in $running; do + if [ $dom = $run ]; then + found=true + break + fi + done + + if ! "$found"; then + name=$(guest_name "$uri" "$dom") + eval_gettext "Shutdown of guest \$name complete." + echo + fi + done +} + + stop() { # last stop was not followed by start [ -f "$LISTFILE" ] && return 0 @@ -260,14 +382,12 @@ stop() { set -f for uri in $URIS; do set +f - eval_gettext "Running guests on \$uri URI: " - if [ "x$uri" = xdefault ] && [ ! -x "$libvirtd" ]; then - gettext "libvirtd not installed; skipping this URI."; echo - continue - fi + test_connect "$uri" || continue + + eval_gettext "Running guests on \$uri URI: " - list=$(list_guests "$uri") + list=$(list_guests "$uri" "all") if [ $? -eq 0 ]; then empty=true for uuid in $list; do @@ -275,13 +395,45 @@ stop() { printf %s "$(guest_name "$uri" "$uuid")" empty=false done + if "$empty"; then - gettext "no running guests."; echo + gettext "no running guests." + fi + echo + fi + + if "$suspending"; then + transient=$(list_guests "$uri" "no") + if [ $? -eq 0 ]; then + empty=true + for uuid in $transient; do + if "$empty"; then + eval_gettext "Not suspending transient domains on URI: \$uri: " + empty=false + else + printf ", " + fi + printf %s "$(guest_name "$uri" "$uuid")" + done + # reload domain list to contain only persistent domains + list=$(list_guests "$uri" "yes") + if [ $? -ne 0 ]; then + eval_gettext "Failed to list persistent domains on \$uri" + echo + RETVAL=1 + return + fi else + gettext "Failed to list transient domains" echo - echo "$uri" "$list" >>"$LISTFILE" + RETVAL=1 + return fi fi + + if [ -n "$list" ]; then + echo "$uri" "$list" >>"$LISTFILE" + fi done set +f @@ -292,13 +444,39 @@ stop() { eval_gettext "Shutting down guests on \$uri URI..."; echo fi - for guest in $list; do - if "$suspending"; then - suspend_guest "$uri" "$guest" - else - shutdown_guest "$uri" "$guest" - fi - done + if [ "$PARALLEL_SHUTDOWN" -gt 1 ] && + ! "$suspending"; then + on_shutdown= + timeout=$SHUTDOWN_TIMEOUT + while [ $(set_count "$on_shutdown") -gt "0" ] || + [ $(set_count "$list") -gt "0" ]; do + while [ $(set_count "$on_shutdown") -lt "$PARALLEL_SHUTDOWN" ] && + [ $(set_count "$list") -gt "0" ]; do + domain=$(set_head "$list") + shutdown_guest_async "$uri" "$domain" + on_shutdown=$(set_add "$domain" "$on_shutdown"); + list=$(set_remove "$domain" "$list"); + done + sleep 1 + timeout=$((timeout - 1)) + if [ $timeout -le 0 ]; then + eval_gettext "Timeout expired while shutting down domains"; echo + RETVAL=1 + return + fi + on_shutdown_old=$on_shutdown + on_shutdown=$(remove_shutdown_domains "$uri" "$on_shutdown" || return) + notify_shutdown_domains "$uri" "$on_shutdown_old" "$on_shutdown" + done + else + for guest in $list; do + if "$suspending"; then + suspend_guest "$uri" "$guest" + else + shutdown_guest "$uri" "$guest" + fi + done + fi done <"$LISTFILE" rm -f "$VAR_SUBSYS_LIBVIRT_GUESTS" diff --git a/tools/libvirt-guests.sysconf b/tools/libvirt-guests.sysconf index 9b8b64f..e16af4f 100644 --- a/tools/libvirt-guests.sysconf +++ b/tools/libvirt-guests.sysconf @@ -10,7 +10,8 @@ # libvirtd #ON_BOOT=start -# number of seconds to wait between each guest start +# number of seconds to wait between each guest start. Set to 0 to allow parallel +# startup. #START_DELAY=0 # action taken on host shutdown @@ -23,7 +24,12 @@ # value suitable for your guests. #ON_SHUTDOWN=suspend -# number of seconds we're willing to wait for a guest to shut down +# If set to non-zero, shutdown will suspend domains concurently. Number of domains +# on shutdown at any time will not exceed number set in this variable. +#PARALLEL_SHUTDOWN=0 + +# number of seconds we're willing to wait for a guest to shut down. If parallel +# shutdown is enabled, this timeout applies as a timeout for shutting down all guests. #SHUTDOWN_TIMEOUT=0 # If non-zero, try to bypass the file system cache when saving and -- 1.7.3.4

On 02/20/2012 06:32 AM, Peter Krempa wrote:
This patch modifies the libvirt-guests init script to enable parallel shiutdown on guest machines and gets rid of error messages if the client
s/shiutdown/shutdown/
is unable connect to the URI specified in the config file. --- Simultaneous shutdown of machines may speed up the shutdown process as the shutdown sequence of guests often consists of timeouts and storage un-intensive tasks. Simultaneous resume of machines was already supported, although not documented well enough.
This patch also checks if connection to the URI can be done, and prints a error message if it's not the case. This get's rid of unrelevant and repeated error messages
s/unrelevant/irrelevant/ (Stupid English, for being inconsistent on whether un- or ir- negates a word :)
if the URI is unreachable.
The last improvement is while using managed-save. Transient domains are excluded from the save sequence to get rid of error messages and a list of domains that are left behind is printed.
Please tell me your suggestions how to improve this, as I'm not a bash "native speaker".
Actually, we want this to be stricter than bash, since people are using libvirt-guests on debian-based systems where /bin/sh is dash. But no fears - I'm fluent in shell.
+test_connect()
I find it helpful to document ALL shell functions; it makes maintenance easier down the road (it doesn't help that we haven't been doing this in the rest of the file). Here, it looks like you are doing: # test_connect URI # check if URI is reachable test_connect() {
+{ + uri=$1 + + run_virsh "$uri" connect 2>/dev/null
Any reason for the two spaces?
+ if [ $? -ne 0 ]; then + eval_gettext "Can't connect to \$uri. Skipping." + echo + return 1 + fi +} + +# "persistent" argument options: +# yes: list only persistent guests +# no: list only transient guests +# all: list both persistent and transient guests list_guests() {
Here, I would use: # list_guests URI PERSISTENT prior to the documentation of valid PERSISTENT values.
uri=$1 + persistent=$2
list=$(run_virsh_c "$uri" list) if [ $? -ne 0 ]; then @@ -89,12 +106,19 @@ list_guests() {
uuids= for id in $(echo "$list" | awk 'NR > 2 {print $1}'); do - uuid=$(run_virsh_c "$uri" dominfo "$id" | awk '/^UUID:/{print $2}')
<aside> Remind me again why we are doing an inefficient shell loop? In fact, why were we using 'dominfo | awk' to convert id to uuid? It's faster to use 'run_virsh_c "$uri" virsh domuuid $id" to get the same information. Taking it one step further, I'd rather see us enhance 'virsh list' to give us what we want up front. That is, I think it would be nice to have 'virsh list { [--table] | --uuid | --name | --id } ...', where --table is default (for back-compat), but listing --uuid, --name, or --id drops the header line, and lists just one identifier per line rather than a full table. Also, I'd also like to see 'virsh list { [--running] | --all | --inactive | --persistent | --transient } ...'; that is, we ought to have virsh list fine-tune which domains it lists, by adding new options such as --persistent and --transient. By improving virsh, we could skip out on the awk loop altogether, and have the initial virsh list give us what we want in the first place. </aside>
- if [ -z "$uuid" ]; then + dominfo=$(run_virsh_c "$uri" dominfo "$id") + uuid=$(echo "$dominfo" | awk '/^UUID:/{print $2}') + dompersist=$(echo "$dominfo" | awk '/^Persistent:/{print $2}')
This sets $dompersist to yes or no,...
+ + if [ -z "$uuid" ] || [ -z "$dompersist" ]; then RETVAL=1 return 1 fi - uuids="$uuids $uuid" + + if [ "$persistent" == "$dompersist" ] ||
For portability to dash, you must use '=', not '==', inside [].
+ [ "$persistent" == "all" ]; then + uuids="$uuids $uuid"
...if persistent is 'all', then you wasted an awk call above for a result you don't care about, since you plan on using the uuid anyway. Again, more reason to have virsh list give us what we want in the first place.
+shutdown_guest_async() +{ + uri=$1 + guest=$2 + + name=$(guest_name "$uri" "$guest") + label=$(eval_gettext "Starting shutdown on guest: \$name") + echo $label
Missing quotes around $label. Probably simpler to just: name=$(guest_name "$uri" "$guest") eval_gettext "Starting shutdown on guest: \$name" echo
+ retval run_virsh "$uri" shutdown "$guest" >/dev/null || return
No need for the ||return - since this is the last statement in the function, you will return anyway, with the same status.
+} + +set_add() +{ + item=$1 + items=$2 + + echo "$items $item"
Should you be checking for duplicates before adding into a set? I'm not sure without reviewing further to see how this is used. Also, this particular usage requires that the user capture the output of the shell function using $(), which wastes a subshell; there are tricks for passing the name of a variable which holds a set, where you can then modify the variable in place with fewer forks, but I don't think we're at the point where optimizing a few forks will help matters.
+} + +set_remove() +{ + item=$1 + items=$2 + + newitems= + for nit in $items; do + if [ "$nit" != "$domain" ]; then
Where did "$domain" come from? Don't you instead mean to be comparing against "$item"?
+ newitems="$newitems $nit"
The resulting $newitems will always have a leading space, even if $items on entry did not. Is that intentional?
+ fi + done + + echo "$newitems" +} + +set_count() +{ + items=$1 + + count="0" + for item in $items; do + count=$((count+1))
It's slightly more portable to do $(($count+1)). But even that is slow; why not let the shell do it for you: set_count() { set -- $1 echo $# }
+ done + + echo $count +} + +set_head() +{ + items=$1 + + for item in $items; do + echo $item + return 0 + done +}
I sure hope that the sets you are using have no whitespace or glob characters (that is, a set of domain ids or uuids is safe, a set of domain names or of URIs is not).
+ +remove_shutdown_domains() +{ + uri=$1 + domains=$2 + + newlist= + for dom in $domains; do + guest_is_on "$uri" "$dom" 2>&1 > /dev/null || return
Do you really want to break out of the entire for loop on the first guest where you encounter failure? Also, this redirection looks fishy - it says 'make the error output of guest_is_on go to my stdout, and discard the normal output of guest_is_on'. Did you mean '>/dev/null 2>&1' which says to discard both output and error messages from guest_is_on?
+ if "$guest_running"; then + newlist="$newlist $dom" + fi + done + + echo "$newlist" +}
I ran out of time to finish my review today, but hope this gives you some things to think about. -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 02/20/2012 06:32 AM, Peter Krempa wrote:
This patch modifies the libvirt-guests init script to enable parallel shiutdown on guest machines and gets rid of error messages if the client is unable connect to the URI specified in the config file.
Resuming where I left off (and seeing as how you took my suggestions for improving 'virsh list', we'll need a v2 anyway),
+ eval_gettext "Running guests on \$uri URI: "
- list=$(list_guests "$uri") + list=$(list_guests "$uri" "all")
Yeah, I guess it makes sense to list all running guests, even the transient ones that we can't save, if only for log purposes to show that we knew we were losing the transient ones. Hmm, I just remembered, a while ago there was a PoC patch to allow migration on shutdown, as an alternative to managed-save - and that would work for transient guests! We need to look at reviving that patch, and figuring out how it would interact with this patch. https://www.redhat.com/archives/libvir-list/2011-November/msg00229.html
@@ -292,13 +444,39 @@ stop() { eval_gettext "Shutting down guests on \$uri URI..."; echo fi
- for guest in $list; do - if "$suspending"; then - suspend_guest "$uri" "$guest" - else - shutdown_guest "$uri" "$guest" - fi - done + if [ "$PARALLEL_SHUTDOWN" -gt 1 ] && + ! "$suspending"; then + on_shutdown= + timeout=$SHUTDOWN_TIMEOUT + while [ $(set_count "$on_shutdown") -gt "0" ] || + [ $(set_count "$list") -gt "0" ]; do
$(set_count ...) forks, but if all you are doing is checking for a non-empty set (count -gt 0), then it is faster to do: while [ -n "$on_shutdown" ] || [ -n "$list" ]; do
+ while [ $(set_count "$on_shutdown") -lt "$PARALLEL_SHUTDOWN" ] &&
whereas this use of set_count really is needed.
+ [ $(set_count "$list") -gt "0" ]; do + domain=$(set_head "$list") + shutdown_guest_async "$uri" "$domain" + on_shutdown=$(set_add "$domain" "$on_shutdown"); + list=$(set_remove "$domain" "$list"); + done + sleep 1 + timeout=$((timeout - 1))
My earlier patch mentioned that it might be slightly more portable to use $(($timeout - 1)) (that is, $timeout instead of timeout); but in looking further, I see that this is just copy-paste from existing code, and no one has complained about it not working on dash, so no worries about it after all. The parallel shutdown looks reasonable, but it's a lot of code to be embedding into the stop function; maybe it's worth factoring into a shutdown_guest_parallel function?
+++ b/tools/libvirt-guests.sysconf @@ -10,7 +10,8 @@ # libvirtd #ON_BOOT=start
-# number of seconds to wait between each guest start +# number of seconds to wait between each guest start. Set to 0 to allow parallel +# startup. #START_DELAY=0
# action taken on host shutdown @@ -23,7 +24,12 @@ # value suitable for your guests. #ON_SHUTDOWN=suspend
-# number of seconds we're willing to wait for a guest to shut down +# If set to non-zero, shutdown will suspend domains concurently. Number of domains
s/concurently/concurrently/
+# on shutdown at any time will not exceed number set in this variable. +#PARALLEL_SHUTDOWN=0 + +# number of seconds we're willing to wait for a guest to shut down. If parallel +# shutdown is enabled, this timeout applies as a timeout for shutting down all guests. #SHUTDOWN_TIMEOUT=0
# If non-zero, try to bypass the file system cache when saving and
Looking forward to v2. -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org
participants (2)
-
Eric Blake
-
Peter Krempa