[libvirt] [PATCH] Add a virStrSplitQuoted for splitting quoted strings

From: "Daniel P. Berrange" <berrange@redhat.com> To facilitate parsing of argv[] style strings, provide a virStrSplitQuoted API which will split a string on the listed separators, but also allow for quoting with ' or ". * src/libvirt_private.syms, src/util/util.c, src/util/util.h: Implement virStrSplitQuoted * tests/utiltest.c: Some tests for virStrSplitQuoted --- src/libvirt_private.syms | 1 + src/util/util.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/util.h | 2 + tests/utiltest.c | 89 ++++++++++++++++++++++++++++++++-- 4 files changed, 203 insertions(+), 6 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 1f55f5d..b6fbafc 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1148,6 +1148,7 @@ virSetUIDGID; virSkipSpaces; virSkipSpacesAndBackslash; virSkipSpacesBackwards; +virStrSplitQuoted; virStrToDouble; virStrToLong_i; virStrToLong_l; diff --git a/src/util/util.c b/src/util/util.c index 15e6cfa..acfb033 100644 --- a/src/util/util.c +++ b/src/util/util.c @@ -1951,6 +1951,123 @@ virStrcpy(char *dest, const char *src, size_t destbytes) return virStrncpy(dest, src, strlen(src), destbytes); } + +static char * +virStrDupUnescape(const char *src, size_t len) +{ + char *ret; + size_t i, j; + bool escape = false; + + if (VIR_ALLOC_N(ret, len + 1) < 0) + return NULL; + + for (i = 0, j = 0 ; i < len ; i++) { + if (escape) { + escape = false; + ret[j++] = src[i]; + } else if (src[i] == '\\') { + escape = true; + } else { + ret[j++] = src[i]; + } + } + ret[j++] = '\0'; + + return ret; +} + +/** + * virStrSplitQuoted: + * @src: string to split + * @sep: list of separator characters + * + * Split the string 'src' into chunks, separated by one or more of + * the characters in 'sep'. Allow ' or " to quote strings even if + * they contain 'sep' + * + * Returns NULL terminated array of strings + */ +char ** +virStrSplitQuoted(const char *src, const char *sep) +{ + size_t alloc = 0; + size_t count = 0; + char **ret = NULL; + const char *start = src; + const char *end; + bool escape; + char match; + char *value = NULL; + size_t i; + + while (start && *start) { + start += strspn(start, sep); + + if (!*start) + break; + + if (*start == '\'' || *start == '\"') { + match = *start; + start++; + + for (end = start, escape = false ; *end ; end++) { + if (escape) + escape = false; + else if (*end == '\\') + escape = true; + else if (*end == match) + break; + } + + if (VIR_RESIZE_N(ret, alloc, count, 1) < 0) + goto no_memory; + + if (!(ret[count] = virStrDupUnescape(start, end-start))) + goto no_memory; + count++; + + start = end; + if (*start) + start++; + } else { + for (end = start, escape = false ; *end ; end++) { + if (escape) + escape = false; + else if (*end == '\\') + escape = true; + else if (strspn(end, sep)) + break; + } + + if (VIR_RESIZE_N(ret, alloc, count, 1) < 0) + goto no_memory; + + if (!(ret[count] = virStrDupUnescape(start, end-start))) + goto no_memory; + count++; + + start = end; + if (*start) + start++; + } + } + + if (VIR_RESIZE_N(ret, alloc, count, 1) < 0) + goto no_memory; + ret[count] = NULL; + + return ret; + +no_memory: + VIR_FREE(value); + for (i = 0 ; i < count ; i++) + VIR_FREE(ret[i]); + VIR_FREE(ret); + return NULL; +} + + int virEnumFromString(const char *const*types, unsigned int ntypes, const char *type) diff --git a/src/util/util.h b/src/util/util.h index 85e8bd6..404003d 100644 --- a/src/util/util.h +++ b/src/util/util.h @@ -182,6 +182,8 @@ char *virStrcpy(char *dest, const char *src, size_t destbytes) ATTRIBUTE_RETURN_CHECK; # define virStrcpyStatic(dest, src) virStrcpy((dest), (src), sizeof(dest)) +char **virStrSplitQuoted(const char *src, const char *sep); + int virDiskNameToIndex(const char* str); char *virIndexToDiskName(int idx, const char *prefix); diff --git a/tests/utiltest.c b/tests/utiltest.c index 774a2f7..b8c6a8e 100644 --- a/tests/utiltest.c +++ b/tests/utiltest.c @@ -151,6 +151,55 @@ testParseVersionString(const void *data ATTRIBUTE_UNUSED) } +struct StringSplitData { + const char *src; + const char *sep; + const char **bits; +}; + +static int +testStringSplit(const void *opaque) +{ + int ret = -1; + const struct StringSplitData *data = opaque; + char **actual; + char **tmp1; + const char **tmp2 = data->bits; + size_t i; + + tmp1 = actual = virStrSplitQuoted(data->src, data->sep); + + if (!tmp1) + return -1; + + while (*tmp1 && *tmp2) { + if (STRNEQ(*tmp1, *tmp2)) { + fprintf(stderr, "Expected '%s' got '%s'\n", *tmp2, *tmp1); + goto cleanup; + } + + tmp1++; + tmp2++; + } + + if (*tmp1) { + fprintf(stderr, "Unexpected extra value '%s'\n", *tmp1); + goto cleanup; + } + + if (*tmp2) { + fprintf(stderr, "Unexpected missing value '%s'\n", *tmp2); + goto cleanup; + } + + + ret = 0; +cleanup: + for (i = 0 ; actual[i] ; i++) + VIR_FREE(actual[i]); + VIR_FREE(actual); + return ret; +} static int @@ -161,17 +210,45 @@ mymain(void) virSetErrorFunc(NULL, testQuietError); #define DO_TEST(_name) \ - do { \ - if (virtTestRun("Util "#_name, 1, test##_name, \ - NULL) < 0) { \ - result = -1; \ - } \ - } while (0) + do { \ + if (virtTestRun("Util "#_name, 1, test##_name, \ + NULL) < 0) { \ + result = -1; \ + } \ + } while (0) + +#define DO_TEST_STRING(str, sep, bits) \ + do { \ + struct StringSplitData data = { \ + str, sep, bits \ + }; \ + if (virtTestRun("Util split " str, 1, testStringSplit, \ + &data) < 0) { \ + result = -1; \ + } \ + } while (0) DO_TEST(IndexToDiskName); DO_TEST(DiskNameToIndex); DO_TEST(ParseVersionString); + const char *bits1[] = { "foo", "bar", NULL }; + DO_TEST_STRING("foo bar", " ", bits1); + DO_TEST_STRING("foo 'bar'", " ", bits1); + DO_TEST_STRING("foo \"bar\"", " ", bits1); + DO_TEST_STRING(" foo \"bar\"", " ", bits1); + DO_TEST_STRING(" foo \"bar\"", " ", bits1); + DO_TEST_STRING(" foo \"bar\"\n ", " \t\n\r", bits1); + + const char *bits2[] = { "foo", "bar wizz", "eek", NULL }; + DO_TEST_STRING("foo 'bar wizz' eek", " ", bits2); + DO_TEST_STRING("foo \"bar wizz\" eek", " ", bits2); + + const char *bits3[] = { "foo", "'bar' \"wizz\"", "eek", NULL }; + DO_TEST_STRING("foo '\\'bar\\' \\\"wizz\\\"' eek", " ", bits3); + DO_TEST_STRING("foo \"\\'bar\\' \\\"wizz\\\"\" eek", " ", bits3); + DO_TEST_STRING("foo \"\\'bar\\' \\\"wizz\\\"\"\r\n eek", " \r\n", bits3); + return result == 0 ? EXIT_SUCCESS : EXIT_FAILURE; } -- 1.7.7.6

On 03/14/2012 09:19 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange" <berrange@redhat.com>
To facilitate parsing of argv[] style strings, provide a virStrSplitQuoted API which will split a string on the listed separators, but also allow for quoting with ' or ".
* src/libvirt_private.syms, src/util/util.c, src/util/util.h: Implement virStrSplitQuoted * tests/utiltest.c: Some tests for virStrSplitQuoted --- src/libvirt_private.syms | 1 + src/util/util.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/util.h | 2 + tests/utiltest.c | 89 ++++++++++++++++++++++++++++++++-- 4 files changed, 203 insertions(+), 6 deletions(-)
This looks like it is repeating some of the code in virsh.c:vshCommandStringGetArg; any chance we can combine them? In particular, the ability to mimic shell handling of \ escapes, as well as the difference in behavior of \ inside "" vs. '', seems like it will come in handy.
+ +static char * +virStrDupUnescape(const char *src, size_t len) +{ + char *ret; + size_t i, j; + bool escape = false; + + if (VIR_ALLOC_N(ret, len + 1) < 0) + return NULL; + + for (i = 0, j = 0 ; i < len ; i++) { + if (escape) { + escape = false; + ret[j++] = src[i]; + } else if (src[i] == '\\') { + escape = true; + } else { + ret[j++] = src[i]; + }
So this version only strips backslash. Looks okay in isolation.
+/** + * virStrSplitQuoted: + * @src: string to split + * @sep: list of separator characters + * + * Split the string 'src' into chunks, separated by one or more of + * the characters in 'sep'. Allow ' or " to quote strings even if + * they contain 'sep'
No documentation of backslash handling. Are we trying to emulate shell parsing here (where depending on outer, "", or '', \ behaves differently)? Or are we trying to emulate C string parsing, where \n is translated to newline? What you have here does neither; although I didn't spot any flaw in the code, I don't know if it's the algorithm we want to be using.
+struct StringSplitData { + const char *src; + const char *sep; + const char **bits; +};
A point in your favor, for at least testing what you parse! If we change our mind to mimic shell or C parsing, then we'd have to update these tests.
+ const char *bits1[] = { "foo", "bar", NULL }; + DO_TEST_STRING("foo bar", " ", bits1); + DO_TEST_STRING("foo 'bar'", " ", bits1); + DO_TEST_STRING("foo \"bar\"", " ", bits1); + DO_TEST_STRING(" foo \"bar\"", " ", bits1); + DO_TEST_STRING(" foo \"bar\"", " ", bits1); + DO_TEST_STRING(" foo \"bar\"\n ", " \t\n\r", bits1); + + const char *bits2[] = { "foo", "bar wizz", "eek", NULL }; + DO_TEST_STRING("foo 'bar wizz' eek", " ", bits2); + DO_TEST_STRING("foo \"bar wizz\" eek", " ", bits2);
What about "foo bar\ wizz eek", if \ can escape outside of quotes?
+ + const char *bits3[] = { "foo", "'bar' \"wizz\"", "eek", NULL };
It would also be nice to have a literal backslash in the expected results, to prove that we can properly escape them. Overall, I like the idea of the new function, but I'm worried that introducing yet another parser could hurt us (users will be asking "now which escape style is in effect here, and how does it differ from standardized escape styles that I'm used to?"). -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On Wed, Mar 14, 2012 at 10:40:41AM -0600, Eric Blake wrote:
On 03/14/2012 09:19 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange" <berrange@redhat.com>
To facilitate parsing of argv[] style strings, provide a virStrSplitQuoted API which will split a string on the listed separators, but also allow for quoting with ' or ".
* src/libvirt_private.syms, src/util/util.c, src/util/util.h: Implement virStrSplitQuoted * tests/utiltest.c: Some tests for virStrSplitQuoted --- src/libvirt_private.syms | 1 + src/util/util.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/util.h | 2 + tests/utiltest.c | 89 ++++++++++++++++++++++++++++++++-- 4 files changed, 203 insertions(+), 6 deletions(-)
This looks like it is repeating some of the code in virsh.c:vshCommandStringGetArg; any chance we can combine them? In particular, the ability to mimic shell handling of \ escapes, as well as the difference in behavior of \ inside "" vs. '', seems like it will come in handy.
Ah I had forgotten about that code. Can you clarify the difference in \ handling. I guess you mean that inside '', \ can only be used for \\ and \', while inside "", it can do all the standard shell escapes like \t, \n, etc ?
+ +static char * +virStrDupUnescape(const char *src, size_t len) +{ + char *ret; + size_t i, j; + bool escape = false; + + if (VIR_ALLOC_N(ret, len + 1) < 0) + return NULL; + + for (i = 0, j = 0 ; i < len ; i++) { + if (escape) { + escape = false; + ret[j++] = src[i]; + } else if (src[i] == '\\') { + escape = true; + } else { + ret[j++] = src[i]; + }
So this version only strips backslash. Looks okay in isolation.
+/** + * virStrSplitQuoted: + * @src: string to split + * @sep: list of separator characters + * + * Split the string 'src' into chunks, separated by one or more of + * the characters in 'sep'. Allow ' or " to quote strings even if + * they contain 'sep'
No documentation of backslash handling. Are we trying to emulate shell parsing here (where depending on outer, "", or '', \ behaves differently)?
Or are we trying to emulate C string parsing, where \n is translated to newline?
What you have here does neither; although I didn't spot any flaw in the code, I don't know if it's the algorithm we want to be using.
I should have sent this paired with my other patch for <cmdline> handling in LXC. That is the intended use case for this function. I'm not sure that anyone has ever clearly defined what escaping syntax is used for /proc/cmdline (which is what <cmdline> is representing. SystemD's parser is what I modelled my code on, though it in fact does not unescape anything. eg. if parsing foo "bar \" wizz" eek systemd seems to return foo bar \" wizz eek while I return foo bar " wizz eek
+struct StringSplitData { + const char *src; + const char *sep; + const char **bits; +};
A point in your favor, for at least testing what you parse! If we change our mind to mimic shell or C parsing, then we'd have to update these tests.
Yes, escaping rules blow my mind unless I can test them :-)
+ const char *bits1[] = { "foo", "bar", NULL }; + DO_TEST_STRING("foo bar", " ", bits1); + DO_TEST_STRING("foo 'bar'", " ", bits1); + DO_TEST_STRING("foo \"bar\"", " ", bits1); + DO_TEST_STRING(" foo \"bar\"", " ", bits1); + DO_TEST_STRING(" foo \"bar\"", " ", bits1); + DO_TEST_STRING(" foo \"bar\"\n ", " \t\n\r", bits1); + + const char *bits2[] = { "foo", "bar wizz", "eek", NULL }; + DO_TEST_STRING("foo 'bar wizz' eek", " ", bits2); + DO_TEST_STRING("foo \"bar wizz\" eek", " ", bits2);
What about "foo bar\ wizz eek", if \ can escape outside of quotes?
Hmm, possibly
+ + const char *bits3[] = { "foo", "'bar' \"wizz\"", "eek", NULL };
It would also be nice to have a literal backslash in the expected results, to prove that we can properly escape them.
Good point.
Overall, I like the idea of the new function, but I'm worried that introducing yet another parser could hurt us (users will be asking "now which escape style is in effect here, and how does it differ from standardized escape styles that I'm used to?").
I think that perhaps we should have virStrSplitQuoted just return the split pieces, with *no* unescaping. And then have separate functions to escape/unescape individual string pieces after the fact. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 03/14/2012 10:50 AM, Daniel P. Berrange wrote:
This looks like it is repeating some of the code in virsh.c:vshCommandStringGetArg; any chance we can combine them? In particular, the ability to mimic shell handling of \ escapes, as well as the difference in behavior of \ inside "" vs. '', seems like it will come in handy.
Ah I had forgotten about that code. Can you clarify the difference in \ handling. I guess you mean that inside '', \ can only be used for \\ and \', while inside "", it can do all the standard shell escapes like \t, \n, etc ?
Inside '', \ has no special meaning. (There's no way to escape ' inside of '). Inside "", \ escapes the next byte (important for " and \, but works on any byte. Outside quotes, \ escapes the next byte. And most importantly: concatenating two forms of quoted materials is permitted. Thus: foo"b'ar\" "\'' blah\' on input becomes this on output: foob'ar" ' blah\
What you have here does neither; although I didn't spot any flaw in the code, I don't know if it's the algorithm we want to be using.
I should have sent this paired with my other patch for <cmdline> handling in LXC. That is the intended use case for this function.
I'm not sure that anyone has ever clearly defined what escaping syntax is used for /proc/cmdline (which is what <cmdline> is representing.
Hmm, good point. If it is /bin/sh doing the parsing, then we know the rules (and virsh matches them); but it if is a custom parser in the kernel, the we have to match that custom parser.
A point in your favor, for at least testing what you parse! If we change our mind to mimic shell or C parsing, then we'd have to update these tests.
Yes, escaping rules blow my mind unless I can test them :-)
You're not the only one - I added loads of tests when I added shell parsing to virsh.
Overall, I like the idea of the new function, but I'm worried that introducing yet another parser could hurt us (users will be asking "now which escape style is in effect here, and how does it differ from standardized escape styles that I'm used to?").
I think that perhaps we should have virStrSplitQuoted just return the split pieces, with *no* unescaping. And then have separate functions to escape/unescape individual string pieces after the fact.
Except that you then have to know what quoting styles were in use to know whether \ was a literal or an escape. -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On Wed, Mar 14, 2012 at 11:10:24AM -0600, Eric Blake wrote:
On 03/14/2012 10:50 AM, Daniel P. Berrange wrote:
This looks like it is repeating some of the code in virsh.c:vshCommandStringGetArg; any chance we can combine them? In particular, the ability to mimic shell handling of \ escapes, as well as the difference in behavior of \ inside "" vs. '', seems like it will come in handy.
Ah I had forgotten about that code. Can you clarify the difference in \ handling. I guess you mean that inside '', \ can only be used for \\ and \', while inside "", it can do all the standard shell escapes like \t, \n, etc ?
Inside '', \ has no special meaning. (There's no way to escape ' inside of ').
Inside "", \ escapes the next byte (important for " and \, but works on any byte.
Outside quotes, \ escapes the next byte.
And most importantly: concatenating two forms of quoted materials is permitted. Thus:
foo"b'ar\" "\'' blah\'
on input becomes this on output:
foob'ar" ' blah\
What you have here does neither; although I didn't spot any flaw in the code, I don't know if it's the algorithm we want to be using.
I should have sent this paired with my other patch for <cmdline> handling in LXC. That is the intended use case for this function.
I'm not sure that anyone has ever clearly defined what escaping syntax is used for /proc/cmdline (which is what <cmdline> is representing.
Hmm, good point. If it is /bin/sh doing the parsing, then we know the rules (and virsh matches them); but it if is a custom parser in the kernel, the we have to match that custom parser.
Hmm, well actually there are multiple consumers of /proc/cmdline data. - kernel - init - userspace (eg anaconda) What is the betting that everyone has their own rules :-) For LXC's purposes though, we don't care about the kernel, only init. So I guess I should look at what upstart/sysvinit/systemd do for parsing rules Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
participants (2)
-
Daniel P. Berrange
-
Eric Blake