aboutsummaryrefslogtreecommitdiffhomepage
path: root/common.cpp
diff options
context:
space:
mode:
authorGravatar ridiculousfish <corydoras@ridiculousfish.com>2013-11-24 22:57:49 -0800
committerGravatar ridiculousfish <corydoras@ridiculousfish.com>2013-11-24 22:58:39 -0800
commit9f6223311e7ae6a9d6d21e33bf0fa67822da6fb6 (patch)
tree401878fd6bfdb121ec41e977af1defe6a001f6f0 /common.cpp
parent90b78326d393d2490e43491a93d8db4b449b972e (diff)
Large cleanup and refactoring of unescape() function.
Diffstat (limited to 'common.cpp')
-rw-r--r--common.cpp536
1 files changed, 529 insertions, 7 deletions
diff --git a/common.cpp b/common.cpp
index 7a9f7a51..c98792fc 100644
--- a/common.cpp
+++ b/common.cpp
@@ -72,6 +72,7 @@ parts of fish.
#include "util.cpp"
#include "fallback.cpp"
+#define NOT_A_WCHAR WEOF
struct termios shell_modes;
@@ -1125,6 +1126,513 @@ wcstring escape_string(const wcstring &in, escape_flags_t flags)
return result;
}
+/* Helper to return the last character in a string, or NOT_A_WCHAR */
+static wint_t string_last_char(const wcstring &str)
+{
+ size_t len = str.size();
+ return len == 0 ? NOT_A_WCHAR : str.at(len - 1);
+}
+
+/* Given a null terminated string starting with a backslash, read the escape as if it is unquoted, appending to result. Return the number of characters consumed, or 0 on error */
+static size_t read_unquoted_escape(const wchar_t *input, wcstring *result, bool allow_incomplete, bool unescape_special)
+{
+ if (input[0] != L'\\')
+ {
+ // not an escape
+ return 0;
+ }
+
+ /* Here's the character we'll ultimately append. Note that L'\0' is a valid thing to append. */
+ wchar_t result_char = NOT_A_WCHAR;
+
+ bool errored = false;
+ size_t in_pos = 1; //in_pos always tracks the next character to read (and therefore the number of characters read so far)
+ const wchar_t c = input[in_pos++];
+ switch (c)
+ {
+
+ /* A null character after a backslash is an error */
+ case L'\0':
+ {
+ /* Adjust in_pos to only include the backslash */
+ assert(in_pos > 0);
+ in_pos--;
+
+ /* It's an error, unless we're allowing incomplete escapes */
+ if (! allow_incomplete)
+ errored = true;
+ break;
+ }
+
+ /* Numeric escape sequences. No prefix means octal escape, otherwise hexadecimal. */
+ case L'0':
+ case L'1':
+ case L'2':
+ case L'3':
+ case L'4':
+ case L'5':
+ case L'6':
+ case L'7':
+ case L'u':
+ case L'U':
+ case L'x':
+ case L'X':
+ {
+ long long res=0;
+ size_t chars=2;
+ int base=16;
+
+ bool byte_literal = false;
+ wchar_t max_val = ASCII_MAX;
+
+ switch (c)
+ {
+ case L'u':
+ {
+ chars=4;
+ max_val = UCS2_MAX;
+ break;
+ }
+
+ case L'U':
+ {
+ chars=8;
+ max_val = WCHAR_MAX;
+ break;
+ }
+
+ case L'x':
+ {
+ chars = 2;
+ max_val = ASCII_MAX;
+ break;
+ }
+
+ case L'X':
+ {
+ byte_literal = true;
+ max_val = BYTE_MAX;
+ break;
+ }
+
+ default:
+ {
+ base=8;
+ chars=3;
+ // note that in_pos currently is just after the first post-backslash character; we want to start our escape from there
+ assert(in_pos > 0);
+ in_pos--;
+ break;
+ }
+ }
+
+ for (size_t i=0; i<chars; i++)
+ {
+ long d = convert_digit(input[in_pos],base);
+ if (d < 0)
+ {
+ break;
+ }
+
+ res=(res*base)+d;
+ in_pos++;
+ }
+
+ if (res <= max_val)
+ {
+ result_char = (wchar_t)((byte_literal ? ENCODE_DIRECT_BASE : 0)+res);
+ }
+ else
+ {
+ errored = true;
+ }
+
+ break;
+ }
+
+ /* \a means bell (alert) */
+ case L'a':
+ {
+ result_char = L'\a';
+ break;
+ }
+
+ /* \b means backspace */
+ case L'b':
+ {
+ result_char = L'\b';
+ break;
+ }
+
+ /* \cX means control sequence X */
+ case L'c':
+ {
+ const wchar_t sequence_char = input[in_pos++];
+ if (sequence_char >= L'a' && sequence_char <= (L'a'+32))
+ {
+ result_char = sequence_char-L'a'+1;
+ }
+ else if (sequence_char >= L'A' && sequence_char <= (L'A'+32))
+ {
+ result_char = sequence_char-L'A'+1;
+ }
+ else
+ {
+ errored = true;
+ }
+ break;
+ }
+
+ /* \x1b means escape */
+ case L'e':
+ {
+ result_char = L'\x1b';
+ break;
+ }
+
+ /*
+ \f means form feed
+ */
+ case L'f':
+ {
+ result_char = L'\f';
+ break;
+ }
+
+ /*
+ \n means newline
+ */
+ case L'n':
+ {
+ result_char = L'\n';
+ break;
+ }
+
+ /*
+ \r means carriage return
+ */
+ case L'r':
+ {
+ result_char = L'\r';
+ break;
+ }
+
+ /*
+ \t means tab
+ */
+ case L't':
+ {
+ result_char = L'\t';
+ break;
+ }
+
+ /*
+ \v means vertical tab
+ */
+ case L'v':
+ {
+ result_char = L'\v';
+ break;
+ }
+
+ /* If a backslash is followed by an actual newline, swallow them both */
+ case L'\n':
+ {
+ result_char = NOT_A_WCHAR;
+ break;
+ }
+
+ default:
+ {
+ if (unescape_special)
+ result->push_back(INTERNAL_SEPARATOR);
+ result_char = c;
+ break;
+ }
+ }
+
+ if (! errored && result_char != NOT_A_WCHAR)
+ {
+ result->push_back(result_char);
+ }
+ return errored ? 0 : in_pos;
+}
+
+/* Returns the unescaped version of input_str into output_str (by reference). Returns true if successful. If false, the contents of output_str are undefined (!) */
+static bool unescape_string_internal(const wchar_t * const input, const size_t input_len, wcstring *output_str, unescape_flags_t flags)
+{
+ /* Set up result string, which we'll swap with the output on success */
+ wcstring result;
+ result.reserve(input_len);
+
+ const bool unescape_special = !!(flags & UNESCAPE_SPECIAL);
+ const bool allow_incomplete = !!(flags & UNESCAPE_INCOMPLETE);
+
+ int bracket_count = 0;
+
+ bool errored = false;
+ enum
+ {
+ mode_unquoted,
+ mode_single_quotes,
+ mode_double_quotes
+ } mode = mode_unquoted;
+
+ for (size_t input_position = 0; input_position < input_len && ! errored; input_position++)
+ {
+ const wchar_t c = input[input_position];
+ /* Here's the character we'll append to result, or NOT_A_WCHAR to suppress it */
+ wchar_t to_append = c;
+ if (mode == mode_unquoted)
+ {
+
+ switch (c)
+ {
+ case L'\\':
+ {
+ /* Backslashes (escapes) are complicated and may result in errors, or appending INTERNAL_SEPARATORs, so we have to handle them specially */
+ size_t escape_chars = read_unquoted_escape(input + input_position, &result, allow_incomplete, unescape_special);
+ if (escape_chars == 0)
+ {
+ /* A 0 return indicates an error */
+ errored = true;
+ }
+ else
+ {
+ /* Skip over the characters we read, minus one because the outer loop will increment it */
+ assert(escape_chars > 0);
+ input_position += escape_chars - 1;
+ }
+ /* We've already appended, don't append anything else */
+ to_append = NOT_A_WCHAR;
+ break;
+ }
+
+ case L'~':
+ {
+ if (unescape_special && (input_position == 0))
+ {
+ to_append = HOME_DIRECTORY;
+ }
+ break;
+ }
+
+ case L'%':
+ {
+ if (unescape_special && (input_position == 0))
+ {
+ to_append = PROCESS_EXPAND;
+ }
+ break;
+ }
+
+ case L'*':
+ {
+ if (unescape_special)
+ {
+ /* In general, this is ANY_STRING. But as a hack, if the last appended char is ANY_STRING, delete the last char and store ANY_STRING_RECURSIVE to reflect the fact that ** is the recursive wildcard. */
+ if (string_last_char(result) == ANY_STRING)
+ {
+ assert(result.size() > 0);
+ result.resize(result.size() - 1);
+ to_append = ANY_STRING_RECURSIVE;
+ }
+ else
+ {
+ to_append = ANY_STRING;
+ }
+ }
+ break;
+ }
+
+ case L'?':
+ {
+ if (unescape_special)
+ {
+ to_append = ANY_CHAR;
+ }
+ break;
+ }
+
+ case L'$':
+ {
+ if (unescape_special)
+ {
+ to_append = VARIABLE_EXPAND;
+ }
+ break;
+ }
+
+ case L'{':
+ {
+ if (unescape_special)
+ {
+ bracket_count++;
+ to_append = BRACKET_BEGIN;
+ }
+ break;
+ }
+
+ case L'}':
+ {
+ if (unescape_special)
+ {
+ bracket_count--;
+ to_append = BRACKET_END;
+ }
+ break;
+ }
+
+ case L',':
+ {
+ /* If the last character was a separator, then treat this as a literal comma */
+ if (unescape_special && bracket_count > 0 && string_last_char(result) != BRACKET_SEP)
+ {
+ to_append = BRACKET_SEP;
+ }
+ break;
+ }
+
+ case L'\'':
+ {
+ mode = mode_single_quotes;
+ to_append = unescape_special ? INTERNAL_SEPARATOR : NOT_A_WCHAR;
+ break;
+ }
+
+ case L'\"':
+ {
+ mode = mode_double_quotes;
+ to_append = unescape_special ? INTERNAL_SEPARATOR : NOT_A_WCHAR;
+ break;
+ }
+ }
+ }
+ else if (mode == mode_single_quotes)
+ {
+ if (c == L'\\')
+ {
+ /* A backslash may or may not escape something in single quotes */
+ switch (input[input_position + 1])
+ {
+ case '\\':
+ case L'\'':
+ {
+ to_append = input[input_position + 1];
+ input_position += 1; /* Skip over the backslash */
+ break;
+ }
+
+ case L'\0':
+ {
+ if (!allow_incomplete)
+ {
+ errored = true;
+ }
+ else
+ {
+ // PCA this line had the following cryptic comment:
+ // 'We may ever escape a NULL character, but still appending a \ in case I am wrong.'
+ // Not sure what it means or the importance of this
+ input_position += 1; /* Skip over the backslash */
+ to_append = L'\\';
+ }
+ }
+ break;
+
+ default:
+ {
+ /* Literal backslash that doesn't escape anything! Leave things alone; we'll append the backslash itself */
+ break;
+ }
+ }
+ }
+ else if (c == L'\'')
+ {
+ to_append = unescape_special ? INTERNAL_SEPARATOR : NOT_A_WCHAR;
+ mode = mode_unquoted;
+ }
+ }
+ else if (mode == mode_double_quotes)
+ {
+ switch (c)
+ {
+ case L'"':
+ {
+ mode = mode_unquoted;
+ to_append = unescape_special ? INTERNAL_SEPARATOR : NOT_A_WCHAR;
+ break;
+ }
+
+ case '\\':
+ {
+ switch (input[input_position + 1])
+ {
+ case L'\0':
+ {
+ if (!allow_incomplete)
+ {
+ errored = true;
+ }
+ else
+ {
+ to_append = L'\0';
+ }
+ }
+ break;
+
+ case '\\':
+ case L'$':
+ case '"':
+ {
+ to_append = input[input_position + 1];
+ input_position += 1; /* Skip over the backslash */
+ break;
+ }
+
+ case '\n':
+ {
+ /* Swallow newline */
+ to_append = NOT_A_WCHAR;
+ break;
+ }
+
+ default:
+ {
+ /* Literal backslash that doesn't escape anything! Leave things alone; we'll append the backslash itself */
+ break;
+ }
+ }
+ break;
+ }
+
+ case '$':
+ {
+ if (unescape_special)
+ {
+ to_append = VARIABLE_EXPAND_SINGLE;
+ }
+ break;
+ }
+
+ }
+ }
+
+ /* Now maybe append the char */
+ if (to_append != NOT_A_WCHAR)
+ {
+ result.push_back(to_append);
+ }
+ }
+
+ /* Return the string by reference, and then success */
+ if (! errored)
+ {
+ output_str->swap(result);
+ }
+ return ! errored;
+}
+
wchar_t *unescape(const wchar_t * orig, int flags)
{
int out_pos;
@@ -1681,19 +2189,33 @@ wchar_t *unescape(const wchar_t * orig, int flags)
return in;
}
-bool unescape_string(wcstring &str, int escape_special)
+bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special)
{
- bool success = false;
- wchar_t *result = unescape(str.c_str(), escape_special);
- if (result)
+ assert(str != NULL);
+ wcstring output;
+ bool success = unescape_string_internal(str->c_str(), str->size(), &output, escape_special);
+ if (success)
{
- str.replace(str.begin(), str.end(), result);
- free(result);
- success = true;
+ str->swap(output);
}
return success;
}
+bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special)
+{
+ bool success = unescape_string_internal(input, wcslen(input), output, escape_special);
+ if (! success)
+ output->clear();
+ return success;
+}
+
+bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special)
+{
+ bool success = unescape_string_internal(input.c_str(), input.size(), output, escape_special);
+ if (! success)
+ output->clear();
+ return success;
+}
void common_handle_winch(int signal)