Rewrite str2wcstring to properly handle embedded nulls, and be simpler

author: ridiculousfish <corydoras@ridiculousfish.com> 2012-12-20 12:25:35 -0800
committer: ridiculousfish <corydoras@ridiculousfish.com> 2012-12-20 12:25:35 -0800
commit: ce15abd577248ced70536d7091b8838b3b765f56 (patch)
tree: c539ea52a6cff0c532cdc8e507c9f4377a71731b
parent: d5af389d2eb16aca2fa27c2b30747a5d73a5eac3 (diff)
3 files changed, 71 insertions, 89 deletions
diff --git a/common.cpp b/common.cpp
index d1db95ed..83167605 100644
--- a/common.cpp
+++ b/common.cpp
@@ -81,8 +81,6 @@ parts of fish.
 #include "fallback.cpp"
 
 
-static wchar_t *str2wcs_internal(const char *in, const size_t in_len, wchar_t *out);
-
 struct termios shell_modes;
 
 // Note we foolishly assume that pthread_t is just a primitive. But it might be a struct.
@@ -164,45 +162,6 @@ int fgetws2(wcstring *s, FILE *f)
     }
 }
 
-static wchar_t *str2wcs(const char *in)
-{
-    size_t len = strlen(in);
-    wchar_t *out = (wchar_t *)malloc(sizeof(wchar_t)*(len+1));
-    if (!out)
-    {
-        DIE_MEM();
-    }
-
-    return str2wcs_internal(in, strlen(in), out);
-}
-
-wcstring str2wcstring(const char *in, size_t len)
-{
-    assert(in != NULL);
-    std::string tmp_str(in, len);
-    wchar_t *tmp = str2wcs(tmp_str.c_str());
-    wcstring result = tmp;
-    free(tmp);
-    return result;
-}
-
-wcstring str2wcstring(const char *in)
-{
-    assert(in != NULL);
-    wchar_t *tmp = str2wcs(in);
-    wcstring result = tmp;
-    free(tmp);
-    return result;
-}
-
-wcstring str2wcstring(const std::string &in)
-{
-    wchar_t *tmp = str2wcs(in.c_str());
-    wcstring result = tmp;
-    free(tmp);
-    return result;
-}
-
 /**
    Converts the narrow character string \c in into it's wide
    equivalent, stored in \c out. \c out must have enough space to fit
@@ -213,63 +172,87 @@ wcstring str2wcstring(const std::string &in)
    This function encodes illegal character sequences in a reversible
    way using the private use area.
 */
-static wchar_t *str2wcs_internal(const char *in, const size_t in_len, wchar_t *out)
-{
-    size_t res=0;
-    size_t in_pos=0;
-    size_t out_pos = 0;
-    mbstate_t state;
 
-    CHECK(in, 0);
-    CHECK(out, 0);
+static wcstring str2wcs_internal(const char *in, const size_t in_len)
+{
+    if (in_len == 0)
+        return wcstring();
 
-    memset(&state, 0, sizeof(state));
+    assert(in != NULL);
 
-    while (in[in_pos])
+    wcstring result;
+    result.reserve(in_len);
+    mbstate_t state = {};
+    size_t in_pos = 0;
+    while (in_pos < in_len)
     {
-        res = mbrtowc(&out[out_pos], &in[in_pos], in_len-in_pos, &state);
+        wchar_t wc = 0;
+        size_t ret = mbrtowc(&wc, &in[in_pos], in_len-in_pos, &state);
 
-        if (((out[out_pos] >= ENCODE_DIRECT_BASE) &&
-                (out[out_pos] < ENCODE_DIRECT_BASE+256)) ||
-                (out[out_pos] == INTERNAL_SEPARATOR))
+        /* Determine whether to encode this characters with our crazy scheme */
+        bool use_encode_direct = false;
+        if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE+256)
         {
-            out[out_pos] = ENCODE_DIRECT_BASE + (unsigned char)in[in_pos];
+            use_encode_direct = true;
+        }
+        else if (wc == INTERNAL_SEPARATOR)
+        {
+            use_encode_direct = true;
+        }
+        else if (ret == (size_t)(-2))
+        {
+            /* Incomplete sequence */
+            use_encode_direct = true;
+        }
+        else if (ret == (size_t)(-1))
+        {
+            /* Invalid data */
+            use_encode_direct = true;
+        }
+        else if (ret > in_len - in_pos)
+        {
+            /* Other error codes? Terrifying, should never happen */
+            use_encode_direct = true;
+        }
+
+        if (use_encode_direct)
+        {
+            wc = ENCODE_DIRECT_BASE + (unsigned char)in[in_pos];
+            result.push_back(wc);
             in_pos++;
-            memset(&state, 0, sizeof(state));
-            out_pos++;
+            bzero(&state, sizeof state);
+        }
+        else if (ret == 0)
+        {
+            /* Embedded null byte! */
+            result.push_back(L'\0');
+            in_pos++;
+            bzero(&state, sizeof state);
         }
         else
         {
-
-            switch (res)
-            {
-                case (size_t)(-2):
-                case (size_t)(-1):
-                {
-                    out[out_pos] = ENCODE_DIRECT_BASE + (unsigned char)in[in_pos];
-                    in_pos++;
-                    memset(&state, 0, sizeof(state));
-                    break;
-                }
-
-                case 0:
-                {
-                    return out;
-                }
-
-                default:
-                {
-                    in_pos += res;
-                    break;
-                }
-            }
-            out_pos++;
+            /* Normal case */
+            result.push_back(wc);
+            in_pos += ret;
         }
-
     }
-    out[out_pos] = 0;
+    return result;
+}
 
-    return out;
+wcstring str2wcstring(const char *in, size_t len)
+{
+    return str2wcs_internal(in, len);
+}
+
+wcstring str2wcstring(const char *in)
+{
+    return str2wcs_internal(in, strlen(in));
+}
+
+wcstring str2wcstring(const std::string &in)
+{
+    /* Handles embedded nulls! */
+    return str2wcs_internal(in.data(), in.size());
 }
 
 char *wcs2str(const wchar_t *in)
diff --git a/exec.cpp b/exec.cpp
index 788e1fc7..cdae00af 100644
--- a/exec.cpp
+++ b/exec.cpp
@@ -1459,7 +1459,7 @@ static int exec_subshell_internal(const wcstring &cmd, wcstring_list_t *lst)
     proc_set_last_status(prev_status);
 
     is_subshell = prev_subshell;
-    
+
     if (lst != NULL)
     {
         const char *begin = io_buffer->out_buffer_ptr();
@@ -1478,12 +1478,12 @@ static int exec_subshell_internal(const wcstring &cmd, wcstring_list_t *lst)
             // Stop now points at the first character we do not want to copy)
             const wcstring wc = str2wcstring(cursor, stop - cursor);
             lst->push_back(wc);
-            
+
             // If we hit a separator, skip over it; otherwise we're at the end
             cursor = stop + (hit_separator ? 1 : 0);
         }
     }
-    
+
     io_buffer_destroy(io_buffer);
     return status;
 }
diff --git a/fish_tests.cpp b/fish_tests.cpp
index 9cc9e8f6..4a2280dd 100644
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@@ -284,7 +284,6 @@ static void test_convert()
 /* Verify correct behavior with embedded nulls */
 static void test_convert_nulls(void)
 {
-    return;
     say(L"Testing embedded nulls in string conversion");
     const wchar_t in[] = L"AAA\0BBB";
     const size_t in_len = (sizeof in / sizeof *in) - 1;
author	ridiculousfish <corydoras@ridiculousfish.com>	2012-12-20 12:25:35 -0800
committer	ridiculousfish <corydoras@ridiculousfish.com>	2012-12-20 12:25:35 -0800
commit	ce15abd577248ced70536d7091b8838b3b765f56 (patch)
tree	c539ea52a6cff0c532cdc8e507c9f4377a71731b
parent	d5af389d2eb16aca2fa27c2b30747a5d73a5eac3 (diff)