fix handling of non-ASCII chars in C locale

The relevant standards allow the mbtowc/mbrtowc functions to reject non-ASCII characters (i.e., chars with the high bit set) when the locale is C or POSIX. The BSD libraries (e.g., on OS X) don't do this but the GNU libraries (e.g., on Linux) do. Like most programs we need the C/POSIX locales to allow arbitrary bytes. So explicitly check if we're in a single-byte locale (which would also include ISO-8859 variants) and simply pass-thru the chars without encoding or decoding. Fixes #2802.
author: Kurtis Rader <krader@skepticism.us> 2016-03-10 18:17:39 -0800
committer: Kurtis Rader <krader@skepticism.us> 2016-03-20 18:47:38 -0700
commit: c2f1df1d4af0c7e633528cb4c8caa79ef04b0b5a (patch)
tree: 0776e975779488cb842c09a5d79d193cb7cf9fdc /src/common.cpp
parent: fb0921249f4584e68699e336be249a655b9c8ede (diff)
1 files changed, 49 insertions, 22 deletions
diff --git a/src/common.cpp b/src/common.cpp
index 2aa76cc5..a796baca 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -103,8 +103,7 @@ int fgetws2(wcstring *s, FILE *f)
     {
         errno=0;
 
-        c = getwc(f);
-
+        c = fgetwc(f);
         if (errno == EILSEQ || errno == EINTR)
         {
             continue;
@@ -148,8 +147,19 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len)
 
     wcstring result;
     result.reserve(in_len);
-    mbstate_t state = {};
     size_t in_pos = 0;
+
+    if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
+    {
+        while (in_pos < in_len)
+        {
+            result.push_back((unsigned char)in[in_pos]);
+            in_pos++;
+        }
+        return result;
+    }
+
+    mbstate_t state = {};
     while (in_pos < in_len)
     {
         wchar_t wc = 0;
@@ -165,12 +175,12 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len)
         {
             use_encode_direct = true;
         }
-        else if (ret == (size_t)(-2))
+        else if (ret == (size_t)-2)
         {
             /* Incomplete sequence */
             use_encode_direct = true;
         }
-        else if (ret == (size_t)(-1))
+        else if (ret == (size_t)-1)
         {
             /* Invalid data */
             use_encode_direct = true;
@@ -266,9 +276,7 @@ std::string wcs2string(const wcstring &input)
     std::string result;
     result.reserve(input.size());
 
-    mbstate_t state;
-    memset(&state, 0, sizeof(state));
-
+    mbstate_t state = {};
     char converted[MB_LEN_MAX + 1];
 
     for (size_t i=0; i < input.size(); i++)
@@ -276,12 +284,22 @@ std::string wcs2string(const wcstring &input)
         wchar_t wc = input[i];
         if (wc == INTERNAL_SEPARATOR)
         {
+            // Do nothing.
         }
-        else if ((wc >= ENCODE_DIRECT_BASE) &&
-                 (wc < ENCODE_DIRECT_BASE+256))
+        else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256)
         {
             result.push_back(wc - ENCODE_DIRECT_BASE);
         }
+        else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
+        {
+            // If `wc` contains a wide character we emit a question-mark.
+            if (wc & ~0xFF)
+            {
+                wc = '?';
+            }
+            converted[0] = wc;
+            result.append(converted, 1);
+        }
         else
         {
             memset(converted, 0, sizeof converted);
@@ -311,38 +329,47 @@ std::string wcs2string(const wcstring &input)
 */
 static char *wcs2str_internal(const wchar_t *in, char *out)
 {
-    size_t res=0;
-    size_t in_pos=0;
-    size_t out_pos = 0;
-    mbstate_t state;
-
     CHECK(in, 0);
     CHECK(out, 0);
 
-    memset(&state, 0, sizeof(state));
+    size_t in_pos = 0;
+    size_t out_pos = 0;
+    mbstate_t state = {};
 
     while (in[in_pos])
     {
         if (in[in_pos] == INTERNAL_SEPARATOR)
         {
+            // Do nothing.
         }
-        else if ((in[in_pos] >= ENCODE_DIRECT_BASE) &&
-                 (in[in_pos] < ENCODE_DIRECT_BASE+256))
+        else if (in[in_pos] >= ENCODE_DIRECT_BASE &&
+                 in[in_pos] < ENCODE_DIRECT_BASE + 256)
         {
             out[out_pos++] = in[in_pos]- ENCODE_DIRECT_BASE;
         }
+        else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
+        {
+            // If `wc` contains a wide character we emit a question-mark.
+            if (in[in_pos] & ~0xFF)
+            {
+                out[out_pos++] = '?';
+            }
+            else
+            {
+                out[out_pos++] = (unsigned char)in[in_pos];
+            }
+        }
         else
         {
-            res = wcrtomb(&out[out_pos], in[in_pos], &state);
-
-            if (res == (size_t)(-1))
+            size_t len = wcrtomb(&out[out_pos], in[in_pos], &state);
+            if (len == (size_t)-1)
             {
                 debug(1, L"Wide character %d has no narrow representation", in[in_pos]);
                 memset(&state, 0, sizeof(state));
             }
             else
             {
-                out_pos += res;
+                out_pos += len;
             }
         }
         in_pos++;
author	Kurtis Rader <krader@skepticism.us>	2016-03-10 18:17:39 -0800
committer	Kurtis Rader <krader@skepticism.us>	2016-03-20 18:47:38 -0700
commit	c2f1df1d4af0c7e633528cb4c8caa79ef04b0b5a (patch)
tree	0776e975779488cb842c09a5d79d193cb7cf9fdc /src/common.cpp
parent	fb0921249f4584e68699e336be249a655b9c8ede (diff)