diff options
author | Kurtis Rader <krader@skepticism.us> | 2016-03-10 18:17:39 -0800 |
---|---|---|
committer | Kurtis Rader <krader@skepticism.us> | 2016-03-20 18:47:38 -0700 |
commit | c2f1df1d4af0c7e633528cb4c8caa79ef04b0b5a (patch) | |
tree | 0776e975779488cb842c09a5d79d193cb7cf9fdc /src/common.cpp | |
parent | fb0921249f4584e68699e336be249a655b9c8ede (diff) |
fix handling of non-ASCII chars in C locale
The relevant standards allow the mbtowc/mbrtowc functions to reject
non-ASCII characters (i.e., chars with the high bit set) when the locale
is C or POSIX. The BSD libraries (e.g., on OS X) don't do this but
the GNU libraries (e.g., on Linux) do. Like most programs we need the
C/POSIX locales to allow arbitrary bytes. So explicitly check if we're
in a single-byte locale (which would also include ISO-8859 variants)
and simply pass-thru the chars without encoding or decoding.
Fixes #2802.
Diffstat (limited to 'src/common.cpp')
-rw-r--r-- | src/common.cpp | 71 |
1 files changed, 49 insertions, 22 deletions
diff --git a/src/common.cpp b/src/common.cpp index 2aa76cc5..a796baca 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -103,8 +103,7 @@ int fgetws2(wcstring *s, FILE *f) { errno=0; - c = getwc(f); - + c = fgetwc(f); if (errno == EILSEQ || errno == EINTR) { continue; @@ -148,8 +147,19 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len) wcstring result; result.reserve(in_len); - mbstate_t state = {}; size_t in_pos = 0; + + if (MB_CUR_MAX == 1) // single-byte locale, all values are legal + { + while (in_pos < in_len) + { + result.push_back((unsigned char)in[in_pos]); + in_pos++; + } + return result; + } + + mbstate_t state = {}; while (in_pos < in_len) { wchar_t wc = 0; @@ -165,12 +175,12 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len) { use_encode_direct = true; } - else if (ret == (size_t)(-2)) + else if (ret == (size_t)-2) { /* Incomplete sequence */ use_encode_direct = true; } - else if (ret == (size_t)(-1)) + else if (ret == (size_t)-1) { /* Invalid data */ use_encode_direct = true; @@ -266,9 +276,7 @@ std::string wcs2string(const wcstring &input) std::string result; result.reserve(input.size()); - mbstate_t state; - memset(&state, 0, sizeof(state)); - + mbstate_t state = {}; char converted[MB_LEN_MAX + 1]; for (size_t i=0; i < input.size(); i++) @@ -276,12 +284,22 @@ std::string wcs2string(const wcstring &input) wchar_t wc = input[i]; if (wc == INTERNAL_SEPARATOR) { + // Do nothing. } - else if ((wc >= ENCODE_DIRECT_BASE) && - (wc < ENCODE_DIRECT_BASE+256)) + else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256) { result.push_back(wc - ENCODE_DIRECT_BASE); } + else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859) + { + // If `wc` contains a wide character we emit a question-mark. + if (wc & ~0xFF) + { + wc = '?'; + } + converted[0] = wc; + result.append(converted, 1); + } else { memset(converted, 0, sizeof converted); @@ -311,38 +329,47 @@ std::string wcs2string(const wcstring &input) */ static char *wcs2str_internal(const wchar_t *in, char *out) { - size_t res=0; - size_t in_pos=0; - size_t out_pos = 0; - mbstate_t state; - CHECK(in, 0); CHECK(out, 0); - memset(&state, 0, sizeof(state)); + size_t in_pos = 0; + size_t out_pos = 0; + mbstate_t state = {}; while (in[in_pos]) { if (in[in_pos] == INTERNAL_SEPARATOR) { + // Do nothing. } - else if ((in[in_pos] >= ENCODE_DIRECT_BASE) && - (in[in_pos] < ENCODE_DIRECT_BASE+256)) + else if (in[in_pos] >= ENCODE_DIRECT_BASE && + in[in_pos] < ENCODE_DIRECT_BASE + 256) { out[out_pos++] = in[in_pos]- ENCODE_DIRECT_BASE; } + else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859) + { + // If `wc` contains a wide character we emit a question-mark. + if (in[in_pos] & ~0xFF) + { + out[out_pos++] = '?'; + } + else + { + out[out_pos++] = (unsigned char)in[in_pos]; + } + } else { - res = wcrtomb(&out[out_pos], in[in_pos], &state); - - if (res == (size_t)(-1)) + size_t len = wcrtomb(&out[out_pos], in[in_pos], &state); + if (len == (size_t)-1) { debug(1, L"Wide character %d has no narrow representation", in[in_pos]); memset(&state, 0, sizeof(state)); } else { - out_pos += res; + out_pos += len; } } in_pos++; |