diff options
author | Kurtis Rader <krader@skepticism.us> | 2016-03-10 18:17:39 -0800 |
---|---|---|
committer | Kurtis Rader <krader@skepticism.us> | 2016-03-20 18:47:38 -0700 |
commit | c2f1df1d4af0c7e633528cb4c8caa79ef04b0b5a (patch) | |
tree | 0776e975779488cb842c09a5d79d193cb7cf9fdc | |
parent | fb0921249f4584e68699e336be249a655b9c8ede (diff) |
fix handling of non-ASCII chars in C locale
The relevant standards allow the mbtowc/mbrtowc functions to reject
non-ASCII characters (i.e., chars with the high bit set) when the locale
is C or POSIX. The BSD libraries (e.g., on OS X) don't do this but
the GNU libraries (e.g., on Linux) do. Like most programs we need the
C/POSIX locales to allow arbitrary bytes. So explicitly check if we're
in a single-byte locale (which would also include ISO-8859 variants)
and simply pass-thru the chars without encoding or decoding.
Fixes #2802.
-rw-r--r-- | src/builtin.cpp | 47 | ||||
-rw-r--r-- | src/common.cpp | 71 | ||||
-rw-r--r-- | src/env.cpp | 7 | ||||
-rw-r--r-- | src/fallback.cpp | 72 | ||||
-rw-r--r-- | src/fallback.h | 20 | ||||
-rw-r--r-- | src/history.cpp | 19 | ||||
-rw-r--r-- | src/input_common.cpp | 11 | ||||
-rw-r--r-- | src/output.cpp | 58 | ||||
-rw-r--r-- | src/wutil.cpp | 27 | ||||
-rw-r--r-- | src/wutil.h | 6 | ||||
-rw-r--r-- | tests/c-locale.err | 0 | ||||
-rw-r--r-- | tests/c-locale.in | 35 | ||||
-rw-r--r-- | tests/c-locale.out | 4 | ||||
-rw-r--r-- | tests/c-locale.status | 1 |
14 files changed, 214 insertions, 164 deletions
diff --git a/src/builtin.cpp b/src/builtin.cpp index 4597b18e..f4de72c7 100644 --- a/src/builtin.cpp +++ b/src/builtin.cpp @@ -1907,18 +1907,18 @@ static int builtin_echo(parser_t &parser, io_streams_t &streams, wchar_t **argv) return STATUS_BUILTIN_OK; } -/** The pwd builtin. We don't respect -P to resolve symbolic links because we try to always resolve them. */ +// The pwd builtin. We don't respect -P to resolve symbolic links because we +// try to always resolve them. static int builtin_pwd(parser_t &parser, io_streams_t &streams, wchar_t **argv) { - wchar_t dir_path[4096]; - wchar_t *res = wgetcwd(dir_path, 4096); - if (res == NULL) + wcstring res = wgetcwd(); + if (res.empty()) { return STATUS_BUILTIN_ERROR; } else { - streams.out.append(dir_path); + streams.out.append(res); streams.out.push_back(L'\n'); return STATUS_BUILTIN_OK; } @@ -2699,9 +2699,8 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv) while (1) { - int finished=0; - - wchar_t res=0; + int finished = 0; + wchar_t res = 0; mbstate_t state = {}; while (!finished) @@ -2713,24 +2712,26 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv) break; } - size_t sz = mbrtowc(&res, &b, 1, &state); - - switch (sz) + if (MB_CUR_MAX == 1) // single-byte locale { - case (size_t)(-1): - memset(&state, '\0', sizeof(state)); - break; - - case (size_t)(-2): - break; - case 0: - finished = 1; - break; + res = (unsigned char)b; + finished = 1; + } + else { + size_t sz = mbrtowc(&res, &b, 1, &state); + switch (sz) + { + case (size_t)-1: + memset(&state, 0, sizeof(state)); + break; - default: - finished=1; - break; + case (size_t)-2: + break; + default: + finished = 1; + break; + } } } diff --git a/src/common.cpp b/src/common.cpp index 2aa76cc5..a796baca 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -103,8 +103,7 @@ int fgetws2(wcstring *s, FILE *f) { errno=0; - c = getwc(f); - + c = fgetwc(f); if (errno == EILSEQ || errno == EINTR) { continue; @@ -148,8 +147,19 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len) wcstring result; result.reserve(in_len); - mbstate_t state = {}; size_t in_pos = 0; + + if (MB_CUR_MAX == 1) // single-byte locale, all values are legal + { + while (in_pos < in_len) + { + result.push_back((unsigned char)in[in_pos]); + in_pos++; + } + return result; + } + + mbstate_t state = {}; while (in_pos < in_len) { wchar_t wc = 0; @@ -165,12 +175,12 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len) { use_encode_direct = true; } - else if (ret == (size_t)(-2)) + else if (ret == (size_t)-2) { /* Incomplete sequence */ use_encode_direct = true; } - else if (ret == (size_t)(-1)) + else if (ret == (size_t)-1) { /* Invalid data */ use_encode_direct = true; @@ -266,9 +276,7 @@ std::string wcs2string(const wcstring &input) std::string result; result.reserve(input.size()); - mbstate_t state; - memset(&state, 0, sizeof(state)); - + mbstate_t state = {}; char converted[MB_LEN_MAX + 1]; for (size_t i=0; i < input.size(); i++) @@ -276,12 +284,22 @@ std::string wcs2string(const wcstring &input) wchar_t wc = input[i]; if (wc == INTERNAL_SEPARATOR) { + // Do nothing. } - else if ((wc >= ENCODE_DIRECT_BASE) && - (wc < ENCODE_DIRECT_BASE+256)) + else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256) { result.push_back(wc - ENCODE_DIRECT_BASE); } + else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859) + { + // If `wc` contains a wide character we emit a question-mark. + if (wc & ~0xFF) + { + wc = '?'; + } + converted[0] = wc; + result.append(converted, 1); + } else { memset(converted, 0, sizeof converted); @@ -311,38 +329,47 @@ std::string wcs2string(const wcstring &input) */ static char *wcs2str_internal(const wchar_t *in, char *out) { - size_t res=0; - size_t in_pos=0; - size_t out_pos = 0; - mbstate_t state; - CHECK(in, 0); CHECK(out, 0); - memset(&state, 0, sizeof(state)); + size_t in_pos = 0; + size_t out_pos = 0; + mbstate_t state = {}; while (in[in_pos]) { if (in[in_pos] == INTERNAL_SEPARATOR) { + // Do nothing. } - else if ((in[in_pos] >= ENCODE_DIRECT_BASE) && - (in[in_pos] < ENCODE_DIRECT_BASE+256)) + else if (in[in_pos] >= ENCODE_DIRECT_BASE && + in[in_pos] < ENCODE_DIRECT_BASE + 256) { out[out_pos++] = in[in_pos]- ENCODE_DIRECT_BASE; } + else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859) + { + // If `wc` contains a wide character we emit a question-mark. + if (in[in_pos] & ~0xFF) + { + out[out_pos++] = '?'; + } + else + { + out[out_pos++] = (unsigned char)in[in_pos]; + } + } else { - res = wcrtomb(&out[out_pos], in[in_pos], &state); - - if (res == (size_t)(-1)) + size_t len = wcrtomb(&out[out_pos], in[in_pos], &state); + if (len == (size_t)-1) { debug(1, L"Wide character %d has no narrow representation", in[in_pos]); memset(&state, 0, sizeof(state)); } else { - out_pos += res; + out_pos += len; } } in_pos++; diff --git a/src/env.cpp b/src/env.cpp index c24280bc..56c9deb0 100644 --- a/src/env.cpp +++ b/src/env.cpp @@ -377,14 +377,13 @@ static void setup_path() int env_set_pwd() { - wchar_t dir_path[4096]; - wchar_t *res = wgetcwd(dir_path, 4096); - if (!res) + wcstring res = wgetcwd(); + if (res.empty()) { debug(0, _(L"Could not determine current working directory. Is your locale set correctly?")); return 0; } - env_set(L"PWD", dir_path, ENV_EXPORT | ENV_GLOBAL); + env_set(L"PWD", res.c_str(), ENV_EXPORT | ENV_GLOBAL); return 1; } diff --git a/src/fallback.cpp b/src/fallback.cpp index 7c5f84d2..ed29f5de 100644 --- a/src/fallback.cpp +++ b/src/fallback.cpp @@ -606,7 +606,7 @@ static FILE *fw_data; static void fw_writer(wchar_t c) { - putwc(c, fw_data); + fputwc(c, fw_data); } /* @@ -648,33 +648,30 @@ int wprintf(const wchar_t *filter, ...) #endif #ifndef HAVE_FGETWC - wint_t fgetwc(FILE *stream) { - wchar_t res=0; - mbstate_t state; - memset(&state, '\0', sizeof(state)); + wchar_t res; + mbstate_t state = {}; while (1) { int b = fgetc(stream); - char bb; - - int sz; - if (b == EOF) + { return WEOF; + } - bb=b; - - sz = mbrtowc(&res, &bb, 1, &state); + if (MB_CUR_MAX == 1) // single-byte locale, all values are legal + { + return b; + } + char bb = b; + size_t sz = mbrtowc(&res, &bb, 1, &state); switch (sz) { case -1: - memset(&state, '\0', sizeof(state)); return WEOF; - case -2: break; case 0: @@ -683,35 +680,40 @@ wint_t fgetwc(FILE *stream) return res; } } - -} - - -wint_t getwc(FILE *stream) -{ - return fgetwc(stream); } - - #endif #ifndef HAVE_FPUTWC - wint_t fputwc(wchar_t wc, FILE *stream) { - int res; - char s[MB_CUR_MAX+1]; - memset(s, 0, MB_CUR_MAX+1); - wctomb(s, wc); - res = fputs(s, stream); - return res==EOF?WEOF:wc; -} + int res = 0; + mbstate_t state = {}; + char s[MB_CUR_MAX + 1] = {}; -wint_t putwc(wchar_t wc, FILE *stream) -{ - return fputwc(wc, stream); -} + if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859) + { + // If `wc` contains a wide character we emit a question-mark. + if (wc & ~0xFF) + { + wc = '?'; + } + s[0] = (char)wc; + res = fputs(s, stream); + } + else + { + size_t len = wcrtomb(s, wc, &state); + if (len == (size_t)-1) + { + debug(1, L"Wide character %d has no narrow representation", wc); + } + else { + res = fputs(s, stream); + } + } + return res == EOF ? WEOF : wc; +} #endif #ifndef HAVE_WCSTOK diff --git a/src/fallback.h b/src/fallback.h index e3044449..f3252c14 100644 --- a/src/fallback.h +++ b/src/fallback.h @@ -158,29 +158,13 @@ int vswprintf(wchar_t *out, size_t n, const wchar_t *filter, va_list va); #endif #ifndef HAVE_FGETWC -/** - Fallback implementation of fgetwc -*/ +// Fallback implementation of fgetwc. wint_t fgetwc(FILE *stream); - -/** - Fallback implementation of getwc -*/ -wint_t getwc(FILE *stream); - #endif #ifndef HAVE_FPUTWC - -/** - Fallback implementation of fputwc -*/ +// Fallback implementation of fputwc. wint_t fputwc(wchar_t wc, FILE *stream); -/** - Fallback implementation of putwc -*/ -wint_t putwc(wchar_t wc, FILE *stream); - #endif #ifndef HAVE_WCSTOK diff --git a/src/history.cpp b/src/history.cpp index 75e42638..a37d2b77 100644 --- a/src/history.cpp +++ b/src/history.cpp @@ -926,10 +926,9 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length) { const char *end = begin + length; - const char *pos=begin; - - bool was_backslash = 0; + const char *pos = begin; wcstring out; + bool was_backslash = false; bool first_char = true; bool timestamp_mode = false; time_t timestamp = 0; @@ -937,12 +936,18 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length) while (1) { wchar_t c; - mbstate_t state; size_t res; + mbstate_t state = {}; - memset(&state, 0, sizeof(state)); - - res = mbrtowc(&c, pos, end-pos, &state); + if (MB_CUR_MAX == 1) // single-byte locale + { + c = (unsigned char)*pos; + res = 1; + } + else + { + res = mbrtowc(&c, pos, end - pos, &state); + } if (res == (size_t)-1) { diff --git a/src/input_common.cpp b/src/input_common.cpp index ebe95fce..7c018c2d 100644 --- a/src/input_common.cpp +++ b/src/input_common.cpp @@ -263,16 +263,17 @@ wchar_t input_common_readch(int timed) while (1) { wint_t b = readb(); - char bb; - size_t sz; + if (MB_CUR_MAX == 1) // single-byte locale, all values are legal + { + return (unsigned char)b; + } if ((b >= R_NULL) && (b < R_NULL + 1000)) return b; - bb=b; - - sz = mbrtowc(&res, &bb, 1, &state); + char bb = b; + size_t sz = mbrtowc(&res, &bb, 1, &state); switch (sz) { diff --git a/src/output.cpp b/src/output.cpp index 1707f4fb..dbd97eb4 100644 --- a/src/output.cpp +++ b/src/output.cpp @@ -386,32 +386,35 @@ int writeb(tputs_arg_t b) int writech(wint_t ch) { - mbstate_t state; - size_t i; char buff[MB_LEN_MAX+1]; - size_t bytes; + size_t len; - if ((ch >= ENCODE_DIRECT_BASE) && - (ch < ENCODE_DIRECT_BASE+256)) + if (ch >= ENCODE_DIRECT_BASE && ch < ENCODE_DIRECT_BASE + 256) { buff[0] = ch - ENCODE_DIRECT_BASE; - bytes=1; + len = 1; + } + else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859) + { + // If `wc` contains a wide character we emit a question-mark. + if (ch & ~0xFF) + { + ch = '?'; + } + buff[0] = ch; + len = 1; } else { - memset(&state, 0, sizeof(state)); - bytes= wcrtomb(buff, ch, &state); - - switch (bytes) + mbstate_t state = {}; + len = wcrtomb(buff, ch, &state); + if (len == (size_t)-1) { - case (size_t)(-1): - { - return 1; - } + return 1; } } - for (i=0; i<bytes; i++) + for (size_t i = 0; i < len; i++) { out(buff[i]); } @@ -420,29 +423,26 @@ int writech(wint_t ch) void writestr(const wchar_t *str) { - char *pos; - CHECK(str,); - // while( *str ) - // writech( *str++ ); - - /* - Check amount of needed space - */ - size_t len = wcstombs(0, str, 0); + if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859) + { + while( *str ) + { + writech( *str++ ); + } + return; + } + size_t len = wcstombs(0, str, 0); // figure amount of space needed if (len == (size_t)-1) { debug(1, L"Tried to print invalid wide character string"); return; } + // Convert the string. len++; - - /* - Convert - */ char *buffer, static_buffer[256]; if (len <= sizeof static_buffer) buffer = static_buffer; @@ -456,7 +456,7 @@ void writestr(const wchar_t *str) /* Write */ - for (pos = buffer; *pos; pos++) + for (char *pos = buffer; *pos; pos++) { out(*pos); } diff --git a/src/wutil.cpp b/src/wutil.cpp index a3a51dac..2ff358bc 100644 --- a/src/wutil.cpp +++ b/src/wutil.cpp @@ -145,30 +145,23 @@ bool wreaddir_for_dirs(DIR *dir, wcstring *out_name) } -wchar_t *wgetcwd(wchar_t *buff, size_t sz) +const wcstring wgetcwd() { - char *buffc = (char *)malloc(sz*MAX_UTF8_BYTES); - char *res; - wchar_t *ret = 0; + wcstring retval; - if (!buffc) + char *res = getcwd(NULL, 0); + if (res) { - errno = ENOMEM; - return 0; + retval = str2wcstring(res); + free(res); } - - res = getcwd(buffc, sz*MAX_UTF8_BYTES); - if (res) + else { - if ((size_t)-1 != mbstowcs(buff, buffc, sizeof(wchar_t) * sz)) - { - ret = buff; - } + debug(0, _(L"getcwd() failed with errno %d/%s"), errno, strerror(errno)); + retval = wcstring(); } - free(buffc); - - return ret; + return retval; } int wchdir(const wcstring &dir) diff --git a/src/wutil.h b/src/wutil.h index 9897439a..a14dfa9b 100644 --- a/src/wutil.h +++ b/src/wutil.h @@ -70,10 +70,8 @@ void safe_perror(const char *message); */ const char *safe_strerror(int err); -/** - Wide character version of getcwd(). -*/ -wchar_t *wgetcwd(wchar_t *buff, size_t sz); +// Wide character version of getcwd(). +const wcstring wgetcwd(); /** Wide character version of chdir() diff --git a/tests/c-locale.err b/tests/c-locale.err new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/tests/c-locale.err diff --git a/tests/c-locale.in b/tests/c-locale.in new file mode 100644 index 00000000..d2f2bd5f --- /dev/null +++ b/tests/c-locale.in @@ -0,0 +1,35 @@ +# Verify that fish can pass through non-ASCII characters in the C/POSIX +# locale. This is to prevent regression of +# https://github.com/fish-shell/fish-shell/issues/2802. +# +# These tests are needed because the relevant standards allow the functions +# mbrtowc() and wcrtomb() to treat bytes with the high bit set as either valid +# or invalid in the C/POSIX locales. GNU libc treats those bytes as invalid. +# Other libc implementations (e.g., BSD) treat them as valid. We want fish to +# always treat those bytes as valid. + +# The fish in the middle of the pipeline should be receiving a UTF-8 encoded +# version of the unicode from the echo. It should pass those bytes thru +# literally since it is in the C locale. We verify this by first passing the +# echo output directly to the `xxd` program then via a fish instance. The +# output should be "58c3bb58" for the first statement and "58c3bc58" for the +# second. +echo -n X\u00fbX | \ + xxd --plain +echo X\u00fcX | env LC_ALL=C ../test/root/bin/fish -c 'read foo; echo -n $foo' | \ + xxd --plain + +# This test is subtle. Despite the presence of the \u00fc unicode char (a "u" +# with an umlaut) the fact the locale is C/POSIX will cause the \xfc byte to +# be emitted rather than the usual UTF-8 sequence \xc3\xbc. That's because the +# few single-byte unicode chars (that are not ASCII) are generally in the +# ISO-8859-1 char set which is encompased by the C locale. The output should +# be "59fc59". +env LC_ALL=C ../test/root/bin/fish -c 'echo -n Y\u00fcY' | \ + xxd --plain + +# The user can specify a wide unicode character (one requiring more than a +# single byte). In the C/POSIX locales we substitute a question-mark for the +# unencodable wide char. The output should be "543f54". +env LC_ALL=C ../test/root/bin/fish -c 'echo -n T\u01fdT' | \ + xxd --plain diff --git a/tests/c-locale.out b/tests/c-locale.out new file mode 100644 index 00000000..10a94d3e --- /dev/null +++ b/tests/c-locale.out @@ -0,0 +1,4 @@ +58c3bb58 +58c3bc58 +59fc59 +543f54 diff --git a/tests/c-locale.status b/tests/c-locale.status new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/tests/c-locale.status @@ -0,0 +1 @@ +0 |