aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Kurtis Rader <krader@skepticism.us>2016-03-10 18:17:39 -0800
committerGravatar Kurtis Rader <krader@skepticism.us>2016-03-20 18:47:38 -0700
commitc2f1df1d4af0c7e633528cb4c8caa79ef04b0b5a (patch)
tree0776e975779488cb842c09a5d79d193cb7cf9fdc
parentfb0921249f4584e68699e336be249a655b9c8ede (diff)
fix handling of non-ASCII chars in C locale
The relevant standards allow the mbtowc/mbrtowc functions to reject non-ASCII characters (i.e., chars with the high bit set) when the locale is C or POSIX. The BSD libraries (e.g., on OS X) don't do this but the GNU libraries (e.g., on Linux) do. Like most programs we need the C/POSIX locales to allow arbitrary bytes. So explicitly check if we're in a single-byte locale (which would also include ISO-8859 variants) and simply pass-thru the chars without encoding or decoding. Fixes #2802.
-rw-r--r--src/builtin.cpp47
-rw-r--r--src/common.cpp71
-rw-r--r--src/env.cpp7
-rw-r--r--src/fallback.cpp72
-rw-r--r--src/fallback.h20
-rw-r--r--src/history.cpp19
-rw-r--r--src/input_common.cpp11
-rw-r--r--src/output.cpp58
-rw-r--r--src/wutil.cpp27
-rw-r--r--src/wutil.h6
-rw-r--r--tests/c-locale.err0
-rw-r--r--tests/c-locale.in35
-rw-r--r--tests/c-locale.out4
-rw-r--r--tests/c-locale.status1
14 files changed, 214 insertions, 164 deletions
diff --git a/src/builtin.cpp b/src/builtin.cpp
index 4597b18e..f4de72c7 100644
--- a/src/builtin.cpp
+++ b/src/builtin.cpp
@@ -1907,18 +1907,18 @@ static int builtin_echo(parser_t &parser, io_streams_t &streams, wchar_t **argv)
return STATUS_BUILTIN_OK;
}
-/** The pwd builtin. We don't respect -P to resolve symbolic links because we try to always resolve them. */
+// The pwd builtin. We don't respect -P to resolve symbolic links because we
+// try to always resolve them.
static int builtin_pwd(parser_t &parser, io_streams_t &streams, wchar_t **argv)
{
- wchar_t dir_path[4096];
- wchar_t *res = wgetcwd(dir_path, 4096);
- if (res == NULL)
+ wcstring res = wgetcwd();
+ if (res.empty())
{
return STATUS_BUILTIN_ERROR;
}
else
{
- streams.out.append(dir_path);
+ streams.out.append(res);
streams.out.push_back(L'\n');
return STATUS_BUILTIN_OK;
}
@@ -2699,9 +2699,8 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv)
while (1)
{
- int finished=0;
-
- wchar_t res=0;
+ int finished = 0;
+ wchar_t res = 0;
mbstate_t state = {};
while (!finished)
@@ -2713,24 +2712,26 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv)
break;
}
- size_t sz = mbrtowc(&res, &b, 1, &state);
-
- switch (sz)
+ if (MB_CUR_MAX == 1) // single-byte locale
{
- case (size_t)(-1):
- memset(&state, '\0', sizeof(state));
- break;
-
- case (size_t)(-2):
- break;
- case 0:
- finished = 1;
- break;
+ res = (unsigned char)b;
+ finished = 1;
+ }
+ else {
+ size_t sz = mbrtowc(&res, &b, 1, &state);
+ switch (sz)
+ {
+ case (size_t)-1:
+ memset(&state, 0, sizeof(state));
+ break;
- default:
- finished=1;
- break;
+ case (size_t)-2:
+ break;
+ default:
+ finished = 1;
+ break;
+ }
}
}
diff --git a/src/common.cpp b/src/common.cpp
index 2aa76cc5..a796baca 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -103,8 +103,7 @@ int fgetws2(wcstring *s, FILE *f)
{
errno=0;
- c = getwc(f);
-
+ c = fgetwc(f);
if (errno == EILSEQ || errno == EINTR)
{
continue;
@@ -148,8 +147,19 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len)
wcstring result;
result.reserve(in_len);
- mbstate_t state = {};
size_t in_pos = 0;
+
+ if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
+ {
+ while (in_pos < in_len)
+ {
+ result.push_back((unsigned char)in[in_pos]);
+ in_pos++;
+ }
+ return result;
+ }
+
+ mbstate_t state = {};
while (in_pos < in_len)
{
wchar_t wc = 0;
@@ -165,12 +175,12 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len)
{
use_encode_direct = true;
}
- else if (ret == (size_t)(-2))
+ else if (ret == (size_t)-2)
{
/* Incomplete sequence */
use_encode_direct = true;
}
- else if (ret == (size_t)(-1))
+ else if (ret == (size_t)-1)
{
/* Invalid data */
use_encode_direct = true;
@@ -266,9 +276,7 @@ std::string wcs2string(const wcstring &input)
std::string result;
result.reserve(input.size());
- mbstate_t state;
- memset(&state, 0, sizeof(state));
-
+ mbstate_t state = {};
char converted[MB_LEN_MAX + 1];
for (size_t i=0; i < input.size(); i++)
@@ -276,12 +284,22 @@ std::string wcs2string(const wcstring &input)
wchar_t wc = input[i];
if (wc == INTERNAL_SEPARATOR)
{
+ // Do nothing.
}
- else if ((wc >= ENCODE_DIRECT_BASE) &&
- (wc < ENCODE_DIRECT_BASE+256))
+ else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256)
{
result.push_back(wc - ENCODE_DIRECT_BASE);
}
+ else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
+ {
+ // If `wc` contains a wide character we emit a question-mark.
+ if (wc & ~0xFF)
+ {
+ wc = '?';
+ }
+ converted[0] = wc;
+ result.append(converted, 1);
+ }
else
{
memset(converted, 0, sizeof converted);
@@ -311,38 +329,47 @@ std::string wcs2string(const wcstring &input)
*/
static char *wcs2str_internal(const wchar_t *in, char *out)
{
- size_t res=0;
- size_t in_pos=0;
- size_t out_pos = 0;
- mbstate_t state;
-
CHECK(in, 0);
CHECK(out, 0);
- memset(&state, 0, sizeof(state));
+ size_t in_pos = 0;
+ size_t out_pos = 0;
+ mbstate_t state = {};
while (in[in_pos])
{
if (in[in_pos] == INTERNAL_SEPARATOR)
{
+ // Do nothing.
}
- else if ((in[in_pos] >= ENCODE_DIRECT_BASE) &&
- (in[in_pos] < ENCODE_DIRECT_BASE+256))
+ else if (in[in_pos] >= ENCODE_DIRECT_BASE &&
+ in[in_pos] < ENCODE_DIRECT_BASE + 256)
{
out[out_pos++] = in[in_pos]- ENCODE_DIRECT_BASE;
}
+ else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
+ {
+ // If `wc` contains a wide character we emit a question-mark.
+ if (in[in_pos] & ~0xFF)
+ {
+ out[out_pos++] = '?';
+ }
+ else
+ {
+ out[out_pos++] = (unsigned char)in[in_pos];
+ }
+ }
else
{
- res = wcrtomb(&out[out_pos], in[in_pos], &state);
-
- if (res == (size_t)(-1))
+ size_t len = wcrtomb(&out[out_pos], in[in_pos], &state);
+ if (len == (size_t)-1)
{
debug(1, L"Wide character %d has no narrow representation", in[in_pos]);
memset(&state, 0, sizeof(state));
}
else
{
- out_pos += res;
+ out_pos += len;
}
}
in_pos++;
diff --git a/src/env.cpp b/src/env.cpp
index c24280bc..56c9deb0 100644
--- a/src/env.cpp
+++ b/src/env.cpp
@@ -377,14 +377,13 @@ static void setup_path()
int env_set_pwd()
{
- wchar_t dir_path[4096];
- wchar_t *res = wgetcwd(dir_path, 4096);
- if (!res)
+ wcstring res = wgetcwd();
+ if (res.empty())
{
debug(0, _(L"Could not determine current working directory. Is your locale set correctly?"));
return 0;
}
- env_set(L"PWD", dir_path, ENV_EXPORT | ENV_GLOBAL);
+ env_set(L"PWD", res.c_str(), ENV_EXPORT | ENV_GLOBAL);
return 1;
}
diff --git a/src/fallback.cpp b/src/fallback.cpp
index 7c5f84d2..ed29f5de 100644
--- a/src/fallback.cpp
+++ b/src/fallback.cpp
@@ -606,7 +606,7 @@ static FILE *fw_data;
static void fw_writer(wchar_t c)
{
- putwc(c, fw_data);
+ fputwc(c, fw_data);
}
/*
@@ -648,33 +648,30 @@ int wprintf(const wchar_t *filter, ...)
#endif
#ifndef HAVE_FGETWC
-
wint_t fgetwc(FILE *stream)
{
- wchar_t res=0;
- mbstate_t state;
- memset(&state, '\0', sizeof(state));
+ wchar_t res;
+ mbstate_t state = {};
while (1)
{
int b = fgetc(stream);
- char bb;
-
- int sz;
-
if (b == EOF)
+ {
return WEOF;
+ }
- bb=b;
-
- sz = mbrtowc(&res, &bb, 1, &state);
+ if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
+ {
+ return b;
+ }
+ char bb = b;
+ size_t sz = mbrtowc(&res, &bb, 1, &state);
switch (sz)
{
case -1:
- memset(&state, '\0', sizeof(state));
return WEOF;
-
case -2:
break;
case 0:
@@ -683,35 +680,40 @@ wint_t fgetwc(FILE *stream)
return res;
}
}
-
-}
-
-
-wint_t getwc(FILE *stream)
-{
- return fgetwc(stream);
}
-
-
#endif
#ifndef HAVE_FPUTWC
-
wint_t fputwc(wchar_t wc, FILE *stream)
{
- int res;
- char s[MB_CUR_MAX+1];
- memset(s, 0, MB_CUR_MAX+1);
- wctomb(s, wc);
- res = fputs(s, stream);
- return res==EOF?WEOF:wc;
-}
+ int res = 0;
+ mbstate_t state = {};
+ char s[MB_CUR_MAX + 1] = {};
-wint_t putwc(wchar_t wc, FILE *stream)
-{
- return fputwc(wc, stream);
-}
+ if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
+ {
+ // If `wc` contains a wide character we emit a question-mark.
+ if (wc & ~0xFF)
+ {
+ wc = '?';
+ }
+ s[0] = (char)wc;
+ res = fputs(s, stream);
+ }
+ else
+ {
+ size_t len = wcrtomb(s, wc, &state);
+ if (len == (size_t)-1)
+ {
+ debug(1, L"Wide character %d has no narrow representation", wc);
+ }
+ else {
+ res = fputs(s, stream);
+ }
+ }
+ return res == EOF ? WEOF : wc;
+}
#endif
#ifndef HAVE_WCSTOK
diff --git a/src/fallback.h b/src/fallback.h
index e3044449..f3252c14 100644
--- a/src/fallback.h
+++ b/src/fallback.h
@@ -158,29 +158,13 @@ int vswprintf(wchar_t *out, size_t n, const wchar_t *filter, va_list va);
#endif
#ifndef HAVE_FGETWC
-/**
- Fallback implementation of fgetwc
-*/
+// Fallback implementation of fgetwc.
wint_t fgetwc(FILE *stream);
-
-/**
- Fallback implementation of getwc
-*/
-wint_t getwc(FILE *stream);
-
#endif
#ifndef HAVE_FPUTWC
-
-/**
- Fallback implementation of fputwc
-*/
+// Fallback implementation of fputwc.
wint_t fputwc(wchar_t wc, FILE *stream);
-/**
- Fallback implementation of putwc
-*/
-wint_t putwc(wchar_t wc, FILE *stream);
-
#endif
#ifndef HAVE_WCSTOK
diff --git a/src/history.cpp b/src/history.cpp
index 75e42638..a37d2b77 100644
--- a/src/history.cpp
+++ b/src/history.cpp
@@ -926,10 +926,9 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length)
{
const char *end = begin + length;
- const char *pos=begin;
-
- bool was_backslash = 0;
+ const char *pos = begin;
wcstring out;
+ bool was_backslash = false;
bool first_char = true;
bool timestamp_mode = false;
time_t timestamp = 0;
@@ -937,12 +936,18 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length)
while (1)
{
wchar_t c;
- mbstate_t state;
size_t res;
+ mbstate_t state = {};
- memset(&state, 0, sizeof(state));
-
- res = mbrtowc(&c, pos, end-pos, &state);
+ if (MB_CUR_MAX == 1) // single-byte locale
+ {
+ c = (unsigned char)*pos;
+ res = 1;
+ }
+ else
+ {
+ res = mbrtowc(&c, pos, end - pos, &state);
+ }
if (res == (size_t)-1)
{
diff --git a/src/input_common.cpp b/src/input_common.cpp
index ebe95fce..7c018c2d 100644
--- a/src/input_common.cpp
+++ b/src/input_common.cpp
@@ -263,16 +263,17 @@ wchar_t input_common_readch(int timed)
while (1)
{
wint_t b = readb();
- char bb;
- size_t sz;
+ if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
+ {
+ return (unsigned char)b;
+ }
if ((b >= R_NULL) && (b < R_NULL + 1000))
return b;
- bb=b;
-
- sz = mbrtowc(&res, &bb, 1, &state);
+ char bb = b;
+ size_t sz = mbrtowc(&res, &bb, 1, &state);
switch (sz)
{
diff --git a/src/output.cpp b/src/output.cpp
index 1707f4fb..dbd97eb4 100644
--- a/src/output.cpp
+++ b/src/output.cpp
@@ -386,32 +386,35 @@ int writeb(tputs_arg_t b)
int writech(wint_t ch)
{
- mbstate_t state;
- size_t i;
char buff[MB_LEN_MAX+1];
- size_t bytes;
+ size_t len;
- if ((ch >= ENCODE_DIRECT_BASE) &&
- (ch < ENCODE_DIRECT_BASE+256))
+ if (ch >= ENCODE_DIRECT_BASE && ch < ENCODE_DIRECT_BASE + 256)
{
buff[0] = ch - ENCODE_DIRECT_BASE;
- bytes=1;
+ len = 1;
+ }
+ else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
+ {
+ // If `wc` contains a wide character we emit a question-mark.
+ if (ch & ~0xFF)
+ {
+ ch = '?';
+ }
+ buff[0] = ch;
+ len = 1;
}
else
{
- memset(&state, 0, sizeof(state));
- bytes= wcrtomb(buff, ch, &state);
-
- switch (bytes)
+ mbstate_t state = {};
+ len = wcrtomb(buff, ch, &state);
+ if (len == (size_t)-1)
{
- case (size_t)(-1):
- {
- return 1;
- }
+ return 1;
}
}
- for (i=0; i<bytes; i++)
+ for (size_t i = 0; i < len; i++)
{
out(buff[i]);
}
@@ -420,29 +423,26 @@ int writech(wint_t ch)
void writestr(const wchar_t *str)
{
- char *pos;
-
CHECK(str,);
- // while( *str )
- // writech( *str++ );
-
- /*
- Check amount of needed space
- */
- size_t len = wcstombs(0, str, 0);
+ if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
+ {
+ while( *str )
+ {
+ writech( *str++ );
+ }
+ return;
+ }
+ size_t len = wcstombs(0, str, 0); // figure amount of space needed
if (len == (size_t)-1)
{
debug(1, L"Tried to print invalid wide character string");
return;
}
+ // Convert the string.
len++;
-
- /*
- Convert
- */
char *buffer, static_buffer[256];
if (len <= sizeof static_buffer)
buffer = static_buffer;
@@ -456,7 +456,7 @@ void writestr(const wchar_t *str)
/*
Write
*/
- for (pos = buffer; *pos; pos++)
+ for (char *pos = buffer; *pos; pos++)
{
out(*pos);
}
diff --git a/src/wutil.cpp b/src/wutil.cpp
index a3a51dac..2ff358bc 100644
--- a/src/wutil.cpp
+++ b/src/wutil.cpp
@@ -145,30 +145,23 @@ bool wreaddir_for_dirs(DIR *dir, wcstring *out_name)
}
-wchar_t *wgetcwd(wchar_t *buff, size_t sz)
+const wcstring wgetcwd()
{
- char *buffc = (char *)malloc(sz*MAX_UTF8_BYTES);
- char *res;
- wchar_t *ret = 0;
+ wcstring retval;
- if (!buffc)
+ char *res = getcwd(NULL, 0);
+ if (res)
{
- errno = ENOMEM;
- return 0;
+ retval = str2wcstring(res);
+ free(res);
}
-
- res = getcwd(buffc, sz*MAX_UTF8_BYTES);
- if (res)
+ else
{
- if ((size_t)-1 != mbstowcs(buff, buffc, sizeof(wchar_t) * sz))
- {
- ret = buff;
- }
+ debug(0, _(L"getcwd() failed with errno %d/%s"), errno, strerror(errno));
+ retval = wcstring();
}
- free(buffc);
-
- return ret;
+ return retval;
}
int wchdir(const wcstring &dir)
diff --git a/src/wutil.h b/src/wutil.h
index 9897439a..a14dfa9b 100644
--- a/src/wutil.h
+++ b/src/wutil.h
@@ -70,10 +70,8 @@ void safe_perror(const char *message);
*/
const char *safe_strerror(int err);
-/**
- Wide character version of getcwd().
-*/
-wchar_t *wgetcwd(wchar_t *buff, size_t sz);
+// Wide character version of getcwd().
+const wcstring wgetcwd();
/**
Wide character version of chdir()
diff --git a/tests/c-locale.err b/tests/c-locale.err
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/tests/c-locale.err
diff --git a/tests/c-locale.in b/tests/c-locale.in
new file mode 100644
index 00000000..d2f2bd5f
--- /dev/null
+++ b/tests/c-locale.in
@@ -0,0 +1,35 @@
+# Verify that fish can pass through non-ASCII characters in the C/POSIX
+# locale. This is to prevent regression of
+# https://github.com/fish-shell/fish-shell/issues/2802.
+#
+# These tests are needed because the relevant standards allow the functions
+# mbrtowc() and wcrtomb() to treat bytes with the high bit set as either valid
+# or invalid in the C/POSIX locales. GNU libc treats those bytes as invalid.
+# Other libc implementations (e.g., BSD) treat them as valid. We want fish to
+# always treat those bytes as valid.
+
+# The fish in the middle of the pipeline should be receiving a UTF-8 encoded
+# version of the unicode from the echo. It should pass those bytes thru
+# literally since it is in the C locale. We verify this by first passing the
+# echo output directly to the `xxd` program then via a fish instance. The
+# output should be "58c3bb58" for the first statement and "58c3bc58" for the
+# second.
+echo -n X\u00fbX | \
+ xxd --plain
+echo X\u00fcX | env LC_ALL=C ../test/root/bin/fish -c 'read foo; echo -n $foo' | \
+ xxd --plain
+
+# This test is subtle. Despite the presence of the \u00fc unicode char (a "u"
+# with an umlaut) the fact the locale is C/POSIX will cause the \xfc byte to
+# be emitted rather than the usual UTF-8 sequence \xc3\xbc. That's because the
+# few single-byte unicode chars (that are not ASCII) are generally in the
+# ISO-8859-1 char set which is encompased by the C locale. The output should
+# be "59fc59".
+env LC_ALL=C ../test/root/bin/fish -c 'echo -n Y\u00fcY' | \
+ xxd --plain
+
+# The user can specify a wide unicode character (one requiring more than a
+# single byte). In the C/POSIX locales we substitute a question-mark for the
+# unencodable wide char. The output should be "543f54".
+env LC_ALL=C ../test/root/bin/fish -c 'echo -n T\u01fdT' | \
+ xxd --plain
diff --git a/tests/c-locale.out b/tests/c-locale.out
new file mode 100644
index 00000000..10a94d3e
--- /dev/null
+++ b/tests/c-locale.out
@@ -0,0 +1,4 @@
+58c3bb58
+58c3bc58
+59fc59
+543f54
diff --git a/tests/c-locale.status b/tests/c-locale.status
new file mode 100644
index 00000000..573541ac
--- /dev/null
+++ b/tests/c-locale.status
@@ -0,0 +1 @@
+0