From 8f420b9272e526097a73ad10ce74ce3edbb67b9e Mon Sep 17 00:00:00 2001 From: Kurtis Rader Date: Sun, 22 May 2016 19:00:13 -0700 Subject: Fix wide char related tests on Cygwin This makes the wide char tests run by `./fish_tests` pass on systems where sizeof wchar_t is two (e.g., Cygwin). In doing so it corrects several problems with the underlying code in module *utf8.cpp* such as allowing five and six byte UTF-8 sequences. They were allowed by the original Unicode proposal but are not allowed by the adopted standard. --- src/common.cpp | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'src/common.cpp') diff --git a/src/common.cpp b/src/common.cpp index 91b08824..1ad51482 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -106,8 +106,8 @@ demangled_backtrace(int max_frames, int skip_levels) { return backtrace_text; } -void __attribute__((noinline)) show_stackframe(const wchar_t msg_level, int frame_count, - int skip_levels) { +void __attribute__((noinline)) +show_stackframe(const wchar_t msg_level, int frame_count, int skip_levels) { ASSERT_IS_NOT_FORKED_CHILD(); // TODO: Decide if this is still needed. I'm commenting it out because it caused me some grief @@ -125,13 +125,13 @@ void __attribute__((noinline)) show_stackframe(const wchar_t msg_level, int fram } } -#else // HAVE_BACKTRACE_SYMBOLS +#else // HAVE_BACKTRACE_SYMBOLS -void __attribute__((noinline)) show_stackframe(const wchar_t msg_level, int frame_count, - int skip_levels) { +void __attribute__((noinline)) +show_stackframe(const wchar_t msg_level, int frame_count, int skip_levels) { debug_shared(msg_level, L"Sorry, but your system does not support backtraces"); } -#endif // HAVE_BACKTRACE_SYMBOLS +#endif // HAVE_BACKTRACE_SYMBOLS int fgetws2(wcstring *s, FILE *f) { int i = 0; @@ -179,8 +179,8 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len) { result.reserve(in_len); size_t in_pos = 0; - if (MB_CUR_MAX == 1) // single-byte locale, all values are legal - { + if (MB_CUR_MAX == 1) { + // Single-byte locale, all values are legal. while (in_pos < in_len) { result.push_back((unsigned char)in[in_pos]); in_pos++; @@ -198,10 +198,16 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len) { // Protect against broken mbrtowc() implementations which attempt to encode UTF-8 // sequences longer than four bytes (e.g., OS X Snow Leopard). use_encode_direct = true; + } else if (sizeof(wchar_t) == 2 && (in[in_pos] & 0xF8) == 0xF0) { + // Assume we are in a UTF-16 environment (e.g., Cygwin) using a UTF-8 encoding. + // The bits set check will be true for a four byte UTF-8 sequence that requires + // two UTF-16 chars. Something that doesn't work with our simple use of mbrtowc(). + use_encode_direct = true; } else { ret = mbrtowc(&wc, &in[in_pos], in_len - in_pos, &state); + // fprintf(stderr, "WTF in_pos %d ret %d\n", in_pos, ret); - // Determine whether to encode this characters with our crazy scheme. + // Determine whether to encode this character with our crazy scheme. if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256) { use_encode_direct = true; } else if (wc == INTERNAL_SEPARATOR) { @@ -215,20 +221,27 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len) { } else if (ret > in_len - in_pos) { // Other error codes? Terrifying, should never happen. use_encode_direct = true; + } else if (sizeof(wchar_t) == 2 && wc >= 0xD800 && wc <= 0xDFFF) { + // If we get a surrogate pair char on a UTF-16 system (e.g., Cygwin) then + // it's guaranteed the UTF-8 decoding is wrong so use direct encoding. + use_encode_direct = true; } } if (use_encode_direct) { + // fprintf(stderr, "WTF use_encode_direct\n"); wc = ENCODE_DIRECT_BASE + (unsigned char)in[in_pos]; result.push_back(wc); in_pos++; memset(&state, 0, sizeof state); } else if (ret == 0) { + // fprintf(stderr, "WTF null byte\n"); // Embedded null byte! result.push_back(L'\0'); in_pos++; memset(&state, 0, sizeof state); } else { + // fprintf(stderr, "WTF null byte\n"); // Normal case. result.push_back(wc); in_pos += ret; -- cgit v1.2.3