Fix wide char related tests on Cygwin

This makes the wide char tests run by `./fish_tests` pass on systems where sizeof wchar_t is two (e.g., Cygwin). In doing so it corrects several problems with the underlying code in module *utf8.cpp* such as allowing five and six byte UTF-8 sequences. They were allowed by the original Unicode proposal but are not allowed by the adopted standard.
author: Kurtis Rader <krader@skepticism.us> 2016-05-22 19:00:13 -0700
committer: Kurtis Rader <krader@skepticism.us> 2016-05-26 20:24:03 -0700
commit: 8f420b9272e526097a73ad10ce74ce3edbb67b9e (patch)
tree: 40dcaa9009a9c7057ae0b38e67f3f26c4bacb3d7 /src/fish_tests.cpp
parent: d55b226f191749282daba2c8c96315df993ecc3b (diff)
1 files changed, 78 insertions, 110 deletions
diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp
index 2457cee4..49c9327b 100644
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@@ -6,7 +6,6 @@
 #include <assert.h>
 #include <libgen.h>
 #include <limits.h>
-#include <locale.h>
 #include <pthread.h>
 #include <signal.h>
 #include <stdarg.h>
@@ -17,6 +16,7 @@
 #include <string.h>
 #include <sys/select.h>
 #include <sys/types.h>
+#include <sys/utsname.h>
 #include <sys/wait.h>
 #include <time.h>
 #include <unistd.h>
@@ -68,6 +68,8 @@
 static const char *const *s_arguments;
 static int s_test_run_count = 0;
 
+bool is_wchar_ucs2() { return sizeof(wchar_t) == 2; }
+
 // Indicate if we should test the given function. Either we test everything (all arguments) or we
 // run only tests that have a prefix in s_arguments.
 static bool should_test_function(const char *func_name) {
@@ -194,8 +196,12 @@ static void test_unescape_sane() {
     if (unescape_string(L"echo \\U110000", &output, UNESCAPE_DEFAULT)) {
         err(L"Should not have been able to unescape \\U110000\n");
     }
-    if (!unescape_string(L"echo \\U10FFFF", &output, UNESCAPE_DEFAULT)) {
-        err(L"Should have been able to unescape \\U10FFFF\n");
+    if (is_wchar_ucs2()) {
+        // TODO: Make this work on MS Windows.
+    } else {
+        if (!unescape_string(L"echo \\U10FFFF", &output, UNESCAPE_DEFAULT)) {
+            err(L"Should have been able to unescape \\U10FFFF\n");
+        }
     }
 }
 
@@ -271,36 +277,8 @@ static char *str2hex(const char *input) {
 }
 
 /// Test wide/narrow conversion by creating random strings and verifying that the original string
-/// comes back thorugh double conversion.
+/// comes back through double conversion.
 static void test_convert() {
-#if 0
-    char o[] = {-17, -128, -121, -68, 0};
-
-    wchar_t *w = str2wcs(o);
-    char *n = wcs2str(w);
-    int i;
-
-    for (i = 0; o[i]; i++) {
-        bitprint(o[i]);
-        // wprintf(L"%d ", o[i]);
-    }
-    wprintf(L"\n");
-
-    for (i = 0; w[i]; i++) {
-        wbitprint(w[i]);
-        // wprintf(L"%d ", w[i]);
-    }
-    wprintf(L"\n");
-
-    for (i = 0; n[i]; i++) {
-        bitprint(n[i]);
-        // wprintf(L"%d ", n[i]);
-    }
-    wprintf(L"\n");
-
-    return;
-#endif
-
     int i;
     std::vector<char> sb;
 
@@ -308,11 +286,9 @@ static void test_convert() {
 
     for (i = 0; i < ESCAPE_TEST_COUNT; i++) {
         const char *o, *n;
-
         char c;
 
         sb.clear();
-
         while (rand() % ESCAPE_TEST_LENGTH) {
             c = rand();
             sb.push_back(c);
@@ -344,7 +320,7 @@ static void test_convert() {
 
 /// Verify correct behavior with embedded nulls.
 static void test_convert_nulls(void) {
-    say(L"Testing embedded nulls in string conversion");
+    say(L"Testing convert_nulls");
     const wchar_t in[] = L"AAA\0BBB";
     const size_t in_len = (sizeof in / sizeof *in) - 1;
     const wcstring in_str = wcstring(in, in_len);
@@ -370,9 +346,9 @@ static void test_convert_nulls(void) {
 }
 
 /// Test the tokenizer.
-static void test_tok() {
-    tok_t token;
+static void test_tokenizer() {
     say(L"Testing tokenizer");
+    tok_t token;
 
     const wchar_t *str =
         L"string <redirection  2>&1 'nested \"quoted\" '(string containing subshells "
@@ -515,7 +491,6 @@ static parser_test_error_bits_t detect_argument_errors(const wcstring &src) {
 /// Test the parser.
 static void test_parser() {
     say(L"Testing parser");
-
     parser_t parser;
 
     say(L"Testing block nesting");
@@ -961,88 +936,55 @@ static void test_wchar2utf8(const wchar_t *src, size_t slen, const unsigned char
 }
 
 static void test_utf8() {
+    say(L"Testing utf8");
     wchar_t w1[] = {0x54, 0x65, 0x73, 0x74};
     wchar_t w2[] = {0x0422, 0x0435, 0x0441, 0x0442};
     wchar_t w3[] = {0x800, 0x1e80, 0x98c4, 0x9910, 0xff00};
-    wchar_t w4[] = {0x15555, 0xf7777, 0xa};
-    wchar_t w5[] = {0x255555, 0x1fa04ff, 0xddfd04, 0xa};
-    wchar_t w6[] = {0xf255555, 0x1dfa04ff, 0x7fddfd04, 0xa};
+    wchar_t w4[] = {0x15555, 0xf7777, 0x0a};
     wchar_t wb[] = {(wchar_t)-2, 0xa, (wchar_t)0xffffffff, 0x0441};
-    wchar_t wm[] = {0x41, 0x0441, 0x3042, 0xff67, 0x9b0d, 0x2e05da67};
-    wchar_t wb1[] = {0xa, 0x0422};
-    wchar_t wb2[] = {0xd800, 0xda00, 0x41, 0xdfff, 0xa};
-    wchar_t wbom[] = {0xfeff, 0x41, 0xa};
+    wchar_t wm[] = {0x41, 0x0441, 0x3042, 0xff67, 0x9b0d};
+    wchar_t wb1[] = {0x0a, 0x0422};
+    wchar_t wb2[] = {0xd800, 0xda00, 0x41, 0xdfff, 0x0a};
+    wchar_t wbom[] = {0xfeff, 0x41, 0x0a};
     wchar_t wbom2[] = {0x41, 0xa};
-    wchar_t wbom22[] = {0xfeff, 0x41, 0xa};
+    wchar_t wbom22[] = {0xfeff, 0x41, 0x0a};
     unsigned char u1[] = {0x54, 0x65, 0x73, 0x74};
     unsigned char u2[] = {0xd0, 0xa2, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82};
     unsigned char u3[] = {0xe0, 0xa0, 0x80, 0xe1, 0xba, 0x80, 0xe9, 0xa3,
                           0x84, 0xe9, 0xa4, 0x90, 0xef, 0xbc, 0x80};
-    unsigned char u4[] = {0xf0, 0x95, 0x95, 0x95, 0xf3, 0xb7, 0x9d, 0xb7, 0xa};
-    unsigned char u5[] = {0xf8, 0x89, 0x95, 0x95, 0x95, 0xf9, 0xbe, 0xa0,
-                          0x93, 0xbf, 0xf8, 0xb7, 0x9f, 0xb4, 0x84, 0x0a};
-    unsigned char u6[] = {0xfc, 0x8f, 0x89, 0x95, 0x95, 0x95, 0xfc, 0x9d, 0xbe, 0xa0,
-                          0x93, 0xbf, 0xfd, 0xbf, 0xb7, 0x9f, 0xb4, 0x84, 0x0a};
+    unsigned char u4[] = {0xf0, 0x95, 0x95, 0x95, 0xf3, 0xb7, 0x9d, 0xb7, 0x0a};
     unsigned char ub[] = {0xa, 0xd1, 0x81};
-    unsigned char um[] = {0x41, 0xd1, 0x81, 0xe3, 0x81, 0x82, 0xef, 0xbd, 0xa7,
-                          0xe9, 0xac, 0x8d, 0xfc, 0xae, 0x81, 0x9d, 0xa9, 0xa7};
+    unsigned char um[] = {0x41, 0xd1, 0x81, 0xe3, 0x81, 0x82, 0xef, 0xbd, 0xa7, 0xe9, 0xac, 0x8d};
     unsigned char ub1[] = {0xa, 0xff, 0xd0, 0xa2, 0xfe, 0x8f, 0xe0, 0x80};
     unsigned char uc080[] = {0xc0, 0x80};
-    unsigned char ub2[] = {0xed, 0xa1, 0x8c, 0xed, 0xbe, 0xb4, 0xa};
+    unsigned char ub2[] = {0xed, 0xa1, 0x8c, 0xed, 0xbe, 0xb4, 0x0a};
     unsigned char ubom[] = {0x41, 0xa};
-    unsigned char ubom2[] = {0xef, 0xbb, 0xbf, 0x41, 0xa};
+    unsigned char ubom2[] = {0xef, 0xbb, 0xbf, 0x41, 0x0a};
 
     // UTF-8 -> UCS-4 string.
     test_utf82wchar(ubom2, sizeof(ubom2), wbom2, sizeof(wbom2) / sizeof(*wbom2), UTF8_SKIP_BOM,
-                    sizeof(wbom2) / sizeof(*wbom2), "skip BOM");
+                    sizeof(wbom2) / sizeof(*wbom2), "ubom2 skip BOM");
     test_utf82wchar(ubom2, sizeof(ubom2), wbom22, sizeof(wbom22) / sizeof(*wbom22), 0,
-                    sizeof(wbom22) / sizeof(*wbom22), "BOM");
-    test_utf82wchar(uc080, sizeof(uc080), NULL, 0, 0, 0, "c0 80 - forbitten by rfc3629");
-    test_utf82wchar(ub2, sizeof(ub2), NULL, 0, 0, is_wchar_ucs2() ? 0 : 3,
-                    "resulted in forbitten wchars (len)");
+                    sizeof(wbom22) / sizeof(*wbom22), "ubom2 BOM");
+    test_utf82wchar(uc080, sizeof(uc080), NULL, 0, 0, 0, "uc080 c0 80 - forbitten by rfc3629");
+    test_utf82wchar(ub2, sizeof(ub2), NULL, 0, 0, 3, "ub2 resulted in forbitten wchars (len)");
     test_utf82wchar(ub2, sizeof(ub2), wb2, sizeof(wb2) / sizeof(*wb2), 0, 0,
-                    "resulted in forbitten wchars");
+                    "ub2 resulted in forbitten wchars");
     test_utf82wchar(ub2, sizeof(ub2), L"\x0a", 1, UTF8_IGNORE_ERROR, 1,
-                    "resulted in ignored forbitten wchars");
+                    "ub2 resulted in ignored forbitten wchars");
     test_utf82wchar(u1, sizeof(u1), w1, sizeof(w1) / sizeof(*w1), 0, sizeof(w1) / sizeof(*w1),
-                    "1 octet chars");
+                    "u1/w1 1 octet chars");
     test_utf82wchar(u2, sizeof(u2), w2, sizeof(w2) / sizeof(*w2), 0, sizeof(w2) / sizeof(*w2),
-                    "2 octets chars");
+                    "u2/w2 2 octets chars");
     test_utf82wchar(u3, sizeof(u3), w3, sizeof(w3) / sizeof(*w3), 0, sizeof(w3) / sizeof(*w3),
-                    "3 octets chars");
-    test_utf82wchar(u4, sizeof(u4), w4, sizeof(w4) / sizeof(*w4), 0, sizeof(w4) / sizeof(*w4),
-                    "4 octets chars");
-    test_utf82wchar(u5, sizeof(u5), w5, sizeof(w5) / sizeof(*w5), 0, sizeof(w5) / sizeof(*w5),
-                    "5 octets chars");
-    test_utf82wchar(u6, sizeof(u6), w6, sizeof(w6) / sizeof(*w6), 0, sizeof(w6) / sizeof(*w6),
-                    "6 octets chars");
+                    "u3/w3 3 octets chars");
     test_utf82wchar("\xff", 1, NULL, 0, 0, 0, "broken utf-8 0xff symbol");
     test_utf82wchar("\xfe", 1, NULL, 0, 0, 0, "broken utf-8 0xfe symbol");
     test_utf82wchar("\x8f", 1, NULL, 0, 0, 0, "broken utf-8, start from 10 higher bits");
-    if (!is_wchar_ucs2())
-        test_utf82wchar(ub1, sizeof(ub1), wb1, sizeof(wb1) / sizeof(*wb1), UTF8_IGNORE_ERROR,
-                        sizeof(wb1) / sizeof(*wb1), "ignore bad chars");
-    test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm), 0, sizeof(wm) / sizeof(*wm),
-                    "mixed languages");
-    // PCA this test was to ensure that if the output buffer was too small, we'd get 0
-    // we no longer have statically sized result buffers, so this test is disabled
-    //    test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) - 1, 0,
-    //                    0, "boundaries -1");
-    test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) + 1, 0, sizeof(wm) / sizeof(*wm),
-                    "boundaries +1");
-    test_utf82wchar(um, sizeof(um), NULL, 0, 0, sizeof(wm) / sizeof(*wm), "calculate length");
-    test_utf82wchar(ub1, sizeof(ub1), NULL, 0, 0, 0, "calculate length of bad chars");
-    test_utf82wchar(ub1, sizeof(ub1), NULL, 0, UTF8_IGNORE_ERROR, sizeof(wb1) / sizeof(*wb1),
-                    "calculate length, ignore bad chars");
     test_utf82wchar((const char *)NULL, 0, NULL, 0, 0, 0, "invalid params, all 0");
     test_utf82wchar(u1, 0, NULL, 0, 0, 0, "invalid params, src buf not NULL");
     test_utf82wchar((const char *)NULL, 10, NULL, 0, 0, 0, "invalid params, src length is not 0");
 
-    // PCA this test was to ensure that converting into a zero length output buffer would return 0
-    // we no longer statically size output buffers, so the test is disabled
-    //    test_utf82wchar(u1, sizeof(u1), w1, 0, 0, 0,
-    //                    "invalid params, dst is not NULL");
-
     // UCS-4 -> UTF-8 string.
     const char *const nullc = NULL;
     test_wchar2utf8(wbom, sizeof(wbom) / sizeof(*wbom), ubom, sizeof(ubom), UTF8_SKIP_BOM,
@@ -1050,31 +992,54 @@ static void test_utf8() {
     test_wchar2utf8(wb2, sizeof(wb2) / sizeof(*wb2), nullc, 0, 0, 0, "prohibited wchars");
     test_wchar2utf8(wb2, sizeof(wb2) / sizeof(*wb2), nullc, 0, UTF8_IGNORE_ERROR, 2,
                     "ignore prohibited wchars");
-    test_wchar2utf8(w1, sizeof(w1) / sizeof(*w1), u1, sizeof(u1), 0, sizeof(u1), "1 octet chars");
-    test_wchar2utf8(w2, sizeof(w2) / sizeof(*w2), u2, sizeof(u2), 0, sizeof(u2), "2 octets chars");
-    test_wchar2utf8(w3, sizeof(w3) / sizeof(*w3), u3, sizeof(u3), 0, sizeof(u3), "3 octets chars");
-    test_wchar2utf8(w4, sizeof(w4) / sizeof(*w4), u4, sizeof(u4), 0, sizeof(u4), "4 octets chars");
-    test_wchar2utf8(w5, sizeof(w5) / sizeof(*w5), u5, sizeof(u5), 0, sizeof(u5), "5 octets chars");
-    test_wchar2utf8(w6, sizeof(w6) / sizeof(*w6), u6, sizeof(u6), 0, sizeof(u6), "6 octets chars");
-    test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), ub, sizeof(ub), 0, 0, "bad chars");
+    test_wchar2utf8(w1, sizeof(w1) / sizeof(*w1), u1, sizeof(u1), 0, sizeof(u1),
+                    "w1/u1 1 octet chars");
+    test_wchar2utf8(w2, sizeof(w2) / sizeof(*w2), u2, sizeof(u2), 0, sizeof(u2),
+                    "w2/u2 2 octets chars");
+    test_wchar2utf8(w3, sizeof(w3) / sizeof(*w3), u3, sizeof(u3), 0, sizeof(u3),
+                    "w3/u3 3 octets chars");
+    test_wchar2utf8(NULL, 0, nullc, 0, 0, 0, "invalid params, all 0");
+    test_wchar2utf8(w1, 0, nullc, 0, 0, 0, "invalid params, src buf not NULL");
+    test_wchar2utf8(w1, sizeof(w1) / sizeof(*w1), u1, 0, 0, 0, "invalid params, dst is not NULL");
+    test_wchar2utf8(NULL, 10, nullc, 0, 0, 0, "invalid params, src length is not 0");
+
+    // The following tests won't pass on systems (e.g., Cygwin) where sizeof wchar_t is 2. That's
+    // due to several reasons but the primary one is that narrowing conversions of literals assigned
+    // to the wchar_t arrays above don't result in values that will be treated as errors by the
+    // conversion functions.
+    if (is_wchar_ucs2()) return;
+    test_utf82wchar(u4, sizeof(u4), w4, sizeof(w4) / sizeof(*w4), 0, sizeof(w4) / sizeof(*w4),
+                    "u4/w4 4 octets chars");
+    test_wchar2utf8(w4, sizeof(w4) / sizeof(*w4), u4, sizeof(u4), 0, sizeof(u4),
+                    "w4/u4 4 octets chars");
+    test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), ub, sizeof(ub), 0, 0, "wb/ub bad chars");
     test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), ub, sizeof(ub), UTF8_IGNORE_ERROR, sizeof(ub),
-                    "ignore bad chars");
-    test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um), 0, sizeof(um), "mixed languages");
-    test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um) - 1, 0, 0, "boundaries -1");
+                    "wb/ub ignore bad chars");
+    test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um), 0, sizeof(um),
+                    "wm/um mixed languages");
+    test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um) - 1, 0, 0, "wm/um boundaries -1");
     test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um) + 1, 0, sizeof(um),
-                    "boundaries +1");
-    test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), nullc, 0, 0, sizeof(um), "calculate length");
-    test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), nullc, 0, 0, 0, "calculate length of bad chars");
+                    "wm/um boundaries +1");
+    test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), nullc, 0, 0, sizeof(um),
+                    "wm/um calculate length");
+    test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), nullc, 0, 0, 0,
+                    "wb calculate length of bad chars");
     test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), nullc, 0, UTF8_IGNORE_ERROR, sizeof(ub),
                     "calculate length, ignore bad chars");
-    test_wchar2utf8(NULL, 0, nullc, 0, 0, 0, "invalid params, all 0");
-    test_wchar2utf8(w1, 0, nullc, 0, 0, 0, "invalid params, src buf not NULL");
-    test_wchar2utf8(NULL, 10, nullc, 0, 0, 0, "invalid params, src length is not 0");
-    test_wchar2utf8(w1, sizeof(w1) / sizeof(*w1), u1, 0, 0, 0, "invalid params, dst is not NULL");
+    test_utf82wchar(ub1, sizeof(ub1), wb1, sizeof(wb1) / sizeof(*wb1), UTF8_IGNORE_ERROR,
+                    sizeof(wb1) / sizeof(*wb1), "ub1/wb1 ignore bad chars");
+    test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm), 0, sizeof(wm) / sizeof(*wm),
+                    "um/wm mixed languages");
+    test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) + 1, 0, sizeof(wm) / sizeof(*wm),
+                    "um/wm boundaries +1");
+    test_utf82wchar(um, sizeof(um), NULL, 0, 0, sizeof(wm) / sizeof(*wm), "um/wm calculate length");
+    test_utf82wchar(ub1, sizeof(ub1), NULL, 0, 0, 0, "ub1 calculate length of bad chars");
+    test_utf82wchar(ub1, sizeof(ub1), NULL, 0, UTF8_IGNORE_ERROR, sizeof(wb1) / sizeof(*wb1),
+                    "ub1 calculate length, ignore bad chars");
 }
 
 static void test_escape_sequences(void) {
-    say(L"Testing escape codes");
+    say(L"Testing escape_sequences");
     if (escape_code_length(L"") != 0) err(L"test_escape_sequences failed on line %d\n", __LINE__);
     if (escape_code_length(L"abcd") != 0)
         err(L"test_escape_sequences failed on line %d\n", __LINE__);
@@ -3905,6 +3870,9 @@ int main(int argc, char **argv) {
     program_name = L"(ignore)";
     s_arguments = argv + 1;
 
+    struct utsname uname_info;
+    uname(&uname_info);
+
     say(L"Testing low-level functionality");
     set_main_thread();
     setup_fork_guards();
@@ -3931,7 +3899,7 @@ int main(int argc, char **argv) {
     if (should_test_function("format")) test_format();
     if (should_test_function("convert")) test_convert();
     if (should_test_function("convert_nulls")) test_convert_nulls();
-    if (should_test_function("tok")) test_tok();
+    if (should_test_function("tok")) test_tokenizer();
     if (should_test_function("iothread")) test_iothread();
     if (should_test_function("parser")) test_parser();
     if (should_test_function("cancellation")) test_cancellation();
author	Kurtis Rader <krader@skepticism.us>	2016-05-22 19:00:13 -0700
committer	Kurtis Rader <krader@skepticism.us>	2016-05-26 20:24:03 -0700
commit	8f420b9272e526097a73ad10ce74ce3edbb67b9e (patch)
tree	40dcaa9009a9c7057ae0b38e67f3f26c4bacb3d7 /src/fish_tests.cpp
parent	d55b226f191749282daba2c8c96315df993ecc3b (diff)