From aabed8279e4a086cf953006023bc14ec1d1d83b8 Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Sat, 22 Mar 2014 23:46:58 -0700 Subject: Incorporate a modified UTF8 <-> wchar_t implementation from Alexey Vatchenko (http://www.bsdua.org/libbsdua.html) in preparation for eliminating our dependency on iconv --- Makefile.in | 4 +- doc_src/license.hdr | 16 ++ fish.xcodeproj/project.pbxproj | 8 + fish_tests.cpp | 307 ++++++++++++++++++++++-- utf8.cpp | 514 +++++++++++++++++++++++++++++++++++++++++ utf8.h | 39 ++++ 6 files changed, 861 insertions(+), 27 deletions(-) create mode 100644 utf8.cpp create mode 100644 utf8.h diff --git a/Makefile.in b/Makefile.in index 8d18ce1d..3063438c 100644 --- a/Makefile.in +++ b/Makefile.in @@ -91,7 +91,7 @@ FISH_OBJS := function.o builtin.o complete.o env.o exec.o expand.o \ signal.o io.o parse_util.o common.o screen.o path.o autoload.o \ parser_keywords.o iothread.o color.o postfork.o \ builtin_test.o parse_tree.o parse_productions.o parse_execution.cpp \ - pager.cpp + pager.cpp utf8.o FISH_INDENT_OBJS := fish_indent.o print_help.o common.o \ parser_keywords.o wutil.o tokenizer.o @@ -117,7 +117,7 @@ FISH_TESTS_OBJS := $(FISH_OBJS) fish_tests.o # FISHD_OBJS := fishd.o env_universal_common.o wutil.o print_help.o \ - common.o + common.o utf8.o # diff --git a/doc_src/license.hdr b/doc_src/license.hdr index c07a94ad..7f14383a 100644 --- a/doc_src/license.hdr +++ b/doc_src/license.hdr @@ -1402,4 +1402,20 @@ POSSIBILITY OF SUCH DAMAGES. */ +

License for UTF8

+ +

Copyright (c) 2007 Alexey Vatchenko + +

Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +

THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + \htmlonly \endhtmlonly diff --git a/fish.xcodeproj/project.pbxproj b/fish.xcodeproj/project.pbxproj index d51f53e3..47d04c4c 100644 --- a/fish.xcodeproj/project.pbxproj +++ b/fish.xcodeproj/project.pbxproj @@ -117,6 +117,8 @@ D0A564FE168D23D800AF6161 /* man in CopyFiles */ = {isa = PBXBuildFile; fileRef = D0A564F1168D0BAB00AF6161 /* man */; }; D0A56501168D258300AF6161 /* man in Copy Files */ = {isa = PBXBuildFile; fileRef = D0A564F1168D0BAB00AF6161 /* man */; }; D0C52F371765284C00BFAB82 /* parse_tree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0C52F351765284C00BFAB82 /* parse_tree.cpp */; }; + D0C9733818DE5449002D7C81 /* utf8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0C9733718DE5449002D7C81 /* utf8.cpp */; }; + D0C9733918DE5449002D7C81 /* utf8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0C9733718DE5449002D7C81 /* utf8.cpp */; }; D0CBD587159EF0E10024809C /* launch_fish.scpt in Resources */ = {isa = PBXBuildFile; fileRef = D0CBD586159EF0E10024809C /* launch_fish.scpt */; }; D0D02A67159837AD008E62BD /* complete.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0A0853713B3ACEE0099B651 /* complete.cpp */; }; D0D02A69159837B2008E62BD /* env.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0A0853A13B3ACEE0099B651 /* env.cpp */; }; @@ -475,6 +477,8 @@ D0C6FCC914CFA4B0004CE8AD /* autoload.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = autoload.cpp; sourceTree = ""; }; D0C6FCCB14CFA4B7004CE8AD /* autoload.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = autoload.h; sourceTree = ""; }; D0C861EA16CC7054003B5A04 /* builtin_set_color.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = builtin_set_color.cpp; sourceTree = ""; }; + D0C9733718DE5449002D7C81 /* utf8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = utf8.cpp; sourceTree = ""; }; + D0C9733A18DE5451002D7C81 /* utf8.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = utf8.h; sourceTree = ""; }; D0CA63F316FC275F00093BD4 /* builtin_printf.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = builtin_printf.cpp; sourceTree = ""; }; D0CBD580159EE48F0024809C /* config.fish */ = {isa = PBXFileReference; lastKnownFileType = text; name = config.fish; path = share/config.fish; sourceTree = ""; }; D0CBD583159EEE010024809C /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; @@ -686,6 +690,8 @@ D0A0855C13B3ACEE0099B651 /* signal.cpp */, D0A0852513B3ACEE0099B651 /* tokenizer.h */, D0A0855D13B3ACEE0099B651 /* tokenizer.cpp */, + D0C9733A18DE5451002D7C81 /* utf8.h */, + D0C9733718DE5449002D7C81 /* utf8.cpp */, D0A0852613B3ACEE0099B651 /* util.h */, D0A0855E13B3ACEE0099B651 /* util.cpp */, D0A0852713B3ACEE0099B651 /* wgetopt.h */, @@ -1120,6 +1126,7 @@ files = ( D0D02AC215985F3F008E62BD /* fishd.cpp in Sources */, D0D02AC315985F43008E62BD /* env_universal_common.cpp in Sources */, + D0C9733918DE5449002D7C81 /* utf8.cpp in Sources */, D0D02AC415985F4D008E62BD /* wutil.cpp in Sources */, D0D02AC515985F5B008E62BD /* print_help.cpp in Sources */, D0D02AC615985F65008E62BD /* common.cpp in Sources */, @@ -1157,6 +1164,7 @@ D0D02A86159839D5008E62BD /* postfork.cpp in Sources */, D0D02A87159839D5008E62BD /* screen.cpp in Sources */, D0D02A88159839D5008E62BD /* signal.cpp in Sources */, + D0C9733818DE5449002D7C81 /* utf8.cpp in Sources */, D0D2694A15983779005D9B9C /* builtin.cpp in Sources */, D0D2694915983772005D9B9C /* function.cpp in Sources */, D0D02A67159837AD008E62BD /* complete.cpp in Sources */, diff --git a/fish_tests.cpp b/fish_tests.cpp index 5b7bcea6..f07b568c 100644 --- a/fish_tests.cpp +++ b/fish_tests.cpp @@ -62,6 +62,7 @@ #include "parse_util.h" #include "pager.h" #include "input.h" +#include "utf8.h" static const char * const * s_arguments; static int s_test_run_count = 0; @@ -140,17 +141,17 @@ static void err(const wchar_t *blah, ...) va_list va; va_start(va, blah); err_count++; - + // show errors in red fputs("\x1b[31m", stdout); wprintf(L"Error: "); vwprintf(blah, va); va_end(va); - + // return to normal color fputs("\x1b[0m", stdout); - + wprintf(L"\n"); } @@ -857,6 +858,260 @@ static void test_utils() if (begin != a + wcslen(L"echo (echo (")) err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__); } +/* UTF8 tests taken from Alexey Vatchenko's utf8 library. See http://www.bsdua.org/libbsdua.html */ + +static void test_utf82wchar(const char *src, size_t slen, const wchar_t *dst, size_t dlen, + int flags, size_t res, const char *descr) +{ + size_t size; + wchar_t *mem = NULL; + + /* Hack: if wchar is only UCS-2, and the UTF-8 input string contains astral characters, then tweak the expected size to 0 */ + if (src != NULL && is_wchar_ucs2()) + { + /* A UTF-8 code unit may represent an astral code point if it has 4 or more leading 1s */ + const unsigned char astral_mask = 0xF0; + for (size_t i=0; i < slen; i++) + { + if ((src[i] & astral_mask) == astral_mask) + { + /* Astral char. We expect this conversion to just fail. */ + res = 0; + break; + } + } + } + + if (dst != NULL) + { + mem = (wchar_t *)malloc(dlen * sizeof(*mem)); + if (mem == NULL) + { + err(L"u2w: %s: MALLOC FAILED\n", descr); + return; + } + } + + do + { + size = utf8_to_wchar(src, slen, mem, dlen, flags); + if (res != size) + { + err(L"u2w: %s: FAILED (rv: %lu, must be %lu)", descr, size, res); + break; + } + + if (mem == NULL) + break; /* OK */ + + if (memcmp(mem, dst, size * sizeof(*mem)) != 0) + { + err(L"u2w: %s: BROKEN", descr); + break; + } + + } + while (0); + + free(mem); +} + +static void test_wchar2utf8(const wchar_t *src, size_t slen, const char *dst, size_t dlen, + int flags, size_t res, const char *descr) +{ + size_t size; + char *mem = NULL; + + /* Hack: if wchar is simulating UCS-2, and the wchar_t input string contains astral characters, then tweak the expected size to 0 */ + if (src != NULL && is_wchar_ucs2()) + { + const uint32_t astral_mask = 0xFFFF0000U; + for (size_t i=0; i < slen; i++) + { + if ((src[i] & astral_mask) != 0) + { + /* astral char */ + res = 0; + break; + } + } + } + + if (dst != NULL) + { + mem = (char *)malloc(dlen); + if (mem == NULL) + { + err(L"w2u: %s: MALLOC FAILED", descr); + return; + } + } + + do + { + size = wchar_to_utf8(src, slen, mem, dlen, flags); + if (res != size) + { + err(L"w2u: %s: FAILED (rv: %lu, must be %lu)", descr, size, res); + break; + } + + if (mem == NULL) + break; /* OK */ + + if (memcmp(mem, dst, size) != 0) + { + err(L"w2u: %s: BROKEN", descr); + break; + } + + } + while (0); + + if (mem != NULL); + free(mem); +} + +static void test_utf8() +{ + wchar_t w1[] = {0x54, 0x65, 0x73, 0x74}; + wchar_t w2[] = {0x0422, 0x0435, 0x0441, 0x0442}; + wchar_t w3[] = {0x800, 0x1e80, 0x98c4, 0x9910, 0xff00}; + wchar_t w4[] = {0x15555, 0xf7777, 0xa}; + wchar_t w5[] = {0x255555, 0x1fa04ff, 0xddfd04, 0xa}; + wchar_t w6[] = {0xf255555, 0x1dfa04ff, 0x7fddfd04, 0xa}; + wchar_t wb[] = {-2, 0xa, 0xffffffff, 0x0441}; + wchar_t wm[] = {0x41, 0x0441, 0x3042, 0xff67, 0x9b0d, 0x2e05da67}; + wchar_t wb1[] = {0xa, 0x0422}; + wchar_t wb2[] = {0xd800, 0xda00, 0x41, 0xdfff, 0xa}; + wchar_t wbom[] = {0xfeff, 0x41, 0xa}; + wchar_t wbom2[] = {0x41, 0xa}; + wchar_t wbom22[] = {0xfeff, 0x41, 0xa}; + char u1[] = {0x54, 0x65, 0x73, 0x74}; + char u2[] = {0xd0, 0xa2, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82}; + char u3[] = {0xe0, 0xa0, 0x80, 0xe1, 0xba, 0x80, 0xe9, 0xa3, 0x84, + 0xe9, 0xa4, 0x90, 0xef, 0xbc, 0x80 + }; + char u4[] = {0xf0, 0x95, 0x95, 0x95, 0xf3, 0xb7, 0x9d, 0xb7, 0xa}; + char u5[] = {0xf8, 0x89, 0x95, 0x95, 0x95, 0xf9, 0xbe, 0xa0, 0x93, + 0xbf, 0xf8, 0xb7, 0x9f, 0xb4, 0x84, 0x0a + }; + char u6[] = {0xfc, 0x8f, 0x89, 0x95, 0x95, 0x95, 0xfc, 0x9d, 0xbe, + 0xa0, 0x93, 0xbf, 0xfd, 0xbf, 0xb7, 0x9f, 0xb4, 0x84, 0x0a + }; + char ub[] = {0xa, 0xd1, 0x81}; + char um[] = {0x41, 0xd1, 0x81, 0xe3, 0x81, 0x82, 0xef, 0xbd, 0xa7, + 0xe9, 0xac, 0x8d, 0xfc, 0xae, 0x81, 0x9d, 0xa9, 0xa7 + }; + char ub1[] = {0xa, 0xff, 0xd0, 0xa2, 0xfe, 0x8f, 0xe0, 0x80}; + char uc080[] = {0xc0, 0x80}; + char ub2[] = {0xed, 0xa1, 0x8c, 0xed, 0xbe, 0xb4, 0xa}; + char ubom[] = {0x41, 0xa}; + char ubom2[] = {0xef, 0xbb, 0xbf, 0x41, 0xa}; + + /* + * UTF-8 -> UCS-4 string. + */ + test_utf82wchar(ubom2, sizeof(ubom2), wbom2, + sizeof(wbom2) / sizeof(*wbom2), UTF8_SKIP_BOM, + sizeof(wbom2) / sizeof(*wbom2), "skip BOM"); + test_utf82wchar(ubom2, sizeof(ubom2), wbom22, + sizeof(wbom22) / sizeof(*wbom22), 0, + sizeof(wbom22) / sizeof(*wbom22), "BOM"); + test_utf82wchar(uc080, sizeof(uc080), NULL, 0, 0, 0, + "c0 80 - forbitten by rfc3629"); + test_utf82wchar(ub2, sizeof(ub2), NULL, 0, 0, is_wchar_ucs2() ? 0 : 3, + "resulted in forbitten wchars (len)"); + test_utf82wchar(ub2, sizeof(ub2), wb2, sizeof(wb2) / sizeof(*wb2), 0, 0, + "resulted in forbitten wchars"); + test_utf82wchar(ub2, sizeof(ub2), L"\x0a", 1, UTF8_IGNORE_ERROR, + 1, "resulted in ignored forbitten wchars"); + test_utf82wchar(u1, sizeof(u1), w1, sizeof(w1) / sizeof(*w1), 0, + sizeof(w1) / sizeof(*w1), "1 octet chars"); + test_utf82wchar(u2, sizeof(u2), w2, sizeof(w2) / sizeof(*w2), 0, + sizeof(w2) / sizeof(*w2), "2 octets chars"); + test_utf82wchar(u3, sizeof(u3), w3, sizeof(w3) / sizeof(*w3), 0, + sizeof(w3) / sizeof(*w3), "3 octets chars"); + test_utf82wchar(u4, sizeof(u4), w4, sizeof(w4) / sizeof(*w4), 0, + sizeof(w4) / sizeof(*w4), "4 octets chars"); + test_utf82wchar(u5, sizeof(u5), w5, sizeof(w5) / sizeof(*w5), 0, + sizeof(w5) / sizeof(*w5), "5 octets chars"); + test_utf82wchar(u6, sizeof(u6), w6, sizeof(w6) / sizeof(*w6), 0, + sizeof(w6) / sizeof(*w6), "6 octets chars"); + test_utf82wchar("\xff", 1, NULL, 0, 0, 0, "broken utf-8 0xff symbol"); + test_utf82wchar("\xfe", 1, NULL, 0, 0, 0, "broken utf-8 0xfe symbol"); + test_utf82wchar("\x8f", 1, NULL, 0, 0, 0, + "broken utf-8, start from 10 higher bits"); + if (! is_wchar_ucs2()) test_utf82wchar(ub1, sizeof(ub1), wb1, sizeof(wb1) / sizeof(*wb1), + UTF8_IGNORE_ERROR, sizeof(wb1) / sizeof(*wb1), "ignore bad chars"); + test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm), 0, + sizeof(wm) / sizeof(*wm), "mixed languages"); + test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) - 1, 0, + 0, "boundaries -1"); + test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) + 1, 0, + sizeof(wm) / sizeof(*wm), "boundaries +1"); + test_utf82wchar(um, sizeof(um), NULL, 0, 0, + sizeof(wm) / sizeof(*wm), "calculate length"); + test_utf82wchar(ub1, sizeof(ub1), NULL, 0, 0, + 0, "calculate length of bad chars"); + test_utf82wchar(ub1, sizeof(ub1), NULL, 0, + UTF8_IGNORE_ERROR, sizeof(wb1) / sizeof(*wb1), + "calculate length, ignore bad chars"); + test_utf82wchar(NULL, 0, NULL, 0, 0, 0, "invalid params, all 0"); + test_utf82wchar(u1, 0, NULL, 0, 0, 0, + "invalid params, src buf not NULL"); + test_utf82wchar(NULL, 10, NULL, 0, 0, 0, + "invalid params, src length is not 0"); + test_utf82wchar(u1, sizeof(u1), w1, 0, 0, 0, + "invalid params, dst is not NULL"); + + /* + * UCS-4 -> UTF-8 string. + */ + test_wchar2utf8(wbom, sizeof(wbom) / sizeof(*wbom), ubom, sizeof(ubom), + UTF8_SKIP_BOM, sizeof(ubom), "BOM"); + test_wchar2utf8(wb2, sizeof(wb2) / sizeof(*wb2), NULL, 0, 0, + 0, "prohibited wchars"); + test_wchar2utf8(wb2, sizeof(wb2) / sizeof(*wb2), NULL, 0, + UTF8_IGNORE_ERROR, 2, "ignore prohibited wchars"); + test_wchar2utf8(w1, sizeof(w1) / sizeof(*w1), u1, sizeof(u1), 0, + sizeof(u1), "1 octet chars"); + test_wchar2utf8(w2, sizeof(w2) / sizeof(*w2), u2, sizeof(u2), 0, + sizeof(u2), "2 octets chars"); + test_wchar2utf8(w3, sizeof(w3) / sizeof(*w3), u3, sizeof(u3), 0, + sizeof(u3), "3 octets chars"); + test_wchar2utf8(w4, sizeof(w4) / sizeof(*w4), u4, sizeof(u4), 0, + sizeof(u4), "4 octets chars"); + test_wchar2utf8(w5, sizeof(w5) / sizeof(*w5), u5, sizeof(u5), 0, + sizeof(u5), "5 octets chars"); + test_wchar2utf8(w6, sizeof(w6) / sizeof(*w6), u6, sizeof(u6), 0, + sizeof(u6), "6 octets chars"); + test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), ub, sizeof(ub), 0, + 0, "bad chars"); + test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), ub, sizeof(ub), + UTF8_IGNORE_ERROR, sizeof(ub), "ignore bad chars"); + test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um), 0, + sizeof(um), "mixed languages"); + test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um) - 1, 0, + 0, "boundaries -1"); + test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um) + 1, 0, + sizeof(um), "boundaries +1"); + test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), NULL, 0, 0, + sizeof(um), "calculate length"); + test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), NULL, 0, 0, + 0, "calculate length of bad chars"); + test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), NULL, 0, + UTF8_IGNORE_ERROR, sizeof(ub), + "calculate length, ignore bad chars"); + test_wchar2utf8(NULL, 0, NULL, 0, 0, 0, "invalid params, all 0"); + test_wchar2utf8(w1, 0, NULL, 0, 0, 0, + "invalid params, src buf not NULL"); + test_wchar2utf8(NULL, 10, NULL, 0, 0, 0, + "invalid params, src length is not 0"); + test_wchar2utf8(w1, sizeof(w1) / sizeof(*w1), u1, 0, 0, 0, + "invalid params, dst is not NULL"); +} + static void test_escape_sequences(void) { say(L"Testing escape codes"); @@ -1110,9 +1365,9 @@ static void test_path() static void test_pager_navigation() { say(L"Testing pager navigation"); - + /* Generate 19 strings of width 10. There's 2 spaces between completions, and our term size is 80; these can therefore fit into 6 columns (6 * 12 - 2 = 70) or 5 columns (58) but not 7 columns (7 * 12 - 2 = 82). - + You can simulate this test by creating 19 files named "file00.txt" through "file_18.txt". */ completion_list_t completions; @@ -1120,31 +1375,31 @@ static void test_pager_navigation() { append_completion(completions, L"abcdefghij"); } - + pager_t pager; pager.set_completions(completions); pager.set_term_size(80, 24); page_rendering_t render = pager.render(); - + if (render.term_width != 80) err(L"Wrong term width"); if (render.term_height != 24) err(L"Wrong term height"); - + size_t rows = 4, cols = 5; - + /* We have 19 completions. We can fit into 6 columns with 4 rows or 5 columns with 4 rows; the second one is better and so is what we ought to have picked. */ if (render.rows != rows) err(L"Wrong row count"); if (render.cols != cols) err(L"Wrong column count"); - + /* Initially expect to have no completion index */ if (render.selected_completion_idx != (size_t)(-1)) { err(L"Wrong initial selection"); } - + /* Here are navigation directions and where we expect the selection to be */ const struct { @@ -1155,31 +1410,31 @@ static void test_pager_navigation() { /* Tab completion to get into the list */ {direction_next, 0}, - + /* Westward motion in upper left wraps along the top row */ {direction_west, 16}, {direction_east, 1}, - + /* "Next" motion goes down the column */ {direction_next, 2}, {direction_next, 3}, - + {direction_west, 18}, {direction_east, 3}, {direction_east, 7}, {direction_east, 11}, {direction_east, 15}, {direction_east, 3}, - + {direction_west, 18}, {direction_east, 3}, - + /* Eastward motion wraps along the bottom, westward goes to the prior column */ {direction_east, 7}, {direction_east, 11}, {direction_east, 15}, {direction_east, 3}, - + /* Column memory */ {direction_west, 18}, {direction_south, 15}, @@ -1197,7 +1452,7 @@ static void test_pager_navigation() err(L"For command %lu, expected selection %lu, but found instead %lu\n", i, cmds[i].sel, render.selected_completion_idx); } } - + } enum word_motion_t @@ -1536,14 +1791,14 @@ static void test_complete(void) completions.clear(); complete(L"echo (builtin scuttlebut", completions, COMPLETION_REQUEST_DEFAULT); do_test(completions.size() == 0); - + /* Trailing spaces (#1261) */ complete_add(L"foobarbaz", false, 0, NULL, 0, NO_FILES, NULL, L"qux", NULL, COMPLETE_AUTO_SPACE); completions.clear(); complete(L"foobarbaz ", completions, COMPLETION_REQUEST_DEFAULT); do_test(completions.size() == 1); do_test(completions.at(0).completion == L"qux"); - + /* Don't complete variable names in single quotes (#1023) */ completions.clear(); complete(L"echo '$Foo", completions, COMPLETION_REQUEST_DEFAULT); @@ -1814,14 +2069,14 @@ static void test_input() wcstring desired_binding = prefix_binding + L'a'; input_mapping_add(prefix_binding.c_str(), L"up-line"); input_mapping_add(desired_binding.c_str(), L"down-line"); - + /* Push the desired binding on the stack (backwards!) */ size_t idx = desired_binding.size(); while (idx--) { input_unreadch(desired_binding.at(idx)); } - + /* Now test */ wint_t c = input_readch(); if (c != R_DOWN_LINE) @@ -2748,7 +3003,7 @@ static void test_highlighting(void) {L"'single_quote", highlight_spec_error}, {NULL, -1} }; - + const highlight_component_t components11[] = { {L"echo", highlight_spec_command}, @@ -2761,7 +3016,7 @@ static void test_highlighting(void) {L"]", highlight_spec_operator}, {NULL, -1} }; - + const highlight_component_t components12[] = { {L"for", highlight_spec_command}, @@ -2867,6 +3122,7 @@ int main(int argc, char **argv) if (should_test_function("cancellation")) test_cancellation(); if (should_test_function("indents")) test_indents(); if (should_test_function("utils")) test_utils(); + if (should_test_function("utf8")) test_utf8(); if (should_test_function("escape_sequences")) test_escape_sequences(); if (should_test_function("lru")) test_lru(); if (should_test_function("expand")) test_expand(); @@ -2906,7 +3162,8 @@ int main(int argc, char **argv) event_destroy(); proc_destroy(); - if(err_count != 0) { + if (err_count != 0) + { return(1); } } diff --git a/utf8.cpp b/utf8.cpp new file mode 100644 index 00000000..60d83d33 --- /dev/null +++ b/utf8.cpp @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2007 Alexey Vatchenko + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include + +#include + +#include "utf8.h" + +#include +#include +#include + + +#define _NXT 0x80 +#define _SEQ2 0xc0 +#define _SEQ3 0xe0 +#define _SEQ4 0xf0 +#define _SEQ5 0xf8 +#define _SEQ6 0xfc + +#define _BOM 0xfeff + +/* We can tweak the following typedef to allow us to simulate Windows-style 16 bit wchar's on Unix */ +typedef wchar_t utf8_wchar_t; +#define UTF8_WCHAR_MAX ((size_t)std::numeric_limits::max()) + +bool is_wchar_ucs2() +{ + return UTF8_WCHAR_MAX <= 0xFFFF; +} + +static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wchar_t *out, size_t outsize, int flags); +static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, size_t outsize, int flags); + +static bool safe_copy_wchar_to_utf8_wchar(const wchar_t *in, utf8_wchar_t *out, size_t count) +{ + bool result = true; + for (size_t i=0; i < count; i++) + { + wchar_t c = in[i]; + if (c > UTF8_WCHAR_MAX) + { + result = false; + break; + } + out[i] = c; + } + return result; +} + +bool utf8_to_wchar_string(const std::string &str, std::wstring *result) +{ + result->clear(); + const size_t inlen = str.size(); + if (inlen == 0) + { + return true; + } + + bool success = false; + const char *input = str.c_str(); + size_t outlen = utf8_to_wchar(input, inlen, NULL, 0, 0); + if (outlen > 0) + { + wchar_t *tmp = new wchar_t[outlen]; + size_t outlen2 = utf8_to_wchar(input, inlen, tmp, outlen, 0); + if (outlen2 > 0) + { + result->assign(tmp, outlen2); + success = true; + } + delete[] tmp; + } + return success; +} + +bool wchar_to_utf8_string(const std::wstring &str, std::string *result) +{ + result->clear(); + const size_t inlen = str.size(); + if (inlen == 0) + { + return true; + } + + bool success = false; + const wchar_t *input = str.c_str(); + size_t outlen = wchar_to_utf8(input, inlen, NULL, 0, 0); + if (outlen > 0) + { + char *tmp = new char[outlen]; + size_t outlen2 = wchar_to_utf8(input, inlen, tmp, outlen, 0); + if (outlen2 > 0) + { + result->assign(tmp, outlen2); + success = true; + } + delete[] tmp; + } + return success; +} + +size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags) +{ + if (in == NULL || insize == 0 || (outsize == 0 && out != NULL)) + { + return 0; + } + + size_t result; + if (sizeof(wchar_t) == sizeof(utf8_wchar_t)) + { + result = utf8_to_wchar_internal(in, insize, reinterpret_cast(out), outsize, flags); + } + else + { + // Allocate a temporary buffer to hold the output + // note: outsize may be 0 + utf8_wchar_t *tmp_output = new utf8_wchar_t[outsize]; + + // Invoke the conversion with the temporary + result = utf8_to_wchar_internal(in, insize, tmp_output, outsize, flags); + + // Copy back from tmp to the function's output, then clean it up + size_t amount_to_copy = std::min(result, outsize); + std::copy(tmp_output, tmp_output + amount_to_copy, out); + delete[] tmp_output; + } + return result; +} + +size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags) +{ + if (in == NULL || insize == 0 || (outsize == 0 && out != NULL)) + { + return 0; + } + + size_t result; + if (sizeof(wchar_t) == sizeof(utf8_wchar_t)) + { + result = wchar_to_utf8_internal(reinterpret_cast(in), insize, out, outsize, flags); + } + else + { + // Allocate a temporary buffer to hold the input + // the std::copy performs the size conversion + // note: insize may be 0 + utf8_wchar_t *tmp_input = new utf8_wchar_t[insize]; + if (! safe_copy_wchar_to_utf8_wchar(in, tmp_input, insize)) + { + // our utf8_wchar_t is UCS-16 and there was an astral character + result = 0; + } + else + { + // Invoke the conversion with the temporary, then clean up the input + result = wchar_to_utf8_internal(tmp_input, insize, out, outsize, flags); + } + delete[] tmp_input; + } + return result; +} + + +static int __wchar_forbitten(utf8_wchar_t sym); +static int __utf8_forbitten(unsigned char octet); + +static int +__wchar_forbitten(utf8_wchar_t sym) +{ + + /* Surrogate pairs */ + if (sym >= 0xd800 && sym <= 0xdfff) + return (-1); + + return (0); +} + +static int +__utf8_forbitten(unsigned char octet) +{ + + switch (octet) + { + case 0xc0: + case 0xc1: + case 0xf5: + case 0xff: + return (-1); + } + + return (0); +} + +/* + * DESCRIPTION + * This function translates UTF-8 string into UCS-2 or UCS-4 string (all symbols + * will be in local machine byte order). + * + * It takes the following arguments: + * in - input UTF-8 string. It can be null-terminated. + * insize - size of input string in bytes. + * out - result buffer for UCS-2/4 string. If out is NULL, + * function returns size of result buffer. + * outsize - size of out buffer in wide characters. + * + * RETURN VALUES + * The function returns size of result buffer (in wide characters). + * Zero is returned in case of error. + * + * CAVEATS + * 1. If UTF-8 string contains zero symbols, they will be translated + * as regular symbols. + * 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary + * when `out' is NULL and not NULL. It's because of special UTF-8 + * sequences which may result in forbitten (by RFC3629) UNICODE + * characters. So, the caller must check return value every time and + * not prepare buffer in advance (\0 terminate) but after calling this + * function. + */ +static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wchar_t *out, size_t outsize, int flags) +{ + unsigned char *p, *lim; + utf8_wchar_t *wlim, high; + size_t n, total, i, n_bits; + + if (in == NULL || insize == 0 || (outsize == 0 && out != NULL)) + return (0); + + total = 0; + p = (unsigned char *)in; + lim = p + insize; + wlim = out + outsize; + + for (; p < lim; p += n) + { + if (__utf8_forbitten(*p) != 0 && + (flags & UTF8_IGNORE_ERROR) == 0) + return (0); + + /* + * Get number of bytes for one wide character. + */ + n = 1; /* default: 1 byte. Used when skipping bytes. */ + if ((*p & 0x80) == 0) + high = (utf8_wchar_t)*p; + else if ((*p & 0xe0) == _SEQ2) + { + n = 2; + high = (utf8_wchar_t)(*p & 0x1f); + } + else if ((*p & 0xf0) == _SEQ3) + { + n = 3; + high = (utf8_wchar_t)(*p & 0x0f); + } + else if ((*p & 0xf8) == _SEQ4) + { + n = 4; + high = (utf8_wchar_t)(*p & 0x07); + } + else if ((*p & 0xfc) == _SEQ5) + { + n = 5; + high = (utf8_wchar_t)(*p & 0x03); + } + else if ((*p & 0xfe) == _SEQ6) + { + n = 6; + high = (utf8_wchar_t)(*p & 0x01); + } + else + { + if ((flags & UTF8_IGNORE_ERROR) == 0) + return (0); + continue; + } + + /* does the sequence header tell us truth about length? */ + if (lim - p <= n - 1) + { + if ((flags & UTF8_IGNORE_ERROR) == 0) + return (0); + n = 1; + continue; /* skip */ + } + + /* + * Validate sequence. + * All symbols must have higher bits set to 10xxxxxx + */ + if (n > 1) + { + for (i = 1; i < n; i++) + { + if ((p[i] & 0xc0) != _NXT) + break; + } + if (i != n) + { + if ((flags & UTF8_IGNORE_ERROR) == 0) + return (0); + n = 1; + continue; /* skip */ + } + } + + total++; + + if (out == NULL) + continue; + + if (out >= wlim) + return (0); /* no space left */ + + uint32_t out_val = 0; + *out = 0; + n_bits = 0; + for (i = 1; i < n; i++) + { + out_val |= (utf8_wchar_t)(p[n - i] & 0x3f) << n_bits; + n_bits += 6; /* 6 low bits in every byte */ + } + out_val |= high << n_bits; + + bool skip = false; + if (__wchar_forbitten(out_val) != 0) + { + if ((flags & UTF8_IGNORE_ERROR) == 0) + { + return 0; /* forbitten character */ + } + else + { + skip = true; + } + } + else if (out_val == _BOM && (flags & UTF8_SKIP_BOM) != 0) + { + skip = true; + } + + if (skip) + { + total--; + } + else if (out_val > UTF8_WCHAR_MAX) + { + // wchar_t is UCS-2, but the UTF-8 specified an astral character + return 0; + } + else + { + *out++ = out_val; + } + } + + return (total); +} + +/* + * DESCRIPTION + * This function translates UCS-2/4 symbols (given in local machine + * byte order) into UTF-8 string. + * + * It takes the following arguments: + * in - input unicode string. It can be null-terminated. + * insize - size of input string in wide characters. + * out - result buffer for utf8 string. If out is NULL, + * function returns size of result buffer. + * outsize - size of result buffer. + * + * RETURN VALUES + * The function returns size of result buffer (in bytes). Zero is returned + * in case of error. + * + * CAVEATS + * If UCS-4 string contains zero symbols, they will be translated + * as regular symbols. + */ +static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, size_t outsize, int flags) +{ + const utf8_wchar_t *w, *wlim; + unsigned char *p, *lim; + size_t total, n; + + if (in == NULL || insize == 0 || (outsize == 0 && out != NULL)) + return (0); + + w = in; + wlim = w + insize; + p = (unsigned char *)out; + lim = p + outsize; + total = 0; + for (; w < wlim; w++) + { + if (__wchar_forbitten(*w) != 0) + { + if ((flags & UTF8_IGNORE_ERROR) == 0) + return (0); + else + continue; + } + + if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0) + continue; + + const int32_t w_wide = *w; + if (w_wide < 0) + { + if ((flags & UTF8_IGNORE_ERROR) == 0) + return (0); + continue; + } + else if (w_wide <= 0x0000007f) + n = 1; + else if (w_wide <= 0x000007ff) + n = 2; + else if (w_wide <= 0x0000ffff) + n = 3; + else if (w_wide <= 0x001fffff) + n = 4; + else if (w_wide <= 0x03ffffff) + n = 5; + else /* if (w_wide <= 0x7fffffff) */ + n = 6; + + total += n; + + if (out == NULL) + continue; + + if (lim - p <= n - 1) + return (0); /* no space left */ + + /* extract the wchar_t as big-endian. If wchar_t is UCS-16, the first two bytes will be 0 */ + unsigned char oc[4]; + uint32_t w_tmp = *w; + oc[3] = w_tmp & 0xFF; + w_tmp >>= 8; + oc[2] = w_tmp & 0xFF; + w_tmp >>= 8; + oc[1] = w_tmp & 0xFF; + w_tmp >>= 8; + oc[0] = w_tmp & 0xFF; + + switch (n) + { + case 1: + p[0] = oc[3]; + break; + + case 2: + p[1] = _NXT | (oc[3] & 0x3f); + p[0] = _SEQ2 | (oc[3] >> 6) | ((oc[2] & 0x07) << 2); + break; + + case 3: + p[2] = _NXT | (oc[3] & 0x3f); + p[1] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2); + p[0] = _SEQ3 | ((oc[2] & 0xf0) >> 4); + break; + + case 4: + p[3] = _NXT | (oc[3] & 0x3f); + p[2] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2); + p[1] = _NXT | ((oc[2] & 0xf0) >> 4) | + ((oc[1] & 0x03) << 4); + p[0] = _SEQ4 | ((oc[1] & 0x1f) >> 2); + break; + + case 5: + p[4] = _NXT | (oc[3] & 0x3f); + p[3] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2); + p[2] = _NXT | ((oc[2] & 0xf0) >> 4) | + ((oc[1] & 0x03) << 4); + p[1] = _NXT | (oc[1] >> 2); + p[0] = _SEQ5 | (oc[0] & 0x03); + break; + + case 6: + p[5] = _NXT | (oc[3] & 0x3f); + p[4] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2); + p[3] = _NXT | (oc[2] >> 4) | ((oc[1] & 0x03) << 4); + p[2] = _NXT | (oc[1] >> 2); + p[1] = _NXT | (oc[0] & 0x3f); + p[0] = _SEQ6 | ((oc[0] & 0x40) >> 6); + break; + } + + /* + * NOTE: do not check here for forbitten UTF-8 characters. + * They cannot appear here because we do proper convertion. + */ + + p += n; + } + + return (total); +} diff --git a/utf8.h b/utf8.h new file mode 100644 index 00000000..18aa5265 --- /dev/null +++ b/utf8.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2007 Alexey Vatchenko + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * utf8: implementation of UTF-8 charset encoding (RFC3629). + */ +#ifndef _UTF8_H_ +#define _UTF8_H_ + +#include + +#include +#include + +#define UTF8_IGNORE_ERROR 0x01 +#define UTF8_SKIP_BOM 0x02 + +bool utf8_to_wchar_string(const std::string &input, std::wstring *result); +bool wchar_to_utf8_string(const std::wstring &input, std::string *result); + +size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags); +size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags); + +bool is_wchar_ucs2(); + +#endif /* !_UTF8_H_ */ -- cgit v1.2.3