aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--Makefile.in4
-rw-r--r--doc_src/license.hdr16
-rw-r--r--fish.xcodeproj/project.pbxproj8
-rw-r--r--fish_tests.cpp307
-rw-r--r--utf8.cpp514
-rw-r--r--utf8.h39
6 files changed, 861 insertions, 27 deletions
diff --git a/Makefile.in b/Makefile.in
index 8d18ce1d..3063438c 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -91,7 +91,7 @@ FISH_OBJS := function.o builtin.o complete.o env.o exec.o expand.o \
signal.o io.o parse_util.o common.o screen.o path.o autoload.o \
parser_keywords.o iothread.o color.o postfork.o \
builtin_test.o parse_tree.o parse_productions.o parse_execution.cpp \
- pager.cpp
+ pager.cpp utf8.o
FISH_INDENT_OBJS := fish_indent.o print_help.o common.o \
parser_keywords.o wutil.o tokenizer.o
@@ -117,7 +117,7 @@ FISH_TESTS_OBJS := $(FISH_OBJS) fish_tests.o
#
FISHD_OBJS := fishd.o env_universal_common.o wutil.o print_help.o \
- common.o
+ common.o utf8.o
#
diff --git a/doc_src/license.hdr b/doc_src/license.hdr
index c07a94ad..7f14383a 100644
--- a/doc_src/license.hdr
+++ b/doc_src/license.hdr
@@ -1402,4 +1402,20 @@ POSSIBILITY OF SUCH DAMAGES.
*/
+<h2>License for UTF8</h2>
+
+<p>Copyright (c) 2007 Alexey Vatchenko <av@bsdua.org>
+
+<p>Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+<p>THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
\htmlonly </div> \endhtmlonly
diff --git a/fish.xcodeproj/project.pbxproj b/fish.xcodeproj/project.pbxproj
index d51f53e3..47d04c4c 100644
--- a/fish.xcodeproj/project.pbxproj
+++ b/fish.xcodeproj/project.pbxproj
@@ -117,6 +117,8 @@
D0A564FE168D23D800AF6161 /* man in CopyFiles */ = {isa = PBXBuildFile; fileRef = D0A564F1168D0BAB00AF6161 /* man */; };
D0A56501168D258300AF6161 /* man in Copy Files */ = {isa = PBXBuildFile; fileRef = D0A564F1168D0BAB00AF6161 /* man */; };
D0C52F371765284C00BFAB82 /* parse_tree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0C52F351765284C00BFAB82 /* parse_tree.cpp */; };
+ D0C9733818DE5449002D7C81 /* utf8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0C9733718DE5449002D7C81 /* utf8.cpp */; };
+ D0C9733918DE5449002D7C81 /* utf8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0C9733718DE5449002D7C81 /* utf8.cpp */; };
D0CBD587159EF0E10024809C /* launch_fish.scpt in Resources */ = {isa = PBXBuildFile; fileRef = D0CBD586159EF0E10024809C /* launch_fish.scpt */; };
D0D02A67159837AD008E62BD /* complete.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0A0853713B3ACEE0099B651 /* complete.cpp */; };
D0D02A69159837B2008E62BD /* env.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0A0853A13B3ACEE0099B651 /* env.cpp */; };
@@ -475,6 +477,8 @@
D0C6FCC914CFA4B0004CE8AD /* autoload.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = autoload.cpp; sourceTree = "<group>"; };
D0C6FCCB14CFA4B7004CE8AD /* autoload.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = autoload.h; sourceTree = "<group>"; };
D0C861EA16CC7054003B5A04 /* builtin_set_color.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = builtin_set_color.cpp; sourceTree = "<group>"; };
+ D0C9733718DE5449002D7C81 /* utf8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = utf8.cpp; sourceTree = "<group>"; };
+ D0C9733A18DE5451002D7C81 /* utf8.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = utf8.h; sourceTree = "<group>"; };
D0CA63F316FC275F00093BD4 /* builtin_printf.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = builtin_printf.cpp; sourceTree = "<group>"; };
D0CBD580159EE48F0024809C /* config.fish */ = {isa = PBXFileReference; lastKnownFileType = text; name = config.fish; path = share/config.fish; sourceTree = "<group>"; };
D0CBD583159EEE010024809C /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
@@ -686,6 +690,8 @@
D0A0855C13B3ACEE0099B651 /* signal.cpp */,
D0A0852513B3ACEE0099B651 /* tokenizer.h */,
D0A0855D13B3ACEE0099B651 /* tokenizer.cpp */,
+ D0C9733A18DE5451002D7C81 /* utf8.h */,
+ D0C9733718DE5449002D7C81 /* utf8.cpp */,
D0A0852613B3ACEE0099B651 /* util.h */,
D0A0855E13B3ACEE0099B651 /* util.cpp */,
D0A0852713B3ACEE0099B651 /* wgetopt.h */,
@@ -1120,6 +1126,7 @@
files = (
D0D02AC215985F3F008E62BD /* fishd.cpp in Sources */,
D0D02AC315985F43008E62BD /* env_universal_common.cpp in Sources */,
+ D0C9733918DE5449002D7C81 /* utf8.cpp in Sources */,
D0D02AC415985F4D008E62BD /* wutil.cpp in Sources */,
D0D02AC515985F5B008E62BD /* print_help.cpp in Sources */,
D0D02AC615985F65008E62BD /* common.cpp in Sources */,
@@ -1157,6 +1164,7 @@
D0D02A86159839D5008E62BD /* postfork.cpp in Sources */,
D0D02A87159839D5008E62BD /* screen.cpp in Sources */,
D0D02A88159839D5008E62BD /* signal.cpp in Sources */,
+ D0C9733818DE5449002D7C81 /* utf8.cpp in Sources */,
D0D2694A15983779005D9B9C /* builtin.cpp in Sources */,
D0D2694915983772005D9B9C /* function.cpp in Sources */,
D0D02A67159837AD008E62BD /* complete.cpp in Sources */,
diff --git a/fish_tests.cpp b/fish_tests.cpp
index 5b7bcea6..f07b568c 100644
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@@ -62,6 +62,7 @@
#include "parse_util.h"
#include "pager.h"
#include "input.h"
+#include "utf8.h"
static const char * const * s_arguments;
static int s_test_run_count = 0;
@@ -140,17 +141,17 @@ static void err(const wchar_t *blah, ...)
va_list va;
va_start(va, blah);
err_count++;
-
+
// show errors in red
fputs("\x1b[31m", stdout);
wprintf(L"Error: ");
vwprintf(blah, va);
va_end(va);
-
+
// return to normal color
fputs("\x1b[0m", stdout);
-
+
wprintf(L"\n");
}
@@ -857,6 +858,260 @@ static void test_utils()
if (begin != a + wcslen(L"echo (echo (")) err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__);
}
+/* UTF8 tests taken from Alexey Vatchenko's utf8 library. See http://www.bsdua.org/libbsdua.html */
+
+static void test_utf82wchar(const char *src, size_t slen, const wchar_t *dst, size_t dlen,
+ int flags, size_t res, const char *descr)
+{
+ size_t size;
+ wchar_t *mem = NULL;
+
+ /* Hack: if wchar is only UCS-2, and the UTF-8 input string contains astral characters, then tweak the expected size to 0 */
+ if (src != NULL && is_wchar_ucs2())
+ {
+ /* A UTF-8 code unit may represent an astral code point if it has 4 or more leading 1s */
+ const unsigned char astral_mask = 0xF0;
+ for (size_t i=0; i < slen; i++)
+ {
+ if ((src[i] & astral_mask) == astral_mask)
+ {
+ /* Astral char. We expect this conversion to just fail. */
+ res = 0;
+ break;
+ }
+ }
+ }
+
+ if (dst != NULL)
+ {
+ mem = (wchar_t *)malloc(dlen * sizeof(*mem));
+ if (mem == NULL)
+ {
+ err(L"u2w: %s: MALLOC FAILED\n", descr);
+ return;
+ }
+ }
+
+ do
+ {
+ size = utf8_to_wchar(src, slen, mem, dlen, flags);
+ if (res != size)
+ {
+ err(L"u2w: %s: FAILED (rv: %lu, must be %lu)", descr, size, res);
+ break;
+ }
+
+ if (mem == NULL)
+ break; /* OK */
+
+ if (memcmp(mem, dst, size * sizeof(*mem)) != 0)
+ {
+ err(L"u2w: %s: BROKEN", descr);
+ break;
+ }
+
+ }
+ while (0);
+
+ free(mem);
+}
+
+static void test_wchar2utf8(const wchar_t *src, size_t slen, const char *dst, size_t dlen,
+ int flags, size_t res, const char *descr)
+{
+ size_t size;
+ char *mem = NULL;
+
+ /* Hack: if wchar is simulating UCS-2, and the wchar_t input string contains astral characters, then tweak the expected size to 0 */
+ if (src != NULL && is_wchar_ucs2())
+ {
+ const uint32_t astral_mask = 0xFFFF0000U;
+ for (size_t i=0; i < slen; i++)
+ {
+ if ((src[i] & astral_mask) != 0)
+ {
+ /* astral char */
+ res = 0;
+ break;
+ }
+ }
+ }
+
+ if (dst != NULL)
+ {
+ mem = (char *)malloc(dlen);
+ if (mem == NULL)
+ {
+ err(L"w2u: %s: MALLOC FAILED", descr);
+ return;
+ }
+ }
+
+ do
+ {
+ size = wchar_to_utf8(src, slen, mem, dlen, flags);
+ if (res != size)
+ {
+ err(L"w2u: %s: FAILED (rv: %lu, must be %lu)", descr, size, res);
+ break;
+ }
+
+ if (mem == NULL)
+ break; /* OK */
+
+ if (memcmp(mem, dst, size) != 0)
+ {
+ err(L"w2u: %s: BROKEN", descr);
+ break;
+ }
+
+ }
+ while (0);
+
+ if (mem != NULL);
+ free(mem);
+}
+
+static void test_utf8()
+{
+ wchar_t w1[] = {0x54, 0x65, 0x73, 0x74};
+ wchar_t w2[] = {0x0422, 0x0435, 0x0441, 0x0442};
+ wchar_t w3[] = {0x800, 0x1e80, 0x98c4, 0x9910, 0xff00};
+ wchar_t w4[] = {0x15555, 0xf7777, 0xa};
+ wchar_t w5[] = {0x255555, 0x1fa04ff, 0xddfd04, 0xa};
+ wchar_t w6[] = {0xf255555, 0x1dfa04ff, 0x7fddfd04, 0xa};
+ wchar_t wb[] = {-2, 0xa, 0xffffffff, 0x0441};
+ wchar_t wm[] = {0x41, 0x0441, 0x3042, 0xff67, 0x9b0d, 0x2e05da67};
+ wchar_t wb1[] = {0xa, 0x0422};
+ wchar_t wb2[] = {0xd800, 0xda00, 0x41, 0xdfff, 0xa};
+ wchar_t wbom[] = {0xfeff, 0x41, 0xa};
+ wchar_t wbom2[] = {0x41, 0xa};
+ wchar_t wbom22[] = {0xfeff, 0x41, 0xa};
+ char u1[] = {0x54, 0x65, 0x73, 0x74};
+ char u2[] = {0xd0, 0xa2, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82};
+ char u3[] = {0xe0, 0xa0, 0x80, 0xe1, 0xba, 0x80, 0xe9, 0xa3, 0x84,
+ 0xe9, 0xa4, 0x90, 0xef, 0xbc, 0x80
+ };
+ char u4[] = {0xf0, 0x95, 0x95, 0x95, 0xf3, 0xb7, 0x9d, 0xb7, 0xa};
+ char u5[] = {0xf8, 0x89, 0x95, 0x95, 0x95, 0xf9, 0xbe, 0xa0, 0x93,
+ 0xbf, 0xf8, 0xb7, 0x9f, 0xb4, 0x84, 0x0a
+ };
+ char u6[] = {0xfc, 0x8f, 0x89, 0x95, 0x95, 0x95, 0xfc, 0x9d, 0xbe,
+ 0xa0, 0x93, 0xbf, 0xfd, 0xbf, 0xb7, 0x9f, 0xb4, 0x84, 0x0a
+ };
+ char ub[] = {0xa, 0xd1, 0x81};
+ char um[] = {0x41, 0xd1, 0x81, 0xe3, 0x81, 0x82, 0xef, 0xbd, 0xa7,
+ 0xe9, 0xac, 0x8d, 0xfc, 0xae, 0x81, 0x9d, 0xa9, 0xa7
+ };
+ char ub1[] = {0xa, 0xff, 0xd0, 0xa2, 0xfe, 0x8f, 0xe0, 0x80};
+ char uc080[] = {0xc0, 0x80};
+ char ub2[] = {0xed, 0xa1, 0x8c, 0xed, 0xbe, 0xb4, 0xa};
+ char ubom[] = {0x41, 0xa};
+ char ubom2[] = {0xef, 0xbb, 0xbf, 0x41, 0xa};
+
+ /*
+ * UTF-8 -> UCS-4 string.
+ */
+ test_utf82wchar(ubom2, sizeof(ubom2), wbom2,
+ sizeof(wbom2) / sizeof(*wbom2), UTF8_SKIP_BOM,
+ sizeof(wbom2) / sizeof(*wbom2), "skip BOM");
+ test_utf82wchar(ubom2, sizeof(ubom2), wbom22,
+ sizeof(wbom22) / sizeof(*wbom22), 0,
+ sizeof(wbom22) / sizeof(*wbom22), "BOM");
+ test_utf82wchar(uc080, sizeof(uc080), NULL, 0, 0, 0,
+ "c0 80 - forbitten by rfc3629");
+ test_utf82wchar(ub2, sizeof(ub2), NULL, 0, 0, is_wchar_ucs2() ? 0 : 3,
+ "resulted in forbitten wchars (len)");
+ test_utf82wchar(ub2, sizeof(ub2), wb2, sizeof(wb2) / sizeof(*wb2), 0, 0,
+ "resulted in forbitten wchars");
+ test_utf82wchar(ub2, sizeof(ub2), L"\x0a", 1, UTF8_IGNORE_ERROR,
+ 1, "resulted in ignored forbitten wchars");
+ test_utf82wchar(u1, sizeof(u1), w1, sizeof(w1) / sizeof(*w1), 0,
+ sizeof(w1) / sizeof(*w1), "1 octet chars");
+ test_utf82wchar(u2, sizeof(u2), w2, sizeof(w2) / sizeof(*w2), 0,
+ sizeof(w2) / sizeof(*w2), "2 octets chars");
+ test_utf82wchar(u3, sizeof(u3), w3, sizeof(w3) / sizeof(*w3), 0,
+ sizeof(w3) / sizeof(*w3), "3 octets chars");
+ test_utf82wchar(u4, sizeof(u4), w4, sizeof(w4) / sizeof(*w4), 0,
+ sizeof(w4) / sizeof(*w4), "4 octets chars");
+ test_utf82wchar(u5, sizeof(u5), w5, sizeof(w5) / sizeof(*w5), 0,
+ sizeof(w5) / sizeof(*w5), "5 octets chars");
+ test_utf82wchar(u6, sizeof(u6), w6, sizeof(w6) / sizeof(*w6), 0,
+ sizeof(w6) / sizeof(*w6), "6 octets chars");
+ test_utf82wchar("\xff", 1, NULL, 0, 0, 0, "broken utf-8 0xff symbol");
+ test_utf82wchar("\xfe", 1, NULL, 0, 0, 0, "broken utf-8 0xfe symbol");
+ test_utf82wchar("\x8f", 1, NULL, 0, 0, 0,
+ "broken utf-8, start from 10 higher bits");
+ if (! is_wchar_ucs2()) test_utf82wchar(ub1, sizeof(ub1), wb1, sizeof(wb1) / sizeof(*wb1),
+ UTF8_IGNORE_ERROR, sizeof(wb1) / sizeof(*wb1), "ignore bad chars");
+ test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm), 0,
+ sizeof(wm) / sizeof(*wm), "mixed languages");
+ test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) - 1, 0,
+ 0, "boundaries -1");
+ test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) + 1, 0,
+ sizeof(wm) / sizeof(*wm), "boundaries +1");
+ test_utf82wchar(um, sizeof(um), NULL, 0, 0,
+ sizeof(wm) / sizeof(*wm), "calculate length");
+ test_utf82wchar(ub1, sizeof(ub1), NULL, 0, 0,
+ 0, "calculate length of bad chars");
+ test_utf82wchar(ub1, sizeof(ub1), NULL, 0,
+ UTF8_IGNORE_ERROR, sizeof(wb1) / sizeof(*wb1),
+ "calculate length, ignore bad chars");
+ test_utf82wchar(NULL, 0, NULL, 0, 0, 0, "invalid params, all 0");
+ test_utf82wchar(u1, 0, NULL, 0, 0, 0,
+ "invalid params, src buf not NULL");
+ test_utf82wchar(NULL, 10, NULL, 0, 0, 0,
+ "invalid params, src length is not 0");
+ test_utf82wchar(u1, sizeof(u1), w1, 0, 0, 0,
+ "invalid params, dst is not NULL");
+
+ /*
+ * UCS-4 -> UTF-8 string.
+ */
+ test_wchar2utf8(wbom, sizeof(wbom) / sizeof(*wbom), ubom, sizeof(ubom),
+ UTF8_SKIP_BOM, sizeof(ubom), "BOM");
+ test_wchar2utf8(wb2, sizeof(wb2) / sizeof(*wb2), NULL, 0, 0,
+ 0, "prohibited wchars");
+ test_wchar2utf8(wb2, sizeof(wb2) / sizeof(*wb2), NULL, 0,
+ UTF8_IGNORE_ERROR, 2, "ignore prohibited wchars");
+ test_wchar2utf8(w1, sizeof(w1) / sizeof(*w1), u1, sizeof(u1), 0,
+ sizeof(u1), "1 octet chars");
+ test_wchar2utf8(w2, sizeof(w2) / sizeof(*w2), u2, sizeof(u2), 0,
+ sizeof(u2), "2 octets chars");
+ test_wchar2utf8(w3, sizeof(w3) / sizeof(*w3), u3, sizeof(u3), 0,
+ sizeof(u3), "3 octets chars");
+ test_wchar2utf8(w4, sizeof(w4) / sizeof(*w4), u4, sizeof(u4), 0,
+ sizeof(u4), "4 octets chars");
+ test_wchar2utf8(w5, sizeof(w5) / sizeof(*w5), u5, sizeof(u5), 0,
+ sizeof(u5), "5 octets chars");
+ test_wchar2utf8(w6, sizeof(w6) / sizeof(*w6), u6, sizeof(u6), 0,
+ sizeof(u6), "6 octets chars");
+ test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), ub, sizeof(ub), 0,
+ 0, "bad chars");
+ test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), ub, sizeof(ub),
+ UTF8_IGNORE_ERROR, sizeof(ub), "ignore bad chars");
+ test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um), 0,
+ sizeof(um), "mixed languages");
+ test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um) - 1, 0,
+ 0, "boundaries -1");
+ test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um) + 1, 0,
+ sizeof(um), "boundaries +1");
+ test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), NULL, 0, 0,
+ sizeof(um), "calculate length");
+ test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), NULL, 0, 0,
+ 0, "calculate length of bad chars");
+ test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), NULL, 0,
+ UTF8_IGNORE_ERROR, sizeof(ub),
+ "calculate length, ignore bad chars");
+ test_wchar2utf8(NULL, 0, NULL, 0, 0, 0, "invalid params, all 0");
+ test_wchar2utf8(w1, 0, NULL, 0, 0, 0,
+ "invalid params, src buf not NULL");
+ test_wchar2utf8(NULL, 10, NULL, 0, 0, 0,
+ "invalid params, src length is not 0");
+ test_wchar2utf8(w1, sizeof(w1) / sizeof(*w1), u1, 0, 0, 0,
+ "invalid params, dst is not NULL");
+}
+
static void test_escape_sequences(void)
{
say(L"Testing escape codes");
@@ -1110,9 +1365,9 @@ static void test_path()
static void test_pager_navigation()
{
say(L"Testing pager navigation");
-
+
/* Generate 19 strings of width 10. There's 2 spaces between completions, and our term size is 80; these can therefore fit into 6 columns (6 * 12 - 2 = 70) or 5 columns (58) but not 7 columns (7 * 12 - 2 = 82).
-
+
You can simulate this test by creating 19 files named "file00.txt" through "file_18.txt".
*/
completion_list_t completions;
@@ -1120,31 +1375,31 @@ static void test_pager_navigation()
{
append_completion(completions, L"abcdefghij");
}
-
+
pager_t pager;
pager.set_completions(completions);
pager.set_term_size(80, 24);
page_rendering_t render = pager.render();
-
+
if (render.term_width != 80)
err(L"Wrong term width");
if (render.term_height != 24)
err(L"Wrong term height");
-
+
size_t rows = 4, cols = 5;
-
+
/* We have 19 completions. We can fit into 6 columns with 4 rows or 5 columns with 4 rows; the second one is better and so is what we ought to have picked. */
if (render.rows != rows)
err(L"Wrong row count");
if (render.cols != cols)
err(L"Wrong column count");
-
+
/* Initially expect to have no completion index */
if (render.selected_completion_idx != (size_t)(-1))
{
err(L"Wrong initial selection");
}
-
+
/* Here are navigation directions and where we expect the selection to be */
const struct
{
@@ -1155,31 +1410,31 @@ static void test_pager_navigation()
{
/* Tab completion to get into the list */
{direction_next, 0},
-
+
/* Westward motion in upper left wraps along the top row */
{direction_west, 16},
{direction_east, 1},
-
+
/* "Next" motion goes down the column */
{direction_next, 2},
{direction_next, 3},
-
+
{direction_west, 18},
{direction_east, 3},
{direction_east, 7},
{direction_east, 11},
{direction_east, 15},
{direction_east, 3},
-
+
{direction_west, 18},
{direction_east, 3},
-
+
/* Eastward motion wraps along the bottom, westward goes to the prior column */
{direction_east, 7},
{direction_east, 11},
{direction_east, 15},
{direction_east, 3},
-
+
/* Column memory */
{direction_west, 18},
{direction_south, 15},
@@ -1197,7 +1452,7 @@ static void test_pager_navigation()
err(L"For command %lu, expected selection %lu, but found instead %lu\n", i, cmds[i].sel, render.selected_completion_idx);
}
}
-
+
}
enum word_motion_t
@@ -1536,14 +1791,14 @@ static void test_complete(void)
completions.clear();
complete(L"echo (builtin scuttlebut", completions, COMPLETION_REQUEST_DEFAULT);
do_test(completions.size() == 0);
-
+
/* Trailing spaces (#1261) */
complete_add(L"foobarbaz", false, 0, NULL, 0, NO_FILES, NULL, L"qux", NULL, COMPLETE_AUTO_SPACE);
completions.clear();
complete(L"foobarbaz ", completions, COMPLETION_REQUEST_DEFAULT);
do_test(completions.size() == 1);
do_test(completions.at(0).completion == L"qux");
-
+
/* Don't complete variable names in single quotes (#1023) */
completions.clear();
complete(L"echo '$Foo", completions, COMPLETION_REQUEST_DEFAULT);
@@ -1814,14 +2069,14 @@ static void test_input()
wcstring desired_binding = prefix_binding + L'a';
input_mapping_add(prefix_binding.c_str(), L"up-line");
input_mapping_add(desired_binding.c_str(), L"down-line");
-
+
/* Push the desired binding on the stack (backwards!) */
size_t idx = desired_binding.size();
while (idx--)
{
input_unreadch(desired_binding.at(idx));
}
-
+
/* Now test */
wint_t c = input_readch();
if (c != R_DOWN_LINE)
@@ -2748,7 +3003,7 @@ static void test_highlighting(void)
{L"'single_quote", highlight_spec_error},
{NULL, -1}
};
-
+
const highlight_component_t components11[] =
{
{L"echo", highlight_spec_command},
@@ -2761,7 +3016,7 @@ static void test_highlighting(void)
{L"]", highlight_spec_operator},
{NULL, -1}
};
-
+
const highlight_component_t components12[] =
{
{L"for", highlight_spec_command},
@@ -2867,6 +3122,7 @@ int main(int argc, char **argv)
if (should_test_function("cancellation")) test_cancellation();
if (should_test_function("indents")) test_indents();
if (should_test_function("utils")) test_utils();
+ if (should_test_function("utf8")) test_utf8();
if (should_test_function("escape_sequences")) test_escape_sequences();
if (should_test_function("lru")) test_lru();
if (should_test_function("expand")) test_expand();
@@ -2906,7 +3162,8 @@ int main(int argc, char **argv)
event_destroy();
proc_destroy();
- if(err_count != 0) {
+ if (err_count != 0)
+ {
return(1);
}
}
diff --git a/utf8.cpp b/utf8.cpp
new file mode 100644
index 00000000..60d83d33
--- /dev/null
+++ b/utf8.cpp
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2007 Alexey Vatchenko <av@bsdua.org>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <sys/types.h>
+
+#include <wchar.h>
+
+#include "utf8.h"
+
+#include <string>
+#include <vector>
+#include <limits>
+
+
+#define _NXT 0x80
+#define _SEQ2 0xc0
+#define _SEQ3 0xe0
+#define _SEQ4 0xf0
+#define _SEQ5 0xf8
+#define _SEQ6 0xfc
+
+#define _BOM 0xfeff
+
+/* We can tweak the following typedef to allow us to simulate Windows-style 16 bit wchar's on Unix */
+typedef wchar_t utf8_wchar_t;
+#define UTF8_WCHAR_MAX ((size_t)std::numeric_limits<utf8_wchar_t>::max())
+
+bool is_wchar_ucs2()
+{
+ return UTF8_WCHAR_MAX <= 0xFFFF;
+}
+
+static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wchar_t *out, size_t outsize, int flags);
+static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
+
+static bool safe_copy_wchar_to_utf8_wchar(const wchar_t *in, utf8_wchar_t *out, size_t count)
+{
+ bool result = true;
+ for (size_t i=0; i < count; i++)
+ {
+ wchar_t c = in[i];
+ if (c > UTF8_WCHAR_MAX)
+ {
+ result = false;
+ break;
+ }
+ out[i] = c;
+ }
+ return result;
+}
+
+bool utf8_to_wchar_string(const std::string &str, std::wstring *result)
+{
+ result->clear();
+ const size_t inlen = str.size();
+ if (inlen == 0)
+ {
+ return true;
+ }
+
+ bool success = false;
+ const char *input = str.c_str();
+ size_t outlen = utf8_to_wchar(input, inlen, NULL, 0, 0);
+ if (outlen > 0)
+ {
+ wchar_t *tmp = new wchar_t[outlen];
+ size_t outlen2 = utf8_to_wchar(input, inlen, tmp, outlen, 0);
+ if (outlen2 > 0)
+ {
+ result->assign(tmp, outlen2);
+ success = true;
+ }
+ delete[] tmp;
+ }
+ return success;
+}
+
+bool wchar_to_utf8_string(const std::wstring &str, std::string *result)
+{
+ result->clear();
+ const size_t inlen = str.size();
+ if (inlen == 0)
+ {
+ return true;
+ }
+
+ bool success = false;
+ const wchar_t *input = str.c_str();
+ size_t outlen = wchar_to_utf8(input, inlen, NULL, 0, 0);
+ if (outlen > 0)
+ {
+ char *tmp = new char[outlen];
+ size_t outlen2 = wchar_to_utf8(input, inlen, tmp, outlen, 0);
+ if (outlen2 > 0)
+ {
+ result->assign(tmp, outlen2);
+ success = true;
+ }
+ delete[] tmp;
+ }
+ return success;
+}
+
+size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags)
+{
+ if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
+ {
+ return 0;
+ }
+
+ size_t result;
+ if (sizeof(wchar_t) == sizeof(utf8_wchar_t))
+ {
+ result = utf8_to_wchar_internal(in, insize, reinterpret_cast<utf8_wchar_t *>(out), outsize, flags);
+ }
+ else
+ {
+ // Allocate a temporary buffer to hold the output
+ // note: outsize may be 0
+ utf8_wchar_t *tmp_output = new utf8_wchar_t[outsize];
+
+ // Invoke the conversion with the temporary
+ result = utf8_to_wchar_internal(in, insize, tmp_output, outsize, flags);
+
+ // Copy back from tmp to the function's output, then clean it up
+ size_t amount_to_copy = std::min(result, outsize);
+ std::copy(tmp_output, tmp_output + amount_to_copy, out);
+ delete[] tmp_output;
+ }
+ return result;
+}
+
+size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags)
+{
+ if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
+ {
+ return 0;
+ }
+
+ size_t result;
+ if (sizeof(wchar_t) == sizeof(utf8_wchar_t))
+ {
+ result = wchar_to_utf8_internal(reinterpret_cast<const utf8_wchar_t *>(in), insize, out, outsize, flags);
+ }
+ else
+ {
+ // Allocate a temporary buffer to hold the input
+ // the std::copy performs the size conversion
+ // note: insize may be 0
+ utf8_wchar_t *tmp_input = new utf8_wchar_t[insize];
+ if (! safe_copy_wchar_to_utf8_wchar(in, tmp_input, insize))
+ {
+ // our utf8_wchar_t is UCS-16 and there was an astral character
+ result = 0;
+ }
+ else
+ {
+ // Invoke the conversion with the temporary, then clean up the input
+ result = wchar_to_utf8_internal(tmp_input, insize, out, outsize, flags);
+ }
+ delete[] tmp_input;
+ }
+ return result;
+}
+
+
+static int __wchar_forbitten(utf8_wchar_t sym);
+static int __utf8_forbitten(unsigned char octet);
+
+static int
+__wchar_forbitten(utf8_wchar_t sym)
+{
+
+ /* Surrogate pairs */
+ if (sym >= 0xd800 && sym <= 0xdfff)
+ return (-1);
+
+ return (0);
+}
+
+static int
+__utf8_forbitten(unsigned char octet)
+{
+
+ switch (octet)
+ {
+ case 0xc0:
+ case 0xc1:
+ case 0xf5:
+ case 0xff:
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * DESCRIPTION
+ * This function translates UTF-8 string into UCS-2 or UCS-4 string (all symbols
+ * will be in local machine byte order).
+ *
+ * It takes the following arguments:
+ * in - input UTF-8 string. It can be null-terminated.
+ * insize - size of input string in bytes.
+ * out - result buffer for UCS-2/4 string. If out is NULL,
+ * function returns size of result buffer.
+ * outsize - size of out buffer in wide characters.
+ *
+ * RETURN VALUES
+ * The function returns size of result buffer (in wide characters).
+ * Zero is returned in case of error.
+ *
+ * CAVEATS
+ * 1. If UTF-8 string contains zero symbols, they will be translated
+ * as regular symbols.
+ * 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary
+ * when `out' is NULL and not NULL. It's because of special UTF-8
+ * sequences which may result in forbitten (by RFC3629) UNICODE
+ * characters. So, the caller must check return value every time and
+ * not prepare buffer in advance (\0 terminate) but after calling this
+ * function.
+ */
+static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wchar_t *out, size_t outsize, int flags)
+{
+ unsigned char *p, *lim;
+ utf8_wchar_t *wlim, high;
+ size_t n, total, i, n_bits;
+
+ if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
+ return (0);
+
+ total = 0;
+ p = (unsigned char *)in;
+ lim = p + insize;
+ wlim = out + outsize;
+
+ for (; p < lim; p += n)
+ {
+ if (__utf8_forbitten(*p) != 0 &&
+ (flags & UTF8_IGNORE_ERROR) == 0)
+ return (0);
+
+ /*
+ * Get number of bytes for one wide character.
+ */
+ n = 1; /* default: 1 byte. Used when skipping bytes. */
+ if ((*p & 0x80) == 0)
+ high = (utf8_wchar_t)*p;
+ else if ((*p & 0xe0) == _SEQ2)
+ {
+ n = 2;
+ high = (utf8_wchar_t)(*p & 0x1f);
+ }
+ else if ((*p & 0xf0) == _SEQ3)
+ {
+ n = 3;
+ high = (utf8_wchar_t)(*p & 0x0f);
+ }
+ else if ((*p & 0xf8) == _SEQ4)
+ {
+ n = 4;
+ high = (utf8_wchar_t)(*p & 0x07);
+ }
+ else if ((*p & 0xfc) == _SEQ5)
+ {
+ n = 5;
+ high = (utf8_wchar_t)(*p & 0x03);
+ }
+ else if ((*p & 0xfe) == _SEQ6)
+ {
+ n = 6;
+ high = (utf8_wchar_t)(*p & 0x01);
+ }
+ else
+ {
+ if ((flags & UTF8_IGNORE_ERROR) == 0)
+ return (0);
+ continue;
+ }
+
+ /* does the sequence header tell us truth about length? */
+ if (lim - p <= n - 1)
+ {
+ if ((flags & UTF8_IGNORE_ERROR) == 0)
+ return (0);
+ n = 1;
+ continue; /* skip */
+ }
+
+ /*
+ * Validate sequence.
+ * All symbols must have higher bits set to 10xxxxxx
+ */
+ if (n > 1)
+ {
+ for (i = 1; i < n; i++)
+ {
+ if ((p[i] & 0xc0) != _NXT)
+ break;
+ }
+ if (i != n)
+ {
+ if ((flags & UTF8_IGNORE_ERROR) == 0)
+ return (0);
+ n = 1;
+ continue; /* skip */
+ }
+ }
+
+ total++;
+
+ if (out == NULL)
+ continue;
+
+ if (out >= wlim)
+ return (0); /* no space left */
+
+ uint32_t out_val = 0;
+ *out = 0;
+ n_bits = 0;
+ for (i = 1; i < n; i++)
+ {
+ out_val |= (utf8_wchar_t)(p[n - i] & 0x3f) << n_bits;
+ n_bits += 6; /* 6 low bits in every byte */
+ }
+ out_val |= high << n_bits;
+
+ bool skip = false;
+ if (__wchar_forbitten(out_val) != 0)
+ {
+ if ((flags & UTF8_IGNORE_ERROR) == 0)
+ {
+ return 0; /* forbitten character */
+ }
+ else
+ {
+ skip = true;
+ }
+ }
+ else if (out_val == _BOM && (flags & UTF8_SKIP_BOM) != 0)
+ {
+ skip = true;
+ }
+
+ if (skip)
+ {
+ total--;
+ }
+ else if (out_val > UTF8_WCHAR_MAX)
+ {
+ // wchar_t is UCS-2, but the UTF-8 specified an astral character
+ return 0;
+ }
+ else
+ {
+ *out++ = out_val;
+ }
+ }
+
+ return (total);
+}
+
+/*
+ * DESCRIPTION
+ * This function translates UCS-2/4 symbols (given in local machine
+ * byte order) into UTF-8 string.
+ *
+ * It takes the following arguments:
+ * in - input unicode string. It can be null-terminated.
+ * insize - size of input string in wide characters.
+ * out - result buffer for utf8 string. If out is NULL,
+ * function returns size of result buffer.
+ * outsize - size of result buffer.
+ *
+ * RETURN VALUES
+ * The function returns size of result buffer (in bytes). Zero is returned
+ * in case of error.
+ *
+ * CAVEATS
+ * If UCS-4 string contains zero symbols, they will be translated
+ * as regular symbols.
+ */
+static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, size_t outsize, int flags)
+{
+ const utf8_wchar_t *w, *wlim;
+ unsigned char *p, *lim;
+ size_t total, n;
+
+ if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
+ return (0);
+
+ w = in;
+ wlim = w + insize;
+ p = (unsigned char *)out;
+ lim = p + outsize;
+ total = 0;
+ for (; w < wlim; w++)
+ {
+ if (__wchar_forbitten(*w) != 0)
+ {
+ if ((flags & UTF8_IGNORE_ERROR) == 0)
+ return (0);
+ else
+ continue;
+ }
+
+ if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0)
+ continue;
+
+ const int32_t w_wide = *w;
+ if (w_wide < 0)
+ {
+ if ((flags & UTF8_IGNORE_ERROR) == 0)
+ return (0);
+ continue;
+ }
+ else if (w_wide <= 0x0000007f)
+ n = 1;
+ else if (w_wide <= 0x000007ff)
+ n = 2;
+ else if (w_wide <= 0x0000ffff)
+ n = 3;
+ else if (w_wide <= 0x001fffff)
+ n = 4;
+ else if (w_wide <= 0x03ffffff)
+ n = 5;
+ else /* if (w_wide <= 0x7fffffff) */
+ n = 6;
+
+ total += n;
+
+ if (out == NULL)
+ continue;
+
+ if (lim - p <= n - 1)
+ return (0); /* no space left */
+
+ /* extract the wchar_t as big-endian. If wchar_t is UCS-16, the first two bytes will be 0 */
+ unsigned char oc[4];
+ uint32_t w_tmp = *w;
+ oc[3] = w_tmp & 0xFF;
+ w_tmp >>= 8;
+ oc[2] = w_tmp & 0xFF;
+ w_tmp >>= 8;
+ oc[1] = w_tmp & 0xFF;
+ w_tmp >>= 8;
+ oc[0] = w_tmp & 0xFF;
+
+ switch (n)
+ {
+ case 1:
+ p[0] = oc[3];
+ break;
+
+ case 2:
+ p[1] = _NXT | (oc[3] & 0x3f);
+ p[0] = _SEQ2 | (oc[3] >> 6) | ((oc[2] & 0x07) << 2);
+ break;
+
+ case 3:
+ p[2] = _NXT | (oc[3] & 0x3f);
+ p[1] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
+ p[0] = _SEQ3 | ((oc[2] & 0xf0) >> 4);
+ break;
+
+ case 4:
+ p[3] = _NXT | (oc[3] & 0x3f);
+ p[2] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
+ p[1] = _NXT | ((oc[2] & 0xf0) >> 4) |
+ ((oc[1] & 0x03) << 4);
+ p[0] = _SEQ4 | ((oc[1] & 0x1f) >> 2);
+ break;
+
+ case 5:
+ p[4] = _NXT | (oc[3] & 0x3f);
+ p[3] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
+ p[2] = _NXT | ((oc[2] & 0xf0) >> 4) |
+ ((oc[1] & 0x03) << 4);
+ p[1] = _NXT | (oc[1] >> 2);
+ p[0] = _SEQ5 | (oc[0] & 0x03);
+ break;
+
+ case 6:
+ p[5] = _NXT | (oc[3] & 0x3f);
+ p[4] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
+ p[3] = _NXT | (oc[2] >> 4) | ((oc[1] & 0x03) << 4);
+ p[2] = _NXT | (oc[1] >> 2);
+ p[1] = _NXT | (oc[0] & 0x3f);
+ p[0] = _SEQ6 | ((oc[0] & 0x40) >> 6);
+ break;
+ }
+
+ /*
+ * NOTE: do not check here for forbitten UTF-8 characters.
+ * They cannot appear here because we do proper convertion.
+ */
+
+ p += n;
+ }
+
+ return (total);
+}
diff --git a/utf8.h b/utf8.h
new file mode 100644
index 00000000..18aa5265
--- /dev/null
+++ b/utf8.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2007 Alexey Vatchenko <av@bsdua.org>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * utf8: implementation of UTF-8 charset encoding (RFC3629).
+ */
+#ifndef _UTF8_H_
+#define _UTF8_H_
+
+#include <sys/types.h>
+
+#include <string>
+#include <wchar.h>
+
+#define UTF8_IGNORE_ERROR 0x01
+#define UTF8_SKIP_BOM 0x02
+
+bool utf8_to_wchar_string(const std::string &input, std::wstring *result);
+bool wchar_to_utf8_string(const std::wstring &input, std::string *result);
+
+size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags);
+size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
+
+bool is_wchar_ucs2();
+
+#endif /* !_UTF8_H_ */