reduce number of Unicode private-use characters

This narrows the range of Unicode codepoints fish reserves for its own use from U+E000 thru U+F8FE (6399 codepoints) to U+F600 thru U+F73F (320 codepoints). This is still not ideal since fish shouldn't be using any Unicode private-use codepoints but it's a step in the right direction. This partially addresses issue #2684.
author: Kurtis Rader <krader@skepticism.us> 2016-01-21 19:56:39 -0800
committer: Kurtis Rader <krader@skepticism.us> 2016-02-28 18:36:34 -0800
commit: f2246dfb343bea19beb176fb2cc534f85513b2eb (patch)
tree: f95f0c4f4f48d445c1d561aa0986d2b98cdecec5 /src/common.h
parent: b41b96233616f26e52663c133f7a29d32b0e9142 (diff)
1 files changed, 45 insertions, 20 deletions
diff --git a/src/common.h b/src/common.h
index d255ac62..74640988 100644
--- a/src/common.h
+++ b/src/common.h
@@ -35,34 +35,59 @@
 typedef std::wstring wcstring;
 typedef std::vector<wcstring> wcstring_list_t;
 
-/**
-   Maximum number of bytes used by a single utf-8 character
-*/
+// Maximum number of bytes used by a single utf-8 character.
 #define MAX_UTF8_BYTES 6
 
-/**
-   This is in the unicode private use area.
-*/
-#define ENCODE_DIRECT_BASE 0xf100
-
-/**
-  Highest legal ascii value
-*/
+// Highest legal ASCII value.
 #define ASCII_MAX 127u
 
-/**
-  Highest legal 16-bit unicode value
-*/
-#define UCS2_MAX 0xffffu
+// Highest legal 16-bit Unicode value.
+#define UCS2_MAX 0xFFFFu
 
-/**
-  Highest legal byte value
-*/
-#define BYTE_MAX 0xffu
+// Highest legal byte value.
+#define BYTE_MAX 0xFFu
 
-/** BOM value */
+// Unicode BOM value.
 #define UTF8_BOM_WCHAR 0xFEFFu
 
+// Unicode replacement character.
+#define REPLACEMENT_WCHAR 0xFFFDu
+
+// Use Unicode "noncharacters" for internal characters as much as we can. This
+// gives us 32 "characters" for internal use that we can guarantee should not
+// appear in our input stream. See http://www.unicode.org/faq/private_use.html.
+#define RESERVED_CHAR_BASE 0xFDD0u
+#define RESERVED_CHAR_END  0xFDF0u
+// Split the available noncharacter values into two ranges to ensure there are
+// no conflicts among the places we use these special characters.
+#define EXPAND_RESERVED_BASE RESERVED_CHAR_BASE
+#define EXPAND_RESERVED_END  (EXPAND_RESERVED_BASE + 16)
+#define WILDCARD_RESERVED_BASE EXPAND_RESERVED_END
+#define WILDCARD_RESERVED_END  (WILDCARD_RESERVED_BASE + 16)
+// Make sure the ranges defined above don't exceed the range for noncharacters.
+// This is to make sure we didn't do something stupid in subdividing the
+// Unicode range for our needs.
+#if WILDCARD_RESERVED_END > RESERVED_CHAR_END
+#error
+#endif
+
+// These are in the Unicode private-use range. We really shouldn't use this
+// range but have little choice in the matter given how our lexer/parser works.
+// We can't use non-characters for these two ranges because there are only 66 of
+// them and we need at least 256 + 64.
+//
+// If sizeof(wchar_t))==4 we could avoid using private-use chars; however, that
+// would result in fish having different behavior on machines with 16 versus 32
+// bit wchar_t. It's better that fish behave the same on both types of systems.
+//
+// Note: We don't use the highest 8 bit range (0xF800 - 0xF8FF) because we know
+// of at least one use of a codepoint in that range: the Apple symbol (0xF8FF)
+// on Mac OS X. See http://www.unicode.org/faq/private_use.html.
+#define ENCODE_DIRECT_BASE 0xF600u
+#define ENCODE_DIRECT_END  (ENCODE_DIRECT_BASE + 256)
+#define INPUT_COMMON_BASE  0xF700u
+#define INPUT_COMMON_END   (INPUT_COMMON_BASE + 64)
+
 /* Flags for unescape_string functions */
 enum
 {
author	Kurtis Rader <krader@skepticism.us>	2016-01-21 19:56:39 -0800
committer	Kurtis Rader <krader@skepticism.us>	2016-02-28 18:36:34 -0800
commit	f2246dfb343bea19beb176fb2cc534f85513b2eb (patch)
tree	f95f0c4f4f48d445c1d561aa0986d2b98cdecec5 /src/common.h
parent	b41b96233616f26e52663c133f7a29d32b0e9142 (diff)