From f2246dfb343bea19beb176fb2cc534f85513b2eb Mon Sep 17 00:00:00 2001 From: Kurtis Rader Date: Thu, 21 Jan 2016 19:56:39 -0800 Subject: reduce number of Unicode private-use characters This narrows the range of Unicode codepoints fish reserves for its own use from U+E000 thru U+F8FE (6399 codepoints) to U+F600 thru U+F73F (320 codepoints). This is still not ideal since fish shouldn't be using any Unicode private-use codepoints but it's a step in the right direction. This partially addresses issue #2684. --- src/common.h | 65 +++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 20 deletions(-) (limited to 'src/common.h') diff --git a/src/common.h b/src/common.h index d255ac62..74640988 100644 --- a/src/common.h +++ b/src/common.h @@ -35,34 +35,59 @@ typedef std::wstring wcstring; typedef std::vector wcstring_list_t; -/** - Maximum number of bytes used by a single utf-8 character -*/ +// Maximum number of bytes used by a single utf-8 character. #define MAX_UTF8_BYTES 6 -/** - This is in the unicode private use area. -*/ -#define ENCODE_DIRECT_BASE 0xf100 - -/** - Highest legal ascii value -*/ +// Highest legal ASCII value. #define ASCII_MAX 127u -/** - Highest legal 16-bit unicode value -*/ -#define UCS2_MAX 0xffffu +// Highest legal 16-bit Unicode value. +#define UCS2_MAX 0xFFFFu -/** - Highest legal byte value -*/ -#define BYTE_MAX 0xffu +// Highest legal byte value. +#define BYTE_MAX 0xFFu -/** BOM value */ +// Unicode BOM value. #define UTF8_BOM_WCHAR 0xFEFFu +// Unicode replacement character. +#define REPLACEMENT_WCHAR 0xFFFDu + +// Use Unicode "noncharacters" for internal characters as much as we can. This +// gives us 32 "characters" for internal use that we can guarantee should not +// appear in our input stream. See http://www.unicode.org/faq/private_use.html. +#define RESERVED_CHAR_BASE 0xFDD0u +#define RESERVED_CHAR_END 0xFDF0u +// Split the available noncharacter values into two ranges to ensure there are +// no conflicts among the places we use these special characters. +#define EXPAND_RESERVED_BASE RESERVED_CHAR_BASE +#define EXPAND_RESERVED_END (EXPAND_RESERVED_BASE + 16) +#define WILDCARD_RESERVED_BASE EXPAND_RESERVED_END +#define WILDCARD_RESERVED_END (WILDCARD_RESERVED_BASE + 16) +// Make sure the ranges defined above don't exceed the range for noncharacters. +// This is to make sure we didn't do something stupid in subdividing the +// Unicode range for our needs. +#if WILDCARD_RESERVED_END > RESERVED_CHAR_END +#error +#endif + +// These are in the Unicode private-use range. We really shouldn't use this +// range but have little choice in the matter given how our lexer/parser works. +// We can't use non-characters for these two ranges because there are only 66 of +// them and we need at least 256 + 64. +// +// If sizeof(wchar_t))==4 we could avoid using private-use chars; however, that +// would result in fish having different behavior on machines with 16 versus 32 +// bit wchar_t. It's better that fish behave the same on both types of systems. +// +// Note: We don't use the highest 8 bit range (0xF800 - 0xF8FF) because we know +// of at least one use of a codepoint in that range: the Apple symbol (0xF8FF) +// on Mac OS X. See http://www.unicode.org/faq/private_use.html. +#define ENCODE_DIRECT_BASE 0xF600u +#define ENCODE_DIRECT_END (ENCODE_DIRECT_BASE + 256) +#define INPUT_COMMON_BASE 0xF700u +#define INPUT_COMMON_END (INPUT_COMMON_BASE + 64) + /* Flags for unescape_string functions */ enum { -- cgit v1.2.3