From f2246dfb343bea19beb176fb2cc534f85513b2eb Mon Sep 17 00:00:00 2001 From: Kurtis Rader Date: Thu, 21 Jan 2016 19:56:39 -0800 Subject: reduce number of Unicode private-use characters This narrows the range of Unicode codepoints fish reserves for its own use from U+E000 thru U+F8FE (6399 codepoints) to U+F600 thru U+F73F (320 codepoints). This is still not ideal since fish shouldn't be using any Unicode private-use codepoints but it's a step in the right direction. This partially addresses issue #2684. --- doc_src/faq.hdr | 5 +++ src/common.h | 65 +++++++++++++++++++++++++++------------ src/expand.h | 90 +++++++++++++++++++----------------------------------- src/fish.cpp | 11 +++++++ src/highlight.cpp | 2 +- src/input.h | 66 --------------------------------------- src/input_common.h | 81 ++++++++++++++++++++++++++++++++++++++++-------- src/reader.cpp | 20 +++++++----- src/tokenizer.cpp | 4 +-- src/wildcard.h | 26 ++++++---------- 10 files changed, 184 insertions(+), 186 deletions(-) diff --git a/doc_src/faq.hdr b/doc_src/faq.hdr index 5ec12552..abbedb8b 100644 --- a/doc_src/faq.hdr +++ b/doc_src/faq.hdr @@ -215,6 +215,11 @@ cd /usr/local/bin rm -f fish fish_indent \endfish +
+\section faq-reserved-chars Unicode private-use characters reserved by fish + +Fish reserves the Unicode private-use character range from U+F600 thru U+F73F for internal use. Any attempt to feed characters in that range to fish will result in them being replaced by the Unicode "replacement character" U+FFFD. This includes both interactive input as well as any file read by fish (but not programs run by fish). + \htmlonly[block] \endhtmlonly diff --git a/src/common.h b/src/common.h index d255ac62..74640988 100644 --- a/src/common.h +++ b/src/common.h @@ -35,34 +35,59 @@ typedef std::wstring wcstring; typedef std::vector wcstring_list_t; -/** - Maximum number of bytes used by a single utf-8 character -*/ +// Maximum number of bytes used by a single utf-8 character. #define MAX_UTF8_BYTES 6 -/** - This is in the unicode private use area. -*/ -#define ENCODE_DIRECT_BASE 0xf100 - -/** - Highest legal ascii value -*/ +// Highest legal ASCII value. #define ASCII_MAX 127u -/** - Highest legal 16-bit unicode value -*/ -#define UCS2_MAX 0xffffu +// Highest legal 16-bit Unicode value. +#define UCS2_MAX 0xFFFFu -/** - Highest legal byte value -*/ -#define BYTE_MAX 0xffu +// Highest legal byte value. +#define BYTE_MAX 0xFFu -/** BOM value */ +// Unicode BOM value. #define UTF8_BOM_WCHAR 0xFEFFu +// Unicode replacement character. +#define REPLACEMENT_WCHAR 0xFFFDu + +// Use Unicode "noncharacters" for internal characters as much as we can. This +// gives us 32 "characters" for internal use that we can guarantee should not +// appear in our input stream. See http://www.unicode.org/faq/private_use.html. +#define RESERVED_CHAR_BASE 0xFDD0u +#define RESERVED_CHAR_END 0xFDF0u +// Split the available noncharacter values into two ranges to ensure there are +// no conflicts among the places we use these special characters. +#define EXPAND_RESERVED_BASE RESERVED_CHAR_BASE +#define EXPAND_RESERVED_END (EXPAND_RESERVED_BASE + 16) +#define WILDCARD_RESERVED_BASE EXPAND_RESERVED_END +#define WILDCARD_RESERVED_END (WILDCARD_RESERVED_BASE + 16) +// Make sure the ranges defined above don't exceed the range for noncharacters. +// This is to make sure we didn't do something stupid in subdividing the +// Unicode range for our needs. +#if WILDCARD_RESERVED_END > RESERVED_CHAR_END +#error +#endif + +// These are in the Unicode private-use range. We really shouldn't use this +// range but have little choice in the matter given how our lexer/parser works. +// We can't use non-characters for these two ranges because there are only 66 of +// them and we need at least 256 + 64. +// +// If sizeof(wchar_t))==4 we could avoid using private-use chars; however, that +// would result in fish having different behavior on machines with 16 versus 32 +// bit wchar_t. It's better that fish behave the same on both types of systems. +// +// Note: We don't use the highest 8 bit range (0xF800 - 0xF8FF) because we know +// of at least one use of a codepoint in that range: the Apple symbol (0xF8FF) +// on Mac OS X. See http://www.unicode.org/faq/private_use.html. +#define ENCODE_DIRECT_BASE 0xF600u +#define ENCODE_DIRECT_END (ENCODE_DIRECT_BASE + 256) +#define INPUT_COMMON_BASE 0xF700u +#define INPUT_COMMON_END (INPUT_COMMON_BASE + 64) + /* Flags for unescape_string functions */ enum { diff --git a/src/expand.h b/src/expand.h index 2c526358..731acd71 100644 --- a/src/expand.h +++ b/src/expand.h @@ -26,92 +26,64 @@ enum { - /** Flag specifying that cmdsubst expansion should be skipped */ + // Flag specifying that cmdsubst expansion should be skipped. EXPAND_SKIP_CMDSUBST = 1 << 0, - - /** Flag specifying that variable expansion should be skipped */ + // Flag specifying that variable expansion should be skipped. EXPAND_SKIP_VARIABLES = 1 << 1, - - /** Flag specifying that wildcard expansion should be skipped */ + // Flag specifying that wildcard expansion should be skipped. EXPAND_SKIP_WILDCARDS = 1 << 2, - - /** - The expansion is being done for tab or auto completions. Returned completions may have the wildcard as a prefix instead of a match. - */ + // The expansion is being done for tab or auto completions. Returned + // completions may have the wildcard as a prefix instead of a match. EXPAND_FOR_COMPLETIONS = 1 << 3, - - /** Only match files that are executable by the current user. Only applicable together with ACCEPT_INCOMPLETE. */ + // Only match files that are executable by the current user. Only + // applicable together with ACCEPT_INCOMPLETE. EXECUTABLES_ONLY = 1 << 4, - - /** Only match directories. Only applicable together with ACCEPT_INCOMPLETE. */ + // Only match directories. Only applicable together with ACCEPT_INCOMPLETE. DIRECTORIES_ONLY = 1 << 5, - - /** Don't generate descriptions */ + // Don't generate descriptions. EXPAND_NO_DESCRIPTIONS = 1 << 6, - - /** Don't expand jobs (but you can still expand processes). This is because job expansion is not thread safe. */ + // Don't expand jobs (but you can still expand processes). This is because + // job expansion is not thread safe. EXPAND_SKIP_JOBS = 1 << 7, - - /** Don't expand home directories */ + // Don't expand home directories. EXPAND_SKIP_HOME_DIRECTORIES = 1 << 8, - - /** Allow fuzzy matching */ + // Allow fuzzy matching. EXPAND_FUZZY_MATCH = 1 << 9, - - /** Disallow directory abbreviations like /u/l/b for /usr/local/bin. Only applicable if EXPAND_FUZZY_MATCH is set. */ + // Disallow directory abbreviations like /u/l/b for /usr/local/bin. Only + // applicable if EXPAND_FUZZY_MATCH is set. EXPAND_NO_FUZZY_DIRECTORIES = 1 << 10, - - /** Do expansions specifically to support cd (CDPATH, etc) */ + // Do expansions specifically to support cd (CDPATH, etc). EXPAND_SPECIAL_CD = 1 << 11 }; typedef int expand_flags_t; -/** - Use unencoded private-use keycodes for internal characters -*/ -#define EXPAND_RESERVED 0xf000 -/** - End of range reserved for expand - */ -#define EXPAND_RESERVED_END 0xf000f - class completion_t; enum { - /** Character represeting a home directory */ - HOME_DIRECTORY = EXPAND_RESERVED, - - /** Character represeting process expansion */ + // Character representing a home directory. + HOME_DIRECTORY = EXPAND_RESERVED_BASE, + // Character representing process expansion. PROCESS_EXPAND, - - /** Character representing variable expansion */ + // Character representing variable expansion. VARIABLE_EXPAND, - - /** Character rpresenting variable expansion into a single element*/ + // Character representing variable expansion into a single element. VARIABLE_EXPAND_SINGLE, - - /** Character representing the start of a bracket expansion */ + // Character representing the start of a bracket expansion. BRACKET_BEGIN, - - /** Character representing the end of a bracket expansion */ + // Character representing the end of a bracket expansion. BRACKET_END, - - /** Character representing separation between two bracket elements */ + // Character representing separation between two bracket elements. BRACKET_SEP, - /** - Separate subtokens in a token with this character. - */ + // Separate subtokens in a token with this character. INTERNAL_SEPARATOR, - - /** - Character representing an empty variable expansion. - Only used transitively while expanding variables. - */ + // Character representing an empty variable expansion. Only used + // transitively while expanding variables. VARIABLE_EXPAND_EMPTY, -} -; - + // This is a special psuedo-char that is not used other than to mark the + // end of the the special characters so we can sanity check the enum range. + EXPAND_SENTINAL +}; /** These are the possible return values for expand_string. Note how zero value is the only error. */ enum expand_error_t diff --git a/src/fish.cpp b/src/fish.cpp index 69b4b0de..91610232 100644 --- a/src/fish.cpp +++ b/src/fish.cpp @@ -22,6 +22,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA #include "config.h" +#include #include #include #include @@ -63,6 +64,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA #include "input.h" #include "io.h" #include "fish_version.h" +#include "input_common.h" +#include "wildcard.h" /* PATH_MAX may not exist */ #ifndef PATH_MAX @@ -484,6 +487,14 @@ int main(int argc, char **argv) int res=1; int my_optind=0; + // We can't do this at compile time due to the use of enum symbols. + assert(EXPAND_SENTINAL >= EXPAND_RESERVED_BASE && + EXPAND_SENTINAL <= EXPAND_RESERVED_END); + assert(ANY_SENTINAL >= WILDCARD_RESERVED_BASE && + ANY_SENTINAL <= WILDCARD_RESERVED_END); + assert(R_SENTINAL >= INPUT_COMMON_BASE && + R_SENTINAL <= INPUT_COMMON_END); + set_main_thread(); setup_fork_guards(); diff --git a/src/highlight.cpp b/src/highlight.cpp index 90426803..3014ebdf 100644 --- a/src/highlight.cpp +++ b/src/highlight.cpp @@ -335,7 +335,7 @@ static bool has_expand_reserved(const wcstring &str) for (size_t i=0; i < str.size(); i++) { wchar_t wc = str.at(i); - if (wc >= EXPAND_RESERVED && wc <= EXPAND_RESERVED_END) + if (wc >= EXPAND_RESERVED_BASE && wc <= EXPAND_RESERVED_END) { result = true; break; diff --git a/src/input.h b/src/input.h index 36d49a51..47315c1c 100644 --- a/src/input.h +++ b/src/input.h @@ -16,77 +16,11 @@ inputrc information for key bindings. #include "env.h" #include "input_common.h" - #define DEFAULT_BIND_MODE L"default" #define FISH_BIND_MODE_VAR L"fish_bind_mode" -/** - Key codes for inputrc-style keyboard functions that are passed on - to the caller of input_read() - - NOTE: IF YOU MODIFY THIS YOU MUST UPDATE THE name_arr AND code_arr VARIABLES TO MATCH! -*/ -enum -{ - R_BEGINNING_OF_LINE = R_NULL+10, /* This give input_common ten slots for lowlevel keycodes */ - R_END_OF_LINE, - R_FORWARD_CHAR, - R_BACKWARD_CHAR, - R_FORWARD_WORD, - R_BACKWARD_WORD, - R_FORWARD_BIGWORD, - R_BACKWARD_BIGWORD, - R_HISTORY_SEARCH_BACKWARD, - R_HISTORY_SEARCH_FORWARD, - R_DELETE_CHAR, - R_BACKWARD_DELETE_CHAR, - R_KILL_LINE, - R_YANK, - R_YANK_POP, - R_COMPLETE, - R_COMPLETE_AND_SEARCH, - R_BEGINNING_OF_HISTORY, - R_END_OF_HISTORY, - R_BACKWARD_KILL_LINE, - R_KILL_WHOLE_LINE, - R_KILL_WORD, - R_KILL_BIGWORD, - R_BACKWARD_KILL_WORD, - R_BACKWARD_KILL_PATH_COMPONENT, - R_BACKWARD_KILL_BIGWORD, - R_HISTORY_TOKEN_SEARCH_BACKWARD, - R_HISTORY_TOKEN_SEARCH_FORWARD, - R_SELF_INSERT, - R_TRANSPOSE_CHARS, - R_TRANSPOSE_WORDS, - R_UPCASE_WORD, - R_DOWNCASE_WORD, - R_CAPITALIZE_WORD, - R_VI_ARG_DIGIT, - R_VI_DELETE_TO, - R_EXECUTE, - R_BEGINNING_OF_BUFFER, - R_END_OF_BUFFER, - R_REPAINT, - R_FORCE_REPAINT, - R_UP_LINE, - R_DOWN_LINE, - R_SUPPRESS_AUTOSUGGESTION, - R_ACCEPT_AUTOSUGGESTION, - R_BEGIN_SELECTION, - R_END_SELECTION, - R_KILL_SELECTION, - R_FORWARD_JUMP, - R_BACKWARD_JUMP, - R_AND, - R_CANCEL -}; - wcstring describe_char(wint_t c); -#define R_MIN R_NULL -#define R_MAX R_CANCEL - /** Initialize the terminal by calling setupterm, and set up arrays used by readch to detect escape sequences for special keys. diff --git a/src/input_common.h b/src/input_common.h index 72ccc073..e605d18b 100644 --- a/src/input_common.h +++ b/src/input_common.h @@ -8,22 +8,77 @@ Header file for the low level input library #include -/** - Use unencoded private-use keycodes for internal characters -*/ -#define INPUT_COMMON_RESERVED 0xe000 +#include "common.h" enum { - /** - R_NULL is sometimes returned by the input when a character was - requested but none could be delivered, or when an exception - happened. - */ - R_NULL = INPUT_COMMON_RESERVED, - R_EOF -} -; + R_MIN = INPUT_COMMON_BASE, + // R_NULL is sometimes returned by the input when a character was requested + // but none could be delivered, or when an exception happened. + R_NULL = R_MIN, + R_EOF, + // Key codes for inputrc-style keyboard functions that are passed on + // to the caller of input_read(). + // + // NOTE: If you modify this sequence of symbols you must update the + // name_arr, code_arr and desc_arr variables in input.cpp to match! + R_BEGINNING_OF_LINE, + R_END_OF_LINE, + R_FORWARD_CHAR, + R_BACKWARD_CHAR, + R_FORWARD_WORD, + R_BACKWARD_WORD, + R_FORWARD_BIGWORD, + R_BACKWARD_BIGWORD, + R_HISTORY_SEARCH_BACKWARD, + R_HISTORY_SEARCH_FORWARD, + R_DELETE_CHAR, + R_BACKWARD_DELETE_CHAR, + R_KILL_LINE, + R_YANK, + R_YANK_POP, + R_COMPLETE, + R_COMPLETE_AND_SEARCH, + R_BEGINNING_OF_HISTORY, + R_END_OF_HISTORY, + R_BACKWARD_KILL_LINE, + R_KILL_WHOLE_LINE, + R_KILL_WORD, + R_KILL_BIGWORD, + R_BACKWARD_KILL_WORD, + R_BACKWARD_KILL_PATH_COMPONENT, + R_BACKWARD_KILL_BIGWORD, + R_HISTORY_TOKEN_SEARCH_BACKWARD, + R_HISTORY_TOKEN_SEARCH_FORWARD, + R_SELF_INSERT, + R_TRANSPOSE_CHARS, + R_TRANSPOSE_WORDS, + R_UPCASE_WORD, + R_DOWNCASE_WORD, + R_CAPITALIZE_WORD, + R_VI_ARG_DIGIT, + R_VI_DELETE_TO, + R_EXECUTE, + R_BEGINNING_OF_BUFFER, + R_END_OF_BUFFER, + R_REPAINT, + R_FORCE_REPAINT, + R_UP_LINE, + R_DOWN_LINE, + R_SUPPRESS_AUTOSUGGESTION, + R_ACCEPT_AUTOSUGGESTION, + R_BEGIN_SELECTION, + R_END_SELECTION, + R_KILL_SELECTION, + R_FORWARD_JUMP, + R_BACKWARD_JUMP, + R_AND, + R_CANCEL, + R_MAX = R_CANCEL, + // This is a special psuedo-char that is not used other than to mark the + // end of the the special characters so we can sanity check the enum range. + R_SENTINAL +}; /** Init the library diff --git a/src/reader.cpp b/src/reader.cpp index 3998e5a0..a7e8ec7e 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -2964,16 +2964,20 @@ static int can_read(int fd) return select(fd + 1, &fds, 0, 0, &can_read_timeout) == 1; } -/** - Test if the specified character is in the private use area that - fish uses to store internal characters - - Note: Allow U+F8FF because that's the Apple symbol, which is in the - OS X US keyboard layout. -*/ +// Test if the specified character is in a range that fish uses interally to +// store special tokens. +// +// NOTE: This is used when tokenizing the input. It is also used when reading +// input, before tokenization, to replace such chars with REPLACEMENT_WCHAR if +// they're not part of a quoted string. We don't want external input to be able +// to feed reserved characters into our lexer/parser or code evaluator. +// +// TODO: Actually implement the replacement as documented above. static int wchar_private(wchar_t c) { - return ((c >= 0xe000) && (c < 0xf8ff)); + return ((c >= RESERVED_CHAR_BASE && c < RESERVED_CHAR_END) || + (c >= ENCODE_DIRECT_BASE && c < ENCODE_DIRECT_END) || + (c >= INPUT_COMMON_BASE && c < INPUT_COMMON_END)); } /** diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index ce28ecf0..4075c73b 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -632,8 +632,8 @@ void tokenizer_t::tok_next() /*fwprintf( stderr, L"End of string\n" );*/ this->has_next = false; break; - case 13: // carriage return - case L'\n': + case L'\r': // carriage-return + case L'\n': // newline case L';': this->last_type = TOK_END; this->buff++; diff --git a/src/wildcard.h b/src/wildcard.h index 3a9a869f..3e1d2b6f 100644 --- a/src/wildcard.h +++ b/src/wildcard.h @@ -18,27 +18,19 @@ #include "expand.h" #include "complete.h" -/* - Use unencoded private-use keycodes for internal characters -*/ - -#define WILDCARD_RESERVED 0xf400 - -/** - Enumeration of all wildcard types -*/ +// Enumeration of all wildcard types enum { - /** Character representing any character except '/' */ - ANY_CHAR = WILDCARD_RESERVED, - - /** Character representing any character string not containing '/' (A slash) */ + // Character representing any character except '/' (slash). + ANY_CHAR = WILDCARD_RESERVED_BASE, + // Character representing any character string not containing '/' (slash). ANY_STRING, - - /** Character representing any character string */ + // Character representing any character string. ANY_STRING_RECURSIVE, -} -; + // This is a special psuedo-char that is not used other than to mark the + // end of the the special characters so we can sanity check the enum range. + ANY_SENTINAL +}; /** Expand the wildcard by matching against the filesystem. -- cgit v1.2.3