diff options
author | David Adam <zanchey@ucc.gu.uwa.edu.au> | 2015-07-26 10:20:13 +0800 |
---|---|---|
committer | David Adam <zanchey@ucc.gu.uwa.edu.au> | 2015-07-26 10:20:13 +0800 |
commit | 3929e9de0e69666b37df87347d5ce15663e81347 (patch) | |
tree | b2701c439c0260840ce1c68beaebf7de1178cc53 /src/tokenizer.h | |
parent | 793e1afa084982dac92c4fe19e50c25e326a79c2 (diff) | |
parent | f4d1657c22c81a7720a91026f915b80d2d6aa6e8 (diff) |
Merge branch 'master' into iwyu
Diffstat (limited to 'src/tokenizer.h')
-rw-r--r-- | src/tokenizer.h | 220 |
1 files changed, 220 insertions, 0 deletions
diff --git a/src/tokenizer.h b/src/tokenizer.h new file mode 100644 index 00000000..09a49509 --- /dev/null +++ b/src/tokenizer.h @@ -0,0 +1,220 @@ +/** \file tokenizer.h + + A specialized tokenizer for tokenizing the fish language. In the + future, the tokenizer should be extended to support marks, + tokenizing multiple strings and disposing of unused string + segments. +*/ + +#ifndef FISH_TOKENIZER_H +#define FISH_TOKENIZER_H + +#include <stddef.h> +#include "common.h" + +/** + Token types +*/ +enum token_type +{ + TOK_NONE, /**< Tokenizer not yet constructed */ + TOK_ERROR, /**< Error reading token */ + TOK_STRING,/**< String token */ + TOK_PIPE,/**< Pipe token */ + TOK_END,/**< End token (semicolon or newline, not literal end) */ + TOK_REDIRECT_OUT, /**< redirection token */ + TOK_REDIRECT_APPEND,/**< redirection append token */ + TOK_REDIRECT_IN,/**< input redirection token */ + TOK_REDIRECT_FD,/**< redirection to new fd token */ + TOK_REDIRECT_NOCLOB, /**<? redirection token */ + TOK_BACKGROUND,/**< send job to bg token */ + TOK_COMMENT/**< comment token */ +}; + +/** + Tokenizer error types +*/ +enum tokenizer_error +{ + TOK_UNTERMINATED_QUOTE, + TOK_UNTERMINATED_SUBSHELL, + TOK_UNTERMINATED_ESCAPE, + TOK_OTHER +} +; + + +/** + Flag telling the tokenizer to accept incomplete parameters, + i.e. parameters with mismatching paranthesis, etc. This is useful + for tab-completion. +*/ +#define TOK_ACCEPT_UNFINISHED 1 + +/** + Flag telling the tokenizer not to remove comments. Useful for + syntax highlighting. +*/ +#define TOK_SHOW_COMMENTS 2 + +/** Flag telling the tokenizer to not generate error messages, which we need to do when tokenizing off of the main thread (since wgettext is not thread safe). +*/ +#define TOK_SQUASH_ERRORS 4 + +/** Ordinarily, the tokenizer ignores newlines following a newline, or a semicolon. + This flag tells the tokenizer to return each of them as a separate END. */ +#define TOK_SHOW_BLANK_LINES 8 + +typedef unsigned int tok_flags_t; + +/** + The tokenizer struct. +*/ +struct tokenizer_t +{ + /** A pointer into the original string, showing where the next token begins */ + const wchar_t *buff; + /** A copy of the original string */ + const wchar_t *orig_buff; + /** The last token */ + wcstring last_token; + + /** Type of last token*/ + enum token_type last_type; + + /** Offset of last token*/ + size_t last_pos; + /** Whether there are more tokens*/ + bool has_next; + /** Whether incomplete tokens are accepted*/ + bool accept_unfinished; + /** Whether comments should be returned*/ + bool show_comments; + /** Whether all blank lines are returned */ + bool show_blank_lines; + /** Type of last quote, can be either ' or ".*/ + wchar_t last_quote; + /** Last error */ + int error; + /* Whether we are squashing errors */ + bool squash_errors; + + /* Cached line number information */ + size_t cached_lineno_offset; + int cached_lineno_count; + + /* Whether to continue the previous line after the comment */ + bool continue_line_after_comment; + + /** + Constructor for a tokenizer. b is the string that is to be + tokenized. It is not copied, and should not be freed by the caller + until after the tokenizer is destroyed. + + \param b The string to tokenize + \param flags Flags to the tokenizer. Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer + to accept incomplete tokens, such as a subshell without a closing + parenthesis, as a valid token. Setting TOK_SHOW_COMMENTS will return comments as tokens + + */ + tokenizer_t(const wchar_t *b, tok_flags_t flags); +}; + +/** + Jump to the next token. +*/ +void tok_next(tokenizer_t *tok); + +/** + Returns the type of the last token. Must be one of the values in the token_type enum. +*/ +enum token_type tok_last_type(tokenizer_t *tok); + +/** + Returns the last token string. The string should not be freed by the caller. This returns nonsense results for some token types, like TOK_END. +*/ +const wchar_t *tok_last(tokenizer_t *tok); + +/** + Returns true as long as there are more tokens left +*/ +int tok_has_next(tokenizer_t *tok); + +/** + Returns the position of the beginning of the current token in the original string +*/ +int tok_get_pos(const tokenizer_t *tok); + +/** Returns the extent of the current token */ +size_t tok_get_extent(const tokenizer_t *tok); + +/** + Returns only the first token from the specified string. This is a + convenience function, used to retrieve the first token of a + string. This can be useful for error messages, etc. + + On failure, returns the empty string. +*/ +wcstring tok_first(const wchar_t *str); + +/** + Indicates whether a character can be part of a string, or is a string separator. + Separators include newline, tab, |, ^, >, <, etc. + + is_first should indicate whether this is the first character in a potential string. +*/ +bool tok_is_string_character(wchar_t c, bool is_first); + +/** + Move tokenizer position +*/ +void tok_set_pos(tokenizer_t *tok, int pos); + +/** + Returns a string description of the specified token type +*/ +const wchar_t *tok_get_desc(int type); + +/** + Get tokenizer error type. Should only be called if tok_last_tope returns TOK_ERROR. +*/ +int tok_get_error(tokenizer_t *tok); + +/* Helper function to determine redirection type from a string, or TOK_NONE if the redirection is invalid. Also returns the fd by reference. */ +enum token_type redirection_type_for_string(const wcstring &str, int *out_fd = NULL); + +/* Helper function to determine which fd is redirected by a pipe */ +int fd_redirected_by_pipe(const wcstring &str); + +/* Helper function to return oflags (as in open(2)) for a redirection type */ +int oflags_for_redirection_type(enum token_type type); + +enum move_word_style_t +{ + move_word_style_punctuation, //stop at punctuation + move_word_style_path_components, //stops at path components + move_word_style_whitespace // stops at whitespace +}; + +/* Our state machine that implements "one word" movement or erasure. */ +class move_word_state_machine_t +{ +private: + + bool consume_char_punctuation(wchar_t c); + bool consume_char_path_components(wchar_t c); + bool is_path_component_character(wchar_t c); + bool consume_char_whitespace(wchar_t c); + + int state; + move_word_style_t style; + +public: + + move_word_state_machine_t(move_word_style_t st); + bool consume_char(wchar_t c); + void reset(); +}; + + +#endif |