Continue migration to the new tokenizer interface

author: ridiculousfish <corydoras@ridiculousfish.com> 2015-07-26 00:12:36 -0700
committer: ridiculousfish <corydoras@ridiculousfish.com> 2015-07-26 00:12:36 -0700
commit: 4ebaa7b6bd595f37f66f8d77ee836c1e5a4ef647 (patch)
tree: ef3a2396c6307afb54697126a5c17f02c35835f4
parent: 618896c0436e6ca70feb2fa317b34171cc4e1a81 (diff)
8 files changed, 81 insertions, 160 deletions
diff --git a/src/builtin_commandline.cpp b/src/builtin_commandline.cpp
index 23aaaa52..71f593c1 100644
--- a/src/builtin_commandline.cpp
+++ b/src/builtin_commandline.cpp
@@ -196,17 +196,18 @@ static void write_part(const wchar_t *begin,
 //    fwprintf( stderr, L"Subshell: %ls, end char %lc\n", buff, *end );
         wcstring out;
         tokenizer_t tok(buff, TOK_ACCEPT_UNFINISHED);
-        for (; tok_has_next(&tok); tok_next(&tok))
+        tok_t token;
+        while (tok.next(&token))
         {
             if ((cut_at_cursor) &&
-                    (tok_get_pos(&tok)+wcslen(tok_last(&tok)) >= pos))
+                    (token.offset + token.text.size() >= pos))
                 break;
 
-            switch (tok_last_type(&tok))
+            switch (token.type)
             {
                 case TOK_STRING:
                 {
-                    wcstring tmp = tok_last(&tok);
+                    wcstring tmp = token.text;
                     unescape_string_in_place(&tmp, UNESCAPE_INCOMPLETE);
                     out.append(tmp);
                     out.push_back(L'\n');
diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp
index cbece61c..791dbb17 100644
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@@ -435,34 +435,7 @@ static void test_convert_nulls(void)
 */
 static void test_tok()
 {
-
     say(L"Testing tokenizer");
-
-
-    say(L"Testing invalid input");
-    tokenizer_t t(NULL, 0);
-
-    if (tok_last_type(&t) != TOK_ERROR)
-    {
-        err(L"Invalid input to tokenizer was undetected");
-    }
-
-    say(L"Testing use of broken tokenizer");
-    if (!tok_has_next(&t))
-    {
-        err(L"tok_has_next() should return 1 once on broken tokenizer");
-    }
-
-    tok_next(&t);
-    if (tok_last_type(&t) != TOK_ERROR)
-    {
-        err(L"Invalid input to tokenizer was undetected");
-    }
-
-    /*
-      This should crash if there is a bug. No reliable way to detect otherwise.
-    */
-    say(L"Test destruction of broken tokenizer");
     {
 
         const wchar_t *str = L"string <redirection  2>&1 'nested \"quoted\" '(string containing subshells ){and,brackets}$as[$well (as variable arrays)] not_a_redirect^ ^ ^^is_a_redirect Compress_Newlines\n  \n\t\n   \nInto_Just_One";
diff --git a/src/parse_tree.cpp b/src/parse_tree.cpp
index eda67ffb..c884f667 100644
--- a/src/parse_tree.cpp
+++ b/src/parse_tree.cpp
@@ -632,7 +632,7 @@ public:
     void accept_tokens(parse_token_t token1, parse_token_t token2);
 
     /* Report tokenizer errors */
-    void report_tokenizer_error(parse_token_t token, int tok_err, const wchar_t *tok_error);
+    void report_tokenizer_error(parse_token_t token, int tok_err, const wcstring &tok_error);
 
     /* Indicate if we hit a fatal error */
     bool has_fatal_error(void) const
@@ -887,9 +887,8 @@ void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &sta
     }
 }
 
-void parse_ll_t::report_tokenizer_error(parse_token_t token, int tok_err_code, const wchar_t *tok_error)
+void parse_ll_t::report_tokenizer_error(parse_token_t token, int tok_err_code, const wcstring &tok_error)
 {
-    assert(tok_error != NULL);
     parse_error_code_t parse_error_code;
     switch (tok_err_code)
     {
@@ -911,7 +910,7 @@ void parse_ll_t::report_tokenizer_error(parse_token_t token, int tok_err_code, c
             break;
 
     }
-    this->parse_error(token, parse_error_code, L"%ls", tok_error);
+    this->parse_error(token, parse_error_code, L"%ls", tok_error.c_str());
 }
 
 void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token)
@@ -1200,7 +1199,7 @@ static parse_keyword_t keyword_with_name(const wchar_t *name)
 }
 
 /* Given a token, returns the keyword it matches, or parse_keyword_none. */
-static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
+static parse_keyword_t keyword_for_token(token_type tok, const wcstring &token)
 {
     /* Only strings can be keywords */
     if (tok != TOK_STRING)
@@ -1211,6 +1210,7 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
     /* If tok_txt is clean (which most are), we can compare it directly. Otherwise we have to expand it. We only expand quotes, and we don't want to do expensive expansions like tilde expansions. So we do our own "cleanliness" check; if we find a character not in our allowed set we know it's not a keyword, and if we never find a quote we don't have to expand! Note that this lowercase set could be shrunk to be just the characters that are in keywords. */
     parse_keyword_t result = parse_keyword_none;
     bool needs_expand = false, all_chars_valid = true;
+    const wchar_t *tok_txt = token.c_str();
     const wchar_t *chars_allowed_in_keywords = L"abcdefghijklmnopqrstuvwxyz'\"";
     for (size_t i=0; tok_txt[i] != L'\0'; i++)
     {
@@ -1249,36 +1249,34 @@ static const parse_token_t kInvalidToken = {token_type_invalid, parse_keyword_no
 /* Terminal token */
 static const parse_token_t kTerminalToken = {parse_token_type_terminate, parse_keyword_none, false, false, SOURCE_OFFSET_INVALID, 0};
 
-static inline bool is_help_argument(const wchar_t *txt)
+static inline bool is_help_argument(const wcstring &txt)
 {
-    return ! wcscmp(txt, L"-h") || ! wcscmp(txt, L"--help");
+    return contains(txt, L"-h", L"--help");
 }
 
 /* Return a new parse token, advancing the tokenizer */
-static inline parse_token_t next_parse_token(tokenizer_t *tok)
+static inline parse_token_t next_parse_token(tokenizer_t *tok, tok_t *token)
 {
-    if (! tok_has_next(tok))
+    if (! tok->next(token))
     {
         return kTerminalToken;
     }
 
-    token_type tok_type = static_cast<token_type>(tok_last_type(tok));
-    int tok_start = tok_get_pos(tok);
-    size_t tok_extent = tok_get_extent(tok);
-    assert(tok_extent < 10000000); //paranoia
-    const wchar_t *tok_txt = tok_last(tok);
-
     parse_token_t result;
 
     /* Set the type, keyword, and whether there's a dash prefix. Note that this is quite sketchy, because it ignores quotes. This is the historical behavior. For example, `builtin --names` lists builtins, but `builtin "--names"` attempts to run --names as a command. Amazingly as of this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard and it even starts to look like a feature. */
-    result.type = parse_token_type_from_tokenizer_token(tok_type);
-    result.keyword = keyword_for_token(tok_type, tok_txt);
-    result.has_dash_prefix = (tok_txt[0] == L'-');
-    result.is_help_argument = result.has_dash_prefix && is_help_argument(tok_txt);
-    result.source_start = (source_offset_t)tok_start;
-    result.source_length = (source_offset_t)tok_extent;
-
-    tok_next(tok);
+    result.type = parse_token_type_from_tokenizer_token(token->type);
+    result.keyword = keyword_for_token(token->type, token->text);
+    result.has_dash_prefix = !token->text.empty() && token->text.at(0) == L'-';
+    result.is_help_argument = result.has_dash_prefix && is_help_argument(token->text);
+    
+    /* These assertions are totally bogus. Basically our tokenizer works in size_t but we work in uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just crash. */
+    assert(token->offset < SOURCE_OFFSET_INVALID);
+    result.source_start = (source_offset_t)token->offset;
+    
+    assert(token->length <= SOURCE_OFFSET_INVALID);
+    result.source_length = (source_offset_t)token->length;
+
     return result;
 }
 
@@ -1307,11 +1305,12 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
     parse_token_t queue[2] = {kInvalidToken, kInvalidToken};
 
     /* Loop until we have a terminal token. */
+    tok_t tokenizer_token;
     for (size_t token_count = 0; queue[0].type != parse_token_type_terminate; token_count++)
     {
         /* Push a new token onto the queue */
         queue[0] = queue[1];
-        queue[1] = next_parse_token(&tok);
+        queue[1] = next_parse_token(&tok, &tokenizer_token);
 
         /* If we are leaving things unterminated, then don't pass parse_token_type_terminate */
         if (queue[0].type == parse_token_type_terminate && (parse_flags & parse_flag_leave_unterminated))
@@ -1328,7 +1327,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
         /* Handle tokenizer errors. This is a hack because really the parser should report this for itself; but it has no way of getting the tokenizer message */
         if (queue[1].type == parse_special_type_tokenizer_error)
         {
-            parser.report_tokenizer_error(queue[1], tok_get_error(&tok), tok_last(&tok));
+            parser.report_tokenizer_error(queue[1], tokenizer_token.error, tokenizer_token.text);
         }
 
         /* Handle errors */
diff --git a/src/parse_util.cpp b/src/parse_util.cpp
index 9aac4a6a..ac6cb8ca 100644
--- a/src/parse_util.cpp
+++ b/src/parse_util.cpp
@@ -358,7 +358,6 @@ static void job_or_process_extent(const wchar_t *buff,
                                   int process)
 {
     const wchar_t *begin, *end;
-    long pos;
     wchar_t *buffcpy;
     int finished=0;
 
@@ -380,7 +379,8 @@ static void job_or_process_extent(const wchar_t *buff,
         return;
     }
 
-    pos = cursor_pos - (begin - buff);
+    assert(cursor_pos >= (begin - buff));
+    const size_t pos = cursor_pos - (begin - buff);
 
     if (a)
     {
@@ -400,11 +400,12 @@ static void job_or_process_extent(const wchar_t *buff,
     }
 
     tokenizer_t tok(buffcpy, TOK_ACCEPT_UNFINISHED);
-    for (; tok_has_next(&tok) && !finished; tok_next(&tok))
+    tok_t token;
+    while (tok.next(&token) && !finished)
     {
-        int tok_begin = tok_get_pos(&tok);
+        size_t tok_begin = token.offset;
 
-        switch (tok_last_type(&tok))
+        switch (token.type)
         {
             case TOK_PIPE:
             {
@@ -501,17 +502,18 @@ void parse_util_token_extent(const wchar_t *buff,
     const wcstring buffcpy = wcstring(cmdsubst_begin, cmdsubst_end-cmdsubst_begin);
 
     tokenizer_t tok(buffcpy.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SQUASH_ERRORS);
-    for (; tok_has_next(&tok); tok_next(&tok))
+    tok_t token;
+    while (tok.next(&token))
     {
-        size_t tok_begin = tok_get_pos(&tok);
+        size_t tok_begin = token.offset;
         size_t tok_end = tok_begin;
 
         /*
           Calculate end of token
         */
-        if (tok_last_type(&tok) == TOK_STRING)
+        if (token.type == TOK_STRING)
         {
-            tok_end += wcslen(tok_last(&tok));
+            tok_end += token.text.size();
         }
 
         /*
@@ -529,20 +531,20 @@ void parse_util_token_extent(const wchar_t *buff,
           If cursor is inside the token, this is the token we are
           looking for. If so, set a and b and break
         */
-        if ((tok_last_type(&tok) == TOK_STRING) && (tok_end >= offset_within_cmdsubst))
+        if (token.type == TOK_STRING && tok_end >= offset_within_cmdsubst)
         {
-            a = cmdsubst_begin + tok_get_pos(&tok);
-            b = a + wcslen(tok_last(&tok));
+            a = cmdsubst_begin + token.offset;
+            b = a + token.text.size();
             break;
         }
 
         /*
           Remember previous string token
         */
-        if (tok_last_type(&tok) == TOK_STRING)
+        if (token.type == TOK_STRING)
         {
-            pa = cmdsubst_begin + tok_get_pos(&tok);
-            pb = pa + wcslen(tok_last(&tok));
+            pa = cmdsubst_begin + token.offset;
+            pb = pa + token.text.size();
         }
     }
 
@@ -684,10 +686,11 @@ wchar_t *parse_util_unescape_wildcards(const wchar_t *str)
    token is not quoted.
 
 */
-static wchar_t get_quote(const wchar_t *cmd, size_t len)
+static wchar_t get_quote(const wcstring &cmd_str, size_t len)
 {
     size_t i=0;
     wchar_t res=0;
+    const wchar_t * const cmd = cmd_str.c_str();
 
     while (1)
     {
@@ -722,26 +725,26 @@ static wchar_t get_quote(const wchar_t *cmd, size_t len)
     return res;
 }
 
-void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_t *quote, size_t *offset, int *type)
+void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_t *quote, size_t *offset, enum token_type *out_type)
 {
     size_t prev_pos=0;
     wchar_t last_quote = '\0';
     int unfinished;
 
     tokenizer_t tok(cmd.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SQUASH_ERRORS);
-    for (; tok_has_next(&tok); tok_next(&tok))
+    tok_t token;
+    while (tok.next(&token))
     {
-        if (tok_get_pos(&tok) > pos)
+        if (token.offset > pos)
             break;
 
-        if (tok_last_type(&tok) == TOK_STRING)
-            last_quote = get_quote(tok_last(&tok),
-                                   pos - tok_get_pos(&tok));
+        if (token.type == TOK_STRING)
+            last_quote = get_quote(token.text, pos - token.offset);
 
-        if (type != NULL)
-            *type = tok_last_type(&tok);
+        if (out_type != NULL)
+            *out_type = token.type;
 
-        prev_pos = tok_get_pos(&tok);
+        prev_pos = token.offset;
     }
 
     wchar_t *cmd_tmp = wcsdup(cmd.c_str());
diff --git a/src/parse_util.h b/src/parse_util.h
index 28286ed8..0e23946e 100644
--- a/src/parse_util.h
+++ b/src/parse_util.h
@@ -166,9 +166,9 @@ bool parse_util_argument_is_help(const wchar_t *s, int min_match);
    \param pos An index in the string which is inside the parameter
    \param quote If not NULL, store the type of quote this parameter has, can be either ', " or \\0, meaning the string is not quoted.
    \param offset If not NULL, get_param will store the offset to the beginning of the parameter.
-   \param type If not NULL, get_param will store the token type as returned by tok_last.
+   \param type If not NULL, get_param will store the token type.
 */
-void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_t *quote, size_t *offset, int *type);
+void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_t *quote, size_t *offset, enum token_type *out_type);
 
 /**
    Attempts to escape the string 'cmd' using the given quote type, as determined by the quote character. The quote can be a single quote or double quote, or L'\0' to indicate no quoting (and thus escaping should be with backslashes).
diff --git a/src/reader.cpp b/src/reader.cpp
index 5828aba1..0ee2746c 100644
--- a/src/reader.cpp
+++ b/src/reader.cpp
@@ -2336,7 +2336,7 @@ static void handle_token_history(int forward, int reset)
                 switch (token.type)
                 {
                     case TOK_STRING:
-                    {   
+                    {
                         if (token.text.find(data->search_buff) != wcstring::npos)
                         {
                             //debug( 3, L"Found token at pos %d\n", tok_get_pos( &tok ) );
@@ -3087,14 +3087,13 @@ static wchar_t unescaped_quote(const wcstring &str, size_t pos)
 /* Returns true if the last token is a comment. */
 static bool text_ends_in_comment(const wcstring &text)
 {
-    token_type last_type = TOK_NONE;
     tokenizer_t tok(text.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SHOW_COMMENTS | TOK_SQUASH_ERRORS);
-    while (tok_has_next(&tok))
+    tok_t token;
+    while (tok.next(&token))
     {
-        last_type = tok_last_type(&tok);
-        tok_next(&tok);
+        // pass
     }
-    return last_type == TOK_COMMENT;
+    return token.type == TOK_COMMENT;
 }
 
 const wchar_t *reader_readline(int nchars)
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 22412c47..18a617b1 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -52,6 +52,10 @@ segments.
 */
 #define PIPE_ERROR _( L"Cannot use stdin (fd 0) as pipe output" )
 
+static void tok_next(tokenizer_t *tok);
+static enum token_type tok_last_type(tokenizer_t *tok);
+static const wchar_t *tok_last(tokenizer_t *tok);
+
 /**
    Set the latest tokens string to be the specified error message
 */
@@ -62,11 +66,6 @@ static void tok_call_error(tokenizer_t *tok, enum tokenizer_error error_type, co
     tok->last_token = error_message;
 }
 
-int tok_get_error(tokenizer_t *tok)
-{
-    return tok->error;
-}
-
 tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig_buff(NULL), last_type(TOK_NONE), last_pos(0), has_next(false), accept_unfinished(false), show_comments(false), show_blank_lines(false), error(TOK_ERROR_NONE), squash_errors(false), continue_line_after_comment(false)
 {
     CHECK(b,);
@@ -91,13 +90,18 @@ bool tokenizer_t::next(struct tok_t *result)
     result->text = this->last_token;
     result->type = this->last_type;
     result->offset = last_pos;
+    result->error = this->last_type == TOK_ERROR ? this->error : TOK_ERROR_NONE;
+    assert(this->buff >= this->orig_buff);
+    
     assert(this->buff >= this->orig_buff);
-    result->length = this->buff - this->orig_buff;
+    size_t current_pos = this->buff - this->orig_buff;
+    result->length = current_pos >= this->last_pos ? current_pos - this->last_pos : 0;
+    
     tok_next(this);
     return true;
 }
 
-enum token_type tok_last_type(tokenizer_t *tok)
+static enum token_type tok_last_type(tokenizer_t *tok)
 {
     CHECK(tok, TOK_ERROR);
     CHECK(tok->buff, TOK_ERROR);
@@ -105,25 +109,13 @@ enum token_type tok_last_type(tokenizer_t *tok)
     return tok->last_type;
 }
 
-const wchar_t *tok_last(tokenizer_t *tok)
+static const wchar_t *tok_last(tokenizer_t *tok)
 {
     CHECK(tok, 0);
 
     return tok->last_token.c_str();
 }
 
-int tok_has_next(tokenizer_t *tok)
-{
-    /*
-      Return 1 on broken tokenizer
-    */
-    CHECK(tok, 1);
-    CHECK(tok->buff, 1);
-
-    /*  fwprintf( stderr, L"has_next is %ls \n", tok->has_next?L"true":L"false" );*/
-    return   tok->has_next;
-}
-
 /**
    Tests if this character can be a part of a string. The redirect ^ is allowed unless it's the first character.
    Hash (#) starts a comment if it's the first character in a token; otherwise it is considered a string character.
@@ -539,7 +531,7 @@ static bool my_iswspace(wchar_t c)
     return c != L'\n' && iswspace(c);
 }
 
-void tok_next(tokenizer_t *tok)
+static void tok_next(tokenizer_t *tok)
 {
 
     CHECK(tok,);
@@ -718,20 +710,6 @@ wcstring tok_first(const wchar_t *str)
     return result;
 }
 
-int tok_get_pos(const tokenizer_t *tok)
-{
-    CHECK(tok, 0);
-    return (int)tok->last_pos;
-}
-
-size_t tok_get_extent(const tokenizer_t *tok)
-{
-    CHECK(tok, 0);
-    size_t current_pos = tok->buff - tok->orig_buff;
-    return current_pos > tok->last_pos ? current_pos - tok->last_pos : 0;
-}
-
-
 bool move_word_state_machine_t::consume_char_punctuation(wchar_t c)
 {
     enum
diff --git a/src/tokenizer.h b/src/tokenizer.h
index 14e648fd..7e4003f8 100644
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@@ -76,16 +76,16 @@ struct tok_t
     /* The type of the token */
     token_type type;
     
+    /* If an error, this is the error code */
+    enum tokenizer_error error;
+    
     /* Offset of the token */
     size_t offset;
     
     /* Length of the token */
     size_t length;
     
-    /* If an error, this is the error code */
-    enum tokenizer_error error;
-    
-    tok_t() : type(TOK_NONE), offset(-1), length(-1), error(TOK_ERROR_NONE) {}
+    tok_t() : type(TOK_NONE), error(TOK_ERROR_NONE), offset(-1), length(-1) {}
 };
 
 /**
@@ -138,33 +138,6 @@ struct tokenizer_t
     bool next(struct tok_t *result);
 };
 
-/**
-  Jump to the next token.
-*/
-void tok_next(tokenizer_t *tok);
-
-/**
-  Returns the type of the last token. Must be one of the values in the token_type enum.
-*/
-enum token_type tok_last_type(tokenizer_t *tok);
-
-/**
-  Returns the last token string. The string should not be freed by the caller. This returns nonsense results for some token types, like TOK_END.
-*/
-const wchar_t *tok_last(tokenizer_t *tok);
-
-/**
-  Returns true as long as there are more tokens left
-*/
-int tok_has_next(tokenizer_t *tok);
-
-/**
-  Returns the position of the beginning of the current token in the original string
-*/
-int tok_get_pos(const tokenizer_t *tok);
-
-/** Returns the extent of the current token */
-size_t tok_get_extent(const tokenizer_t *tok);
 
 /**
    Returns only the first token from the specified string. This is a
@@ -175,11 +148,6 @@ size_t tok_get_extent(const tokenizer_t *tok);
 */
 wcstring tok_first(const wchar_t *str);
 
-/**
-   Get tokenizer error type. Should only be called if tok_last_tope returns TOK_ERROR.
-*/
-int tok_get_error(tokenizer_t *tok);
-
 /* Helper function to determine redirection type from a string, or TOK_NONE if the redirection is invalid. Also returns the fd by reference. */
 enum token_type redirection_type_for_string(const wcstring &str, int *out_fd = NULL);
author	ridiculousfish <corydoras@ridiculousfish.com>	2015-07-26 00:12:36 -0700
committer	ridiculousfish <corydoras@ridiculousfish.com>	2015-07-26 00:12:36 -0700
commit	4ebaa7b6bd595f37f66f8d77ee836c1e5a4ef647 (patch)
tree	ef3a2396c6307afb54697126a5c17f02c35835f4
parent	618896c0436e6ca70feb2fa317b34171cc4e1a81 (diff)