Early reworking of tokenizer interface

author: ridiculousfish <corydoras@ridiculousfish.com> 2015-07-25 23:05:47 -0700
committer: ridiculousfish <corydoras@ridiculousfish.com> 2015-07-25 23:05:47 -0700
commit: 618896c0436e6ca70feb2fa317b34171cc4e1a81 (patch)
tree: 4f6d8130f52fe7c9523e034ab85fe3b49fd0ea0b /src
parent: 0dbd83ffaf571dce9b1e8449c28e3ae0040d4e75 (diff)
4 files changed, 72 insertions, 20 deletions
diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp
index a7587575..cbece61c 100644
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@@ -468,22 +468,34 @@ static void test_tok()
         const wchar_t *str = L"string <redirection  2>&1 'nested \"quoted\" '(string containing subshells ){and,brackets}$as[$well (as variable arrays)] not_a_redirect^ ^ ^^is_a_redirect Compress_Newlines\n  \n\t\n   \nInto_Just_One";
         const int types[] =
         {
-            TOK_STRING, TOK_REDIRECT_IN, TOK_STRING, TOK_REDIRECT_FD, TOK_STRING, TOK_STRING, TOK_STRING, TOK_REDIRECT_OUT, TOK_REDIRECT_APPEND, TOK_STRING, TOK_STRING, TOK_END, TOK_STRING, TOK_END
+            TOK_STRING, TOK_REDIRECT_IN, TOK_STRING, TOK_REDIRECT_FD, TOK_STRING, TOK_STRING, TOK_STRING, TOK_REDIRECT_OUT, TOK_REDIRECT_APPEND, TOK_STRING, TOK_STRING, TOK_END, TOK_STRING
         };
 
         say(L"Test correct tokenization");
 
         tokenizer_t t(str, 0);
-        for (size_t i=0; i < sizeof types / sizeof *types; i++, tok_next(&t))
+        tok_t token;
+        size_t i = 0;
+        while (t.next(&token))
         {
-            if (types[i] != tok_last_type(&t))
+            if (i > sizeof types / sizeof *types)
+            {
+                err(L"Too many tokens returned from tokenizer");
+                break;
+            }
+            if (types[i] != token.type)
             {
                 err(L"Tokenization error:");
-                wprintf(L"Token number %d of string \n'%ls'\n, got token '%ls'\n",
+                wprintf(L"Token number %d of string \n'%ls'\n, got token type %ld\n",
                         i+1,
                         str,
-                        tok_last(&t));
+                        (long)token.type);
             }
+            i++;
+        }
+        if (i < sizeof types / sizeof *types)
+        {
+            err(L"Too few tokens returned from tokenizer");
         }
     }
 
diff --git a/src/reader.cpp b/src/reader.cpp
index 5e386dc7..5828aba1 100644
--- a/src/reader.cpp
+++ b/src/reader.cpp
@@ -246,7 +246,7 @@ public:
     /**
        Saved position used by token history search
     */
-    int token_history_pos;
+    size_t token_history_pos;
 
     /**
        Saved search string for token history search. Not handled by command_line_changed.
@@ -2256,7 +2256,7 @@ static void handle_token_history(int forward, int reset)
         return;
 
     wcstring str;
-    long current_pos;
+    size_t current_pos;
 
     if (reset)
     {
@@ -2292,7 +2292,7 @@ static void handle_token_history(int forward, int reset)
     }
     else
     {
-        if (current_pos == -1)
+        if (current_pos == size_t(-1))
         {
             data->token_history_buff.clear();
 
@@ -2330,26 +2330,26 @@ static void handle_token_history(int forward, int reset)
 
             //debug( 3, L"new '%ls'", data->token_history_buff.c_str() );
             tokenizer_t tok(data->token_history_buff.c_str(), TOK_ACCEPT_UNFINISHED);
-            for (; tok_has_next(&tok); tok_next(&tok))
+            tok_t token;
+            while (tok.next(&token))
             {
-                switch (tok_last_type(&tok))
+                switch (token.type)
                 {
                     case TOK_STRING:
-                    {
-                        if (wcsstr(tok_last(&tok), data->search_buff.c_str()))
+                    {   
+                        if (token.text.find(data->search_buff) != wcstring::npos)
                         {
                             //debug( 3, L"Found token at pos %d\n", tok_get_pos( &tok ) );
-                            if (tok_get_pos(&tok) >= current_pos)
+                            if (token.offset >= current_pos)
                             {
                                 break;
                             }
                             //debug( 3, L"ok pos" );
 
-                            const wcstring last_tok = tok_last(&tok);
-                            if (find(data->search_prev.begin(), data->search_prev.end(), last_tok) == data->search_prev.end())
+                            if (find(data->search_prev.begin(), data->search_prev.end(), token.text) == data->search_prev.end())
                             {
-                                data->token_history_pos = tok_get_pos(&tok);
-                                str = tok_last(&tok);
+                                data->token_history_pos = token.offset;
+                                str = token.text;
                             }
 
                         }
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 55e9dc9c..22412c47 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -55,7 +55,7 @@ segments.
 /**
    Set the latest tokens string to be the specified error message
 */
-static void tok_call_error(tokenizer_t *tok, int error_type, const wchar_t *error_message)
+static void tok_call_error(tokenizer_t *tok, enum tokenizer_error error_type, const wchar_t *error_message)
 {
     tok->last_type = TOK_ERROR;
     tok->error = error_type;
@@ -67,7 +67,7 @@ int tok_get_error(tokenizer_t *tok)
     return tok->error;
 }
 
-tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig_buff(NULL), last_type(TOK_NONE), last_pos(0), has_next(false), accept_unfinished(false), show_comments(false), show_blank_lines(false), error(0), squash_errors(false), continue_line_after_comment(false)
+tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig_buff(NULL), last_type(TOK_NONE), last_pos(0), has_next(false), accept_unfinished(false), show_comments(false), show_blank_lines(false), error(TOK_ERROR_NONE), squash_errors(false), continue_line_after_comment(false)
 {
     CHECK(b,);
 
@@ -81,6 +81,22 @@ tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig
     tok_next(this);
 }
 
+bool tokenizer_t::next(struct tok_t *result)
+{
+    assert(result != NULL);
+    if (! this->has_next)
+    {
+        return false;
+    }
+    result->text = this->last_token;
+    result->type = this->last_type;
+    result->offset = last_pos;
+    assert(this->buff >= this->orig_buff);
+    result->length = this->buff - this->orig_buff;
+    tok_next(this);
+    return true;
+}
+
 enum token_type tok_last_type(tokenizer_t *tok)
 {
     CHECK(tok, TOK_ERROR);
diff --git a/src/tokenizer.h b/src/tokenizer.h
index f5f0455d..14e648fd 100644
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@@ -36,6 +36,7 @@ enum token_type
 */
 enum tokenizer_error
 {
+    TOK_ERROR_NONE,
     TOK_UNTERMINATED_QUOTE,
     TOK_UNTERMINATED_SUBSHELL,
     TOK_UNTERMINATED_ESCAPE,
@@ -67,6 +68,26 @@ enum tokenizer_error
 
 typedef unsigned int tok_flags_t;
 
+struct tok_t
+{
+    /* The text of the token, or an error message for type error */
+    wcstring text;
+    
+    /* The type of the token */
+    token_type type;
+    
+    /* Offset of the token */
+    size_t offset;
+    
+    /* Length of the token */
+    size_t length;
+    
+    /* If an error, this is the error code */
+    enum tokenizer_error error;
+    
+    tok_t() : type(TOK_NONE), offset(-1), length(-1), error(TOK_ERROR_NONE) {}
+};
+
 /**
    The tokenizer struct.
 */
@@ -93,7 +114,7 @@ struct tokenizer_t
     /** Whether all blank lines are returned */
     bool show_blank_lines;
     /** Last error */
-    int error;
+    tokenizer_error error;
     /* Whether we are squashing errors */
     bool squash_errors;
 
@@ -112,6 +133,9 @@ struct tokenizer_t
 
     */
     tokenizer_t(const wchar_t *b, tok_flags_t flags);
+    
+    /** Returns the next token by reference. Returns true if we got one, false if we're at the end. */
+    bool next(struct tok_t *result);
 };
 
 /**
author	ridiculousfish <corydoras@ridiculousfish.com>	2015-07-25 23:05:47 -0700
committer	ridiculousfish <corydoras@ridiculousfish.com>	2015-07-25 23:05:47 -0700
commit	618896c0436e6ca70feb2fa317b34171cc4e1a81 (patch)
tree	4f6d8130f52fe7c9523e034ab85fe3b49fd0ea0b /src
parent	0dbd83ffaf571dce9b1e8449c28e3ae0040d4e75 (diff)