Hack the tokenizer to compress multiple adjacent newlines into one

This slightly reduces the size of parse trees, and is otherwise a minor optimization
author: ridiculousfish <corydoras@ridiculousfish.com> 2014-11-24 01:20:57 -0800
committer: ridiculousfish <corydoras@ridiculousfish.com> 2014-11-24 01:23:42 -0800
commit: eafd5776292c37d37870fc6013029f7146f34f70 (patch)
tree: 6d9d81452eef02560933c42734ce92562407d875
parent: 196a7c9d188304cd6b189b1bcf4e2c088fcf3434 (diff)
2 files changed, 12 insertions, 4 deletions
diff --git a/fish_tests.cpp b/fish_tests.cpp
index 4df8322c..9fa6a110 100644
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@@ -458,10 +458,10 @@ static void test_tok()
     say(L"Test destruction of broken tokenizer");
     {
 
-        const wchar_t *str = L"string <redirection  2>&1 'nested \"quoted\" '(string containing subshells ){and,brackets}$as[$well (as variable arrays)] not_a_redirect^ ^ ^^is_a_redirect";
+        const wchar_t *str = L"string <redirection  2>&1 'nested \"quoted\" '(string containing subshells ){and,brackets}$as[$well (as variable arrays)] not_a_redirect^ ^ ^^is_a_redirect Compress_Newlines\n  \n\t\n   \nInto_Just_One";
         const int types[] =
         {
-            TOK_STRING, TOK_REDIRECT_IN, TOK_STRING, TOK_REDIRECT_FD, TOK_STRING, TOK_STRING, TOK_STRING, TOK_REDIRECT_OUT, TOK_REDIRECT_APPEND, TOK_STRING, TOK_END
+            TOK_STRING, TOK_REDIRECT_IN, TOK_STRING, TOK_REDIRECT_FD, TOK_STRING, TOK_STRING, TOK_STRING, TOK_REDIRECT_OUT, TOK_REDIRECT_APPEND, TOK_STRING, TOK_STRING, TOK_END, TOK_STRING, TOK_END
         };
 
         say(L"Test correct tokenization");
diff --git a/tokenizer.cpp b/tokenizer.cpp
index 17999356..29db04bd 100644
--- a/tokenizer.cpp
+++ b/tokenizer.cpp
@@ -621,14 +621,22 @@ void tok_next(tokenizer_t *tok)
 
     switch (*tok->buff)
     {
-
         case L'\0':
             tok->last_type = TOK_END;
             /*fwprintf( stderr, L"End of string\n" );*/
             tok->has_next = false;
             break;
-        case 13:
+        case 13: // carriage return
         case L'\n':
+            // Hack: when we get a newline, swallow as many as we can
+            // This compresses multiple subsequent newlines into a single one
+            while (*tok->buff == L'\n' || *tok->buff == 13 || *tok->buff == ' ' || *tok->buff == '\t')
+            {
+                tok->buff++;
+            }
+            tok->last_type = TOK_END;
+            break;
+
         case L';':
             tok->last_type = TOK_END;
             tok->buff++;
author	ridiculousfish <corydoras@ridiculousfish.com>	2014-11-24 01:20:57 -0800
committer	ridiculousfish <corydoras@ridiculousfish.com>	2014-11-24 01:23:42 -0800
commit	eafd5776292c37d37870fc6013029f7146f34f70 (patch)
tree	6d9d81452eef02560933c42734ce92562407d875
parent	196a7c9d188304cd6b189b1bcf4e2c088fcf3434 (diff)