Cleaner, more resilient parsing of pragmas.

git-svn-id: https://yquem.inria.fr/compcert/svn/compcert/trunk@2507 fca1b0fc-160b-0410-b1d3-a4f43f01ea2e
author: xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e> 2014-06-05 06:50:10 +0000
committer: xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e> 2014-06-05 06:50:10 +0000
commit: 16fc8b07cb8dbbabad0b665b9114925f4349cf38 (patch)
tree: 3ed603afd2e87c5ed70fe6e86d6691d876129024 /lib
parent: c8a892a09e9f61c3af7dae30d39509558f77462a (diff)
2 files changed, 78 insertions, 0 deletions
diff --git a/lib/Tokenize.mli b/lib/Tokenize.mli
new file mode 100644
index 0000000..a9f22c4
--- /dev/null
+++ b/lib/Tokenize.mli
@@ -0,0 +1,33 @@
+(* *********************************************************************)
+(*                                                                     *)
+(*              The Compcert verified compiler                         *)
+(*                                                                     *)
+(*          Xavier Leroy, INRIA Paris-Rocquencourt                     *)
+(*                                                                     *)
+(*  Copyright Institut National de Recherche en Informatique et en     *)
+(*  Automatique.  All rights reserved.  This file is distributed       *)
+(*  under the terms of the GNU General Public License as published by  *)
+(*  the Free Software Foundation, either version 2 of the License, or  *)
+(*  (at your option) any later version.  This file is also distributed *)
+(*  under the terms of the INRIA Non-Commercial License Agreement.     *)
+(*                                                                     *)
+(* *********************************************************************)
+
+(* Parse a string as a list of tokens *)
+
+val string: string -> string list
+  (** [Tokenize.string s] decomposes [s] into a list of tokens.
+      Whitespace separates tokens.  The following substrings
+      constitute tokens:
+    - A string enclosed in double quotes.  Within the string,
+      the escape sequences '\t' '\n' '\"' and '\\' are recognized.
+      The token value is the contents of the string without the
+      enclosing double quotes.
+    - A string enclosed in single quotes.  No escape sequences are
+      recognized.  The token value is the contents of the string without the
+      enclosing single quotes.
+    - A sequence of letters, digits, or the [_], [$], [-] and [.]
+      characters.  [-] and [.] cannot appear as the first character.
+    - Any other non-whitespace character is treated as a separate token
+      of length 1.
+  *)
diff --git a/lib/Tokenize.mll b/lib/Tokenize.mll
new file mode 100644
index 0000000..422068b
--- /dev/null
+++ b/lib/Tokenize.mll
@@ -0,0 +1,45 @@
+(* *********************************************************************)
+(*                                                                     *)
+(*              The Compcert verified compiler                         *)
+(*                                                                     *)
+(*          Xavier Leroy, INRIA Paris-Rocquencourt                     *)
+(*                                                                     *)
+(*  Copyright Institut National de Recherche en Informatique et en     *)
+(*  Automatique.  All rights reserved.  This file is distributed       *)
+(*  under the terms of the GNU General Public License as published by  *)
+(*  the Free Software Foundation, either version 2 of the License, or  *)
+(*  (at your option) any later version.  This file is also distributed *)
+(*  under the terms of the INRIA Non-Commercial License Agreement.     *)
+(*                                                                     *)
+(* *********************************************************************)
+
+(* Parse a string as a list of tokens *)
+
+let identstart = [ '0'-'9' 'A'-'Z' 'a'-'z' '$' '_' ]
+let identcont  = [ '0'-'9' 'A'-'Z' 'a'-'z' '$' '_' '-' '.' ]
+
+rule tokenize acc = parse
+  | eof                 { List.rev acc }
+  | [' ' '\t' '\n'] +   { tokenize acc lexbuf }
+  | "\""                { tok_dquote acc (Buffer.create 16) lexbuf }
+  | "'"                 { tok_squote acc (Buffer.create 16) lexbuf }
+  | (identstart identcont*) as s
+                        { tokenize (s :: acc) lexbuf }
+  | _ as c              { tokenize (String.make 1 c :: acc) lexbuf }
+
+and tok_dquote acc buf = parse
+  | "\"" | eof          { tokenize (Buffer.contents buf :: acc) lexbuf }
+  | "\\t"               { Buffer.add_char buf '\t'; tok_dquote acc buf lexbuf }
+  | "\\n"               { Buffer.add_char buf '\n'; tok_dquote acc buf lexbuf }
+  | "\\" ([ '\\' '\"' ] as c)
+                        { Buffer.add_char buf c; tok_dquote acc buf lexbuf }
+  | _ as c              { Buffer.add_char buf c; tok_dquote acc buf lexbuf }
+
+and tok_squote acc buf = parse
+  | "\'" | eof          { tokenize (Buffer.contents buf :: acc) lexbuf }
+  | _ as c              { Buffer.add_char buf c; tok_squote acc buf lexbuf }
+
+{
+let string s =
+  tokenize [] (Lexing.from_string s)
+}
author	xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e>	2014-06-05 06:50:10 +0000
committer	xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e>	2014-06-05 06:50:10 +0000
commit	16fc8b07cb8dbbabad0b665b9114925f4349cf38 (patch)
tree	3ed603afd2e87c5ed70fe6e86d6691d876129024 /lib
parent	c8a892a09e9f61c3af7dae30d39509558f77462a (diff)