From 16fc8b07cb8dbbabad0b665b9114925f4349cf38 Mon Sep 17 00:00:00 2001 From: xleroy Date: Thu, 5 Jun 2014 06:50:10 +0000 Subject: Cleaner, more resilient parsing of pragmas. git-svn-id: https://yquem.inria.fr/compcert/svn/compcert/trunk@2507 fca1b0fc-160b-0410-b1d3-a4f43f01ea2e --- lib/Tokenize.mli | 33 +++++++++++++++++++++++++++++++++ lib/Tokenize.mll | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 lib/Tokenize.mli create mode 100644 lib/Tokenize.mll (limited to 'lib') diff --git a/lib/Tokenize.mli b/lib/Tokenize.mli new file mode 100644 index 0000000..a9f22c4 --- /dev/null +++ b/lib/Tokenize.mli @@ -0,0 +1,33 @@ +(* *********************************************************************) +(* *) +(* The Compcert verified compiler *) +(* *) +(* Xavier Leroy, INRIA Paris-Rocquencourt *) +(* *) +(* Copyright Institut National de Recherche en Informatique et en *) +(* Automatique. All rights reserved. This file is distributed *) +(* under the terms of the GNU General Public License as published by *) +(* the Free Software Foundation, either version 2 of the License, or *) +(* (at your option) any later version. This file is also distributed *) +(* under the terms of the INRIA Non-Commercial License Agreement. *) +(* *) +(* *********************************************************************) + +(* Parse a string as a list of tokens *) + +val string: string -> string list + (** [Tokenize.string s] decomposes [s] into a list of tokens. + Whitespace separates tokens. The following substrings + constitute tokens: + - A string enclosed in double quotes. Within the string, + the escape sequences '\t' '\n' '\"' and '\\' are recognized. + The token value is the contents of the string without the + enclosing double quotes. + - A string enclosed in single quotes. No escape sequences are + recognized. The token value is the contents of the string without the + enclosing single quotes. + - A sequence of letters, digits, or the [_], [$], [-] and [.] + characters. [-] and [.] cannot appear as the first character. + - Any other non-whitespace character is treated as a separate token + of length 1. + *) diff --git a/lib/Tokenize.mll b/lib/Tokenize.mll new file mode 100644 index 0000000..422068b --- /dev/null +++ b/lib/Tokenize.mll @@ -0,0 +1,45 @@ +(* *********************************************************************) +(* *) +(* The Compcert verified compiler *) +(* *) +(* Xavier Leroy, INRIA Paris-Rocquencourt *) +(* *) +(* Copyright Institut National de Recherche en Informatique et en *) +(* Automatique. All rights reserved. This file is distributed *) +(* under the terms of the GNU General Public License as published by *) +(* the Free Software Foundation, either version 2 of the License, or *) +(* (at your option) any later version. This file is also distributed *) +(* under the terms of the INRIA Non-Commercial License Agreement. *) +(* *) +(* *********************************************************************) + +(* Parse a string as a list of tokens *) + +let identstart = [ '0'-'9' 'A'-'Z' 'a'-'z' '$' '_' ] +let identcont = [ '0'-'9' 'A'-'Z' 'a'-'z' '$' '_' '-' '.' ] + +rule tokenize acc = parse + | eof { List.rev acc } + | [' ' '\t' '\n'] + { tokenize acc lexbuf } + | "\"" { tok_dquote acc (Buffer.create 16) lexbuf } + | "'" { tok_squote acc (Buffer.create 16) lexbuf } + | (identstart identcont*) as s + { tokenize (s :: acc) lexbuf } + | _ as c { tokenize (String.make 1 c :: acc) lexbuf } + +and tok_dquote acc buf = parse + | "\"" | eof { tokenize (Buffer.contents buf :: acc) lexbuf } + | "\\t" { Buffer.add_char buf '\t'; tok_dquote acc buf lexbuf } + | "\\n" { Buffer.add_char buf '\n'; tok_dquote acc buf lexbuf } + | "\\" ([ '\\' '\"' ] as c) + { Buffer.add_char buf c; tok_dquote acc buf lexbuf } + | _ as c { Buffer.add_char buf c; tok_dquote acc buf lexbuf } + +and tok_squote acc buf = parse + | "\'" | eof { tokenize (Buffer.contents buf :: acc) lexbuf } + | _ as c { Buffer.add_char buf c; tok_squote acc buf lexbuf } + +{ +let string s = + tokenize [] (Lexing.from_string s) +} -- cgit v1.2.3