Assorted fixes to fix parsing issues and be more GCC-like:

- Moved scanning of char constants and string literals entirely to Lexer - Parser: separate STRING_LITERAL from CONSTANT to be closer to ISO C99 grammar - pre_parser: adapted + "asm" takes string_literal, not CONSTANT - Revised errors "inline doesnt belong here" git-svn-id: https://yquem.inria.fr/compcert/svn/compcert/trunk@2492 fca1b0fc-160b-0410-b1d3-a4f43f01ea2e
author: xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e> 2014-05-12 15:52:42 +0000
committer: xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e> 2014-05-12 15:52:42 +0000
commit: edc00e0c90a5598f653add89f42a095d8ee1b629 (patch)
tree: 2d2539335cc7e916a8964847b2ed7489f9340d00 /cparser/Lexer.mll
parent: 951bf7bdb208f500c86e8d45c45247cd25adb4ab (diff)
1 files changed, 101 insertions, 62 deletions
diff --git a/cparser/Lexer.mll b/cparser/Lexer.mll
index e4cb9a6..276aead 100644
--- a/cparser/Lexer.mll
+++ b/cparser/Lexer.mll
@@ -20,8 +20,6 @@ open Pre_parser_aux
 open Cabshelper
 open Camlcoq
 
-module SMap = Map.Make(String)
-
 let contexts : string list list ref = ref []
 let lexicon : (string, Cabs.cabsloc -> token) Hashtbl.t = Hashtbl.create 0
 
@@ -128,6 +126,32 @@ let currentLoc =
             byteno   = p.Lexing.pos_cnum;
             ident    = getident ();})
 
+(* Error reporting *)
+
+let fatal_error lb fmt =
+  Cerrors.fatal_error ("%s:%d: Error:@ " ^^ fmt)
+      lb.lex_curr_p.pos_fname lb.lex_curr_p.pos_lnum
+
+let error lb fmt =
+  Cerrors.error ("%s:%d: Error:@ " ^^ fmt)
+      lb.lex_curr_p.pos_fname lb.lex_curr_p.pos_lnum
+
+let warning lb fmt =
+  Cerrors.warning ("%s:%d: Warning:@ " ^^ fmt)
+      lb.lex_curr_p.pos_fname lb.lex_curr_p.pos_lnum
+
+(* Simple character escapes *)
+
+let convert_escape = function
+  | 'a' -> 7L  (* bell *)
+  | 'b' -> 8L  (* backspace  *)
+  | 'e' -> 27L (* escape (GCC extension) *)
+  | 'f' -> 12L (* form feed *)
+  | 'n' -> 10L (* new line *)
+  | 'r' -> 13L (* carriage return *)
+  | 't' -> 9L  (* horizontal tab *)
+  | 'v' -> 11L (* vertical tab *)
+  | c   -> Int64.of_int (Char.code c)
 }
 
 (* Identifiers *)
@@ -138,8 +162,8 @@ let nondigit = ['_' 'a'-'z' 'A'-'Z']
 let hex_quad = hexadecimal_digit hexadecimal_digit
                  hexadecimal_digit hexadecimal_digit
 let universal_character_name =
-    "\\u" hex_quad
-  | "\\U" hex_quad hex_quad
+    "\\u" (hex_quad as n)
+  | "\\U" (hex_quad hex_quad as n)
 
 let identifier_nondigit =
     nondigit
@@ -204,36 +228,19 @@ let hexadecimal_floating_constant =
   | hexadecimal_prefix (hexadecimal_digit_sequence as intpart)
         binary_exponent_part floating_suffix?
 
-(* Charater constants *)
+(* Character and string constants *)
 let simple_escape_sequence =
-    "\\'" | "\\\"" | "\\?" | "\\\\" | "\\a" | "\\b" | "\\f" | "\\n"
-  | "\\r" | "\\t" | "\\v"
+  '\\' ( ['\''  '\"'  '?'  '\\'  'a'  'b'  'e'  'f'  'n'  'r'  't'  'v'] as c)
 let octal_escape_sequence =
-    '\\' octal_digit
-  | '\\' octal_digit octal_digit
-  | '\\' octal_digit octal_digit octal_digit
-let hexadecimal_escape_sequence = "\\x" hexadecimal_digit+
+  '\\' ((octal_digit
+         | octal_digit octal_digit
+         | octal_digit octal_digit octal_digit) as n)
+let hexadecimal_escape_sequence = "\\x" (hexadecimal_digit+ as n)
 let escape_sequence =
     simple_escape_sequence
   | octal_escape_sequence
   | hexadecimal_escape_sequence
   | universal_character_name
-let c_char =
-    [^ '\'' '\\' '\n']
-  | escape_sequence
-let c_char_sequence = c_char+
-let character_constant =
-    "'" c_char_sequence "'"
-  | "L'" c_char_sequence "'"
-
-(* String literals *)
-let s_char =
-    [^ '"' '\\' '\n']
-  | escape_sequence
-let s_char_sequence = s_char+
-let string_literal =
-    '"' s_char_sequence? '"'
-  | 'L' '"' s_char_sequence? '"'
 
 rule initial = parse
   | '\n'                          { new_line lexbuf; initial_linebegin lexbuf }
@@ -261,8 +268,16 @@ rule initial = parse
                                            | None -> None
                                            | Some c -> Some (String.make 1 c) },
                                       currentLoc lexbuf)}
-  | character_constant as s       { CONSTANT (Cabs.CONST_CHAR s, currentLoc lexbuf) }
-  | string_literal as s           { STRING_LITERAL (s, currentLoc lexbuf) }
+  | "'"                           { let l = char_literal [] lexbuf in
+                                    CONSTANT (Cabs.CONST_CHAR(false, l),
+                                              currentLoc lexbuf) }
+  | "L'"                          { let l = char_literal [] lexbuf in
+                                    CONSTANT (Cabs.CONST_CHAR(true, l),
+                                              currentLoc lexbuf) }
+  | "\""                          { let l = string_literal [] lexbuf in
+                                    STRING_LITERAL(false, l, currentLoc lexbuf) }
+  | "L\""                         { let l = string_literal [] lexbuf in
+                                    STRING_LITERAL(true, l, currentLoc lexbuf) }
   | "..."                         { ELLIPSIS(currentLoc lexbuf) }
   | "+="                          { ADD_ASSIGN(currentLoc lexbuf) }
   | "-="                          { SUB_ASSIGN(currentLoc lexbuf) }
@@ -313,12 +328,7 @@ rule initial = parse
         try Hashtbl.find lexicon id (currentLoc lexbuf)
         with Not_found -> VAR_NAME (id, ref VarId, currentLoc lexbuf) }
   | eof                           { EOF }
-  | '"' ("" | 'L') s_char* '\\' (_ as c) {
-      Cerrors.fatal_error "%s:%d Error:@ invalid escape sequence in string litteral %S"
-        lexbuf.lex_curr_p.pos_fname lexbuf.lex_curr_p.pos_lnum (Printf.sprintf "\\%c" c) }
-  | _ as c                        {
-      Cerrors.fatal_error "%s:%d Error:@ invalid symbol %C"
-        lexbuf.lex_curr_p.pos_fname lexbuf.lex_curr_p.pos_lnum c }
+  | _ as c                        { fatal_error lexbuf "invalid symbol %C" c }
 
 and initial_linebegin = parse
   | '\n'                          { new_line lexbuf; initial_linebegin lexbuf }
@@ -326,6 +336,41 @@ and initial_linebegin = parse
   | '#'                           { hash lexbuf }
   | ""                            { initial lexbuf }
 
+and char = parse
+  | universal_character_name
+      { try
+          Int64.of_string ("0x" ^ n)
+        with Failure _ ->
+          error lexbuf "overflow in universal character name";
+          0L
+      }
+  | hexadecimal_escape_sequence
+      { try
+          Int64.of_string ("0x" ^ n)
+        with Failure _ ->
+          error lexbuf "overflow in hexadecimal escape sequence";
+          0L
+      }
+  | octal_escape_sequence
+      { Int64.of_string  ("0o" ^ n) }
+  | simple_escape_sequence
+      { convert_escape c }
+  | '\\' (_ as c)
+      { warning lexbuf "incorrect escape sequence '\\%c', treating as '%c'" c c;
+        Int64.of_int (Char.code c) }
+  | _ as c
+      { Int64.of_int (Char.code c) }
+
+and char_literal accu = parse
+  | '\''       { List.rev accu }
+  | '\n' | eof { fatal_error lexbuf "missing terminating \"'\" character" }
+  | ""         { let c = char lexbuf in char_literal (c :: accu) lexbuf }
+
+and string_literal accu = parse
+  | '\"'       { List.rev accu }
+  | '\n' | eof { fatal_error lexbuf "missing terminating '\"' character" }
+  | ""         { let c = char lexbuf in string_literal (c :: accu) lexbuf }
+
 (* We assume gcc -E syntax but try to tolerate variations. *)
 and hash = parse
   | whitespace_char_no_newline +
@@ -334,11 +379,10 @@ and hash = parse
     "\"" ([^ '\n' '\"']* as file) "\""
     [^ '\n']* '\n'
       { let n =
-          try int_of_string n
-          with Failure "int_of_string" ->
-            Cerrors.warning "%s:%d Warning:@ invalid line number"
-              lexbuf.lex_curr_p.pos_fname lexbuf.lex_curr_p.pos_lnum;
-            lexbuf.lex_curr_p.pos_lnum
+          try
+            int_of_string n
+          with Failure _ ->
+            warning lexbuf "invalid line number"; lexbuf.lex_curr_p.pos_lnum
         in
         lexbuf.lex_curr_p <- {
           lexbuf.lex_curr_p with
@@ -353,29 +397,24 @@ and hash = parse
     ([^ '\n']* as s) '\n'
       { new_line lexbuf; PRAGMA (s, currentLoc lexbuf) }
   | [^ '\n']* '\n'
-      { Cerrors.warning "%s:%d Warning:@ unrecognized '#' line"
-          lexbuf.lex_curr_p.pos_fname lexbuf.lex_curr_p.pos_lnum;
+      { warning lexbuf "unrecognized '#' line";
         new_line lexbuf; initial_linebegin lexbuf }
   | [^ '\n']* eof
-      { Cerrors.fatal_error "%s:%d Error:@ unexpected end of file"
-          lexbuf.lex_curr_p.pos_fname lexbuf.lex_curr_p.pos_lnum }
+      { fatal_error lexbuf "unexpected end of file" }
   | _ as c
-      { Cerrors.fatal_error "%s:%d Error:@ invalid symbol %C"
-          lexbuf.lex_curr_p.pos_fname lexbuf.lex_curr_p.pos_lnum c }
+      { fatal_error lexbuf "invalid symbol %C" c }
 
 (* Multi-line comment terminated by "*/" *)
 and multiline_comment = parse
   | "*/"   { () }
-  | eof    { Cerrors.error "%s:%d Error: unterminated comment"
-                lexbuf.lex_curr_p.pos_fname lexbuf.lex_curr_p.pos_lnum }
+  | eof    { error lexbuf "unterminated comment" }
   | '\n'   { new_line lexbuf; multiline_comment lexbuf }
   | _      { multiline_comment lexbuf }
 
 (* Single-line comment terminated by a newline *)
 and singleline_comment = parse
   | '\n'   { new_line lexbuf }
-  | eof    { Cerrors.error "%s:%d Error: unterminated comment"
-                lexbuf.lex_curr_p.pos_fname lexbuf.lex_curr_p.pos_lnum }
+  | eof    { () }
   | _      { singleline_comment lexbuf }
 
 {
@@ -477,22 +516,22 @@ and singleline_comment = parse
       | SLASH loc -> loop SLASH't loc
       | STAR loc -> loop STAR't loc
       | STATIC loc -> loop STATIC't loc
-      | STRING_LITERAL (str, loc) ->
-          let buf = Buffer.create (String.length str) in
-          Buffer.add_string buf str;
+      | STRING_LITERAL (wide, str, loc) ->
           (* Merge consecutive string literals *)
-          let rec doConcat () =
+          let rec doConcat wide str =
             try
               match Queue.peek tokens with
-              | STRING_LITERAL (str, loc) ->
+              | STRING_LITERAL (wide', str', loc) ->
                   ignore (Queue.pop tokens);
-                  Buffer.add_string buf str;
-                  doConcat ()
-              | _ -> ()
-            with Queue.Empty -> ()
-          in
-          doConcat ();
-          loop CONSTANT't (Cabs.CONST_STRING (Buffer.contents buf), loc)
+                  let (wide'', str'') = doConcat wide' str' in
+                  if str'' <> []
+                  then (wide || wide'', str @ str'')
+                  else (wide, str)
+              | _ ->
+                  (wide, str)
+            with Queue.Empty -> (wide, str) in
+          let (wide', str') = doConcat wide str in
+          loop STRING_LITERAL't ((wide', str'), loc)
       | STRUCT loc -> loop STRUCT't loc
       | SUB_ASSIGN loc -> loop SUB_ASSIGN't loc
       | SWITCH loc -> loop SWITCH't loc
author	xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e>	2014-05-12 15:52:42 +0000
committer	xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e>	2014-05-12 15:52:42 +0000
commit	edc00e0c90a5598f653add89f42a095d8ee1b629 (patch)
tree	2d2539335cc7e916a8964847b2ed7489f9340d00 /cparser/Lexer.mll
parent	951bf7bdb208f500c86e8d45c45247cd25adb4ab (diff)