From 891377ce1962cdb31357d6580d6546ec22df2b4f Mon Sep 17 00:00:00 2001 From: xleroy Date: Wed, 3 Mar 2010 10:22:27 +0000 Subject: Switching to the new C parser/elaborator/simplifier git-svn-id: https://yquem.inria.fr/compcert/svn/compcert/trunk@1269 fca1b0fc-160b-0410-b1d3-a4f43f01ea2e --- cparser/Lexer.mll | 604 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 604 insertions(+) create mode 100644 cparser/Lexer.mll (limited to 'cparser/Lexer.mll') diff --git a/cparser/Lexer.mll b/cparser/Lexer.mll new file mode 100644 index 0000000..d4947ad --- /dev/null +++ b/cparser/Lexer.mll @@ -0,0 +1,604 @@ +(* + * + * Copyright (c) 2001-2003, + * George C. Necula + * Scott McPeak + * Wes Weimer + * Ben Liblit + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. The names of the contributors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *) +(* FrontC -- lexical analyzer +** +** 1.0 3.22.99 Hugues Cassé First version. +** 2.0 George Necula 12/12/00: Many extensions +*) +{ +open Lexing +open Parse_aux +open Parser + +exception Eof + +module H = Hashtbl + +let newline lb = + let cp = lb.lex_curr_p in + lb.lex_curr_p <- { cp with pos_lnum = 1 + cp.pos_lnum } + +let setCurrentLine lb lineno = + let cp = lb.lex_curr_p in + lb.lex_curr_p <- { cp with pos_lnum = lineno } + +let setCurrentFile lb file = + let cp = lb.lex_curr_p in + lb.lex_curr_p <- { cp with pos_fname = file } + +let matchingParsOpen = ref 0 + +let currentLoc = Cabshelper.currentLoc_lexbuf + +let int64_to_char value = + assert (value <= 255L && value >= 0L); + Char.chr (Int64.to_int value) + +(* takes a not-nul-terminated list, and converts it to a string. *) +let rec intlist_to_string (str: int64 list):string = + match str with + [] -> "" (* add nul-termination *) + | value::rest -> + let this_char = int64_to_char value in + (String.make 1 this_char) ^ (intlist_to_string rest) + +(* +** Keyword hashtable +*) +let lexicon = H.create 211 +let init_lexicon _ = + H.clear lexicon; + List.iter + (fun (key, builder) -> H.add lexicon key builder) + [ ("_Bool", fun loc -> UNDERSCORE_BOOL loc); + ("auto", fun loc -> AUTO loc); + ("const", fun loc -> CONST loc); + ("__const", fun loc -> CONST loc); + ("__const__", fun loc -> CONST loc); + ("static", fun loc -> STATIC loc); + ("extern", fun loc -> EXTERN loc); + ("long", fun loc -> LONG loc); + ("short", fun loc -> SHORT loc); + ("register", fun loc -> REGISTER loc); + ("signed", fun loc -> SIGNED loc); + ("__signed", fun loc -> SIGNED loc); + ("unsigned", fun loc -> UNSIGNED loc); + ("volatile", fun loc -> VOLATILE loc); + ("__volatile", fun loc -> VOLATILE loc); + (* WW: see /usr/include/sys/cdefs.h for why __signed and __volatile + * are accepted GCC-isms *) + ("char", fun loc -> CHAR loc); + ("int", fun loc -> INT loc); + ("float", fun loc -> FLOAT loc); + ("double", fun loc -> DOUBLE loc); + ("void", fun loc -> VOID loc); + ("enum", fun loc -> ENUM loc); + ("struct", fun loc -> STRUCT loc); + ("typedef", fun loc -> TYPEDEF loc); + ("union", fun loc -> UNION loc); + ("break", fun loc -> BREAK loc); + ("continue", fun loc -> CONTINUE loc); + ("goto", fun loc -> GOTO loc); + ("return", fun loc -> RETURN loc); + ("switch", fun loc -> SWITCH loc); + ("case", fun loc -> CASE loc); + ("default", fun loc -> DEFAULT loc); + ("while", fun loc -> WHILE loc); + ("do", fun loc -> DO loc); + ("for", fun loc -> FOR loc); + ("if", fun loc -> IF loc); + ("else", fun _ -> ELSE); + (*** Implementation specific keywords ***) + ("__signed__", fun loc -> SIGNED loc); + ("__inline__", fun loc -> INLINE loc); + ("inline", fun loc -> INLINE loc); + ("__inline", fun loc -> INLINE loc); + ("_inline", fun loc -> + if !msvcMode then + INLINE loc + else + IDENT ("_inline", loc)); + ("__attribute__", fun loc -> ATTRIBUTE loc); + ("__attribute", fun loc -> ATTRIBUTE loc); +(* + ("__attribute_used__", fun loc -> ATTRIBUTE_USED loc); +*) + ("__blockattribute__", fun _ -> BLOCKATTRIBUTE); + ("__blockattribute", fun _ -> BLOCKATTRIBUTE); + ("__asm__", fun loc -> ASM loc); + ("asm", fun loc -> ASM loc); + ("__typeof__", fun loc -> TYPEOF loc); + ("__typeof", fun loc -> TYPEOF loc); + ("typeof", fun loc -> TYPEOF loc); + ("__alignof", fun loc -> ALIGNOF loc); + ("__alignof__", fun loc -> ALIGNOF loc); + ("__volatile__", fun loc -> VOLATILE loc); + ("__volatile", fun loc -> VOLATILE loc); + + ("__FUNCTION__", fun loc -> FUNCTION__ loc); + ("__func__", fun loc -> FUNCTION__ loc); (* ISO 6.4.2.2 *) + ("__PRETTY_FUNCTION__", fun loc -> PRETTY_FUNCTION__ loc); + ("__label__", fun _ -> LABEL__); + (*** weimer: GCC arcana ***) + ("__restrict", fun loc -> RESTRICT loc); + ("restrict", fun loc -> RESTRICT loc); +(* ("__extension__", EXTENSION); *) + (**** MS VC ***) + ("__int64", fun loc -> INT64 loc); + ("__int32", fun loc -> INT loc); + ("_cdecl", fun loc -> MSATTR ("_cdecl", loc)); + ("__cdecl", fun loc -> MSATTR ("__cdecl", loc)); + ("_stdcall", fun loc -> MSATTR ("_stdcall", loc)); + ("__stdcall", fun loc -> MSATTR ("__stdcall", loc)); + ("_fastcall", fun loc -> MSATTR ("_fastcall", loc)); + ("__fastcall", fun loc -> MSATTR ("__fastcall", loc)); + ("__w64", fun loc -> MSATTR("__w64", loc)); + ("__declspec", fun loc -> DECLSPEC loc); + ("__forceinline", fun loc -> INLINE loc); (* !! we turn forceinline + * into inline *) + ("__try", fun loc -> TRY loc); + ("__except", fun loc -> EXCEPT loc); + ("__finally", fun loc -> FINALLY loc); + (* weimer: some files produced by 'GCC -E' expect this type to be + * defined *) + ("__builtin_va_list", fun loc -> NAMED_TYPE ("__builtin_va_list", loc)); + ("__builtin_va_arg", fun loc -> BUILTIN_VA_ARG loc); + ("__builtin_types_compatible_p", fun loc -> BUILTIN_TYPES_COMPAT loc); + ("__builtin_offsetof", fun loc -> BUILTIN_OFFSETOF loc); + (* On some versions of GCC __thread is a regular identifier *) + ("__thread", fun loc -> THREAD loc) + ] + +(* Mark an identifier as a type name. The old mapping is preserved and will + * be reinstated when we exit this context *) +let add_type name = + (* ignore (print_string ("adding type name " ^ name ^ "\n")); *) + H.add lexicon name (fun loc -> NAMED_TYPE (name, loc)) + +let context : string list list ref = ref [] + +let push_context _ = context := []::!context + +let pop_context _ = + match !context with + [] -> assert false + | con::sub -> + (context := sub; + List.iter (fun name -> + (* ignore (print_string ("removing lexicon for " ^ name ^ "\n")); *) + H.remove lexicon name) con) + +(* Mark an identifier as a variable name. The old mapping is preserved and + * will be reinstated when we exit this context *) +let add_identifier name = + match !context with + [] -> () (* Just ignore raise (InternalError "Empty context stack") *) + | con::sub -> + context := (name::con)::sub; + H.add lexicon name (fun loc -> IDENT (name, loc)) + + +(* +** Useful primitives +*) +let scan_ident lb id = + let here = currentLoc lb in + try (H.find lexicon id) here + (* default to variable name, as opposed to type *) + with Not_found -> IDENT (id, here) + + +(* +** Buffer processor +*) + + +let init ~(filename: string) ic : Lexing.lexbuf = + init_lexicon (); + (* Inititialize the pointer in Errormsg *) + Parse_aux.add_type := add_type; + Parse_aux.push_context := push_context; + Parse_aux.pop_context := pop_context; + Parse_aux.add_identifier := add_identifier; + (* Build lexbuf *) + let lb = Lexing.from_channel ic in + let cp = lb.lex_curr_p in + lb.lex_curr_p <- {cp with pos_fname = filename; pos_lnum = 1}; + lb + +let finish () = + () + +(*** Error handling ***) +let error = parse_error + + +(*** escape character management ***) +let scan_escape (char: char) : int64 = + let result = match char with + 'n' -> '\n' + | 'r' -> '\r' + | 't' -> '\t' + | 'b' -> '\b' + | 'f' -> '\012' (* ASCII code 12 *) + | 'v' -> '\011' (* ASCII code 11 *) + | 'a' -> '\007' (* ASCII code 7 *) + | 'e' | 'E' -> '\027' (* ASCII code 27. This is a GCC extension *) + | '\'' -> '\'' + | '"'-> '"' (* '"' *) + | '?' -> '?' + | '(' when not !msvcMode -> '(' + | '{' when not !msvcMode -> '{' + | '[' when not !msvcMode -> '[' + | '%' when not !msvcMode -> '%' + | '\\' -> '\\' + | other -> error ("Unrecognized escape sequence: \\" ^ (String.make 1 other)); raise Parsing.Parse_error + in + Int64.of_int (Char.code result) + +let scan_hex_escape str = + let radix = Int64.of_int 16 in + let the_value = ref Int64.zero in + (* start at character 2 to skip the \x *) + for i = 2 to (String.length str) - 1 do + let thisDigit = Cabshelper.valueOfDigit (String.get str i) in + (* the_value := !the_value * 16 + thisDigit *) + the_value := Int64.add (Int64.mul !the_value radix) thisDigit + done; + !the_value + +let scan_oct_escape str = + let radix = Int64.of_int 8 in + let the_value = ref Int64.zero in + (* start at character 1 to skip the \x *) + for i = 1 to (String.length str) - 1 do + let thisDigit = Cabshelper.valueOfDigit (String.get str i) in + (* the_value := !the_value * 8 + thisDigit *) + the_value := Int64.add (Int64.mul !the_value radix) thisDigit + done; + !the_value + +let lex_hex_escape remainder lexbuf = + let prefix = scan_hex_escape (Lexing.lexeme lexbuf) in + prefix :: remainder lexbuf + +let lex_oct_escape remainder lexbuf = + let prefix = scan_oct_escape (Lexing.lexeme lexbuf) in + prefix :: remainder lexbuf + +let lex_simple_escape remainder lexbuf = + let lexchar = Lexing.lexeme_char lexbuf 1 in + let prefix = scan_escape lexchar in + prefix :: remainder lexbuf + +let lex_unescaped remainder lexbuf = + let prefix = Int64.of_int (Char.code (Lexing.lexeme_char lexbuf 0)) in + prefix :: remainder lexbuf + +let lex_comment remainder lexbuf = + let ch = Lexing.lexeme_char lexbuf 0 in + let prefix = Int64.of_int (Char.code ch) in + if ch = '\n' then newline lexbuf; + prefix :: remainder lexbuf + +let make_char (i:int64):char = + let min_val = Int64.zero in + let max_val = Int64.of_int 255 in + (* if i < 0 || i > 255 then error*) + if compare i min_val < 0 || compare i max_val > 0 then begin + let msg = Printf.sprintf "clexer:make_char: character 0x%Lx too big" i in + error msg + end; + Char.chr (Int64.to_int i) + + +(* ISO standard locale-specific function to convert a wide character + * into a sequence of normal characters. Here we work on strings. + * We convert L"Hi" to "H\000i\000" + matth: this seems unused. +let wbtowc wstr = + let len = String.length wstr in + let dest = String.make (len * 2) '\000' in + for i = 0 to len-1 do + dest.[i*2] <- wstr.[i] ; + done ; + dest +*) + +(* This function converst the "Hi" in L"Hi" to { L'H', L'i', L'\0' } + matth: this seems unused. +let wstr_to_warray wstr = + let len = String.length wstr in + let res = ref "{ " in + for i = 0 to len-1 do + res := !res ^ (Printf.sprintf "L'%c', " wstr.[i]) + done ; + res := !res ^ "}" ; + !res +*) + +} + +let decdigit = ['0'-'9'] +let octdigit = ['0'-'7'] +let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] +let letter = ['a'- 'z' 'A'-'Z'] + + +let usuffix = ['u' 'U'] +let lsuffix = "l"|"L"|"ll"|"LL" +let intsuffix = lsuffix | usuffix | usuffix lsuffix | lsuffix usuffix + | usuffix ? "i64" + + +let hexprefix = '0' ['x' 'X'] + +let intnum = decdigit+ intsuffix? +let octnum = '0' octdigit+ intsuffix? +let hexnum = hexprefix hexdigit+ intsuffix? + +let exponent = ['e' 'E']['+' '-']? decdigit+ +let fraction = '.' decdigit+ +let decfloat = (intnum? fraction) + |(intnum exponent) + |(intnum? fraction exponent) + | (intnum '.') + | (intnum '.' exponent) + +let hexfraction = hexdigit* '.' hexdigit+ | hexdigit+ +let binexponent = ['p' 'P'] ['+' '-']? decdigit+ +let hexfloat = hexprefix hexfraction binexponent + | hexprefix hexdigit+ binexponent + +let floatsuffix = ['f' 'F' 'l' 'L'] +let floatnum = (decfloat | hexfloat) floatsuffix? + +let ident = (letter|'_'|'$')(letter|decdigit|'_'|'$')* +let blank = [' ' '\t' '\012' '\r']+ +let escape = '\\' _ +let hex_escape = '\\' ['x' 'X'] hexdigit+ +let oct_escape = '\\' octdigit octdigit? octdigit? + +rule initial = + parse "/*" { comment lexbuf; + initial lexbuf} +| "//" { onelinecomment lexbuf; + newline lexbuf; + initial lexbuf + } +| blank { initial lexbuf} +| '\n' { newline lexbuf; + initial lexbuf } +| '\\' '\r' * '\n' { newline lexbuf; + initial lexbuf + } +| '#' { hash lexbuf} +(* +| "_Pragma" { PRAGMA (currentLoc lexbuf) } +*) +| '\'' { CST_CHAR (chr lexbuf, currentLoc lexbuf)} +| "L'" { CST_WCHAR (chr lexbuf, currentLoc lexbuf) } +| '"' { (* '"' *) +(* matth: BUG: this could be either a regular string or a wide string. + * e.g. if it's the "world" in + * L"Hello, " "world" + * then it should be treated as wide even though there's no L immediately + * preceding it. See test/small1/wchar5.c for a failure case. *) + CST_STRING (str lexbuf, currentLoc lexbuf) } +| "L\"" { (* weimer: wchar_t string literal *) + CST_WSTRING(str lexbuf, currentLoc lexbuf) } +| floatnum {CST_FLOAT (Lexing.lexeme lexbuf, currentLoc lexbuf)} +| hexnum {CST_INT (Lexing.lexeme lexbuf, currentLoc lexbuf)} +| octnum {CST_INT (Lexing.lexeme lexbuf, currentLoc lexbuf)} +| intnum {CST_INT (Lexing.lexeme lexbuf, currentLoc lexbuf)} +| "!quit!" {EOF} +| "..." {ELLIPSIS} +| "+=" {PLUS_EQ} +| "-=" {MINUS_EQ} +| "*=" {STAR_EQ} +| "/=" {SLASH_EQ} +| "%=" {PERCENT_EQ} +| "|=" {PIPE_EQ} +| "&=" {AND_EQ} +| "^=" {CIRC_EQ} +| "<<=" {INF_INF_EQ} +| ">>=" {SUP_SUP_EQ} +| "<<" {INF_INF} +| ">>" {SUP_SUP} +| "==" {EQ_EQ} +| "!=" {EXCLAM_EQ} +| "<=" {INF_EQ} +| ">=" {SUP_EQ} +| "=" {EQ} +| "<" {INF} +| ">" {SUP} +| "++" {PLUS_PLUS (currentLoc lexbuf)} +| "--" {MINUS_MINUS (currentLoc lexbuf)} +| "->" {ARROW} +| '+' {PLUS (currentLoc lexbuf)} +| '-' {MINUS (currentLoc lexbuf)} +| '*' {STAR (currentLoc lexbuf)} +| '/' {SLASH} +| '%' {PERCENT} +| '!' {EXCLAM (currentLoc lexbuf)} +| "&&" {AND_AND (currentLoc lexbuf)} +| "||" {PIPE_PIPE} +| '&' {AND (currentLoc lexbuf)} +| '|' {PIPE} +| '^' {CIRC} +| '?' {QUEST} +| ':' {COLON} +| '~' {TILDE (currentLoc lexbuf)} + +| '{' {LBRACE (currentLoc lexbuf)} +| '}' {RBRACE (currentLoc lexbuf)} +| '[' {LBRACKET} +| ']' {RBRACKET} +| '(' { (LPAREN (currentLoc lexbuf)) } +| ')' {RPAREN} +| ';' { (SEMICOLON (currentLoc lexbuf)) } +| ',' {COMMA} +| '.' {DOT} +| "sizeof" {SIZEOF (currentLoc lexbuf)} +| "__asm" { if !msvcMode then + MSASM (msasm lexbuf, currentLoc lexbuf) + else (ASM (currentLoc lexbuf)) } + +(* If we see __pragma we eat it and the matching parentheses as well *) +| "__pragma" { matchingParsOpen := 0; + let _ = matchingpars lexbuf in + initial lexbuf + } + +(* __extension__ is a black. The parser runs into some conflicts if we let it + * pass *) +| "__extension__" {initial lexbuf } +| ident {scan_ident lexbuf (Lexing.lexeme lexbuf)} +| eof {EOF} +| _ {parse_error "Invalid symbol"; raise Parsing.Parse_error } +and comment = + parse + "*/" { () } +| eof { () } +| '\n' { newline lexbuf; comment lexbuf } +| _ { comment lexbuf } + + +and onelinecomment = parse + '\n'|eof { () } +| _ { onelinecomment lexbuf } + +and matchingpars = parse + '\n' { newline lexbuf; matchingpars lexbuf } +| blank { matchingpars lexbuf } +| '(' { incr matchingParsOpen; matchingpars lexbuf } +| ')' { decr matchingParsOpen; + if !matchingParsOpen = 0 then + () + else + matchingpars lexbuf + } +| "/*" { comment lexbuf; matchingpars lexbuf} +| '"' { (* '"' *) + let _ = str lexbuf in + matchingpars lexbuf + } +| _ { matchingpars lexbuf } + +(* # ... *) +and hash = parse + '\n' { newline lexbuf; initial lexbuf} +| blank { hash lexbuf} +| intnum { (* We are seeing a line number. This is the number for the + * next line *) + let s = Lexing.lexeme lexbuf in + begin try + setCurrentLine lexbuf (int_of_string s - 1) + with Failure ("int_of_string") -> + (* the int is too big. *) + () + end; + (* A file name may follow *) + file lexbuf } +| "line" { hash lexbuf } (* MSVC line number info *) +| "pragma" blank + { let here = currentLoc lexbuf in + PRAGMA_LINE (pragma lexbuf, here) + } +| _ { endline lexbuf} + +and file = parse + '\n' { newline lexbuf; initial lexbuf} +| blank { file lexbuf} +| '"' [^ '\012' '\t' '"']* '"' { (* '"' *) + let n = Lexing.lexeme lexbuf in + let n1 = String.sub n 1 + ((String.length n) - 2) in + setCurrentFile lexbuf n1; + endline lexbuf} + +| _ { endline lexbuf} + +and endline = parse + '\n' { newline lexbuf; initial lexbuf} +| eof { EOF } +| _ { endline lexbuf} + +and pragma = parse + '\n' { newline lexbuf; "" } +| _ { let cur = Lexing.lexeme lexbuf in + cur ^ (pragma lexbuf) } + +and str = parse + '"' {[]} (* no nul terminiation in CST_STRING '"' *) +| hex_escape { lex_hex_escape str lexbuf} +| oct_escape { lex_oct_escape str lexbuf} +| escape { lex_simple_escape str lexbuf} +| _ { lex_unescaped str lexbuf} + +and chr = parse + '\'' {[]} +| hex_escape {lex_hex_escape chr lexbuf} +| oct_escape {lex_oct_escape chr lexbuf} +| escape {lex_simple_escape chr lexbuf} +| _ {lex_unescaped chr lexbuf} + +and msasm = parse + blank { msasm lexbuf } +| '{' { msasminbrace lexbuf } +| _ { let cur = Lexing.lexeme lexbuf in + cur ^ (msasmnobrace lexbuf) } + +and msasminbrace = parse + '}' { "" } +| _ { let cur = Lexing.lexeme lexbuf in + cur ^ (msasminbrace lexbuf) } +and msasmnobrace = parse + ['}' ';' '\n'] { lexbuf.Lexing.lex_curr_pos <- + lexbuf.Lexing.lex_curr_pos - 1; + "" } +| "__asm" { lexbuf.Lexing.lex_curr_pos <- + lexbuf.Lexing.lex_curr_pos - 5; + "" } +| _ { let cur = Lexing.lexeme lexbuf in + + cur ^ (msasmnobrace lexbuf) } + +{ + +} -- cgit v1.2.3