(* *********************************************************************)
(*                                                                     *)
(*              The Compcert verified compiler                         *)
(*                                                                     *)
(*          Jacques-Henri Jourdan, INRIA Paris-Rocquencourt            *)
(*                                                                     *)
(*  Copyright Institut National de Recherche en Informatique et en     *)
(*  Automatique.  All rights reserved.  This file is distributed       *)
(*  under the terms of the GNU General Public License as published by  *)
(*  the Free Software Foundation, either version 2 of the License, or  *)
(*  (at your option) any later version.  This file is also distributed *)
(*  under the terms of the INRIA Non-Commercial License Agreement.     *)
(*                                                                     *)
(* *********************************************************************)

{
open Lexing
open Pre_parser
open Pre_parser_aux
open Cabshelper
open Camlcoq

let contexts : string list list ref = ref []
let lexicon : (string, Cabs.cabsloc -> token) Hashtbl.t = Hashtbl.create 0

let init filename channel : Lexing.lexbuf =
  assert (!contexts = []);
  Hashtbl.clear lexicon;
  List.iter
    (fun (key, builder) -> Hashtbl.add lexicon key builder)
    [ ("auto", fun loc -> AUTO loc);
      ("break", fun loc -> BREAK loc);
      ("case", fun loc -> CASE loc);
      ("char", fun loc -> CHAR loc);
      ("const", fun loc -> CONST loc);
      ("__const", fun loc -> CONST loc);
      ("__const__", fun loc -> CONST loc);
      ("continue", fun loc -> CONTINUE loc);
      ("default", fun loc -> DEFAULT loc);
      ("do", fun loc -> DO loc);
      ("double", fun loc -> DOUBLE loc);
      ("else", fun loc -> ELSE loc);
      ("enum", fun loc -> ENUM loc);
      ("extern", fun loc -> EXTERN loc);
      ("float", fun loc -> FLOAT loc);
      ("for", fun loc -> FOR loc);
      ("goto", fun loc -> GOTO loc);
      ("if", fun loc -> IF loc);
      ("inline", fun loc -> INLINE loc);
      ("__inline", fun loc -> INLINE loc);
      ("__inline__", fun loc -> INLINE loc);
      ("int", fun loc -> INT loc);
      ("long", fun loc -> LONG loc);
      ("register", fun loc -> REGISTER loc);
      ("restrict", fun loc -> RESTRICT loc);
      ("__restrict", fun loc -> RESTRICT loc);
      ("__restrict__", fun loc -> RESTRICT loc);
      ("return", fun loc -> RETURN loc);
      ("short", fun loc -> SHORT loc);
      ("signed", fun loc -> SIGNED loc);
      ("sizeof", fun loc -> SIZEOF loc);
      ("static", fun loc -> STATIC loc);
      ("struct", fun loc -> STRUCT loc);
      ("switch", fun loc -> SWITCH loc);
      ("typedef", fun loc -> TYPEDEF loc);
      ("union", fun loc -> UNION loc);
      ("unsigned", fun loc -> UNSIGNED loc);
      ("void", fun loc -> VOID loc);
      ("volatile", fun loc -> VOLATILE loc);
      ("while", fun loc -> WHILE loc);
      ("_Alignas", fun loc -> ALIGNAS loc);
      ("_Alignof", fun loc -> ALIGNOF loc);
      ("__alignof", fun loc -> ALIGNOF loc);
      ("__alignof__", fun loc -> ALIGNOF loc);
      ("__attribute", fun loc -> ATTRIBUTE loc);
      ("__attribute__", fun loc -> ATTRIBUTE loc);
      ("_Bool", fun loc -> UNDERSCORE_BOOL loc);
      ("__builtin_va_arg", fun loc -> BUILTIN_VA_ARG loc);
      ("__packed__", fun loc -> PACKED loc);
      ("__asm__", fun loc -> ASM loc);
      ("__asm", fun loc -> ASM loc);
      ("asm", fun loc -> ASM loc);
    ];

  push_context := begin fun () -> contexts := []::!contexts end;
  pop_context := begin fun () ->
    match !contexts with
      | [] -> assert false
      | t::q -> List.iter (Hashtbl.remove lexicon) t;
                contexts := q
  end;

 declare_varname := begin fun id ->
   if Hashtbl.mem lexicon id then begin
     Hashtbl.add lexicon id (fun loc -> VAR_NAME (id, ref VarId, loc));
     match !contexts with
       | [] -> ()
       | t::q -> contexts := (id::t)::q
     end
  end;

  declare_typename := begin fun id ->
    Hashtbl.add lexicon id (fun loc -> TYPEDEF_NAME (id, ref TypedefId, loc));
    match !contexts with
      | [] -> ()
      | t::q -> contexts := (id::t)::q
  end;

  !declare_typename "__builtin_va_list";

  let lb = Lexing.from_channel channel in
  lb.lex_curr_p <-
    {lb.lex_curr_p with pos_fname = filename; pos_lnum = 1};
  lb

let currentLoc =
  let nextident = ref 0 in
  let getident () =
    nextident := !nextident + 1;
    !nextident
  in
  fun lb ->
    let p = Lexing.lexeme_start_p lb in
    Cabs.({ lineno   = p.Lexing.pos_lnum;
            filename = p.Lexing.pos_fname;
            byteno   = p.Lexing.pos_cnum;
            ident    = getident ();})

(* Error reporting *)

let fatal_error lb fmt =
  Cerrors.fatal_error ("%s:%d: Error:@ " ^^ fmt)
      lb.lex_curr_p.pos_fname lb.lex_curr_p.pos_lnum

let error lb fmt =
  Cerrors.error ("%s:%d: Error:@ " ^^ fmt)
      lb.lex_curr_p.pos_fname lb.lex_curr_p.pos_lnum

let warning lb fmt =
  Cerrors.warning ("%s:%d: Warning:@ " ^^ fmt)
      lb.lex_curr_p.pos_fname lb.lex_curr_p.pos_lnum

(* Simple character escapes *)

let convert_escape = function
  | 'a' -> 7L  (* bell *)
  | 'b' -> 8L  (* backspace  *)
  | 'e' -> 27L (* escape (GCC extension) *)
  | 'f' -> 12L (* form feed *)
  | 'n' -> 10L (* new line *)
  | 'r' -> 13L (* carriage return *)
  | 't' -> 9L  (* horizontal tab *)
  | 'v' -> 11L (* vertical tab *)
  | c   -> Int64.of_int (Char.code c)
}

(* Identifiers *)
let digit = ['0'-'9']
let hexadecimal_digit = ['0'-'9' 'A'-'F' 'a'-'f']
let nondigit = ['_' 'a'-'z' 'A'-'Z']

let hex_quad = hexadecimal_digit hexadecimal_digit
                 hexadecimal_digit hexadecimal_digit
let universal_character_name =
    "\\u" (hex_quad as n)
  | "\\U" (hex_quad hex_quad as n)

let identifier_nondigit =
    nondigit
(*| universal_character_name*)
  | '$'

let identifier = identifier_nondigit (identifier_nondigit|digit)*

(* Whitespaces *)
let whitespace_char_no_newline = [' ' '\t' '\012' '\r']

(* Integer constants *)
let nonzero_digit = ['1'-'9']
let decimal_constant = nonzero_digit digit*

let octal_digit = ['0'-'7']
let octal_constant = '0' octal_digit*

let hexadecimal_prefix = "0x" | "0X"
let hexadecimal_constant =
  hexadecimal_prefix hexadecimal_digit+

let unsigned_suffix = ['u' 'U']
let long_suffix = ['l' 'L']
let long_long_suffix = "ll" | "LL"
let integer_suffix =
    unsigned_suffix long_suffix?
  | unsigned_suffix long_long_suffix
  | long_suffix unsigned_suffix?
  | long_long_suffix unsigned_suffix?

let integer_constant =
    decimal_constant integer_suffix?
  | octal_constant integer_suffix?
  | hexadecimal_constant integer_suffix?

(* Floating constants *)
let sign = ['-' '+']
let digit_sequence = digit+
let floating_suffix = ['f' 'l' 'F' 'L'] as suffix

let fractional_constant =
    (digit_sequence as intpart)? '.' (digit_sequence as frac)
  | (digit_sequence as intpart) '.'
let exponent_part =
    'e' ((sign? digit_sequence) as expo)
  | 'E' ((sign? digit_sequence) as expo)
let decimal_floating_constant =
    fractional_constant exponent_part? floating_suffix?
  | (digit_sequence as intpart) exponent_part floating_suffix?

let hexadecimal_digit_sequence = hexadecimal_digit+
let hexadecimal_fractional_constant =
    (hexadecimal_digit_sequence as intpart)? '.' (hexadecimal_digit_sequence as frac)
  | (hexadecimal_digit_sequence as intpart) '.'
let binary_exponent_part =
    'p' ((sign? digit_sequence) as expo)
  | 'P' ((sign? digit_sequence) as expo)
let hexadecimal_floating_constant =
    hexadecimal_prefix hexadecimal_fractional_constant
        binary_exponent_part floating_suffix?
  | hexadecimal_prefix (hexadecimal_digit_sequence as intpart)
        binary_exponent_part floating_suffix?

(* Character and string constants *)
let simple_escape_sequence =
  '\\' ( ['\''  '\"'  '?'  '\\'  'a'  'b'  'e'  'f'  'n'  'r'  't'  'v'] as c)
let octal_escape_sequence =
  '\\' ((octal_digit
         | octal_digit octal_digit
         | octal_digit octal_digit octal_digit) as n)
let hexadecimal_escape_sequence = "\\x" (hexadecimal_digit+ as n)
let escape_sequence =
    simple_escape_sequence
  | octal_escape_sequence
  | hexadecimal_escape_sequence
  | universal_character_name

rule initial = parse
  | '\n'                          { new_line lexbuf; initial_linebegin lexbuf }
  | whitespace_char_no_newline +  { initial lexbuf }
  | "/*"                          { multiline_comment lexbuf; initial lexbuf }
  | "//"                          { singleline_comment lexbuf; initial lexbuf }
  | integer_constant as s         { CONSTANT (Cabs.CONST_INT s, currentLoc lexbuf) }
  | decimal_floating_constant     { CONSTANT (Cabs.CONST_FLOAT
                                      {Cabs.isHex_FI = false;
                                       Cabs.integer_FI = intpart;
                                       Cabs.fraction_FI = frac;
                                       Cabs.exponent_FI = expo;
                                       Cabs.suffix_FI =
                                         match suffix with
                                         | None -> None
                                         | Some c -> Some (String.make 1 c) },
                                      currentLoc lexbuf) }
  | hexadecimal_floating_constant { CONSTANT (Cabs.CONST_FLOAT
                                      {Cabs.isHex_FI = true;
                                       Cabs.integer_FI = intpart;
                                       Cabs.fraction_FI = frac;
                                       Cabs.exponent_FI = Some expo;
                                       Cabs.suffix_FI =
                                         match suffix with
                                           | None -> None
                                           | Some c -> Some (String.make 1 c) },
                                      currentLoc lexbuf)}
  | "'"                           { let l = char_literal [] lexbuf in
                                    CONSTANT (Cabs.CONST_CHAR(false, l),
                                              currentLoc lexbuf) }
  | "L'"                          { let l = char_literal [] lexbuf in
                                    CONSTANT (Cabs.CONST_CHAR(true, l),
                                              currentLoc lexbuf) }
  | "\""                          { let l = string_literal [] lexbuf in
                                    STRING_LITERAL(false, l, currentLoc lexbuf) }
  | "L\""                         { let l = string_literal [] lexbuf in
                                    STRING_LITERAL(true, l, currentLoc lexbuf) }
  | "..."                         { ELLIPSIS(currentLoc lexbuf) }
  | "+="                          { ADD_ASSIGN(currentLoc lexbuf) }
  | "-="                          { SUB_ASSIGN(currentLoc lexbuf) }
  | "*="                          { MUL_ASSIGN(currentLoc lexbuf) }
  | "/="                          { DIV_ASSIGN(currentLoc lexbuf) }
  | "%="                          { MOD_ASSIGN(currentLoc lexbuf) }
  | "|="                          { OR_ASSIGN(currentLoc lexbuf) }
  | "&="                          { AND_ASSIGN(currentLoc lexbuf) }
  | "^="                          { XOR_ASSIGN(currentLoc lexbuf) }
  | "<<="                         { LEFT_ASSIGN(currentLoc lexbuf) }
  | ">>="                         { RIGHT_ASSIGN(currentLoc lexbuf) }
  | "<<"                          { LEFT(currentLoc lexbuf) }
  | ">>"                          { RIGHT(currentLoc lexbuf) }
  | "=="                          { EQEQ(currentLoc lexbuf) }
  | "!="                          { NEQ(currentLoc lexbuf) }
  | "<="                          { LEQ(currentLoc lexbuf) }
  | ">="                          { GEQ(currentLoc lexbuf) }
  | "="                           { EQ(currentLoc lexbuf) }
  | "<"                           { LT(currentLoc lexbuf) }
  | ">"                           { GT(currentLoc lexbuf) }
  | "++"                          { INC(currentLoc lexbuf) }
  | "--"                          { DEC(currentLoc lexbuf) }
  | "->"                          { PTR(currentLoc lexbuf) }
  | "+"                           { PLUS(currentLoc lexbuf) }
  | "-"                           { MINUS(currentLoc lexbuf) }
  | "*"                           { STAR(currentLoc lexbuf) }
  | "/"                           { SLASH(currentLoc lexbuf) }
  | "%"                           { PERCENT(currentLoc lexbuf) }
  | "!"                           { BANG(currentLoc lexbuf) }
  | "&&"                          { ANDAND(currentLoc lexbuf) }
  | "||"                          { BARBAR(currentLoc lexbuf) }
  | "&"                           { AND(currentLoc lexbuf) }
  | "|"                           { BAR(currentLoc lexbuf) }
  | "^"                           { HAT(currentLoc lexbuf) }
  | "?"                           { QUESTION(currentLoc lexbuf) }
  | ":"                           { COLON(currentLoc lexbuf) }
  | "~"                           { TILDE(currentLoc lexbuf) }
  | "{"|"<%"                      { LBRACE(currentLoc lexbuf) }
  | "}"|"%>"                      { RBRACE(currentLoc lexbuf) }
  | "["|"<:"                      { LBRACK(currentLoc lexbuf) }
  | "]"|":>"                      { RBRACK(currentLoc lexbuf) }
  | "("                           { LPAREN(currentLoc lexbuf) }
  | ")"                           { RPAREN(currentLoc lexbuf) }
  | ";"                           { SEMICOLON(currentLoc lexbuf) }
  | ","                           { COMMA(currentLoc lexbuf) }
  | "."                           { DOT(currentLoc lexbuf) }
  | identifier as id              {
        try Hashtbl.find lexicon id (currentLoc lexbuf)
        with Not_found -> VAR_NAME (id, ref VarId, currentLoc lexbuf) }
  | eof                           { EOF }
  | _ as c                        { fatal_error lexbuf "invalid symbol %C" c }

and initial_linebegin = parse
  | '\n'                          { new_line lexbuf; initial_linebegin lexbuf }
  | whitespace_char_no_newline    { initial_linebegin lexbuf }
  | '#'                           { hash lexbuf }
  | ""                            { initial lexbuf }

and char = parse
  | universal_character_name
      { try
          Int64.of_string ("0x" ^ n)
        with Failure _ ->
          error lexbuf "overflow in universal character name";
          0L
      }
  | hexadecimal_escape_sequence
      { try
          Int64.of_string ("0x" ^ n)
        with Failure _ ->
          error lexbuf "overflow in hexadecimal escape sequence";
          0L
      }
  | octal_escape_sequence
      { Int64.of_string  ("0o" ^ n) }
  | simple_escape_sequence
      { convert_escape c }
  | '\\' (_ as c)
      { warning lexbuf "incorrect escape sequence '\\%c', treating as '%c'" c c;
        Int64.of_int (Char.code c) }
  | _ as c
      { Int64.of_int (Char.code c) }

and char_literal accu = parse
  | '\''       { List.rev accu }
  | '\n' | eof { fatal_error lexbuf "missing terminating \"'\" character" }
  | ""         { let c = char lexbuf in char_literal (c :: accu) lexbuf }

and string_literal accu = parse
  | '\"'       { List.rev accu }
  | '\n' | eof { fatal_error lexbuf "missing terminating '\"' character" }
  | ""         { let c = char lexbuf in string_literal (c :: accu) lexbuf }

(* We assume gcc -E syntax but try to tolerate variations. *)
and hash = parse
  | whitespace_char_no_newline +
    (decimal_constant as n)
    whitespace_char_no_newline *
    "\"" ([^ '\n' '\"']* as file) "\""
    [^ '\n']* '\n'
      { let n =
          try
            int_of_string n
          with Failure _ ->
            warning lexbuf "invalid line number"; lexbuf.lex_curr_p.pos_lnum
        in
        lexbuf.lex_curr_p <- {
          lexbuf.lex_curr_p with
            pos_fname = file;
            pos_lnum = n;
            pos_bol = lexbuf.lex_curr_p.pos_cnum
        };
        initial_linebegin lexbuf }
  | whitespace_char_no_newline *
    "pragma"
    whitespace_char_no_newline +
    ([^ '\n']* as s) '\n'
      { new_line lexbuf; PRAGMA (s, currentLoc lexbuf) }
  | [^ '\n']* '\n'
      { warning lexbuf "unrecognized '#' line";
        new_line lexbuf; initial_linebegin lexbuf }
  | [^ '\n']* eof
      { fatal_error lexbuf "unexpected end of file" }
  | _ as c
      { fatal_error lexbuf "invalid symbol %C" c }

(* Multi-line comment terminated by "*/" *)
and multiline_comment = parse
  | "*/"   { () }
  | eof    { error lexbuf "unterminated comment" }
  | '\n'   { new_line lexbuf; multiline_comment lexbuf }
  | _      { multiline_comment lexbuf }

(* Single-line comment terminated by a newline *)
and singleline_comment = parse
  | '\n'   { new_line lexbuf }
  | eof    { () }
  | _      { singleline_comment lexbuf }

{
  open Streams
  open Specif
  open Parser
  open Aut.GramDefs

  let tokens_stream lexbuf : token coq_Stream =
    let tokens = Queue.create () in
    let lexer_wraper lexbuf : Pre_parser.token =
      let res =
        if lexbuf.lex_curr_p.pos_cnum = lexbuf.lex_curr_p.pos_bol then
          initial_linebegin lexbuf
        else
          initial lexbuf
      in
      Queue.push res tokens;
      res
    in
    Pre_parser.translation_unit_file lexer_wraper lexbuf;
    assert (!contexts = []);
    let rec compute_token_stream () =
      let loop t v =
        Cons (Coq_existT (t, Obj.magic v), Lazy.from_fun compute_token_stream)
      in
      match Queue.pop tokens with
      | ADD_ASSIGN loc -> loop ADD_ASSIGN't loc
      | AND loc -> loop AND't loc
      | ANDAND loc -> loop ANDAND't loc
      | AND_ASSIGN loc -> loop AND_ASSIGN't loc
      | AUTO loc -> loop AUTO't loc
      | BANG loc -> loop BANG't loc
      | BAR loc -> loop BAR't loc
      | BARBAR loc -> loop BARBAR't loc
      | UNDERSCORE_BOOL loc -> loop UNDERSCORE_BOOL't loc
      | BREAK loc -> loop BREAK't loc
      | BUILTIN_VA_ARG loc -> loop BUILTIN_VA_ARG't loc
      | CASE loc -> loop CASE't loc
      | CHAR loc -> loop CHAR't loc
      | COLON loc -> loop COLON't loc
      | COMMA loc -> loop COMMA't loc
      | CONST loc -> loop CONST't loc
      | CONSTANT (cst, loc) -> loop CONSTANT't (cst, loc)
      | CONTINUE loc -> loop CONTINUE't loc
      | DEC loc -> loop DEC't loc
      | DEFAULT loc -> loop DEFAULT't loc
      | DIV_ASSIGN loc -> loop DIV_ASSIGN't loc
      | DO loc -> loop DO't loc
      | DOT loc -> loop DOT't loc
      | DOUBLE loc -> loop DOUBLE't loc
      | ELLIPSIS loc -> loop ELLIPSIS't loc
      | ELSE loc -> loop ELSE't loc
      | ENUM loc -> loop ENUM't loc
      | EOF -> loop EOF't ()
      | EQ loc -> loop EQ't loc
      | EQEQ loc -> loop EQEQ't loc
      | EXTERN loc -> loop EXTERN't loc
      | FLOAT loc -> loop FLOAT't loc
      | FOR loc -> loop FOR't loc
      | GEQ loc -> loop GEQ't loc
      | GOTO loc -> loop GOTO't loc
      | GT loc -> loop GT't loc
      | HAT loc -> loop HAT't loc
      | IF loc -> loop IF't loc
      | INC loc -> loop INC't loc
      | INLINE loc -> loop INLINE't loc
      | INT loc -> loop INT't loc
      | LBRACE loc -> loop LBRACE't loc
      | LBRACK loc -> loop LBRACK't loc
      | LEFT loc -> loop LEFT't loc
      | LEFT_ASSIGN loc -> loop LEFT_ASSIGN't loc
      | LEQ loc -> loop LEQ't loc
      | LONG loc -> loop LONG't loc
      | LPAREN loc -> loop LPAREN't loc
      | LT loc -> loop LT't loc
      | MINUS loc -> loop MINUS't loc
      | MOD_ASSIGN loc -> loop MOD_ASSIGN't loc
      | MUL_ASSIGN loc -> loop MUL_ASSIGN't loc
      | NEQ loc -> loop NEQ't loc
      | OR_ASSIGN loc -> loop OR_ASSIGN't loc
      | PACKED loc -> loop PACKED't loc
      | PERCENT loc -> loop PERCENT't loc
      | PLUS loc -> loop PLUS't loc
      | PTR loc -> loop PTR't loc
      | QUESTION loc -> loop QUESTION't loc
      | RBRACE loc -> loop RBRACE't loc
      | RBRACK loc -> loop RBRACK't loc
      | REGISTER loc -> loop REGISTER't loc
      | RESTRICT loc -> loop RESTRICT't loc
      | RETURN loc -> loop RETURN't loc
      | RIGHT loc -> loop RIGHT't loc
      | RIGHT_ASSIGN loc -> loop RIGHT_ASSIGN't loc
      | RPAREN loc -> loop RPAREN't loc
      | SEMICOLON loc -> loop SEMICOLON't loc
      | SHORT loc -> loop SHORT't loc
      | SIGNED loc -> loop SIGNED't loc
      | SIZEOF loc -> loop SIZEOF't loc
      | SLASH loc -> loop SLASH't loc
      | STAR loc -> loop STAR't loc
      | STATIC loc -> loop STATIC't loc
      | STRING_LITERAL (wide, str, loc) ->
          (* Merge consecutive string literals *)
          let rec doConcat wide str =
            try
              match Queue.peek tokens with
              | STRING_LITERAL (wide', str', loc) ->
                  ignore (Queue.pop tokens);
                  let (wide'', str'') = doConcat wide' str' in
                  if str'' <> []
                  then (wide || wide'', str @ str'')
                  else (wide, str)
              | _ ->
                  (wide, str)
            with Queue.Empty -> (wide, str) in
          let (wide', str') = doConcat wide str in
          loop STRING_LITERAL't ((wide', str'), loc)
      | STRUCT loc -> loop STRUCT't loc
      | SUB_ASSIGN loc -> loop SUB_ASSIGN't loc
      | SWITCH loc -> loop SWITCH't loc
      | TILDE loc -> loop TILDE't loc
      | TYPEDEF loc -> loop TYPEDEF't loc
      | TYPEDEF_NAME (id, typ, loc)
      | VAR_NAME (id, typ, loc) ->
          let terminal = match !typ with
            | VarId -> VAR_NAME't
            | TypedefId -> TYPEDEF_NAME't
            | OtherId -> OTHER_NAME't
          in
          loop terminal (id, loc)
      | UNION loc -> loop UNION't loc
      | UNSIGNED loc -> loop UNSIGNED't loc
      | VOID loc -> loop VOID't loc
      | VOLATILE loc -> loop VOLATILE't loc
      | WHILE loc -> loop WHILE't loc
      | XOR_ASSIGN loc -> loop XOR_ASSIGN't loc
      | ALIGNAS loc -> loop ALIGNAS't loc
      | ALIGNOF loc -> loop ALIGNOF't loc
      | ATTRIBUTE loc -> loop ATTRIBUTE't loc
      | ASM loc -> loop ASM't loc
      | PRAGMA (s, loc) -> loop PRAGMA't (s, loc)
    in
    Lazy.from_fun compute_token_stream

}