From 0577c06262f89785b41f110840a1646c7981fe12 Mon Sep 17 00:00:00 2001 From: filliatr Date: Tue, 30 Jan 2001 09:19:30 +0000 Subject: backtrack sur le lexeur de la V6 git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@1289 85f007b7-540e-0410-9357-904b9bb8a0f7 --- parsing/lexer.ml4 | 307 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ parsing/lexer.mli | 2 + parsing/lexer.mll | 290 --------------------------------------------------- parsing/pcoq.ml4 | 4 +- 4 files changed, 311 insertions(+), 292 deletions(-) create mode 100644 parsing/lexer.ml4 delete mode 100644 parsing/lexer.mll (limited to 'parsing') diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4 new file mode 100644 index 000000000..adc15c8a7 --- /dev/null +++ b/parsing/lexer.ml4 @@ -0,0 +1,307 @@ + +(*i $Id$ i*) + +open Token + +(* Dictionaries: trees annotated with string options, each node being a map + from chars to dictionaries (the subtrees). A trie, in other words. *) + +module CharMap = Map.Make (struct type t = char let compare = compare end) + +type ttree = { + node : string option; + branch : ttree CharMap.t } + +let empty_ttree = { node = None; branch = CharMap.empty } + +let ttree_add ttree str = + let rec insert tt i = + if i == String.length str then + {node = Some str; branch = tt.branch} + else + let c = str.[i] in + let br = + match try Some (CharMap.find c tt.branch) with Not_found -> None with + | Some tt' -> + CharMap.add c (insert tt' (i + 1)) (CharMap.remove c tt.branch) + | None -> + let tt' = {node = None; branch = CharMap.empty} in + CharMap.add c (insert tt' (i + 1)) tt.branch + in + { node = tt.node; branch = br } + in + insert ttree 0 + +(* Search a string in a dictionary: raises [Not_found] + if the word is not present. *) + +let ttree_find ttree str = + let rec proc_rec tt i = + if i == String.length str then + match tt.node with + | Some s -> s + | None -> raise Not_found + else + proc_rec (CharMap.find str.[i] tt.branch) (i+1) + in + proc_rec ttree 0 + +(* Lexer conventions on tokens *) + +type error = + | Illegal_character + | Unterminated_comment + | Unterminated_string + | Undefined_token + | Bad_token of string + +exception Error of error + +let bad_token str = raise (Error (Bad_token str)) + +let check_special_token str = + let rec loop = parser + | [< ' (' ' | '\n' | '\r' | '\t') >] -> bad_token str + | [< _ = Stream.empty >] -> () + | [< '_ ; s >] -> loop s + in + loop (Stream.of_string str) + +let check_ident str = + let rec loop = parser + | [< ''$' | 'a'..'z' | 'A'..'Z' | '\192'..'\214' | '\216'..'\246' + | '\248'..'\255' | '0'..'9' | ''' | '_'; s >] -> loop s + | [< _ = Stream.empty >] -> () + | [< >] -> bad_token str + in + loop (Stream.of_string str) + +(* Special token dictionary *) +let token_tree = ref empty_ttree + +let add_special_token str = + check_special_token str; + token_tree := ttree_add !token_tree str + +(* Keyword identifier dictionary *) +let keywords = ref empty_ttree + +let find_keyword s = ttree_find !keywords s + +let add_keyword str = + check_ident str; + keywords := ttree_add !keywords str + +let is_keyword s = + try let _ = ttree_find !keywords s in true with Not_found -> false + + +(* Adding a new token (keyword or special token). *) +let add_token (con, str) = match con with + | "" -> + let normal_token = + if String.length str > 0 then + match str.[0] with + | ' ' | '\n' | '\r' | '\t' | '0'..'9' | '"' -> bad_token str + | '_' | '$' | 'a'..'z' | 'A'..'Z' -> true + | _ -> false + else + true + in + if normal_token then add_keyword str else add_special_token str + | "METAIDENT" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI" -> () + | _ -> + raise (Token.Error ("\ +the constructor \"" ^ con ^ "\" is not recognized by Lexer")) + + +(* Freeze and unfreeze the state of the lexer *) +type frozen_t = ttree * ttree + +let freeze () = (!keywords, !token_tree) + +let unfreeze (kw,tt) = + keywords := kw; + token_tree := tt + +let init () = + unfreeze (empty_ttree, empty_ttree); + List.iter add_keyword + [ "Grammar"; "Syntax"; "Quit"; "Load"; "Compile"; + "of"; "with"; "end"; "as"; "in"; "using"; + "Cases"; "Fixpoint"; "CoFixpoint"; + "Definition"; "Inductive"; "CoInductive"; + "Theorem"; "Variable"; "Axiom"; "Parameter"; "Hypothesis"; + "Orelse"; "Proof"; "Qed"; + "Prop"; "Set"; "Type" + ]; (* "let" is not a keyword because #Core#let.cci would not parse *) + List.iter add_special_token + [ ":"; "("; ")"; "["; "]"; "{"; "}"; "_"; ";"; "`"; "``"; + "()"; "->"; "\\/"; "/\\"; "|-"; ":="; "<->"; "!"; "$"; "%"; "&"; + "*"; "+"; ","; "@"; "^"; "#"; "\\"; "/"; "-"; "<"; ">"; + "|"; "?"; "="; "~"; "'"; "<<"; ">>"; "<>"; "::"; + "<:"; ":<"; "=>"; ">->" ] + +let _ = init() + +(* Errors occuring while lexing (explained as "Lexer error: ...") *) +let err loc str = Stdpp.raise_with_loc loc (Error str) + +(* The string buffering machinery *) + +let buff = ref (String.create 80) + +let store len x = + if len >= String.length !buff then + buff := !buff ^ String.create (String.length !buff); + !buff.[len] <- x; + succ len + +let mstore len s = + let rec add_rec len i = + if i == String.length s then len else add_rec (store len s.[i]) (succ i) + in + add_rec len 0 + +let get_buff len = String.sub !buff 0 len + + +(* The classical lexer: idents, numbers, quoted strings, comments *) + +let rec ident len = parser + | [< ' ('a'..'z' | 'A'..'Z' | '\192'..'\214' | '\216'..'\246' + |'\248'..'\255' | '0'..'9' | ''' | '_' as c); s >] -> + ident (store len c) s + | [< >] -> len + +let rec number len = parser + | [< ' ('0'..'9' as c); s >] -> number (store len c) s + | [< >] -> len + +let escape len c = store len c + +let rec string bp len = parser + | [< ''"' >] -> len + | [< ''\\'; 'c; s >] -> string bp (escape len c) s + | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_string + | [< 'c; s >] -> string bp (store len c) s + +let rec comment bp = parser + | [< ''('; + _ = (parser + | [< ''*'; _ = comment bp >] -> () + | [< >] -> ()); + s >] -> comment bp s + | [< ''*'; + _ = parser + | [< '')' >] -> () + | [< s >] -> comment bp s >] -> () + | [< ''"'; _ = (parser bp [< _ = string bp 0 >] -> ()); s >] -> + comment bp s + | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment + | [< '_; s >] -> comment bp s + +(* Parse a special token, using the [token_tree] *) + +let process_chars bp c cs = + let rec proc_rec tt = + match + match Stream.peek cs with + | Some c -> + (try Some (CharMap.find c tt.branch) with Not_found -> None) + | None -> None + with + | Some tt' -> Stream.junk cs; proc_rec tt' + | None -> tt.node + in + let t = + try proc_rec (CharMap.find c !token_tree.branch) + with Not_found -> !token_tree.node + in + let ep = Stream.count cs in + match t with + | Some t -> (("", t), (bp, ep)) + | None -> err (bp, ep) Undefined_token + +(* Parse a token in a char stream *) + +let rec next_token = parser bp + | [< '' ' | '\n' | '\r'| '\t'; s >] -> next_token s + | [< ''$'; len = ident (store 0 '$') >] ep -> + (("METAIDENT", get_buff len), (bp,ep)) + | [< ''.'; t = parser + | [< ' ('_' | 'a'..'z' | 'A'..'Z' | '\192'..'\214' + | '\216'..'\246' | '\248'..'\255' as c); + len = ident (store 0 c) >] -> ("FIELD", get_buff len) + | [< >] -> ("", ".") >] ep -> (t, (bp,ep)) + | [< ' ('_' | 'a'..'z' | 'A'..'Z' | '\192'..'\214' + | '\216'..'\246' | '\248'..'\255' as c); + len = ident (store 0 c) >] ep -> + let id = get_buff len in + (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep) + | [< ' ('0'..'9' as c); len = number (store 0 c) >] ep -> + (("INT", get_buff len), (bp, ep)) + | [< ''"'; len = string bp 0 >] ep -> + (("STRING", get_buff len), (bp, ep)) + | [< ' ('(' as c); + t = parser + | [< ''*'; _ = comment bp; s >] -> next_token s + | [< t = process_chars bp c >] -> t >] -> t + | [< 'c; t = process_chars bp c >] -> t + | [< _ = Stream.empty >] -> (("EOI", ""), (bp, bp + 1)) + +(* Location table system for creating tables associating a token count + to its location in a char stream (the source) *) + +let locerr () = invalid_arg "Lexer: location function" + +let loct_create () = ref (Array.create 1024 None) + +let loct_func loct i = + match + if i < 0 || i >= Array.length !loct then None + else Array.unsafe_get !loct i + with + | Some loc -> loc + | _ -> locerr () + +let loct_add loct i loc = + if i >= Array.length !loct then begin + let new_tmax = Array.length !loct * 2 in + let new_loct = Array.create new_tmax None in + Array.blit !loct 0 new_loct 0 (Array.length !loct); + loct := new_loct + end; + !loct.(i) <- Some loc + +let func cs = + let loct = loct_create () in + let ts = + Stream.from + (fun i -> + let (tok, loc) = next_token cs in + loct_add loct i loc; Some tok) + in + (ts, loct_func loct) + +(* Names of tokens, for this lexer, used in Grammar error messages *) + +let token_text = function + | ("", t) -> "'" ^ t ^ "'" + | ("IDENT", "") -> "identifier" + | ("IDENT", t) -> "'" ^ t ^ "'" + | ("INT", "") -> "integer" + | ("INT", s) -> "'" ^ s ^ "'" + | ("STRING", "") -> "string" + | ("EOI", "") -> "end of input" + | (con, "") -> con + | (con, prm) -> con ^ " \"" ^ prm ^ "\"" + +let tparse (p_con, p_prm) = + None + (*i was + if p_prm = "" then + (parser [< '(con, prm) when con = p_con >] -> prm) + else + (parser [< '(con, prm) when con = p_con && prm = p_prm >] -> prm) + i*) diff --git a/parsing/lexer.mli b/parsing/lexer.mli index 8ecfbd069..b7dc6e25d 100644 --- a/parsing/lexer.mli +++ b/parsing/lexer.mli @@ -5,6 +5,8 @@ type error = | Illegal_character | Unterminated_comment | Unterminated_string + | Undefined_token + | Bad_token of string exception Error of error diff --git a/parsing/lexer.mll b/parsing/lexer.mll deleted file mode 100644 index 06531fb0f..000000000 --- a/parsing/lexer.mll +++ /dev/null @@ -1,290 +0,0 @@ - -(* $Id$ *) - -{ -open Util - -type error = - | Illegal_character - | Unterminated_comment - | Unterminated_string - -exception Error of error - -(* token trees *) - -module Charmap = Map.Make (struct type t = char let compare = compare end) - -type ttree = { node : string option; branch : ttree Charmap.t } - -let ttree_empty = { node = None; branch = Charmap.empty } - -let ttree_add ttree str = - let n = String.length str in - let rec insert tt i = - if i == n then - { node = Some str; branch = tt.branch } - else - let c = str.[i] in - let br = - match - try Some (Charmap.find c tt.branch) with Not_found -> None - with - | Some tt' -> - Charmap.add c (insert tt' (i + 1)) (Charmap.remove c tt.branch) - | None -> - let tt' = { node = None; branch = Charmap.empty } in - Charmap.add c (insert tt' (i + 1)) tt.branch - in - { node = tt.node; branch = br } - in - insert ttree 0 - -let ttree_find ttree str = - let n = String.length str in - let rec proc_rec tt i = - if i == n then - match tt.node with - | Some s -> s - | None -> raise Not_found - else - proc_rec (Charmap.find str.[i] tt.branch) (i+1) - in - proc_rec ttree 0 - -let kw_table = ref ttree_empty - -let init () = - kw_table := ttree_empty; - List.iter - (fun kw -> kw_table := ttree_add !kw_table kw) - [ "Grammar"; "Syntax"; "Quit"; "Load"; "Compile"; - "of"; "with"; "end"; "as"; "in"; "using"; - "Cases"; "Fixpoint"; "CoFixpoint"; - "Definition"; "Inductive"; "CoInductive"; - "Theorem"; "Variable"; "Axiom"; "Parameter"; "Hypothesis"; - "Orelse"; "Proof"; "Qed"; - "Prop"; "Set"; "Type" ] - -let _ = init () - -let add_keyword s = kw_table := ttree_add !kw_table s - -let is_keyword s = - try let _ = ttree_find !kw_table s in true with Not_found -> false - -type frozen_t = ttree - -let freeze () = !kw_table - -let unfreeze ft = kw_table := ft - -let char_for_backslash = - match Sys.os_type with - | "Unix" | "Win32" -> - begin function - | 'n' -> '\010' - | 'r' -> '\013' - | 'b' -> '\008' - | 't' -> '\009' - | c -> c - end - | "MacOS" -> - begin function - | 'n' -> '\013' - | 'r' -> '\010' - | 'b' -> '\008' - | 't' -> '\009' - | c -> c - end - | x -> error "Lexer: unknown system type" - -let char_for_decimal_code lexbuf i = - let c = 100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) + - 10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) + - (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48) in - Char.chr(c land 0xFF) - -let string_buffer = Buffer.create 80 -let string_start_pos = ref 0 - -let comment_depth = ref 0 -let comment_start_pos = ref 0 - -} - -let blank = [' ' '\010' '\013' '\009' '\012'] -let firstchar = - ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255'] -let identchar = - ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' - '\'' '0'-'9'] -let symbolchar = ['!' '$' '%' '&' '*' '+' ',' '@' '^' '#'] -let extrachar = symbolchar | ['\\' '/' '-' '<' '>' '|' ':' '?' '=' '~' '\''] -let decimal_literal = ['0'-'9']+ -let hex_literal = '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']+ -let oct_literal = '0' ['o' 'O'] ['0'-'7']+ -let bin_literal = '0' ['b' 'B'] ['0'-'1']+ - -rule token = parse - | blank+ - { token lexbuf } - | "$" identchar* - { ("METAIDENT",Lexing.lexeme lexbuf) } - | firstchar identchar* - { let s = Lexing.lexeme lexbuf in - if is_keyword s then ("",s) else ("IDENT",s) } - | decimal_literal | hex_literal | oct_literal | bin_literal - { ("INT", Lexing.lexeme lexbuf) } - | "(" | ")" | "[" | "]" | "{" | "}" | "." | "_" | ";"| "`" | "``" | "()" - | "->" | "\\/" | "/\\" | "|-" | ":=" | "<->" | '\'' | extrachar - { ("", Lexing.lexeme lexbuf) } - | "." blank - { ("", ".") } - | "." firstchar identchar* - { let s = Lexing.lexeme lexbuf in - ("FIELD", String.sub s 1 (String.length s - 1)) } - | ('\\'(symbolchar | '\\' | '-' | '>' | '|' | ':' | '?' | '=' | '<' | '~')+ | - '/' (symbolchar | '/' | '-' | '>' | '|' | ':' | '?' | '=' | '<' | '~')+ | - '-' (symbolchar | '\\' | '/' | '-' | '|' | ':' | '?' | '=' | '<' | '~')+ | - '|' (symbolchar | '\\' | '/' | '|' | '>' | ':' | '?' | '=' | '<' | '~')+ | - '=' (symbolchar | '\\' | '/' | '|' | '-' | '>' | ':' | '=' | '<' | '~')+ | - '<' (symbolchar | '\\' | '/' | '|' | '>' | '?' | ':' | '=' | '<' | '~')+ | - "<-"(symbolchar | '\\' | '/' | '|' | '-' | '?' | ':' | '=' | '<' | '~')+ | - '~' (symbolchar | '\\' | '/' | '|' | '-' | '>' | ':' | '=' | '<' )+ | - ':' (symbolchar | '\\' | '/' | '|' | '-' | '>' | ':' | '=' | '<' )+ | - '\'' symbolchar+ | '>' | '?') extrachar* | "<-" - { ("", Lexing.lexeme lexbuf) } - | symbolchar+ extrachar* - { ("", Lexing.lexeme lexbuf) } - (*** - | '`' [^'`']* '`' - { let s = Lexing.lexeme lexbuf in - ("QUOTED", String.sub s 1 (String.length s - 2)) } - ***) - | "\"" - { Buffer.reset string_buffer; - let string_start = Lexing.lexeme_start lexbuf in - string_start_pos := string_start; - string lexbuf; - ("STRING", Buffer.contents string_buffer) } - | "(*" - { comment_depth := 1; - comment_start_pos := Lexing.lexeme_start lexbuf; - comment lexbuf; - token lexbuf } - | eof - { ("EOI","") } - | _ { let loc = (Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf) in - raise (Stdpp.Exc_located (loc, Error Illegal_character)) } - -and comment = parse - | "(*" - { comment_depth := succ !comment_depth; comment lexbuf } - | "*)" - { comment_depth := pred !comment_depth; - if !comment_depth > 0 then comment lexbuf } - | "\"" - { Buffer.reset string_buffer; - string_start_pos := Lexing.lexeme_start lexbuf; - string lexbuf; - comment lexbuf } - | "''" - { comment lexbuf } - | "'" [^ '\\' '\''] "'" - { comment lexbuf } - | "'\\" ['\\' '\'' 'n' 't' 'b' 'r'] "'" - { comment lexbuf } - | "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" - { comment lexbuf } - | eof - { let loc = (!comment_start_pos, !comment_start_pos+2) in - raise (Stdpp.Exc_located (loc, Error Unterminated_comment)) } - | _ - { comment lexbuf } - -and string = parse - | '"' - { () } - | '\\' ("\010" | "\013" | "\013\010") [' ' '\009'] * - { string lexbuf } - | '\\' ['\\' '"' 'n' 't' 'b' 'r'] - { let c = char_for_backslash (Lexing.lexeme_char lexbuf 1) in - Buffer.add_char string_buffer c; - string lexbuf } - | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] - { Buffer.add_char string_buffer (char_for_decimal_code lexbuf 1); - string lexbuf } - | eof - { let loc = (!string_start_pos, !string_start_pos+1) in - raise (Stdpp.Exc_located (loc, Error Unterminated_string)) } - | _ - { Buffer.add_char string_buffer (Lexing.lexeme_char lexbuf 0); - string lexbuf } - -{ - -let create_loc_table () = ref (Array.create 1024 None) - -let find_loc t i = - if i < 0 || i >= Array.length !t then invalid_arg "find_loc"; - match Array.unsafe_get !t i with - | None -> invalid_arg "find_loc" - | Some l -> l - -let add_loc t i l = - while i >= Array.length !t do - let new_t = Array.create (2 * Array.length !t) None in - Array.blit !t 0 new_t 0 (Array.length !t); - t := new_t - done; - !t.(i) <- Some l - -let func cs = - let loct = create_loc_table () in - let lexbuf = - Lexing.from_function - (fun s _ -> match cs with parser - | [< 'c >] -> String.unsafe_set s 0 c; 1 - | [< >] -> 0) - in - let next_token i = - let tok = token lexbuf in - let loc = (Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf) in - add_loc loct i loc; Some tok - in - let ts = Stream.from next_token in - (ts, find_loc loct) - -let is_ident s = - String.length s > 0 && - (match s.[0] with - | 'A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246' - | '\248'..'\255' -> true - | _ -> false) - -let add_token = function - | ("",kw) -> if is_ident kw then add_keyword kw - | _ -> () - -let token_text = function - | ("", t) -> "'" ^ t ^ "'" - | ("IDENT", "") -> "identifier" - | ("IDENT", t) -> "'" ^ t ^ "'" - | ("INT", "") -> "integer" - | ("INT", s) -> "'" ^ s ^ "'" - | ("STRING", "") -> "string" - | ("EOI", "") -> "end of input" - | (con, "") -> con - | (con, prm) -> con ^ " \"" ^ prm ^ "\"" - -let tparse (p_con, p_prm) = - None - (*i etait en ocaml 2.xx : - if p_prm = "" then - parser [< '(con, prm) when con = p_con >] -> prm - else - parser [< '(con, prm) when con = p_con && prm = p_prm >] -> prm - i*) - -} diff --git a/parsing/pcoq.ml4 b/parsing/pcoq.ml4 index 968a4cba4..5776cb811 100644 --- a/parsing/pcoq.ml4 +++ b/parsing/pcoq.ml4 @@ -7,8 +7,8 @@ open Util (* The lexer of Coq *) (* Note: removing a token. - We do nothing because remove_token is called only when removing a grammar - rule with Grammar.delete_rule. The latter command is called only when + We do nothing because [remove_token] is called only when removing a grammar + rule with [Grammar.delete_rule]. The latter command is called only when unfreezing the state of the grammar entries (see GRAMMAR summary, file env/metasyntax.ml). Therefore, instead of removing tokens one by one, we unfreeze the state of the lexer. This restores the behaviour of the -- cgit v1.2.3