(***********************************************************************) (* v * The Coq Proof Assistant / The Coq Development Team *) (* None with | Some tt' -> CharMap.add c (insert tt' (i + 1)) (CharMap.remove c tt.branch) | None -> let tt' = {node = None; branch = CharMap.empty} in CharMap.add c (insert tt' (i + 1)) tt.branch in { node = tt.node; branch = br } in insert ttree 0 (* Search a string in a dictionary: raises [Not_found] if the word is not present. *) let ttree_find ttree str = let rec proc_rec tt i = if i == String.length str then match tt.node with | Some s -> s | None -> raise Not_found else proc_rec (CharMap.find str.[i] tt.branch) (i+1) in proc_rec ttree 0 (* Lexer conventions on tokens *) type error = | Illegal_character | Unterminated_comment | Unterminated_string | Undefined_token | Bad_token of string exception Error of error let bad_token str = raise (Error (Bad_token str)) let check_special_token str = let rec loop = parser | [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str | [< _ = Stream.empty >] -> () | [< '_ ; s >] -> loop s in loop (Stream.of_string str) let check_ident str = let rec loop = parser | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_'); s >] -> loop s (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) | [< ' ('\206' | '\207'); ' ('\128'..'\191'); s >] -> loop s (* iso 8859-1 accentuated letters *) | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255'); s >] -> loop s | [< _ = Stream.empty >] -> () | [< >] -> bad_token str in loop (Stream.of_string str) (* Special token dictionary *) let token_tree = ref empty_ttree let add_special_token str = check_special_token str; token_tree := ttree_add !token_tree str (* Keyword identifier dictionary *) let keywords = ref empty_ttree let find_keyword s = ttree_find !keywords s let add_keyword str = check_ident str; keywords := ttree_add !keywords str let is_keyword s = try let _ = ttree_find !keywords s in true with Not_found -> false let is_normal_token str = if String.length str > 0 then match str.[0] with | ' ' | '\n' | '\r' | '\t' | '"' -> bad_token str | '_' | '$' | 'a'..'z' | 'A'..'Z' -> true (* utf-8 symbols of the form "E2 xx xx" [E2=226] *) | '\226' when String.length str > 2 -> (match str.[1], str.[2] with | ('\132', '\128'..'\191') | ('\133', '\128'..'\143') -> (* utf8 letter-like unicode 2100-214F *) true | (('\134'..'\143' | '\152'..'\155' | '\164'..'\165' | '\168'..'\171'),_) -> (* utf8 symbols (see [parse_226_tail] *) false | _ -> (* default to iso 8859-1 "â" *) true) (* iso 8859-1 accentuated letters *) | '\192'..'\214' | '\216'..'\246' | '\248'..'\255' -> true | _ -> false else bad_token str (* Adding a new token (keyword or special token). *) let add_token (con, str) = match con with | "" -> if is_normal_token str then add_keyword str else add_special_token str | "METAIDENT" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI" -> () | _ -> raise (Token.Error ("\ the constructor \"" ^ con ^ "\" is not recognized by Lexer")) (* Freeze and unfreeze the state of the lexer *) type frozen_t = ttree * ttree let freeze () = (!keywords, !token_tree) let unfreeze (kw,tt) = keywords := kw; token_tree := tt let init () = unfreeze (empty_ttree, empty_ttree) let _ = init() (* Errors occuring while lexing (explained as "Lexer error: ...") *) let err loc str = Stdpp.raise_with_loc loc (Error str) (* The string buffering machinery *) let buff = ref (String.create 80) let store len x = if len >= String.length !buff then buff := !buff ^ String.create (String.length !buff); !buff.[len] <- x; succ len let mstore len s = let rec add_rec len i = if i == String.length s then len else add_rec (store len s.[i]) (succ i) in add_rec len 0 let get_buff len = String.sub !buff 0 len (* The classical lexer: idents, numbers, quoted strings, comments *) let rec ident len = parser | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' | '@' as c); s >] -> ident (store len c) s (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2) ; s >] -> ident (store (store len c1) c2) s (* iso 8859-1 accentuated letters *) | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c); s >] -> ident (store len c) s | [< >] -> len let rec number len = parser | [< ' ('0'..'9' as c); s >] -> number (store len c) s | [< >] -> len let escape len c = store len c let rec string bp len = parser | [< ''"' >] -> len | [< ''\\'; c = (parser [< ' ('"' | '\\' as c) >] -> c | [< >] -> '\\'); s >] -> string bp (escape len c) s | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_string | [< 'c; s >] -> string bp (store len c) s let comments = ref None type commentar = | InVernac | InVernacEsc of string | BeVernac of string | InComment of string let current = ref (BeVernac "") let add_comment () = let reinit () = match ! current with | InVernac -> () | InVernacEsc s -> current := InVernacEsc "" | BeVernac s -> current := BeVernac "" | InComment s -> current := InComment "" in match (!comments,!current) with | None , InVernac -> () | None , BeVernac s | None , InComment s | None , InVernacEsc s -> reinit (); comments := Some [str s] | Some _ , InVernac -> () | Some l , BeVernac s | Some l , InComment s | Some l , InVernacEsc s -> reinit (); comments := Some (str s::l) let add_comment_pp s = match !comments with | None -> comments := Some [s] | Some l -> comments := Some (s::l) let add_string s = match !current with | InVernac -> () | InVernacEsc t -> current := InVernacEsc (t^s) | BeVernac t -> current := BeVernac (t^s) | InComment t -> current := InComment (t^s) let rec comment bp = parser | [< ''('; _ = (parser | [< ''*'; s >] -> add_string "(*"; comment bp s | [< >] -> add_string "(" ); s >] -> comment bp s | [< ''*'; _ = parser | [< '')' >] -> add_string "*)"; add_comment () | [< s >] -> add_string "*"; comment bp s >] -> () | [< ''"'; _ = (parser bp [< _ = string bp 0 >] -> ()); s >] -> comment bp s | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment | [< '_ as z; s >] -> (match z with | '\n' -> add_comment (); add_comment_pp (fnl ()) | _ -> add_string (String.make 1 z)); comment bp s (* Parse a special token, using the [token_tree] *) let progress_special c = function | None -> None | Some tt -> try Some (CharMap.find c tt.branch) with Not_found -> None let rec special tt cs = match tt with | None -> None | Some tt -> match match Stream.peek cs with | Some c -> (try Some (CharMap.find c tt.branch) with Not_found -> None) | None -> None with | Some _ as tt' -> Stream.junk cs; special tt' cs | None -> tt.node let process_chars bp c cs = let t = try special (Some (CharMap.find c !token_tree.branch)) cs with Not_found -> !token_tree.node in let ep = Stream.count cs in match t with | Some t -> (("", t), (bp, ep)) | None -> err (bp, ep) Undefined_token type token_226_tail = | TokSymbol of string option | TokIdent of string let parse_226_tail tk = parser | [< ''\132' as c2; ' ('\128'..'\191' as c3); (* utf8 letter-like unicode 2100-214F *) len = ident (store (store (store 0 '\226') c2) c3) >] -> TokIdent (get_buff len) | [< ''\133' as c2; ' ('\128'..'\143' as c3); (* utf8 letter-like unicode 2100-214F *) len = ident (store (store (store 0 '\226') c2) c3) >] -> TokIdent (get_buff len) | [< ' ('\134'..'\143' | '\152'..'\155' | '\164'..'\165' | '\168'..'\171' as c2); 'c3; (* utf8 arrows A unicode 2190-21FF *) (* utf8 mathematical operators unicode 2200-22FF *) (* utf8 miscellaneous technical unicode 2300-23FF *) (* utf8 miscellaneous symbols unicode 2600-26FF *) (* utf8 arrows B unicode 2900-297F *) (* utf8 mathematical operators unicode 2A00-2AFF *) t = special (progress_special c3 (progress_special c2 (progress_special '\226' tk))) >] -> TokSymbol t | [< len = ident (store 0 '\226') >] -> TokIdent (get_buff len) (* Parse what foolows a dot *) let parse_after_dot bp c strm = if !Options.v7 then match strm with parser | [< ' ('_' | 'a'..'z' | 'A'..'Z' as c); len = ident (store 0 c) >] -> ("FIELD", get_buff len) (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2); len = ident (store (store 0 c1) c2) >] -> ("FIELD", get_buff len) (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *) | [< ''\226' as c1; t = parse_226_tail (progress_special '.' (Some !token_tree)) >] ep -> (match t with | TokSymbol (Some t) -> ("", t) | TokSymbol None -> err (bp, ep) Undefined_token | TokIdent t -> ("FIELD", t)) (* iso 8859-1 accentuated letters *) | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c); len = ident (store 0 c) >] -> ("FIELD", get_buff len) | [< (t,_) = process_chars bp c >] -> t else match strm with parser | [< (t,_) = process_chars bp c >] -> t (* Parse a token in a char stream *) let rec next_token = parser bp | [< ''\n'; s >] -> (match !current with | BeVernac s -> current := InComment s | InComment _ -> add_comment_pp (fnl ()) | _ -> ()); next_token s | [< '' ' | '\t'; s >] -> (match !current with | BeVernac _ | InComment _ -> add_comment_pp (spc ()) | _ -> ()); next_token s | [< ''\r'; s >] -> next_token s | [< ''$'; len = ident (store 0 '$') >] ep -> (("METAIDENT", get_buff len), (bp,ep)) | [< ''.' as c; t = parse_after_dot bp c >] ep -> current:=BeVernac ""; (t, (bp,ep)) | [< ' ('_' | 'a'..'z' | 'A'..'Z' as c); len = ident (store 0 c) >] ep -> let id = get_buff len in current:=InVernac; (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep) (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2); len = ident (store (store 0 c1) c2) >] ep -> let id = get_buff len in current:=InVernac; (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep) (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *) | [< ''\226' as c1; t = parse_226_tail (Some !token_tree) >] ep -> (match t with | TokSymbol (Some t) -> ("", t), (bp, ep) | TokSymbol None -> err (bp, ep) Undefined_token | TokIdent id -> current:=InVernac; (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)) (* iso 8859-1 accentuated letters *) | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c); len = ident (store 0 c) >] ep -> let id = get_buff len in current:=InVernac; (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep) | [< ' ('0'..'9' as c); len = number (store 0 c) >] ep -> (("INT", get_buff len), (bp, ep)) | [< ''"'; len = string bp 0 >] ep -> (("STRING", get_buff len), (bp, ep)) | [< ' ('(' as c); t = parser | [< ''*'; c; s >] -> (match !current with | InVernac -> current := InVernacEsc "(*" | _ -> current := InComment "(*"); comment bp c; (match !current with | InVernacEsc _ -> add_comment_pp (fnl ()); current := InVernac | _ -> ()); next_token s | [< t = process_chars bp c >] -> t >] -> t | [< 'c; t = process_chars bp c >] -> t | [< _ = Stream.empty >] -> (("EOI", ""), (bp, bp + 1)) (* Location table system for creating tables associating a token count to its location in a char stream (the source) *) let locerr () = invalid_arg "Lexer: location function" let loct_create () = ref (Array.create 1024 None) let loct_func loct i = match if i < 0 || i >= Array.length !loct then None else Array.unsafe_get !loct i with | Some loc -> loc | _ -> locerr () let loct_add loct i loc = if i >= Array.length !loct then begin let new_tmax = Array.length !loct * 2 in let new_loct = Array.create new_tmax None in Array.blit !loct 0 new_loct 0 (Array.length !loct); loct := new_loct end; !loct.(i) <- Some loc let func cs = let loct = loct_create () in let ts = Stream.from (fun i -> let (tok, loc) = next_token cs in loct_add loct i loc; Some tok) in (ts, loct_func loct) (* Names of tokens, for this lexer, used in Grammar error messages *) let token_text = function | ("", t) -> "'" ^ t ^ "'" | ("IDENT", "") -> "identifier" | ("IDENT", t) -> "'" ^ t ^ "'" | ("INT", "") -> "integer" | ("INT", s) -> "'" ^ s ^ "'" | ("STRING", "") -> "string" | ("EOI", "") -> "end of input" | (con, "") -> con | (con, prm) -> con ^ " \"" ^ prm ^ "\"" let tparse (p_con, p_prm) = None (*i was if p_prm = "" then (parser [< '(con, prm) when con = p_con >] -> prm) else (parser [< '(con, prm) when con = p_con && prm = p_prm >] -> prm) i*)