From 5b7eafd0f00a16d78f99a27f5c7d5a0de77dc7e6 Mon Sep 17 00:00:00 2001 From: Stephane Glondu Date: Wed, 21 Jul 2010 09:46:51 +0200 Subject: Imported Upstream snapshot 8.3~beta0+13298 --- parsing/lexer.ml4 | 157 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 105 insertions(+), 52 deletions(-) (limited to 'parsing/lexer.ml4') diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4 index 52b5ede7..4ec61a23 100644 --- a/parsing/lexer.ml4 +++ b/parsing/lexer.ml4 @@ -6,13 +6,12 @@ (* * GNU Lesser General Public License Version 2.1 *) (************************************************************************) -(*i $Id: lexer.ml4 12891 2010-03-30 11:40:02Z herbelin $ i*) - - -(*i camlp4use: "pr_o.cmo" i*) +(*i camlp4use: "pr_o.cmo pa_macro.cmo" i*) (* Add pr_o.cmo to circumvent a useless-warning bug when preprocessed with * ast-based camlp4 *) +(*i $Id$ i*) + open Pp open Util open Token @@ -22,7 +21,7 @@ open Token module CharMap = Map.Make (struct type t = char let compare = compare end) -type ttree = { +type ttree = { node : string option; branch : ttree CharMap.t } @@ -30,7 +29,7 @@ let empty_ttree = { node = None; branch = CharMap.empty } let ttree_add ttree str = let rec insert tt i = - if i == String.length str then + if i == String.length str then {node = Some str; branch = tt.branch} else let c = str.[i] in @@ -43,7 +42,7 @@ let ttree_add ttree str = CharMap.add c (insert tt' (i + 1)) tt.branch in { node = tt.node; branch = br } - in + in insert ttree 0 (* Search a string in a dictionary: raises [Not_found] @@ -51,15 +50,30 @@ let ttree_add ttree str = let ttree_find ttree str = let rec proc_rec tt i = - if i == String.length str then - match tt.node with - | Some s -> s - | None -> raise Not_found - else - proc_rec (CharMap.find str.[i] tt.branch) (i+1) - in + if i == String.length str then tt + else proc_rec (CharMap.find str.[i] tt.branch) (i+1) + in proc_rec ttree 0 +(* Removes a string from a dictionary: returns an equal dictionary + if the word not present. *) +let ttree_remove ttree str = + let rec remove tt i = + if i == String.length str then + {node = None; branch = tt.branch} + else + let c = str.[i] in + let br = + match try Some (CharMap.find c tt.branch) with Not_found -> None with + | Some tt' -> + CharMap.add c (remove tt' (i + 1)) (CharMap.remove c tt.branch) + | None -> tt.branch + in + { node = tt.node; branch = br } + in + remove ttree 0 + + (* Errors occuring while lexing (explained as "Lexer error: ...") *) type error = @@ -108,7 +122,7 @@ let check_utf8_trailing_byte cs c = (* but don't certify full utf8 compliance (e.g. no emptyness check) *) let lookup_utf8_tail c cs = let c1 = Char.code c in - if c1 land 0x40 = 0 or c1 land 0x38 = 0x38 then error_utf8 cs + if c1 land 0x40 = 0 or c1 land 0x38 = 0x38 then error_utf8 cs else let n, unicode = if c1 land 0x20 = 0 then @@ -121,20 +135,20 @@ let lookup_utf8_tail c cs = match Stream.npeek 3 cs with | [_;c2;c3] -> check_utf8_trailing_byte cs c2; check_utf8_trailing_byte cs c3; - 3, (c1 land 0x0F) lsl 12 + (Char.code c2 land 0x3F) lsl 6 + + 3, (c1 land 0x0F) lsl 12 + (Char.code c2 land 0x3F) lsl 6 + (Char.code c3 land 0x3F) | _ -> error_utf8 cs else match Stream.npeek 4 cs with | [_;c2;c3;c4] -> check_utf8_trailing_byte cs c2; check_utf8_trailing_byte cs c3; - check_utf8_trailing_byte cs c4; + check_utf8_trailing_byte cs c4; 4, (c1 land 0x07) lsl 18 + (Char.code c2 land 0x3F) lsl 12 + (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F) | _ -> error_utf8 cs in try classify_unicode unicode, n with UnsupportedUtf8 -> error_unsupported_unicode_character n cs - + let lookup_utf8 cs = match Stream.peek cs with | Some ('\x00'..'\x7F') -> AsciiChar @@ -171,15 +185,17 @@ let check_keyword str = (* Keyword and symbol dictionary *) let token_tree = ref empty_ttree -let find_keyword s = ttree_find !token_tree s - -let is_keyword s = - try let _ = ttree_find !token_tree s in true with Not_found -> false +let is_keyword s = + try match (ttree_find !token_tree s).node with None -> false | Some _ -> true + with Not_found -> false let add_keyword str = check_keyword str; token_tree := ttree_add !token_tree str +let remove_keyword str = + token_tree := ttree_remove !token_tree str + (* Adding a new token (keyword or special token). *) let add_token (con, str) = match con with | "" -> add_keyword str @@ -235,10 +251,18 @@ let rec number len = parser let escape len c = store len c -let rec string bp len = parser +let rec string in_comments bp len = parser | [< ''"'; esc=(parser [<''"' >] -> true | [< >] -> false); s >] -> - if esc then string bp (store len '"') s else len - | [< 'c; s >] -> string bp (store len c) s + if esc then string in_comments bp (store len '"') s else len + | [< ''*'; s >] -> + (parser + | [< '')'; s >] -> + if in_comments then + msg_warning (str "Not interpreting \"*)\" as the end of current non-terminated comment because it occurs in a non-terminated string of the comment."); + string in_comments bp (store (store len '*') ')') s + | [< >] -> + string in_comments bp (store len '*') s) s + | [< 'c; s >] -> string in_comments bp (store len c) s | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_string (* Hook for exporting comment into xml theory files *) @@ -254,8 +278,8 @@ let between_com = ref true type com_state = int option * string * bool let restore_com_state (o,s,b) = - comment_begin := o; - Buffer.clear current; Buffer.add_string current s; + comment_begin := o; + Buffer.clear current; Buffer.add_string current s; between_com := b let dflt_com = (None,"",true) let com_state () = @@ -310,13 +334,13 @@ let rec comm_string bp = parser | [< >] -> real_push_char '\\'); s >] -> comm_string bp s | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_string - | [< 'c; s >] -> real_push_char c; comm_string bp s + | [< 'c; s >] -> real_push_char c; comm_string bp s let rec comment bp = parser bp2 | [< ''('; _ = (parser | [< ''*'; s >] -> push_string "(*"; comment bp s - | [< >] -> push_string "(" ); + | [< >] -> push_string "(" ); s >] -> comment bp s | [< ''*'; _ = parser @@ -324,7 +348,7 @@ let rec comment bp = parser bp2 | [< s >] -> real_push_char '*'; comment bp s >] -> () | [< ''"'; s >] -> if Flags.do_beautify() then (push_string"\"";comm_string bp2 s) - else ignore (string bp2 0 s); + else ignore (string true bp2 0 s); comment bp s | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment | [< 'z; s >] -> real_push_char z; comment bp s @@ -340,12 +364,12 @@ let rec progress_further last nj tt cs = and update_longest_valid_token last nj tt cs = match tt.node with | Some _ as last' -> - for i=1 to nj do Stream.junk cs done; + for i=1 to nj do Stream.junk cs done; progress_further last' 0 tt cs | None -> progress_further last nj tt cs -(* nr is the number of char peeked since last valid token *) +(* nj is the number of char peeked since last valid token *) (* n the number of char in utf8 block *) and progress_utf8 last nj n c tt cs = try @@ -358,7 +382,7 @@ and progress_utf8 last nj n c tt cs = List.iter (check_utf8_trailing_byte cs) l; let tt = List.fold_left (fun tt c -> CharMap.find c tt.branch) tt l in update_longest_valid_token last (nj+n) tt cs - | _ -> + | _ -> error_utf8 cs with Not_found -> last @@ -366,6 +390,12 @@ and progress_utf8 last nj n c tt cs = and progress_from_byte last nj tt cs c = progress_utf8 last nj (utf8_char_size cs c) c tt cs +let find_keyword id s = + let tt = ttree_find !token_tree id in + match progress_further tt.node 0 tt s with + | None -> raise Not_found + | Some c -> c + (* Must be a special token *) let process_chars bp c cs = let t = progress_from_byte None (-1) !token_tree cs c in @@ -379,7 +409,7 @@ let process_chars bp c cs = let parse_after_dollar bp = parser - | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail (store 0 c) >] -> + | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail (store 0 c) >] -> ("METAIDENT", get_buff len) | [< s >] -> match lookup_utf8 s with @@ -394,9 +424,9 @@ let parse_after_dot bp c = ("FIELD", get_buff len) | [< s >] -> match lookup_utf8 s with - | Utf8Token (UnicodeLetter, n) -> + | Utf8Token (UnicodeLetter, n) -> ("FIELD", get_buff (ident_tail (nstore n 0 s) s)) - | AsciiChar | Utf8Token _ | EmptyStream -> + | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp c s) (* Parse what follows a question mark *) @@ -409,6 +439,7 @@ let parse_after_qmark bp s = | Utf8Token (UnicodeLetter, _) -> ("LEFTQMARK", "") | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp '?' s) + (* Parse a token in a char stream *) let rec next_token = parser bp | [< '' ' | '\t' | '\n' |'\r' as c; s >] -> @@ -422,14 +453,14 @@ let rec next_token = parser bp | [< ''?'; s >] ep -> let t = parse_after_qmark bp s in comment_stop bp; (t, (ep, bp)) | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); - len = ident_tail (store 0 c) >] ep -> - let id = get_buff len in + len = ident_tail (store 0 c); s >] ep -> + let id = get_buff len in comment_stop bp; - (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep) + (try ("", find_keyword id s) with Not_found -> ("IDENT", id)), (bp, ep) | [< ' ('0'..'9' as c); len = number (store 0 c) >] ep -> comment_stop bp; (("INT", get_buff len), (bp, ep)) - | [< ''\"'; len = string bp 0 >] ep -> + | [< ''\"'; len = string false bp 0 >] ep -> comment_stop bp; (("STRING", get_buff len), (bp, ep)) | [< ' ('(' as c); @@ -448,8 +479,8 @@ let rec next_token = parser bp let id = get_buff len in let ep = Stream.count s in comment_stop bp; - (try ("",find_keyword id) with Not_found -> ("IDENT",id)), (bp, ep) - | AsciiChar | Utf8Token ((UnicodeSymbol | UnicodeIdentPart), _) -> + (try ("",find_keyword id s) with Not_found -> ("IDENT",id)), (bp, ep) + | AsciiChar | Utf8Token ((UnicodeSymbol | UnicodeIdentPart), _) -> let t = process_chars bp (Stream.next s) s in comment_stop bp; t | EmptyStream -> @@ -514,16 +545,38 @@ let token_text = function | ("STRING", "") -> "string" | ("EOI", "") -> "end of input" | (con, "") -> con - | (con, prm) -> con ^ " \"" ^ prm ^ "\"" + | (con, prm) -> con ^ " \"" ^ prm ^ "\"" -let tparse (p_con, p_prm) = - None - (*i was - if p_prm = "" then - (parser [< '(con, prm) when con = p_con >] -> prm) - else - (parser [< '(con, prm) when con = p_con && prm = p_prm >] -> prm) - i*) +(* The lexer of Coq *) + +(* Note: removing a token. + We do nothing because [remove_token] is called only when removing a grammar + rule with [Grammar.delete_rule]. The latter command is called only when + unfreezing the state of the grammar entries (see GRAMMAR summary, file + env/metasyntax.ml). Therefore, instead of removing tokens one by one, + we unfreeze the state of the lexer. This restores the behaviour of the + lexer. B.B. *) + +IFDEF CAMLP5 THEN + +let lexer = { + Token.tok_func = func; + Token.tok_using = add_token; + Token.tok_removing = (fun _ -> ()); + Token.tok_match = default_match; + Token.tok_comm = None; + Token.tok_text = token_text } + +ELSE + +let lexer = { + Token.func = func; + Token.using = add_token; + Token.removing = (fun _ -> ()); + Token.tparse = (fun _ -> None); + Token.text = token_text } + +END (* Terminal symbols interpretation *) @@ -534,7 +587,7 @@ let is_ident_not_keyword s = let is_number s = let rec aux i = - String.length s = i or + String.length s = i or match s.[i] with '0'..'9' -> aux (i+1) | _ -> false in aux 0 -- cgit v1.2.3