From 9043add656177eeac1491a73d2f3ab92bec0013c Mon Sep 17 00:00:00 2001 From: Benjamin Barenblat Date: Sat, 29 Dec 2018 14:31:27 -0500 Subject: Imported Upstream version 8.8.2 --- parsing/cLexer.ml4 | 203 ++++++++++++++++++++++++++--------------------------- 1 file changed, 100 insertions(+), 103 deletions(-) (limited to 'parsing/cLexer.ml4') diff --git a/parsing/cLexer.ml4 b/parsing/cLexer.ml4 index aec6a326..d65b35c4 100644 --- a/parsing/cLexer.ml4 +++ b/parsing/cLexer.ml4 @@ -1,15 +1,38 @@ (************************************************************************) -(* v * The Coq Proof Assistant / The Coq Development Team *) -(* "" +| Loc.InFile f -> f + +let coq_file_of_ploc_file s = + if s = "" then Loc.ToplevelInput else Loc.InFile s + +let from_coqloc fname line_nb bol_pos bp ep = + Ploc.make_loc (ploc_file_of_coq_file fname) line_nb bol_pos (bp, ep) "" + +let to_coqloc loc = + { Loc.fname = coq_file_of_ploc_file (Ploc.file_name loc); + Loc.line_nb = Ploc.line_nb loc; + Loc.bol_pos = Ploc.bol_pos loc; + Loc.bp = Ploc.first_pos loc; + Loc.ep = Ploc.last_pos loc; + Loc.line_nb_last = Ploc.line_nb_last loc; + Loc.bol_pos_last = Ploc.bol_pos_last loc; } + +let (!@) = to_coqloc (* Dictionaries: trees annotated with string options, each node being a map from chars to dictionaries (the subtrees). A trie, in other words. *) @@ -89,7 +112,6 @@ module Error = struct | Unterminated_string | Undefined_token | Bad_token of string - | UnsupportedUnicode of int exception E of t @@ -100,21 +122,45 @@ module Error = struct | Unterminated_comment -> "Unterminated comment" | Unterminated_string -> "Unterminated string" | Undefined_token -> "Undefined token" - | Bad_token tok -> Format.sprintf "Bad token %S" tok - | UnsupportedUnicode x -> - Printf.sprintf "Unsupported Unicode character (0x%x)" x) - - (* Require to fix the Camlp4 signature *) - let print ppf x = Pp.pp_with ~pp_tag:Ppstyle.pp_tag ppf (Pp.str (to_string x)) + | Bad_token tok -> Format.sprintf "Bad token %S" tok) end open Error -let err loc str = Loc.raise (Compat.to_coqloc loc) (Error.E str) +let err loc str = Loc.raise ~loc:(to_coqloc loc) (Error.E str) let bad_token str = raise (Error.E (Bad_token str)) -(* Lexer conventions on tokens *) +(* Update a loc without allocating an intermediate pair *) +let set_loc_pos loc bp ep = + Ploc.sub loc (bp - Ploc.first_pos loc) (ep - bp) + +(* Increase line number by 1 and update position of beginning of line *) +let bump_loc_line loc bol_pos = + Ploc.make_loc (Ploc.file_name loc) (Ploc.line_nb loc + 1) bol_pos + (Ploc.first_pos loc, Ploc.last_pos loc) (Ploc.comment loc) + +(* Same as [bump_loc_line], but for the last line in location *) +(* For an obscure reason, camlp5 does not give an easy way to set line_nb_stop, + so we have to resort to a hack merging two locations. *) +(* Warning: [bump_loc_line_last] changes the end position. You may need to call + [set_loc_pos] to fix it. *) +let bump_loc_line_last loc bol_pos = + let loc' = + Ploc.make_loc (Ploc.file_name loc) (Ploc.line_nb_last loc + 1) bol_pos + (Ploc.first_pos loc + 1, Ploc.last_pos loc + 1) (Ploc.comment loc) + in + Ploc.encl loc loc' + +(* For some reason, the [Ploc.after] function of Camlp5 does not update line + numbers, so we define our own function that does it. *) +let after loc = + let line_nb = Ploc.line_nb_last loc in + let bol_pos = Ploc.bol_pos_last loc in + Ploc.make_loc (Ploc.file_name loc) line_nb bol_pos + (Ploc.last_pos loc, Ploc.last_pos loc) (Ploc.comment loc) + +(** Lexer conventions on tokens *) type token_kind = | Utf8Token of (Unicode.status * int) @@ -186,7 +232,7 @@ let check_keyword str = let rec loop_symb = parser | [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str | [< s >] -> - match unlocated lookup_utf8 Compat.CompatLoc.ghost s with + match unlocated lookup_utf8 Ploc.dummy s with | Utf8Token (_,n) -> njunk n s; loop_symb s | AsciiChar -> Stream.junk s; loop_symb s | EmptyStream -> () @@ -200,9 +246,9 @@ let check_ident str = | [< ' ('0'..'9' | ''') when intail; s >] -> loop_id true s | [< s >] -> - match unlocated lookup_utf8 Compat.CompatLoc.ghost s with - | Utf8Token (Unicode.Letter, n) -> njunk n s; loop_id true s - | Utf8Token (Unicode.IdentPart, n) when intail -> + match unlocated lookup_utf8 Ploc.dummy s with + | Utf8Token (st, n) when not intail && Unicode.is_valid_ident_initial st -> njunk n s; loop_id true s + | Utf8Token (st, n) when intail && Unicode.is_valid_ident_trailing st -> njunk n s; loop_id true s | EmptyStream -> () @@ -233,25 +279,26 @@ let remove_keyword str = let keywords () = ttree_elements !token_tree (* Freeze and unfreeze the state of the lexer *) -type frozen_t = ttree +type keyword_state = ttree -let freeze () = !token_tree -let unfreeze tt = (token_tree := tt) +let get_keyword_state () = !token_tree +let set_keyword_state tt = (token_tree := tt) (* The string buffering machinery *) -let buff = ref (String.create 80) +let buff = ref (Bytes.create 80) let store len x = - if len >= String.length !buff then - buff := !buff ^ String.create (String.length !buff); - !buff.[len] <- x; + let open Bytes in + if len >= length !buff then + buff := cat !buff (create (length !buff)); + set !buff len x; succ len let rec nstore n len cs = if n>0 then nstore (n-1) (store len (Stream.next cs)) cs else len -let get_buff len = String.sub !buff 0 len +let get_buff len = Bytes.sub_string !buff 0 len (* The classical lexer: idents, numbers, quoted strings, comments *) @@ -266,9 +313,9 @@ let rec ident_tail loc len = parser ident_tail loc (store len c) s | [< s >] -> match lookup_utf8 loc s with - | Utf8Token ((Unicode.IdentPart | Unicode.Letter), n) -> + | Utf8Token (st, n) when Unicode.is_valid_ident_trailing st -> ident_tail loc (nstore n len s) s - | Utf8Token (Unicode.Unknown, n) -> + | Utf8Token (st, n) when Unicode.is_unknown st -> let id = get_buff len in let u = String.concat "" (List.map (String.make 1) (Stream.npeek n s)) in warn_unrecognized_unicode ~loc:!@loc (u,id); len @@ -293,13 +340,13 @@ let rec string loc ~comm_level bp len = parser if esc then string loc ~comm_level bp (store len '"') s else (loc, len) | [< ''('; s >] -> (parser - | [< ''*'; s >] -> - string loc - (Option.map succ comm_level) + | [< ''*'; s >] -> + let comm_level = Option.map succ comm_level in + string loc ~comm_level bp (store (store len '(') '*') s | [< >] -> - string loc comm_level bp (store len '(') s) s + string loc ~comm_level bp (store len '(') s) s | [< ''*'; s >] -> (parser | [< '')'; s >] -> @@ -309,9 +356,9 @@ let rec string loc ~comm_level bp len = parser | _ -> () in let comm_level = Option.map pred comm_level in - string loc comm_level bp (store (store len '*') ')') s + string loc ~comm_level bp (store (store len '*') ')') s | [< >] -> - string loc comm_level bp (store len '*') s) s + string loc ~comm_level bp (store len '*') s) s | [< ''\n' as c; s >] ep -> (* If we are parsing a comment, the string if not part of a token so we update the first line of the location. Otherwise, we update the last @@ -320,17 +367,14 @@ let rec string loc ~comm_level bp len = parser if Option.has_some comm_level then bump_loc_line loc ep else bump_loc_line_last loc ep in - string loc comm_level bp (store len c) s - | [< 'c; s >] -> string loc comm_level bp (store len c) s + string loc ~comm_level bp (store len c) s + | [< 'c; s >] -> string loc ~comm_level bp (store len c) s | [< _ = Stream.empty >] ep -> let loc = set_loc_pos loc bp ep in err loc Unterminated_string -(* Hook for exporting comment into xml theory files *) -let (f_xml_output_comment, xml_output_comment) = Hook.make ~default:ignore () - (* To associate locations to a file name *) -let current_file = ref None +let current_file = ref Loc.ToplevelInput (* Utilities for comments in beautify *) let comment_begin = ref None @@ -342,18 +386,8 @@ let comments = ref [] let current_comment = Buffer.create 8192 let between_commands = ref true -let rec split_comments comacc acc pos = function - [] -> comments := List.rev acc; comacc - | ((b,e),c as com)::coms -> - (* Take all comments that terminates before pos, or begin exactly - at pos (used to print comments attached after an expression) *) - if e<=pos || pos=b then split_comments (c::comacc) acc pos coms - else split_comments comacc (com::acc) pos coms - -let extract_comments pos = split_comments [] [] pos !comments - (* The state of the lexer visible from outside *) -type lexer_state = int option * string * bool * ((int * int) * string) list * string option +type lexer_state = int option * string * bool * ((int * int) * string) list * Loc.source let init_lexer_state f = (None,"",true,[],f) let set_lexer_state (o,s,b,c,f) = @@ -362,10 +396,13 @@ let set_lexer_state (o,s,b,c,f) = between_commands := b; comments := c; current_file := f -let release_lexer_state () = +let get_lexer_state () = (!comment_begin, Buffer.contents current_comment, !between_commands, !comments, !current_file) +let release_lexer_state = get_lexer_state let drop_lexer_state () = - set_lexer_state (init_lexer_state None) + set_lexer_state (init_lexer_state Loc.ToplevelInput) + +let get_comment_state (_,_,_,c,_) = c let real_push_char c = Buffer.add_char current_comment c @@ -390,9 +427,6 @@ let null_comment s = let comment_stop ep = let current_s = Buffer.contents current_comment in - if !Flags.xml_export && Buffer.length current_comment > 0 && - (!between_commands || not(null_comment current_s)) then - Hook.get f_xml_output_comment current_s; (if !Flags.beautify && Buffer.length current_comment > 0 && (!between_commands || not(null_comment current_s)) then let bp = match !comment_begin with @@ -500,7 +534,7 @@ let parse_after_dot loc c bp = (try find_keyword loc ("."^field) s with Not_found -> FIELD field) | [< s >] -> match lookup_utf8 loc s with - | Utf8Token (Unicode.Letter, n) -> + | Utf8Token (st, n) when Unicode.is_valid_ident_initial st -> let len = ident_tail loc (nstore n 0 s) s in let field = get_buff len in (try find_keyword loc ("."^field) s with Not_found -> FIELD field) @@ -514,7 +548,7 @@ let parse_after_qmark loc bp s = | None -> KEYWORD "?" | _ -> match lookup_utf8 loc s with - | Utf8Token (Unicode.Letter, _) -> LEFTQMARK + | Utf8Token (st, _) when Unicode.is_valid_ident_initial st -> LEFTQMARK | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars loc bp '?' s) @@ -560,7 +594,7 @@ let rec next_token loc = parser bp | [< ' ('0'..'9' as c); len = number (store 0 c) >] ep -> comment_stop bp; (INT (get_buff len), set_loc_pos loc bp ep) - | [< ''\"'; (loc,len) = string loc None bp 0 >] ep -> + | [< ''\"'; (loc,len) = string loc ~comm_level:None bp 0 >] ep -> comment_stop bp; (STRING (get_buff len), set_loc_pos loc bp ep) | [< ' ('(' as c); @@ -579,13 +613,13 @@ let rec next_token loc = parser bp comment_stop bp; between_commands := new_between_commands; t | [< s >] -> match lookup_utf8 loc s with - | Utf8Token (Unicode.Letter, n) -> + | Utf8Token (st, n) when Unicode.is_valid_ident_initial st -> let len = ident_tail loc (nstore n 0 s) s in let id = get_buff len in let ep = Stream.count s in comment_stop bp; (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep - | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart | Unicode.Unknown), _) -> + | AsciiChar | Utf8Token _ -> let t = process_chars loc bp (Stream.next s) s in comment_stop bp; t | EmptyStream -> @@ -619,8 +653,6 @@ let loct_add loct i loc = Hashtbl.add loct i loc we unfreeze the state of the lexer. This restores the behaviour of the lexer. B.B. *) -IFDEF CAMLP5 THEN - type te = Tok.t (** Names of tokens, for this lexer, used in Grammar error messages *) @@ -638,12 +670,12 @@ let token_text = function let func cs = let loct = loct_create () in - let cur_loc = ref (Compat.make_loc !current_file 1 0 0 0) in + let cur_loc = ref (from_coqloc !current_file 1 0 0 0) in let ts = Stream.from (fun i -> let (tok, loc) = next_token !cur_loc cs in - cur_loc := Compat.after loc; + cur_loc := after loc; loct_add loct i loc; Some tok) in (ts, loct_func loct) @@ -659,41 +691,6 @@ let lexer = { Token.tok_comm = None; Token.tok_text = token_text } -ELSE (* official camlp4 for ocaml >= 3.10 *) - -module M_ = Camlp4.ErrorHandler.Register (Error) - -module Loc = Compat.CompatLoc -module Token = struct - include Tok (* Cf. tok.ml *) - module Loc = Compat.CompatLoc - module Error = Camlp4.Struct.EmptyError - module Filter = struct - type token_filter = (Tok.t * Loc.t) Stream.t -> (Tok.t * Loc.t) Stream.t - type t = unit - let mk _is_kwd = () - let keyword_added () kwd _ = add_keyword kwd - let keyword_removed () _ = () - let filter () x = x - let define_filter () _ = () - end -end - -let mk () = - let loct = loct_create () in - let cur_loc = ref (Compat.make_loc !current_file 1 0 0 0) in - let rec self init_loc (* FIXME *) = - parser i - [< (tok, loc) = next_token !cur_loc; s >] -> - cur_loc := Compat.set_loc_file loc !current_file; - loct_add loct i loc; - [< '(tok, loc); self init_loc s >] - | [< >] -> [< >] - in - self - -END - (** Terminal symbols interpretation *) let is_ident_not_keyword s = @@ -716,13 +713,13 @@ let strip s = in if len == String.length s then s else - let s' = String.create len in + let s' = Bytes.create len in let rec loop i i' = if i == String.length s then s' else if s.[i] == ' ' then loop (i + 1) i' - else begin s'.[i'] <- s.[i]; loop (i + 1) (i' + 1) end + else begin Bytes.set s' i' s.[i]; loop (i + 1) (i' + 1) end in - loop 0 0 + Bytes.to_string (loop 0 0) let terminal s = let s = strip s in -- cgit v1.2.3