1 files changed, 100 insertions, 103 deletions
diff --git a/parsing/cLexer.ml4 b/parsing/cLexer.ml4
index aec6a326..d65b35c4 100644
--- a/parsing/cLexer.ml4
+++ b/parsing/cLexer.ml4
@@ -1,15 +1,38 @@
 (************************************************************************)
-(*  v      *   The Coq Proof Assistant  /  The Coq Development Team     *)
-(* <O___,, *   INRIA - CNRS - LIX - LRI - PPS - Copyright 1999-2016     *)
+(*         *   The Coq Proof Assistant / The Coq Development Team       *)
+(*  v      *   INRIA, CNRS and contributors - Copyright 1999-2018       *)
+(* <O___,, *       (see CREDITS file for the list of authors)           *)
 (*   \VV/  **************************************************************)
-(*    //   *      This file is distributed under the terms of the       *)
-(*         *       GNU Lesser General Public License Version 2.1        *)
+(*    //   *    This file is distributed under the terms of the         *)
+(*         *     GNU Lesser General Public License Version 2.1          *)
+(*         *     (see LICENSE file for the text of the license)         *)
 (************************************************************************)
 
 open Pp
 open Util
 open Tok
-open Compat
+
+(** Location utilities  *)
+let ploc_file_of_coq_file = function
+| Loc.ToplevelInput -> ""
+| Loc.InFile f -> f
+
+let coq_file_of_ploc_file s =
+  if s = "" then Loc.ToplevelInput else Loc.InFile s
+
+let from_coqloc fname line_nb bol_pos bp ep =
+  Ploc.make_loc (ploc_file_of_coq_file fname) line_nb bol_pos (bp, ep) ""
+
+let to_coqloc loc =
+  { Loc.fname = coq_file_of_ploc_file (Ploc.file_name loc);
+    Loc.line_nb = Ploc.line_nb loc;
+    Loc.bol_pos = Ploc.bol_pos loc;
+    Loc.bp = Ploc.first_pos loc;
+    Loc.ep = Ploc.last_pos loc;
+    Loc.line_nb_last = Ploc.line_nb_last loc;
+    Loc.bol_pos_last = Ploc.bol_pos_last loc; }
+
+let (!@) = to_coqloc
 
 (* Dictionaries: trees annotated with string options, each node being a map
    from chars to dictionaries (the subtrees). A trie, in other words. *)
@@ -89,7 +112,6 @@ module Error = struct
     | Unterminated_string
     | Undefined_token
     | Bad_token of string
-    | UnsupportedUnicode of int
 
   exception E of t
 
@@ -100,21 +122,45 @@ module Error = struct
          | Unterminated_comment -> "Unterminated comment"
          | Unterminated_string -> "Unterminated string"
          | Undefined_token -> "Undefined token"
-         | Bad_token tok -> Format.sprintf "Bad token %S" tok
-         | UnsupportedUnicode x ->
-             Printf.sprintf "Unsupported Unicode character (0x%x)" x)
-
-  (* Require to fix the Camlp4 signature *)
-  let print ppf x = Pp.pp_with ~pp_tag:Ppstyle.pp_tag ppf (Pp.str (to_string x))
+         | Bad_token tok -> Format.sprintf "Bad token %S" tok)
 
 end
 open Error
 
-let err loc str = Loc.raise (Compat.to_coqloc loc) (Error.E str)
+let err loc str = Loc.raise ~loc:(to_coqloc loc) (Error.E str)
 
 let bad_token str = raise (Error.E (Bad_token str))
 
-(* Lexer conventions on tokens *)
+(* Update a loc without allocating an intermediate pair *)
+let set_loc_pos loc bp ep =
+  Ploc.sub loc (bp - Ploc.first_pos loc) (ep - bp)
+
+(* Increase line number by 1 and update position of beginning of line *)
+let bump_loc_line loc bol_pos =
+  Ploc.make_loc (Ploc.file_name loc) (Ploc.line_nb loc + 1) bol_pos
+		(Ploc.first_pos loc, Ploc.last_pos loc) (Ploc.comment loc)
+
+(* Same as [bump_loc_line], but for the last line in location *)
+(* For an obscure reason, camlp5 does not give an easy way to set line_nb_stop,
+   so we have to resort to a hack merging two locations. *)
+(* Warning: [bump_loc_line_last] changes the end position. You may need to call
+   [set_loc_pos] to fix it. *)
+let bump_loc_line_last loc bol_pos =
+  let loc' =
+    Ploc.make_loc (Ploc.file_name loc) (Ploc.line_nb_last loc + 1) bol_pos
+		  (Ploc.first_pos loc + 1, Ploc.last_pos loc + 1) (Ploc.comment loc)
+  in
+  Ploc.encl loc loc'
+
+(* For some reason, the [Ploc.after] function of Camlp5 does not update line
+   numbers, so we define our own function that does it. *)
+let after loc =
+  let line_nb = Ploc.line_nb_last loc in
+  let bol_pos = Ploc.bol_pos_last loc in
+  Ploc.make_loc (Ploc.file_name loc) line_nb bol_pos
+		(Ploc.last_pos loc, Ploc.last_pos loc) (Ploc.comment loc)
+
+(** Lexer conventions on tokens *)
 
 type token_kind =
   | Utf8Token of (Unicode.status * int)
@@ -186,7 +232,7 @@ let check_keyword str =
   let rec loop_symb = parser
     | [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str
     | [< s >] ->
-        match unlocated lookup_utf8 Compat.CompatLoc.ghost s with
+        match unlocated lookup_utf8 Ploc.dummy s with
         | Utf8Token (_,n) -> njunk n s; loop_symb s
         | AsciiChar -> Stream.junk s; loop_symb s
         | EmptyStream -> ()
@@ -200,9 +246,9 @@ let check_ident str =
     | [< ' ('0'..'9' | ''') when intail; s >] ->
         loop_id true s
     | [< s >] ->
-        match unlocated lookup_utf8 Compat.CompatLoc.ghost s with
-        | Utf8Token (Unicode.Letter, n) -> njunk n s; loop_id true s
-        | Utf8Token (Unicode.IdentPart, n) when intail ->
+        match unlocated lookup_utf8 Ploc.dummy s with
+        | Utf8Token (st, n) when not intail && Unicode.is_valid_ident_initial st -> njunk n s; loop_id true s
+        | Utf8Token (st, n) when intail && Unicode.is_valid_ident_trailing st ->
           njunk n s;
           loop_id true s
         | EmptyStream -> ()
@@ -233,25 +279,26 @@ let remove_keyword str =
 let keywords () = ttree_elements !token_tree
 
 (* Freeze and unfreeze the state of the lexer *)
-type frozen_t = ttree
+type keyword_state = ttree
 
-let freeze () = !token_tree
-let unfreeze tt = (token_tree := tt)
+let get_keyword_state () = !token_tree
+let set_keyword_state tt = (token_tree := tt)
 
 (* The string buffering machinery *)
 
-let buff = ref (String.create 80)
+let buff = ref (Bytes.create 80)
 
 let store len x =
-  if len >= String.length !buff then
-    buff := !buff ^ String.create (String.length !buff);
-  !buff.[len] <- x;
+  let open Bytes in
+  if len >= length !buff then
+    buff := cat !buff (create (length !buff));
+  set !buff len x;
   succ len
 
 let rec nstore n len cs =
   if n>0 then nstore (n-1) (store len (Stream.next cs)) cs else len
 
-let get_buff len = String.sub !buff 0 len
+let get_buff len = Bytes.sub_string !buff 0 len
 
 (* The classical lexer: idents, numbers, quoted strings, comments *)
 
@@ -266,9 +313,9 @@ let rec ident_tail loc len = parser
       ident_tail loc (store len c) s
   | [< s >] ->
       match lookup_utf8 loc s with
-      | Utf8Token ((Unicode.IdentPart | Unicode.Letter), n) ->
+      | Utf8Token (st, n) when Unicode.is_valid_ident_trailing st ->
           ident_tail loc (nstore n len s) s
-      | Utf8Token (Unicode.Unknown, n) ->
+      | Utf8Token (st, n) when Unicode.is_unknown st ->
           let id = get_buff len in
           let u = String.concat "" (List.map (String.make 1) (Stream.npeek n s)) in
           warn_unrecognized_unicode ~loc:!@loc (u,id); len
@@ -293,13 +340,13 @@ let rec string loc ~comm_level bp len = parser
       if esc then string loc ~comm_level bp (store len '"') s else (loc, len)
   | [< ''('; s >] ->
       (parser
-        | [< ''*'; s >] ->
-            string loc
-              (Option.map succ comm_level)
+      | [< ''*'; s >] ->
+        let comm_level = Option.map succ comm_level in
+            string loc ~comm_level
               bp (store (store len '(') '*')
               s
         | [< >] ->
-            string loc comm_level bp (store len '(') s) s
+            string loc ~comm_level bp (store len '(') s) s
   | [< ''*'; s >] ->
       (parser
         | [< '')'; s >] ->
@@ -309,9 +356,9 @@ let rec string loc ~comm_level bp len = parser
             | _ -> ()
             in
             let comm_level = Option.map pred comm_level in
-            string loc comm_level bp (store (store len '*') ')') s
+            string loc ~comm_level bp (store (store len '*') ')') s
         | [< >] ->
-            string loc comm_level bp (store len '*') s) s
+            string loc ~comm_level bp (store len '*') s) s
   | [< ''\n' as c; s >] ep ->
      (* If we are parsing a comment, the string if not part of a token so we
      update the first line of the location. Otherwise, we update the last
@@ -320,17 +367,14 @@ let rec string loc ~comm_level bp len = parser
        if Option.has_some comm_level then bump_loc_line loc ep
        else bump_loc_line_last loc ep
      in
-     string loc comm_level bp (store len c) s
-  | [< 'c; s >] -> string loc comm_level bp (store len c) s
+     string loc ~comm_level bp (store len c) s
+  | [< 'c; s >] -> string loc ~comm_level bp (store len c) s
   | [< _ = Stream.empty >] ep ->
      let loc = set_loc_pos loc bp ep in
      err loc Unterminated_string
 
-(* Hook for exporting comment into xml theory files *)
-let (f_xml_output_comment, xml_output_comment) = Hook.make ~default:ignore ()
-
 (* To associate locations to a file name *)
-let current_file = ref None
+let current_file = ref Loc.ToplevelInput
 
 (* Utilities for comments in beautify *)
 let comment_begin = ref None
@@ -342,18 +386,8 @@ let comments = ref []
 let current_comment = Buffer.create 8192
 let between_commands = ref true
 
-let rec split_comments comacc acc pos = function
-    [] -> comments := List.rev acc; comacc
-  | ((b,e),c as com)::coms ->
-      (* Take all comments that terminates before pos, or begin exactly
-         at pos (used to print comments attached after an expression) *)
-      if e<=pos || pos=b then split_comments (c::comacc) acc pos coms
-      else split_comments comacc (com::acc) pos coms
-
-let extract_comments pos = split_comments [] [] pos !comments
-
 (* The state of the lexer visible from outside *)
-type lexer_state = int option * string * bool * ((int * int) * string) list * string option
+type lexer_state = int option * string * bool * ((int * int) * string) list * Loc.source
 
 let init_lexer_state f = (None,"",true,[],f)
 let set_lexer_state (o,s,b,c,f) =
@@ -362,10 +396,13 @@ let set_lexer_state (o,s,b,c,f) =
   between_commands := b;
   comments := c;
   current_file := f
-let release_lexer_state () =
+let get_lexer_state () =
   (!comment_begin, Buffer.contents current_comment, !between_commands, !comments, !current_file)
+let release_lexer_state = get_lexer_state
 let drop_lexer_state () =
-    set_lexer_state (init_lexer_state None)
+    set_lexer_state (init_lexer_state Loc.ToplevelInput)
+
+let get_comment_state (_,_,_,c,_) = c
 
 let real_push_char c = Buffer.add_char current_comment c
 
@@ -390,9 +427,6 @@ let null_comment s =
 
 let comment_stop ep =
   let current_s = Buffer.contents current_comment in
-  if !Flags.xml_export && Buffer.length current_comment > 0 &&
-    (!between_commands || not(null_comment current_s)) then
-      Hook.get f_xml_output_comment current_s;
   (if !Flags.beautify && Buffer.length current_comment > 0 &&
     (!between_commands || not(null_comment current_s)) then
     let bp = match !comment_begin with
@@ -500,7 +534,7 @@ let parse_after_dot loc c bp =
       (try find_keyword loc ("."^field) s with Not_found -> FIELD field)
   | [< s >] ->
       match lookup_utf8 loc s with
-      | Utf8Token (Unicode.Letter, n) ->
+      | Utf8Token (st, n) when Unicode.is_valid_ident_initial st ->
           let len = ident_tail loc (nstore n 0 s) s in
           let field = get_buff len in
           (try find_keyword loc ("."^field) s with Not_found -> FIELD field)
@@ -514,7 +548,7 @@ let parse_after_qmark loc bp s =
     | None -> KEYWORD "?"
     | _ ->
         match lookup_utf8 loc s with
-          | Utf8Token (Unicode.Letter, _) -> LEFTQMARK
+          | Utf8Token (st, _) when Unicode.is_valid_ident_initial st -> LEFTQMARK
           | AsciiChar | Utf8Token _ | EmptyStream ->
             fst (process_chars loc bp '?' s)
 
@@ -560,7 +594,7 @@ let rec next_token loc = parser bp
   | [< ' ('0'..'9' as c); len = number (store 0 c) >] ep ->
       comment_stop bp;
       (INT (get_buff len), set_loc_pos loc bp ep)
-  | [< ''\"'; (loc,len) = string loc None bp 0 >] ep ->
+  | [< ''\"'; (loc,len) = string loc ~comm_level:None bp 0 >] ep ->
       comment_stop bp;
       (STRING (get_buff len), set_loc_pos loc bp ep)
   | [< ' ('(' as c);
@@ -579,13 +613,13 @@ let rec next_token loc = parser bp
       comment_stop bp; between_commands := new_between_commands; t
   | [< s >] ->
       match lookup_utf8 loc s with
-        | Utf8Token (Unicode.Letter, n) ->
+        | Utf8Token (st, n) when Unicode.is_valid_ident_initial st ->
             let len = ident_tail loc (nstore n 0 s) s in
             let id = get_buff len in
             let ep = Stream.count s in
             comment_stop bp;
             (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep
-        | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart | Unicode.Unknown), _) ->
+        | AsciiChar | Utf8Token _ ->
             let t = process_chars loc bp (Stream.next s) s in
             comment_stop bp; t
         | EmptyStream ->
@@ -619,8 +653,6 @@ let loct_add loct i loc = Hashtbl.add loct i loc
    we unfreeze the state of the lexer. This restores the behaviour of the
    lexer. B.B. *)
 
-IFDEF CAMLP5 THEN
-
 type te = Tok.t
 
 (** Names of tokens, for this lexer, used in Grammar error messages *)
@@ -638,12 +670,12 @@ let token_text = function
 
 let func cs =
   let loct = loct_create () in
-  let cur_loc = ref (Compat.make_loc !current_file 1 0 0 0) in
+  let cur_loc = ref (from_coqloc !current_file 1 0 0 0) in
   let ts =
     Stream.from
       (fun i ->
          let (tok, loc) = next_token !cur_loc cs in
-	 cur_loc := Compat.after loc;
+	 cur_loc := after loc;
          loct_add loct i loc; Some tok)
   in
   (ts, loct_func loct)
@@ -659,41 +691,6 @@ let lexer = {
   Token.tok_comm = None;
   Token.tok_text = token_text }
 
-ELSE (* official camlp4 for ocaml >= 3.10 *)
-
-module M_ = Camlp4.ErrorHandler.Register (Error)
-
-module Loc = Compat.CompatLoc
-module Token = struct
-  include Tok (* Cf. tok.ml *)
-  module Loc = Compat.CompatLoc
-  module Error = Camlp4.Struct.EmptyError
-  module Filter = struct
-    type token_filter = (Tok.t * Loc.t) Stream.t -> (Tok.t * Loc.t) Stream.t
-    type t = unit
-    let mk _is_kwd = ()
-    let keyword_added () kwd _ = add_keyword kwd
-    let keyword_removed () _ = ()
-    let filter () x = x
-    let define_filter () _ = ()
-  end
-end
-
-let mk () =
-  let loct = loct_create () in
-  let cur_loc = ref (Compat.make_loc !current_file 1 0 0 0) in
-  let rec self init_loc (* FIXME *) =
-    parser i
-       [< (tok, loc) = next_token !cur_loc; s >] ->
-            cur_loc := Compat.set_loc_file loc !current_file;
-	    loct_add loct i loc;
-            [< '(tok, loc); self init_loc s >]
-      | [< >] -> [< >]
-  in
-  self
-
-END
-
 (** Terminal symbols interpretation *)
 
 let is_ident_not_keyword s =
@@ -716,13 +713,13 @@ let strip s =
   in
   if len == String.length s then s
   else
-    let s' = String.create len in
+    let s' = Bytes.create len in
     let rec loop i i' =
       if i == String.length s then s'
       else if s.[i] == ' ' then loop (i + 1) i'
-      else begin s'.[i'] <- s.[i]; loop (i + 1) (i' + 1) end
+      else begin Bytes.set s' i' s.[i]; loop (i + 1) (i' + 1) end
     in
-    loop 0 0
+    Bytes.to_string (loop 0 0)
 
 let terminal s =
   let s = strip s in