summaryrefslogtreecommitdiff
path: root/parsing/cLexer.ml4
diff options
context:
space:
mode:
Diffstat (limited to 'parsing/cLexer.ml4')
-rw-r--r--parsing/cLexer.ml4203
1 files changed, 100 insertions, 103 deletions
diff --git a/parsing/cLexer.ml4 b/parsing/cLexer.ml4
index aec6a326..d65b35c4 100644
--- a/parsing/cLexer.ml4
+++ b/parsing/cLexer.ml4
@@ -1,15 +1,38 @@
(************************************************************************)
-(* v * The Coq Proof Assistant / The Coq Development Team *)
-(* <O___,, * INRIA - CNRS - LIX - LRI - PPS - Copyright 1999-2016 *)
+(* * The Coq Proof Assistant / The Coq Development Team *)
+(* v * INRIA, CNRS and contributors - Copyright 1999-2018 *)
+(* <O___,, * (see CREDITS file for the list of authors) *)
(* \VV/ **************************************************************)
-(* // * This file is distributed under the terms of the *)
-(* * GNU Lesser General Public License Version 2.1 *)
+(* // * This file is distributed under the terms of the *)
+(* * GNU Lesser General Public License Version 2.1 *)
+(* * (see LICENSE file for the text of the license) *)
(************************************************************************)
open Pp
open Util
open Tok
-open Compat
+
+(** Location utilities *)
+let ploc_file_of_coq_file = function
+| Loc.ToplevelInput -> ""
+| Loc.InFile f -> f
+
+let coq_file_of_ploc_file s =
+ if s = "" then Loc.ToplevelInput else Loc.InFile s
+
+let from_coqloc fname line_nb bol_pos bp ep =
+ Ploc.make_loc (ploc_file_of_coq_file fname) line_nb bol_pos (bp, ep) ""
+
+let to_coqloc loc =
+ { Loc.fname = coq_file_of_ploc_file (Ploc.file_name loc);
+ Loc.line_nb = Ploc.line_nb loc;
+ Loc.bol_pos = Ploc.bol_pos loc;
+ Loc.bp = Ploc.first_pos loc;
+ Loc.ep = Ploc.last_pos loc;
+ Loc.line_nb_last = Ploc.line_nb_last loc;
+ Loc.bol_pos_last = Ploc.bol_pos_last loc; }
+
+let (!@) = to_coqloc
(* Dictionaries: trees annotated with string options, each node being a map
from chars to dictionaries (the subtrees). A trie, in other words. *)
@@ -89,7 +112,6 @@ module Error = struct
| Unterminated_string
| Undefined_token
| Bad_token of string
- | UnsupportedUnicode of int
exception E of t
@@ -100,21 +122,45 @@ module Error = struct
| Unterminated_comment -> "Unterminated comment"
| Unterminated_string -> "Unterminated string"
| Undefined_token -> "Undefined token"
- | Bad_token tok -> Format.sprintf "Bad token %S" tok
- | UnsupportedUnicode x ->
- Printf.sprintf "Unsupported Unicode character (0x%x)" x)
-
- (* Require to fix the Camlp4 signature *)
- let print ppf x = Pp.pp_with ~pp_tag:Ppstyle.pp_tag ppf (Pp.str (to_string x))
+ | Bad_token tok -> Format.sprintf "Bad token %S" tok)
end
open Error
-let err loc str = Loc.raise (Compat.to_coqloc loc) (Error.E str)
+let err loc str = Loc.raise ~loc:(to_coqloc loc) (Error.E str)
let bad_token str = raise (Error.E (Bad_token str))
-(* Lexer conventions on tokens *)
+(* Update a loc without allocating an intermediate pair *)
+let set_loc_pos loc bp ep =
+ Ploc.sub loc (bp - Ploc.first_pos loc) (ep - bp)
+
+(* Increase line number by 1 and update position of beginning of line *)
+let bump_loc_line loc bol_pos =
+ Ploc.make_loc (Ploc.file_name loc) (Ploc.line_nb loc + 1) bol_pos
+ (Ploc.first_pos loc, Ploc.last_pos loc) (Ploc.comment loc)
+
+(* Same as [bump_loc_line], but for the last line in location *)
+(* For an obscure reason, camlp5 does not give an easy way to set line_nb_stop,
+ so we have to resort to a hack merging two locations. *)
+(* Warning: [bump_loc_line_last] changes the end position. You may need to call
+ [set_loc_pos] to fix it. *)
+let bump_loc_line_last loc bol_pos =
+ let loc' =
+ Ploc.make_loc (Ploc.file_name loc) (Ploc.line_nb_last loc + 1) bol_pos
+ (Ploc.first_pos loc + 1, Ploc.last_pos loc + 1) (Ploc.comment loc)
+ in
+ Ploc.encl loc loc'
+
+(* For some reason, the [Ploc.after] function of Camlp5 does not update line
+ numbers, so we define our own function that does it. *)
+let after loc =
+ let line_nb = Ploc.line_nb_last loc in
+ let bol_pos = Ploc.bol_pos_last loc in
+ Ploc.make_loc (Ploc.file_name loc) line_nb bol_pos
+ (Ploc.last_pos loc, Ploc.last_pos loc) (Ploc.comment loc)
+
+(** Lexer conventions on tokens *)
type token_kind =
| Utf8Token of (Unicode.status * int)
@@ -186,7 +232,7 @@ let check_keyword str =
let rec loop_symb = parser
| [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str
| [< s >] ->
- match unlocated lookup_utf8 Compat.CompatLoc.ghost s with
+ match unlocated lookup_utf8 Ploc.dummy s with
| Utf8Token (_,n) -> njunk n s; loop_symb s
| AsciiChar -> Stream.junk s; loop_symb s
| EmptyStream -> ()
@@ -200,9 +246,9 @@ let check_ident str =
| [< ' ('0'..'9' | ''') when intail; s >] ->
loop_id true s
| [< s >] ->
- match unlocated lookup_utf8 Compat.CompatLoc.ghost s with
- | Utf8Token (Unicode.Letter, n) -> njunk n s; loop_id true s
- | Utf8Token (Unicode.IdentPart, n) when intail ->
+ match unlocated lookup_utf8 Ploc.dummy s with
+ | Utf8Token (st, n) when not intail && Unicode.is_valid_ident_initial st -> njunk n s; loop_id true s
+ | Utf8Token (st, n) when intail && Unicode.is_valid_ident_trailing st ->
njunk n s;
loop_id true s
| EmptyStream -> ()
@@ -233,25 +279,26 @@ let remove_keyword str =
let keywords () = ttree_elements !token_tree
(* Freeze and unfreeze the state of the lexer *)
-type frozen_t = ttree
+type keyword_state = ttree
-let freeze () = !token_tree
-let unfreeze tt = (token_tree := tt)
+let get_keyword_state () = !token_tree
+let set_keyword_state tt = (token_tree := tt)
(* The string buffering machinery *)
-let buff = ref (String.create 80)
+let buff = ref (Bytes.create 80)
let store len x =
- if len >= String.length !buff then
- buff := !buff ^ String.create (String.length !buff);
- !buff.[len] <- x;
+ let open Bytes in
+ if len >= length !buff then
+ buff := cat !buff (create (length !buff));
+ set !buff len x;
succ len
let rec nstore n len cs =
if n>0 then nstore (n-1) (store len (Stream.next cs)) cs else len
-let get_buff len = String.sub !buff 0 len
+let get_buff len = Bytes.sub_string !buff 0 len
(* The classical lexer: idents, numbers, quoted strings, comments *)
@@ -266,9 +313,9 @@ let rec ident_tail loc len = parser
ident_tail loc (store len c) s
| [< s >] ->
match lookup_utf8 loc s with
- | Utf8Token ((Unicode.IdentPart | Unicode.Letter), n) ->
+ | Utf8Token (st, n) when Unicode.is_valid_ident_trailing st ->
ident_tail loc (nstore n len s) s
- | Utf8Token (Unicode.Unknown, n) ->
+ | Utf8Token (st, n) when Unicode.is_unknown st ->
let id = get_buff len in
let u = String.concat "" (List.map (String.make 1) (Stream.npeek n s)) in
warn_unrecognized_unicode ~loc:!@loc (u,id); len
@@ -293,13 +340,13 @@ let rec string loc ~comm_level bp len = parser
if esc then string loc ~comm_level bp (store len '"') s else (loc, len)
| [< ''('; s >] ->
(parser
- | [< ''*'; s >] ->
- string loc
- (Option.map succ comm_level)
+ | [< ''*'; s >] ->
+ let comm_level = Option.map succ comm_level in
+ string loc ~comm_level
bp (store (store len '(') '*')
s
| [< >] ->
- string loc comm_level bp (store len '(') s) s
+ string loc ~comm_level bp (store len '(') s) s
| [< ''*'; s >] ->
(parser
| [< '')'; s >] ->
@@ -309,9 +356,9 @@ let rec string loc ~comm_level bp len = parser
| _ -> ()
in
let comm_level = Option.map pred comm_level in
- string loc comm_level bp (store (store len '*') ')') s
+ string loc ~comm_level bp (store (store len '*') ')') s
| [< >] ->
- string loc comm_level bp (store len '*') s) s
+ string loc ~comm_level bp (store len '*') s) s
| [< ''\n' as c; s >] ep ->
(* If we are parsing a comment, the string if not part of a token so we
update the first line of the location. Otherwise, we update the last
@@ -320,17 +367,14 @@ let rec string loc ~comm_level bp len = parser
if Option.has_some comm_level then bump_loc_line loc ep
else bump_loc_line_last loc ep
in
- string loc comm_level bp (store len c) s
- | [< 'c; s >] -> string loc comm_level bp (store len c) s
+ string loc ~comm_level bp (store len c) s
+ | [< 'c; s >] -> string loc ~comm_level bp (store len c) s
| [< _ = Stream.empty >] ep ->
let loc = set_loc_pos loc bp ep in
err loc Unterminated_string
-(* Hook for exporting comment into xml theory files *)
-let (f_xml_output_comment, xml_output_comment) = Hook.make ~default:ignore ()
-
(* To associate locations to a file name *)
-let current_file = ref None
+let current_file = ref Loc.ToplevelInput
(* Utilities for comments in beautify *)
let comment_begin = ref None
@@ -342,18 +386,8 @@ let comments = ref []
let current_comment = Buffer.create 8192
let between_commands = ref true
-let rec split_comments comacc acc pos = function
- [] -> comments := List.rev acc; comacc
- | ((b,e),c as com)::coms ->
- (* Take all comments that terminates before pos, or begin exactly
- at pos (used to print comments attached after an expression) *)
- if e<=pos || pos=b then split_comments (c::comacc) acc pos coms
- else split_comments comacc (com::acc) pos coms
-
-let extract_comments pos = split_comments [] [] pos !comments
-
(* The state of the lexer visible from outside *)
-type lexer_state = int option * string * bool * ((int * int) * string) list * string option
+type lexer_state = int option * string * bool * ((int * int) * string) list * Loc.source
let init_lexer_state f = (None,"",true,[],f)
let set_lexer_state (o,s,b,c,f) =
@@ -362,10 +396,13 @@ let set_lexer_state (o,s,b,c,f) =
between_commands := b;
comments := c;
current_file := f
-let release_lexer_state () =
+let get_lexer_state () =
(!comment_begin, Buffer.contents current_comment, !between_commands, !comments, !current_file)
+let release_lexer_state = get_lexer_state
let drop_lexer_state () =
- set_lexer_state (init_lexer_state None)
+ set_lexer_state (init_lexer_state Loc.ToplevelInput)
+
+let get_comment_state (_,_,_,c,_) = c
let real_push_char c = Buffer.add_char current_comment c
@@ -390,9 +427,6 @@ let null_comment s =
let comment_stop ep =
let current_s = Buffer.contents current_comment in
- if !Flags.xml_export && Buffer.length current_comment > 0 &&
- (!between_commands || not(null_comment current_s)) then
- Hook.get f_xml_output_comment current_s;
(if !Flags.beautify && Buffer.length current_comment > 0 &&
(!between_commands || not(null_comment current_s)) then
let bp = match !comment_begin with
@@ -500,7 +534,7 @@ let parse_after_dot loc c bp =
(try find_keyword loc ("."^field) s with Not_found -> FIELD field)
| [< s >] ->
match lookup_utf8 loc s with
- | Utf8Token (Unicode.Letter, n) ->
+ | Utf8Token (st, n) when Unicode.is_valid_ident_initial st ->
let len = ident_tail loc (nstore n 0 s) s in
let field = get_buff len in
(try find_keyword loc ("."^field) s with Not_found -> FIELD field)
@@ -514,7 +548,7 @@ let parse_after_qmark loc bp s =
| None -> KEYWORD "?"
| _ ->
match lookup_utf8 loc s with
- | Utf8Token (Unicode.Letter, _) -> LEFTQMARK
+ | Utf8Token (st, _) when Unicode.is_valid_ident_initial st -> LEFTQMARK
| AsciiChar | Utf8Token _ | EmptyStream ->
fst (process_chars loc bp '?' s)
@@ -560,7 +594,7 @@ let rec next_token loc = parser bp
| [< ' ('0'..'9' as c); len = number (store 0 c) >] ep ->
comment_stop bp;
(INT (get_buff len), set_loc_pos loc bp ep)
- | [< ''\"'; (loc,len) = string loc None bp 0 >] ep ->
+ | [< ''\"'; (loc,len) = string loc ~comm_level:None bp 0 >] ep ->
comment_stop bp;
(STRING (get_buff len), set_loc_pos loc bp ep)
| [< ' ('(' as c);
@@ -579,13 +613,13 @@ let rec next_token loc = parser bp
comment_stop bp; between_commands := new_between_commands; t
| [< s >] ->
match lookup_utf8 loc s with
- | Utf8Token (Unicode.Letter, n) ->
+ | Utf8Token (st, n) when Unicode.is_valid_ident_initial st ->
let len = ident_tail loc (nstore n 0 s) s in
let id = get_buff len in
let ep = Stream.count s in
comment_stop bp;
(try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep
- | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart | Unicode.Unknown), _) ->
+ | AsciiChar | Utf8Token _ ->
let t = process_chars loc bp (Stream.next s) s in
comment_stop bp; t
| EmptyStream ->
@@ -619,8 +653,6 @@ let loct_add loct i loc = Hashtbl.add loct i loc
we unfreeze the state of the lexer. This restores the behaviour of the
lexer. B.B. *)
-IFDEF CAMLP5 THEN
-
type te = Tok.t
(** Names of tokens, for this lexer, used in Grammar error messages *)
@@ -638,12 +670,12 @@ let token_text = function
let func cs =
let loct = loct_create () in
- let cur_loc = ref (Compat.make_loc !current_file 1 0 0 0) in
+ let cur_loc = ref (from_coqloc !current_file 1 0 0 0) in
let ts =
Stream.from
(fun i ->
let (tok, loc) = next_token !cur_loc cs in
- cur_loc := Compat.after loc;
+ cur_loc := after loc;
loct_add loct i loc; Some tok)
in
(ts, loct_func loct)
@@ -659,41 +691,6 @@ let lexer = {
Token.tok_comm = None;
Token.tok_text = token_text }
-ELSE (* official camlp4 for ocaml >= 3.10 *)
-
-module M_ = Camlp4.ErrorHandler.Register (Error)
-
-module Loc = Compat.CompatLoc
-module Token = struct
- include Tok (* Cf. tok.ml *)
- module Loc = Compat.CompatLoc
- module Error = Camlp4.Struct.EmptyError
- module Filter = struct
- type token_filter = (Tok.t * Loc.t) Stream.t -> (Tok.t * Loc.t) Stream.t
- type t = unit
- let mk _is_kwd = ()
- let keyword_added () kwd _ = add_keyword kwd
- let keyword_removed () _ = ()
- let filter () x = x
- let define_filter () _ = ()
- end
-end
-
-let mk () =
- let loct = loct_create () in
- let cur_loc = ref (Compat.make_loc !current_file 1 0 0 0) in
- let rec self init_loc (* FIXME *) =
- parser i
- [< (tok, loc) = next_token !cur_loc; s >] ->
- cur_loc := Compat.set_loc_file loc !current_file;
- loct_add loct i loc;
- [< '(tok, loc); self init_loc s >]
- | [< >] -> [< >]
- in
- self
-
-END
-
(** Terminal symbols interpretation *)
let is_ident_not_keyword s =
@@ -716,13 +713,13 @@ let strip s =
in
if len == String.length s then s
else
- let s' = String.create len in
+ let s' = Bytes.create len in
let rec loop i i' =
if i == String.length s then s'
else if s.[i] == ' ' then loop (i + 1) i'
- else begin s'.[i'] <- s.[i]; loop (i + 1) (i' + 1) end
+ else begin Bytes.set s' i' s.[i]; loop (i + 1) (i' + 1) end
in
- loop 0 0
+ Bytes.to_string (loop 0 0)
let terminal s =
let s = strip s in