summaryrefslogtreecommitdiff
path: root/parsing/lexer.ml4
diff options
context:
space:
mode:
Diffstat (limited to 'parsing/lexer.ml4')
-rw-r--r--parsing/lexer.ml4322
1 files changed, 183 insertions, 139 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index 50349e22..e351061d 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -1,20 +1,15 @@
(************************************************************************)
(* v * The Coq Proof Assistant / The Coq Development Team *)
-(* <O___,, * INRIA - CNRS - LIX - LRI - PPS - Copyright 1999-2011 *)
+(* <O___,, * INRIA - CNRS - LIX - LRI - PPS - Copyright 1999-2010 *)
(* \VV/ **************************************************************)
(* // * This file is distributed under the terms of the *)
(* * GNU Lesser General Public License Version 2.1 *)
(************************************************************************)
-(*i camlp4use: "pr_o.cmo pa_macro.cmo" i*)
-(* Add pr_o.cmo to circumvent a useless-warning bug when preprocessed with
- * ast-based camlp4 *)
-
-(*i $Id: lexer.ml4 14641 2011-11-06 11:59:10Z herbelin $ i*)
-
open Pp
open Util
-open Token
+open Compat
+open Tok
(* Dictionaries: trees annotated with string options, each node being a map
from chars to dictionaries (the subtrees). A trie, in other words. *)
@@ -76,18 +71,37 @@ let ttree_remove ttree str =
(* Errors occuring while lexing (explained as "Lexer error: ...") *)
-type error =
- | Illegal_character
- | Unterminated_comment
- | Unterminated_string
- | Undefined_token
- | Bad_token of string
+module Error = struct
+
+ type t =
+ | Illegal_character
+ | Unterminated_comment
+ | Unterminated_string
+ | Undefined_token
+ | Bad_token of string
+ | UnsupportedUnicode of int
+
+ exception E of t
+
+ let to_string x =
+ "Syntax Error: Lexer: " ^
+ (match x with
+ | Illegal_character -> "Illegal character"
+ | Unterminated_comment -> "Unterminated comment"
+ | Unterminated_string -> "Unterminated string"
+ | Undefined_token -> "Undefined token"
+ | Bad_token tok -> Format.sprintf "Bad token %S" tok
+ | UnsupportedUnicode x ->
+ Printf.sprintf "Unsupported Unicode character (0x%x)" x)
-exception Error of error
+ let print ppf x = Format.fprintf ppf "%s@." (to_string x)
-let err loc str = Stdpp.raise_with_loc (Util.make_loc loc) (Error str)
+end
+open Error
-let bad_token str = raise (Error (Bad_token str))
+let err loc str = Loc.raise (make_loc loc) (Error.E str)
+
+let bad_token str = raise (Error.E (Bad_token str))
(* Lexer conventions on tokens *)
@@ -96,9 +110,9 @@ type token_kind =
| AsciiChar
| EmptyStream
-let error_unsupported_unicode_character n cs =
+let error_unsupported_unicode_character n unicode cs =
let bp = Stream.count cs in
- err (bp,bp+n) (Bad_token "Unsupported Unicode character")
+ err (bp,bp+n) (UnsupportedUnicode unicode)
let error_utf8 cs =
let bp = Stream.count cs in
@@ -147,7 +161,8 @@ let lookup_utf8_tail c cs =
| _ -> error_utf8 cs
in
try classify_unicode unicode, n
- with UnsupportedUtf8 -> njunk n cs; error_unsupported_unicode_character n cs
+ with UnsupportedUtf8 ->
+ njunk n cs; error_unsupported_unicode_character n unicode cs
let lookup_utf8 cs =
match Stream.peek cs with
@@ -155,14 +170,26 @@ let lookup_utf8 cs =
| Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail c cs)
| None -> EmptyStream
-let check_special_token str =
+let unlocated f x =
+ try f x with Loc.Exc_located (_,exc) -> raise exc
+
+let check_keyword str =
let rec loop_symb = parser
| [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str
- | [< _ = Stream.empty >] -> ()
- | [< '_ ; s >] -> loop_symb s
+ | [< s >] ->
+ match unlocated lookup_utf8 s with
+ | Utf8Token (_,n) -> njunk n s; loop_symb s
+ | AsciiChar -> Stream.junk s; loop_symb s
+ | EmptyStream -> ()
in
loop_symb (Stream.of_string str)
+let check_keyword_to_add s =
+ try check_keyword s
+ with Error.E (UnsupportedUnicode unicode) ->
+ Flags.if_verbose msg_warning
+ (strbrk (Printf.sprintf "Token '%s' contains unicode character 0x%x which will not be parsable." s unicode))
+
let check_ident str =
let rec loop_id intail = parser
| [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] ->
@@ -170,7 +197,7 @@ let check_ident str =
| [< ' ('0'..'9' | ''') when intail; s >] ->
loop_id true s
| [< s >] ->
- match lookup_utf8 s with
+ match unlocated lookup_utf8 s with
| Utf8Token (UnicodeLetter, n) -> njunk n s; loop_id true s
| Utf8Token (UnicodeIdentPart, n) when intail -> njunk n s; loop_id true s
| EmptyStream -> ()
@@ -178,9 +205,8 @@ let check_ident str =
in
loop_id false (Stream.of_string str)
-let check_keyword str =
- try check_special_token str
- with Error _ -> check_ident str
+let is_ident str =
+ try let _ = check_ident str in true with Error.E _ -> false
(* Keyword and symbol dictionary *)
let token_tree = ref empty_ttree
@@ -190,22 +216,15 @@ let is_keyword s =
with Not_found -> false
let add_keyword str =
- check_keyword str;
- token_tree := ttree_add !token_tree str
+ if not (is_keyword str) then
+ begin
+ check_keyword_to_add str;
+ token_tree := ttree_add !token_tree str
+ end
let remove_keyword str =
token_tree := ttree_remove !token_tree str
-(* Adding a new token (keyword or special token). *)
-let add_token (con, str) = match con with
- | "" -> add_keyword str
- | "METAIDENT" | "LEFTQMARK" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI"
- -> ()
- | _ ->
- raise (Token.Error ("\
-the constructor \"" ^ con ^ "\" is not recognized by Lexer"))
-
-
(* Freeze and unfreeze the state of the lexer *)
type frozen_t = ttree
@@ -249,17 +268,22 @@ let rec number len = parser
| [< ' ('0'..'9' as c); s >] -> number (store len c) s
| [< >] -> len
-let escape len c = store len c
-
let rec string in_comments bp len = parser
| [< ''"'; esc=(parser [<''"' >] -> true | [< >] -> false); s >] ->
if esc then string in_comments bp (store len '"') s else len
+ | [< ''('; s >] ->
+ (parser
+ | [< ''*'; s >] ->
+ string (Option.map succ in_comments) bp (store (store len '(') '*') s
+ | [< >] ->
+ string in_comments bp (store len '(') s) s
| [< ''*'; s >] ->
(parser
| [< '')'; s >] ->
- if in_comments then
+ if in_comments = Some 0 then
msg_warning (str "Not interpreting \"*)\" as the end of current non-terminated comment because it occurs in a non-terminated string of the comment.");
- string in_comments bp (store (store len '*') ')') s
+ let in_comments = Option.map pred in_comments in
+ string in_comments bp (store (store len '*') ')') s
| [< >] ->
string in_comments bp (store len '*') s) s
| [< 'c; s >] -> string in_comments bp (store len c) s
@@ -348,7 +372,7 @@ let rec comment bp = parser bp2
| [< s >] -> real_push_char '*'; comment bp s >] -> ()
| [< ''"'; s >] ->
if Flags.do_beautify() then (push_string"\"";comm_string bp2 s)
- else ignore (string true bp2 0 s);
+ else ignore (string (Some 0) bp2 0 s);
comment bp s
| [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment
| [< 'z; s >] -> real_push_char z; comment bp s
@@ -394,61 +418,68 @@ let find_keyword id s =
let tt = ttree_find !token_tree id in
match progress_further tt.node 0 tt s with
| None -> raise Not_found
- | Some c -> c
+ | Some c -> KEYWORD c
(* Must be a special token *)
let process_chars bp c cs =
let t = progress_from_byte None (-1) !token_tree cs c in
let ep = Stream.count cs in
match t with
- | Some t -> (("", t), (bp, ep))
+ | Some t -> (KEYWORD t, (bp, ep))
| None ->
let ep' = bp + utf8_char_size cs c in
njunk (ep' - ep) cs;
err (bp, ep') Undefined_token
-let parse_after_dollar bp =
- parser
- | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail (store 0 c) >] ->
- ("METAIDENT", get_buff len)
- | [< s >] ->
- match lookup_utf8 s with
- | Utf8Token (UnicodeLetter, n) ->
- ("METAIDENT", get_buff (ident_tail (nstore n 0 s) s))
- | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp '$' s)
+let token_of_special c s = match c with
+ | '$' -> METAIDENT s
+ | '.' -> FIELD s
+ | _ -> assert false
-(* Parse what follows a dot *)
-let parse_after_dot bp c =
+(* Parse what follows a dot / a dollar *)
+
+let parse_after_special c bp =
parser
- | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail (store 0 c) >] ->
- ("FIELD", get_buff len)
+ | [< ' ('a'..'z' | 'A'..'Z' | '_' as d); len = ident_tail (store 0 d) >] ->
+ token_of_special c (get_buff len)
| [< s >] ->
match lookup_utf8 s with
| Utf8Token (UnicodeLetter, n) ->
- ("FIELD", get_buff (ident_tail (nstore n 0 s) s))
- | AsciiChar | Utf8Token _ | EmptyStream ->
- fst (process_chars bp c s)
+ token_of_special c (get_buff (ident_tail (nstore n 0 s) s))
+ | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp c s)
(* Parse what follows a question mark *)
+
let parse_after_qmark bp s =
match Stream.peek s with
- |Some ('a'..'z' | 'A'..'Z' | '_') -> ("LEFTQMARK", "")
- |None -> ("","?")
+ | Some ('a'..'z' | 'A'..'Z' | '_') -> LEFTQMARK
+ | None -> KEYWORD "?"
| _ ->
match lookup_utf8 s with
- | Utf8Token (UnicodeLetter, _) -> ("LEFTQMARK", "")
+ | Utf8Token (UnicodeLetter, _) -> LEFTQMARK
| AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp '?' s)
+let blank_or_eof cs =
+ match Stream.peek cs with
+ | None -> true
+ | Some (' ' | '\t' | '\n' |'\r') -> true
+ | _ -> false
(* Parse a token in a char stream *)
+
let rec next_token = parser bp
| [< '' ' | '\t' | '\n' |'\r' as c; s >] ->
comm_loc bp; push_char c; next_token s
- | [< ''$'; t = parse_after_dollar bp >] ep ->
+ | [< ''$' as c; t = parse_after_special c bp >] ep ->
comment_stop bp; (t, (ep, bp))
- | [< ''.' as c; t = parse_after_dot bp c >] ep ->
+ | [< ''.' as c; t = parse_after_special c bp; s >] ep ->
comment_stop bp;
- if Flags.do_beautify() & t=("",".") then between_com := true;
+ (* We enforce that "." should either be part of a larger keyword,
+ for instance ".(", or followed by a blank or eof. *)
+ if t = KEYWORD "." then begin
+ if not (blank_or_eof s) then err (bp,ep+1) Undefined_token;
+ if Flags.do_beautify() then between_com := true;
+ end;
(t, (bp,ep))
| [< ''?'; s >] ep ->
let t = parse_after_qmark bp s in comment_stop bp; (t, (ep, bp))
@@ -456,13 +487,13 @@ let rec next_token = parser bp
len = ident_tail (store 0 c); s >] ep ->
let id = get_buff len in
comment_stop bp;
- (try ("", find_keyword id s) with Not_found -> ("IDENT", id)), (bp, ep)
+ (try find_keyword id s with Not_found -> IDENT id), (bp, ep)
| [< ' ('0'..'9' as c); len = number (store 0 c) >] ep ->
comment_stop bp;
- (("INT", get_buff len), (bp, ep))
- | [< ''\"'; len = string false bp 0 >] ep ->
+ (INT (get_buff len), (bp, ep))
+ | [< ''\"'; len = string None bp 0 >] ep ->
comment_stop bp;
- (("STRING", get_buff len), (bp, ep))
+ (STRING (get_buff len), (bp, ep))
| [< ' ('(' as c);
t = parser
| [< ''*'; s >] ->
@@ -479,62 +510,53 @@ let rec next_token = parser bp
let id = get_buff len in
let ep = Stream.count s in
comment_stop bp;
- (try ("",find_keyword id s) with Not_found -> ("IDENT",id)), (bp, ep)
+ (try find_keyword id s with Not_found -> IDENT id), (bp, ep)
| AsciiChar | Utf8Token ((UnicodeSymbol | UnicodeIdentPart), _) ->
let t = process_chars bp (Stream.next s) s in
comment_stop bp; t
| EmptyStream ->
- comment_stop bp; (("EOI", ""), (bp, bp + 1))
+ comment_stop bp; (EOI, (bp, bp + 1))
+
+(* (* Debug: uncomment this for tracing tokens seen by coq...*)
+let next_token s =
+ let (t,(bp,ep)) = next_token s in Printf.eprintf "[%s]\n%!" (Tok.to_string t);
+ (t,(bp,ep))
+*)
(* Location table system for creating tables associating a token count
to its location in a char stream (the source) *)
let locerr () = invalid_arg "Lexer: location function"
-let tsz = 256 (* up to 2^29 entries on a 32-bit machine, 2^61 on 64-bit *)
-
-let loct_create () = ref [| [| |] |]
+let loct_create () = Hashtbl.create 207
let loct_func loct i =
- match
- if i < 0 || i/tsz >= Array.length !loct then None
- else if !loct.(i/tsz) = [| |] then None
- else !loct.(i/tsz).(i mod tsz)
- with
- | Some loc -> Util.make_loc loc
- | _ -> locerr ()
-
-let loct_add loct i loc =
- while i/tsz >= Array.length !loct do
- let new_tmax = Array.length !loct * 2 in
- let new_loct = Array.make new_tmax [| |] in
- Array.blit !loct 0 new_loct 0 (Array.length !loct);
- loct := new_loct;
- done;
- if !loct.(i/tsz) = [| |] then !loct.(i/tsz) <- Array.make tsz None;
- !loct.(i/tsz).(i mod tsz) <- Some loc
-
-let current_location_table = ref (ref [| [| |] |])
-
-let location_function n =
- loct_func !current_location_table n
+ try Hashtbl.find loct i with Not_found -> locerr ()
-let func cs =
- let loct = loct_create () in
- let ts =
- Stream.from
- (fun i ->
- let (tok, loc) = next_token cs in
- loct_add loct i loc; Some tok)
- in
- current_location_table := loct;
- (ts, loct_func loct)
+let loct_add loct i loc = Hashtbl.add loct i loc
+
+let current_location_table = ref (loct_create ())
-type location_table = (int * int) option array array ref
+type location_table = (int, loc) Hashtbl.t
let location_table () = !current_location_table
let restore_location_table t = current_location_table := t
+let location_function n = loct_func !current_location_table n
-(* Names of tokens, for this lexer, used in Grammar error messages *)
+(** {6 The lexer of Coq} *)
+
+(** Note: removing a token.
+ We do nothing because [remove_token] is called only when removing a grammar
+ rule with [Grammar.delete_rule]. The latter command is called only when
+ unfreezing the state of the grammar entries (see GRAMMAR summary, file
+ env/metasyntax.ml). Therefore, instead of removing tokens one by one,
+ we unfreeze the state of the lexer. This restores the behaviour of the
+ lexer. B.B. *)
+
+IFDEF CAMLP5 THEN
+
+type te = Tok.t
+
+(** Names of tokens, for this lexer, used in Grammar error messages *)
let token_text = function
| ("", t) -> "'" ^ t ^ "'"
@@ -547,43 +569,65 @@ let token_text = function
| (con, "") -> con
| (con, prm) -> con ^ " \"" ^ prm ^ "\""
-(* The lexer of Coq *)
-
-(* Note: removing a token.
- We do nothing because [remove_token] is called only when removing a grammar
- rule with [Grammar.delete_rule]. The latter command is called only when
- unfreezing the state of the grammar entries (see GRAMMAR summary, file
- env/metasyntax.ml). Therefore, instead of removing tokens one by one,
- we unfreeze the state of the lexer. This restores the behaviour of the
- lexer. B.B. *)
-
-IFDEF CAMLP5 THEN
+let func cs =
+ let loct = loct_create () in
+ let ts =
+ Stream.from
+ (fun i ->
+ let (tok, loc) = next_token cs in
+ loct_add loct i (make_loc loc); Some tok)
+ in
+ current_location_table := loct;
+ (ts, loct_func loct)
let lexer = {
Token.tok_func = func;
- Token.tok_using = add_token;
+ Token.tok_using =
+ (fun pat -> match Tok.of_pattern pat with
+ | KEYWORD s -> add_keyword s
+ | _ -> ());
Token.tok_removing = (fun _ -> ());
- Token.tok_match = default_match;
+ Token.tok_match = Tok.match_pattern;
Token.tok_comm = None;
Token.tok_text = token_text }
-ELSE
-
-let lexer = {
- Token.func = func;
- Token.using = add_token;
- Token.removing = (fun _ -> ());
- Token.tparse = (fun _ -> None);
- Token.text = token_text }
+ELSE (* official camlp4 for ocaml >= 3.10 *)
+
+module M_ = Camlp4.ErrorHandler.Register (Error)
+
+module Loc = Loc
+module Token = struct
+ include Tok (* Cf. tok.ml *)
+ module Loc = Loc
+ module Error = Camlp4.Struct.EmptyError
+ module Filter = struct
+ type token_filter = (Tok.t * Loc.t) Stream.t -> (Tok.t * Loc.t) Stream.t
+ type t = unit
+ let mk _is_kwd = ()
+ let keyword_added () kwd _ = add_keyword kwd
+ let keyword_removed () _ = ()
+ let filter () x = x
+ let define_filter () _ = ()
+ end
+end
+
+let mk () _init_loc(*FIXME*) cs =
+ let loct = loct_create () in
+ let rec self =
+ parser i
+ [< (tok, loc) = next_token; s >] ->
+ let loc = make_loc loc in
+ loct_add loct i loc;
+ [< '(tok, loc); self s >]
+ | [< >] -> [< >]
+ in current_location_table := loct; self cs
END
-(* Terminal symbols interpretation *)
+(** Terminal symbols interpretation *)
let is_ident_not_keyword s =
- match s.[0] with
- | 'a'..'z' | 'A'..'Z' | '_' -> not (is_keyword s)
- | _ -> false
+ is_ident s && not (is_keyword s)
let is_number s =
let rec aux i =
@@ -613,6 +657,6 @@ let strip s =
let terminal s =
let s = strip s in
if s = "" then failwith "empty token";
- if is_ident_not_keyword s then ("IDENT", s)
- else if is_number s then ("INT", s)
- else ("", s)
+ if is_ident_not_keyword s then IDENT s
+ else if is_number s then INT s
+ else KEYWORD s