diff options
Diffstat (limited to 'parsing/cLexer.ml4')
-rw-r--r-- | parsing/cLexer.ml4 | 121 |
1 files changed, 59 insertions, 62 deletions
diff --git a/parsing/cLexer.ml4 b/parsing/cLexer.ml4 index 181c4b7fd..79771f3f6 100644 --- a/parsing/cLexer.ml4 +++ b/parsing/cLexer.ml4 @@ -69,6 +69,15 @@ let ttree_remove ttree str = in remove ttree 0 +let ttree_elements ttree = + let rec elts tt accu = + let accu = match tt.node with + | None -> accu + | Some s -> CString.Set.add s accu + in + CharMap.fold (fun _ tt accu -> elts tt accu) tt.branch accu + in + elts ttree CString.Set.empty (* Errors occurring while lexing (explained as "Lexer error: ...") *) @@ -101,14 +110,6 @@ module Error = struct end open Error -let current_file = ref "" - -let get_current_file () = - !current_file - -let set_current_file ~fname = - current_file := fname - let err loc str = Loc.raise (Compat.to_coqloc loc) (Error.E str) let bad_token str = raise (Error.E (Bad_token str)) @@ -120,11 +121,6 @@ type token_kind = | AsciiChar | EmptyStream -let error_unsupported_unicode_character loc n unicode cs = - let bp = Stream.count cs in - let loc = set_loc_pos loc bp (bp+n) in - err loc (UnsupportedUnicode unicode) - let error_utf8 loc cs = let bp = Stream.count cs in Stream.junk cs; (* consume the char to avoid read it and fail again *) @@ -174,14 +170,12 @@ let lookup_utf8_tail loc c cs = (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F) | _ -> error_utf8 loc cs in - try Unicode.classify unicode, n - with Unicode.Unsupported -> - njunk n cs; error_unsupported_unicode_character loc n unicode cs + Utf8Token (Unicode.classify unicode, n) let lookup_utf8 loc cs = match Stream.peek cs with | Some ('\x00'..'\x7F') -> AsciiChar - | Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail loc c cs) + | Some ('\x80'..'\xFF' as c) -> lookup_utf8_tail loc c cs | None -> EmptyStream let unlocated f x = f x @@ -199,17 +193,6 @@ let check_keyword str = in loop_symb (Stream.of_string str) -let warn_unparsable_keyword = - CWarnings.create ~name:"unparsable-keyword" ~category:"parsing" - (fun (s,unicode) -> - strbrk (Printf.sprintf "Token '%s' contains unicode character 0x%x \ - which will not be parsable." s unicode)) - -let check_keyword_to_add s = - try check_keyword s - with Error.E (UnsupportedUnicode unicode) -> - warn_unparsable_keyword (s,unicode) - let check_ident str = let rec loop_id intail = parser | [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] -> @@ -240,13 +223,15 @@ let is_keyword s = let add_keyword str = if not (is_keyword str) then begin - check_keyword_to_add str; + check_keyword str; token_tree := ttree_add !token_tree str end let remove_keyword str = token_tree := ttree_remove !token_tree str +let keywords () = ttree_elements !token_tree + (* Freeze and unfreeze the state of the lexer *) type frozen_t = ttree @@ -270,6 +255,12 @@ let get_buff len = String.sub !buff 0 len (* The classical lexer: idents, numbers, quoted strings, comments *) +let warn_unrecognized_unicode = + CWarnings.create ~name:"unrecognized-unicode" ~category:"parsing" + (fun (u,id) -> + strbrk (Printf.sprintf "Not considering unicode character \"%s\" of unknown \ + lexical status as part of identifier \"%s\"." u id)) + let rec ident_tail loc len = parser | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' as c); s >] -> ident_tail loc (store len c) s @@ -277,6 +268,10 @@ let rec ident_tail loc len = parser match lookup_utf8 loc s with | Utf8Token ((Unicode.IdentPart | Unicode.Letter), n) -> ident_tail loc (nstore n len s) s + | Utf8Token (Unicode.Unknown, n) -> + let id = get_buff len in + let u = String.concat "" (List.map (String.make 1) (Stream.npeek n s)) in + warn_unrecognized_unicode ~loc:!@loc (u,id); len | _ -> len let rec number len = parser @@ -334,6 +329,9 @@ let rec string loc ~comm_level bp len = parser (* Hook for exporting comment into xml theory files *) let (f_xml_output_comment, xml_output_comment) = Hook.make ~default:ignore () +(* To associate locations to a file name *) +let current_file = ref None + (* Utilities for comments in beautify *) let comment_begin = ref None let comm_loc bp = match !comment_begin with @@ -354,16 +352,20 @@ let rec split_comments comacc acc pos = function let extract_comments pos = split_comments [] [] pos !comments -type comments_state = int option * string * bool * ((int * int) * string) list -let restore_comments_state (o,s,b,c) = +(* The state of the lexer visible from outside *) +type lexer_state = int option * string * bool * ((int * int) * string) list * string option + +let init_lexer_state f = (None,"",true,[],f) +let set_lexer_state (o,s,b,c,f) = comment_begin := o; Buffer.clear current_comment; Buffer.add_string current_comment s; between_commands := b; - comments := c -let default_comments_state = (None,"",true,[]) -let comments_state () = - let s = (!comment_begin, Buffer.contents current_comment, !between_commands, !comments) in - restore_comments_state default_comments_state; s + comments := c; + current_file := f +let release_lexer_state () = + (!comment_begin, Buffer.contents current_comment, !between_commands, !comments, !current_file) +let drop_lexer_state () = + set_lexer_state (init_lexer_state None) let real_push_char c = Buffer.add_char current_comment c @@ -391,7 +393,7 @@ let comment_stop ep = if !Flags.xml_export && Buffer.length current_comment > 0 && (!between_commands || not(null_comment current_s)) then Hook.get f_xml_output_comment current_s; - (if Flags.do_beautify() && Buffer.length current_comment > 0 && + (if !Flags.beautify && Buffer.length current_comment > 0 && (!between_commands || not(null_comment current_s)) then let bp = match !comment_begin with Some bp -> bp @@ -408,7 +410,7 @@ let comment_stop ep = (* Does not unescape!!! *) let rec comm_string loc bp = parser - | [< ''"' >] ep -> push_string "\""; loc + | [< ''"' >] -> push_string "\""; loc | [< ''\\'; loc = (parser [< ' ('"' | '\\' as c) >] -> let () = match c with @@ -438,7 +440,7 @@ let rec comment loc bp = parser bp2 let loc = (* In beautify mode, the lexing differs between strings in comments and regular strings (e.g. escaping). It seems wrong. *) - if Flags.do_beautify() then (push_string"\""; comm_string loc bp2 s) + if !Flags.beautify then (push_string"\""; comm_string loc bp2 s) else fst (string loc ~comm_level:(Some 0) bp2 0 s) in comment loc bp s @@ -511,20 +513,19 @@ let process_chars loc bp c cs = let loc = set_loc_pos loc bp ep' in err loc Undefined_token -let token_of_special c s = match c with - | '.' -> FIELD s - | _ -> assert false +(* Parse what follows a dot *) -(* Parse what follows a dot / a dollar *) - -let parse_after_special loc c bp = +let parse_after_dot loc c bp = parser - | [< ' ('a'..'z' | 'A'..'Z' | '_' as d); len = ident_tail loc (store 0 d) >] -> - token_of_special c (get_buff len) + | [< ' ('a'..'z' | 'A'..'Z' | '_' as d); len = ident_tail loc (store 0 d); s >] -> + let field = get_buff len in + (try find_keyword loc ("."^field) s with Not_found -> FIELD field) | [< s >] -> match lookup_utf8 loc s with | Utf8Token (Unicode.Letter, n) -> - token_of_special c (get_buff (ident_tail loc (nstore n 0 s) s)) + let len = ident_tail loc (nstore n 0 s) s in + let field = get_buff len in + (try find_keyword loc ("."^field) s with Not_found -> FIELD field) | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars loc bp c s) (* Parse what follows a question mark *) @@ -552,7 +553,7 @@ let rec next_token loc = parser bp comm_loc bp; push_char c; next_token (bump_loc_line loc ep) s | [< '' ' | '\t' | '\r' as c; s >] -> comm_loc bp; push_char c; next_token loc s - | [< ''.' as c; t = parse_after_special loc c bp; s >] ep -> + | [< ''.' as c; t = parse_after_dot loc c bp; s >] ep -> comment_stop bp; (* We enforce that "." should either be part of a larger keyword, for instance ".(", or followed by a blank or eof. *) @@ -572,7 +573,7 @@ let rec next_token loc = parser bp comment_stop bp; between_commands := new_between_commands; t | [< ''?'; s >] ep -> let t = parse_after_qmark loc bp s in - comment_stop bp; (t, set_loc_pos loc ep bp) + comment_stop bp; (t, set_loc_pos loc bp ep) | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail loc (store 0 c); s >] ep -> let id = get_buff len in @@ -592,6 +593,12 @@ let rec next_token loc = parser bp let loc = comment loc bp s in next_token loc s | [< t = process_chars loc bp c >] -> comment_stop bp; t >] -> t + | [< ' ('{' | '}' as c); s >] ep -> + let t,new_between_commands = + if !between_commands then (KEYWORD (String.make 1 c), set_loc_pos loc bp ep), true + else process_chars loc bp c s, false + in + comment_stop bp; between_commands := new_between_commands; t | [< s >] -> match lookup_utf8 loc s with | Utf8Token (Unicode.Letter, n) -> @@ -600,11 +607,9 @@ let rec next_token loc = parser bp let ep = Stream.count s in comment_stop bp; (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep - | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart), _) -> + | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart | Unicode.Unknown), _) -> let t = process_chars loc bp (Stream.next s) s in - let new_between_commands = match t with - (KEYWORD ("{"|"}"),_) -> !between_commands | _ -> false in - comment_stop bp; between_commands := new_between_commands; t + comment_stop bp; t | EmptyStream -> comment_stop bp; (EOI, set_loc_pos loc bp (bp+1)) @@ -626,12 +631,6 @@ let loct_func loct i = let loct_add loct i loc = Hashtbl.add loct i loc -let current_location_table = ref (loct_create ()) - -type location_table = (int, Compat.CompatLoc.t) Hashtbl.t -let location_table () = !current_location_table -let restore_location_table t = current_location_table := t - (** {6 The lexer of Coq} *) (** Note: removing a token. @@ -669,7 +668,6 @@ let func cs = cur_loc := Compat.after loc; loct_add loct i loc; Some tok) in - current_location_table := loct; (ts, loct_func loct) let lexer = { @@ -706,7 +704,6 @@ end let mk () = let loct = loct_create () in let cur_loc = ref (Compat.make_loc !current_file 1 0 0 0) in - current_location_table := loct; let rec self init_loc (* FIXME *) = parser i [< (tok, loc) = next_token !cur_loc; s >] -> |