From 7cfc4e5146be5666419451bdd516f1f3f264d24a Mon Sep 17 00:00:00 2001 From: Enrico Tassi Date: Sun, 25 Jan 2015 14:42:51 +0100 Subject: Imported Upstream version 8.5~beta1+dfsg --- ide/coq_lex.mll | 174 +++++++++++++------------------------------------------- 1 file changed, 38 insertions(+), 136 deletions(-) (limited to 'ide/coq_lex.mll') diff --git a/ide/coq_lex.mll b/ide/coq_lex.mll index 1de102d5..e333c0b2 100644 --- a/ide/coq_lex.mll +++ b/ide/coq_lex.mll @@ -1,161 +1,57 @@ (************************************************************************) (* v * The Coq Proof Assistant / The Coq Development Team *) -(* Hashtbl.add h s Keyword) one_word_commands; - List.iter (fun s -> Hashtbl.add h s Declaration) one_word_declarations; - List.iter (fun s -> Hashtbl.add h s ProofDeclaration) proof_declarations; - List.iter (fun s -> Hashtbl.add h s Qed) proof_ends; - List.iter (fun s -> Hashtbl.add h' s Keyword) constr_keywords; - (fun initial id -> Hashtbl.find (if initial then h else h') id) - exception Unterminated - let here f lexbuf = f (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) + let utf8_adjust = ref 0 + let utf8_lexeme_start lexbuf = + Lexing.lexeme_start lexbuf - !utf8_adjust } -let space = - [' ' '\n' '\r' '\t' '\012'] (* '\012' is form-feed *) +let space = [' ' '\n' '\r' '\t' '\012'] (* '\012' is form-feed *) -let firstchar = - ['$' 'A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255'] -let identchar = - ['$' 'A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9'] -let ident = firstchar identchar* - -let undotted_sep = [ '{' '}' '-' '+' '*' ] +let undotted_sep = '{' | '}' | '-'+ | '+'+ | '*'+ let dot_sep = '.' (space | eof) -let multiword_declaration = - "Module" (space+ "Type")? -| "Program" space+ ident -| "Existing" space+ "Instance" "s"? -| "Canonical" space+ "Structure" - -let locality = (space+ "Local")? - -let multiword_command = - ("Uns" | "S")" et" (space+ ident)* -| (("Open" | "Close") locality | "Bind" | " Delimit" ) - space+ "Scope" -| (("Reserved" space+)? "Notation" | "Infix") locality space+ -| "Next" space+ "Obligation" -| "Solve" space+ "Obligations" -| "Require" space+ ("Import"|"Export")? -| "Hint" locality space+ ident -| "Reset" (space+ "Initial")? -| "Tactic" space+ "Notation" -| "Implicit" space+ "Type" "s"? -| "Combined" space+ "Scheme" -| "Extraction" space+ (("Language" space+ ("Ocaml"|"Haskell"|"Scheme"|"Toplevel"))| - ("Library"|"Inline"|"NoInline"|"Blacklist")) -| "Recursive" space+ "Extraction" (space+ "Library")? -| ("Print"|"Reset") space+ "Extraction" space+ ("Inline"|"Blacklist") -| "Extract" space+ (("Inlined" space+) "Constant"| "Inductive") -| "Typeclasses" space+ ("eauto" | "Transparent" | "Opaque") -| ("Generalizable" space+) ("All" | "No")? "Variable" "s"? - -(* At least still missing: "Inline" + decl, variants of "Identity - Coercion", variants of Print, Add, ... *) +let utf8_extra_byte = [ '\x80' - '\xBF' ] rule coq_string = parse | "\"\"" { coq_string lexbuf } - | "\"" { Lexing.lexeme_end lexbuf } - | eof { Lexing.lexeme_end lexbuf } + | "\"" { () } + | eof { () } + | utf8_extra_byte { incr utf8_adjust; coq_string lexbuf } | _ { coq_string lexbuf } and comment = parse - | "(*" { ignore (comment lexbuf); comment lexbuf } - | "\"" { ignore (coq_string lexbuf); comment lexbuf } - | "*)" { (true, Lexing.lexeme_start lexbuf + 2) } - | eof { (false, Lexing.lexeme_end lexbuf) } + | "(*" { let _ = comment lexbuf in comment lexbuf } + | "\"" { let () = coq_string lexbuf in comment lexbuf } + | "*)" { Some (utf8_lexeme_start lexbuf) } + | eof { None } + | utf8_extra_byte { incr utf8_adjust; comment lexbuf } | _ { comment lexbuf } +(** NB : [mkiter] should be called on increasing offsets *) + and sentence initial stamp = parse | "(*" { - let comm_start = Lexing.lexeme_start lexbuf in - let trully_terminated,comm_end = comment lexbuf in - stamp comm_start comm_end Comment; - if not trully_terminated then raise Unterminated; - (* A comment alone is a sentence. - A comment in a sentence doesn't terminate the sentence. - Note: comm_end is the first position _after_ the comment, - as required when tagging a zone, hence the -1 to locate the - ")" terminating the comment. - *) - if initial then comm_end - 1 else sentence false stamp lexbuf + match comment lexbuf with + | None -> raise Unterminated + | Some comm_last -> + stamp comm_last Tags.Script.comment; + sentence initial stamp lexbuf } | "\"" { - let str_start = Lexing.lexeme_start lexbuf in - let str_end = coq_string lexbuf in - stamp str_start str_end String; - sentence false stamp lexbuf - } - | multiword_declaration { - if initial then here stamp lexbuf Declaration; + let () = coq_string lexbuf in sentence false stamp lexbuf } - | multiword_command { - if initial then here stamp lexbuf Keyword; - sentence false stamp lexbuf - } - | ident as id { - (try here stamp lexbuf (tag_of_ident initial id) with Not_found -> ()); - sentence false stamp lexbuf } | ".." { (* We must have a particular rule for parsing "..", where no dot is a terminator, even if we have a blank afterwards @@ -164,32 +60,38 @@ and sentence initial stamp = parse special case, where the third dot is a terminator. *) sentence false stamp lexbuf } - | dot_sep { Lexing.lexeme_start lexbuf } (* The usual "." terminator *) + | dot_sep { + (* The usual "." terminator *) + stamp (utf8_lexeme_start lexbuf) Tags.Script.sentence; + sentence true stamp lexbuf + } | undotted_sep { (* Separators like { or } and bullets * - + are only active at the start of a sentence *) - if initial then Lexing.lexeme_start lexbuf - else sentence false stamp lexbuf + if initial then stamp (utf8_lexeme_start lexbuf + String.length (Lexing.lexeme lexbuf) - 1) Tags.Script.sentence; + sentence initial stamp lexbuf } | space+ { (* Parsing spaces is the only situation preserving initiality *) sentence initial stamp lexbuf } + | utf8_extra_byte { incr utf8_adjust; sentence false stamp lexbuf } + | eof { if initial then () else raise Unterminated } | _ { (* Any other characters *) sentence false stamp lexbuf } - | eof { raise Unterminated } { - (** Parse a sentence in string [slice], tagging relevant parts with - function [stamp], and returning the position of the first - sentence delimitor (either "." or "{" or "}" or the end of a comment). - It will raise [Unterminated] when no end of sentence is found. + (** Parse sentences in string [slice], tagging last characters + of sentences with the [stamp] function. + It will raise [Unterminated] if [slice] ends with an unfinished + sentence. *) - let delimit_sentence stamp slice = + let delimit_sentences stamp slice = + utf8_adjust := 0; sentence true stamp (Lexing.from_string slice) } -- cgit v1.2.3