From 9cdd77d5774e9a73563c4d4870ea2de8225353d9 Mon Sep 17 00:00:00 2001 From: letouzey Date: Tue, 11 Dec 2012 07:46:08 +0000 Subject: Coq_lex: direct accounting of utf8 extra bytes in offsets We directly produce in Coq_lex a utf8 char offset instead of a byte offset, by counting the utf8 extra byte during the lexing. This way, no need anymore for converting later with complex byte_offset_to_char_offset. git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@16059 85f007b7-540e-0410-9357-904b9bb8a0f7 --- ide/coq_lex.mll | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'ide/coq_lex.mll') diff --git a/ide/coq_lex.mll b/ide/coq_lex.mll index c1f75b6ff..d1d7c6629 100644 --- a/ide/coq_lex.mll +++ b/ide/coq_lex.mll @@ -8,26 +8,34 @@ { exception Unterminated + + let utf8_adjust = ref 0 + + let utf8_lexeme_start lexbuf = + Lexing.lexeme_start lexbuf - !utf8_adjust } -let space = - [' ' '\n' '\r' '\t' '\012'] (* '\012' is form-feed *) +let space = [' ' '\n' '\r' '\t' '\012'] (* '\012' is form-feed *) let undotted_sep = [ '{' '}' '-' '+' '*' ] let dot_sep = '.' (space | eof) +let utf8_extra_byte = [ '\x80' - '\xBF' ] + rule coq_string = parse | "\"\"" { coq_string lexbuf } | "\"" { () } | eof { () } + | utf8_extra_byte { incr utf8_adjust; coq_string lexbuf } | _ { coq_string lexbuf } and comment = parse | "(*" { let _ = comment lexbuf in comment lexbuf } | "\"" { let () = coq_string lexbuf in comment lexbuf } - | "*)" { Some (Lexing.lexeme_start lexbuf + 1) } + | "*)" { Some (utf8_lexeme_start lexbuf + 1) } | eof { None } + | utf8_extra_byte { incr utf8_adjust; comment lexbuf } | _ { comment lexbuf } (** NB : [mkiter] should be called on increasing offsets *) @@ -57,24 +65,25 @@ and sentence initial stamp = parse } | dot_sep { (* The usual "." terminator *) - stamp (Lexing.lexeme_start lexbuf) Tags.Script.sentence; + stamp (utf8_lexeme_start lexbuf) Tags.Script.sentence; sentence true stamp lexbuf } | undotted_sep { (* Separators like { or } and bullets * - + are only active at the start of a sentence *) - if initial then stamp (Lexing.lexeme_start lexbuf) Tags.Script.sentence; + if initial then stamp (utf8_lexeme_start lexbuf) Tags.Script.sentence; sentence initial stamp lexbuf } | space+ { (* Parsing spaces is the only situation preserving initiality *) sentence initial stamp lexbuf } + | utf8_extra_byte { incr utf8_adjust; sentence false stamp lexbuf } + | eof { if initial then () else raise Unterminated } | _ { (* Any other characters *) sentence false stamp lexbuf } - | eof { if initial then () else raise Unterminated } { @@ -85,6 +94,7 @@ and sentence initial stamp = parse *) let delimit_sentences stamp slice = + utf8_adjust := 0; sentence true stamp (Lexing.from_string slice) } -- cgit v1.2.3