Coq_lex: direct accounting of utf8 extra bytes in offsets

We directly produce in Coq_lex a utf8 char offset instead of a byte offset, by counting the utf8 extra byte during the lexing. This way, no need anymore for converting later with complex byte_offset_to_char_offset. git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@16059 85f007b7-540e-0410-9357-904b9bb8a0f7
author: letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7> 2012-12-11 07:46:08 +0000
committer: letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7> 2012-12-11 07:46:08 +0000
commit: 9cdd77d5774e9a73563c4d4870ea2de8225353d9 (patch)
tree: e3dea01f467ec7b9f62587c53f069961884c14c6 /ide/coq_lex.mll
parent: b0ab21a5a94b09170ff60cd4a405afa5287d7867 (diff)
1 files changed, 16 insertions, 6 deletions
diff --git a/ide/coq_lex.mll b/ide/coq_lex.mll
index c1f75b6ff..d1d7c6629 100644
--- a/ide/coq_lex.mll
+++ b/ide/coq_lex.mll
@@ -8,26 +8,34 @@
 
 {
   exception Unterminated
+
+  let utf8_adjust = ref 0
+
+  let utf8_lexeme_start lexbuf =
+    Lexing.lexeme_start lexbuf - !utf8_adjust
 }
 
-let space =
-  [' ' '\n' '\r' '\t' '\012'] (* '\012' is form-feed *)
+let space = [' ' '\n' '\r' '\t' '\012'] (* '\012' is form-feed *)
 
 let undotted_sep = [ '{' '}' '-' '+' '*' ]
 
 let dot_sep = '.' (space | eof)
 
+let utf8_extra_byte = [ '\x80' - '\xBF' ]
+
 rule coq_string = parse
   | "\"\"" { coq_string lexbuf }
   | "\"" { () }
   | eof { () }
+  | utf8_extra_byte { incr utf8_adjust; coq_string lexbuf }
   | _ { coq_string lexbuf }
 
 and comment = parse
   | "(*" { let _  = comment lexbuf in comment lexbuf }
   | "\"" { let () = coq_string lexbuf in comment lexbuf }
-  | "*)" { Some (Lexing.lexeme_start lexbuf + 1) }
+  | "*)" { Some (utf8_lexeme_start lexbuf + 1) }
   | eof { None }
+  | utf8_extra_byte { incr utf8_adjust; comment lexbuf }
   | _ { comment lexbuf }
 
 (** NB : [mkiter] should be called on increasing offsets *)
@@ -57,24 +65,25 @@ and sentence initial stamp = parse
     }
   | dot_sep {
       (* The usual "." terminator *)
-      stamp (Lexing.lexeme_start lexbuf) Tags.Script.sentence;
+      stamp (utf8_lexeme_start lexbuf) Tags.Script.sentence;
       sentence true stamp lexbuf
     }
   | undotted_sep {
       (* Separators like { or } and bullets * - + are only active
 	 at the start of a sentence *)
-      if initial then stamp (Lexing.lexeme_start lexbuf) Tags.Script.sentence;
+      if initial then stamp (utf8_lexeme_start lexbuf) Tags.Script.sentence;
       sentence initial stamp lexbuf
     }
   | space+ {
        (* Parsing spaces is the only situation preserving initiality *)
        sentence initial stamp lexbuf
     }
+  | utf8_extra_byte { incr utf8_adjust; sentence false stamp lexbuf }
+  | eof { if initial then () else raise Unterminated }
   | _ {
       (* Any other characters *)
       sentence false stamp lexbuf
     }
-  | eof { if initial then () else raise Unterminated }
 
 {
 
@@ -85,6 +94,7 @@ and sentence initial stamp = parse
   *)
 
   let delimit_sentences stamp slice =
+    utf8_adjust := 0;
     sentence true stamp (Lexing.from_string slice)
 
 }
author	letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7>	2012-12-11 07:46:08 +0000
committer	letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7>	2012-12-11 07:46:08 +0000
commit	9cdd77d5774e9a73563c4d4870ea2de8225353d9 (patch)
tree	e3dea01f467ec7b9f62587c53f069961884c14c6 /ide/coq_lex.mll
parent	b0ab21a5a94b09170ff60cd4a405afa5287d7867 (diff)