3 files changed, 19 insertions, 32 deletions
diff --git a/ide/coq_lex.mll b/ide/coq_lex.mll
index c1f75b6ff..d1d7c6629 100644
--- a/ide/coq_lex.mll
+++ b/ide/coq_lex.mll
@@ -8,26 +8,34 @@
 
 {
   exception Unterminated
+
+  let utf8_adjust = ref 0
+
+  let utf8_lexeme_start lexbuf =
+    Lexing.lexeme_start lexbuf - !utf8_adjust
 }
 
-let space =
-  [' ' '\n' '\r' '\t' '\012'] (* '\012' is form-feed *)
+let space = [' ' '\n' '\r' '\t' '\012'] (* '\012' is form-feed *)
 
 let undotted_sep = [ '{' '}' '-' '+' '*' ]
 
 let dot_sep = '.' (space | eof)
 
+let utf8_extra_byte = [ '\x80' - '\xBF' ]
+
 rule coq_string = parse
   | "\"\"" { coq_string lexbuf }
   | "\"" { () }
   | eof { () }
+  | utf8_extra_byte { incr utf8_adjust; coq_string lexbuf }
   | _ { coq_string lexbuf }
 
 and comment = parse
   | "(*" { let _  = comment lexbuf in comment lexbuf }
   | "\"" { let () = coq_string lexbuf in comment lexbuf }
-  | "*)" { Some (Lexing.lexeme_start lexbuf + 1) }
+  | "*)" { Some (utf8_lexeme_start lexbuf + 1) }
   | eof { None }
+  | utf8_extra_byte { incr utf8_adjust; comment lexbuf }
   | _ { comment lexbuf }
 
 (** NB : [mkiter] should be called on increasing offsets *)
@@ -57,24 +65,25 @@ and sentence initial stamp = parse
     }
   | dot_sep {
       (* The usual "." terminator *)
-      stamp (Lexing.lexeme_start lexbuf) Tags.Script.sentence;
+      stamp (utf8_lexeme_start lexbuf) Tags.Script.sentence;
       sentence true stamp lexbuf
     }
   | undotted_sep {
       (* Separators like { or } and bullets * - + are only active
 	 at the start of a sentence *)
-      if initial then stamp (Lexing.lexeme_start lexbuf) Tags.Script.sentence;
+      if initial then stamp (utf8_lexeme_start lexbuf) Tags.Script.sentence;
       sentence initial stamp lexbuf
     }
   | space+ {
        (* Parsing spaces is the only situation preserving initiality *)
        sentence initial stamp lexbuf
     }
+  | utf8_extra_byte { incr utf8_adjust; sentence false stamp lexbuf }
+  | eof { if initial then () else raise Unterminated }
   | _ {
       (* Any other characters *)
       sentence false stamp lexbuf
     }
-  | eof { if initial then () else raise Unterminated }
 
 {
 
@@ -85,6 +94,7 @@ and sentence initial stamp = parse
   *)
 
   let delimit_sentences stamp slice =
+    utf8_adjust := 0;
     sentence true stamp (Lexing.from_string slice)
 
 }
diff --git a/ide/ideutils.ml b/ide/ideutils.ml
index ed07135b5..e1c57074b 100644
--- a/ide/ideutils.ml
+++ b/ide/ideutils.ml
@@ -44,7 +44,6 @@ let set_location = ref  (function s -> failwith "not ready")
 (** A utf8 char is either a single byte (ascii char, 0xxxxxxx)
     or multi-byte (with a leading byte 11xxxxxx and extra bytes 10xxxxxx) *)
 
-let is_char_start c = ((Char.code c) lsr 6 <> 2)
 let is_extra_byte c = ((Char.code c) lsr 6 = 2)
 
 (** For a string buffer that may contain utf8 chars,
@@ -60,24 +59,6 @@ let byte_offset_to_char_offset s byte_offset =
   done;
   byte_offset - !extra_bytes
 
-(** For multiple offset conversions in a long buffer,
-    we proceed incrementally by storing last known positions.
-    Offsets should be asked in increasing order
-   and correspond to char-starting byte. *)
-
-let incremental_byte_offset_to_char_offset s =
-  let bytes = ref 0
-  and chars = ref 0
-  and len = String.length s
-  in
-  fun byte_offset ->
-    assert (!bytes <= byte_offset);
-    for i = !bytes + 1 to byte_offset do
-      if i >= len || is_char_start s.[i] then incr chars
-    done;
-    bytes := byte_offset;
-    !chars
-
 let print_id id =
   Minilib.log ("GOT sig id :"^(string_of_int (Obj.magic id)))
 
diff --git a/ide/sentence.ml b/ide/sentence.ml
index f6bc99207..01add490e 100644
--- a/ide/sentence.ml
+++ b/ide/sentence.ml
@@ -16,14 +16,10 @@
 let split_slice_lax (buffer:GText.buffer) start stop =
   List.iter (buffer#remove_tag ~start ~stop) Tags.Script.all;
   let slice = buffer#get_text ~start ~stop () in
-  let mkiter =
-    (* caveat : partial application with effects *)
-    let convert = Ideutils.byte_offset_to_char_offset slice in
-    fun off -> start#forward_chars (convert off)
-  in
   let apply_tag off tag =
-    let start = mkiter off in
-    buffer#apply_tag ~start ~stop:start#forward_char tag
+    (* off is now a utf8-compliant char offset, cf Coq_lex.utf8_adjust *)
+    let iter = start#forward_chars off in
+    buffer#apply_tag ~start:iter ~stop:iter#forward_char tag
   in
   Coq_lex.delimit_sentences apply_tag slice