3 files changed, 70 insertions, 47 deletions
diff --git a/ide/coq_lex.mll b/ide/coq_lex.mll
index 2f3763880..c1f75b6ff 100644
--- a/ide/coq_lex.mll
+++ b/ide/coq_lex.mll
@@ -24,27 +24,28 @@ rule coq_string = parse
   | _ { coq_string lexbuf }
 
 and comment = parse
-  | "(*" { ignore (comment lexbuf); comment lexbuf }
+  | "(*" { let _  = comment lexbuf in comment lexbuf }
   | "\"" { let () = coq_string lexbuf in comment lexbuf }
-  | "*)" { (true, Lexing.lexeme_start lexbuf + 2) }
-  | eof { (false, Lexing.lexeme_end lexbuf) }
+  | "*)" { Some (Lexing.lexeme_start lexbuf + 1) }
+  | eof { None }
   | _ { comment lexbuf }
 
-and sentence initial = parse
+(** NB : [mkiter] should be called on increasing offsets *)
+
+and sentence initial stamp = parse
   | "(*" {
-      let trully_terminated,comm_end = comment lexbuf in
-      if not trully_terminated then raise Unterminated;
-      (* A comment alone is a sentence.
-	 A comment in a sentence doesn't terminate the sentence.
-         Note: comm_end is the first position _after_ the comment,
-	 as required when tagging a zone, hence the -1 to locate the
-	 ")" terminating the comment.
-      *)
-      if initial then true, comm_end - 1 else sentence false lexbuf
+      match comment lexbuf with
+        | None -> raise Unterminated
+        | Some comm_last ->
+          (* A comment alone is a sentence.
+	     A comment in a sentence doesn't terminate the sentence.
+             Note: comm_end is the position of the comment final ')' *)
+          if initial then stamp comm_last Tags.Script.comment_sentence;
+          sentence initial stamp lexbuf
     }
   | "\"" {
       let () = coq_string lexbuf in
-      sentence false lexbuf
+      sentence false stamp lexbuf
     }
   | ".." {
       (* We must have a particular rule for parsing "..", where no dot
@@ -52,34 +53,38 @@ and sentence initial = parse
 	 (cf. for instance the syntax for recursive notation).
 	 This rule and the following one also allow to treat the "..."
 	 special case, where the third dot is a terminator. *)
-      sentence false lexbuf
+      sentence false stamp lexbuf
+    }
+  | dot_sep {
+      (* The usual "." terminator *)
+      stamp (Lexing.lexeme_start lexbuf) Tags.Script.sentence;
+      sentence true stamp lexbuf
     }
-  | dot_sep { false, Lexing.lexeme_start lexbuf } (* The usual "." terminator *)
   | undotted_sep {
       (* Separators like { or } and bullets * - + are only active
 	 at the start of a sentence *)
-      if initial then false, Lexing.lexeme_start lexbuf
-      else sentence false lexbuf
+      if initial then stamp (Lexing.lexeme_start lexbuf) Tags.Script.sentence;
+      sentence initial stamp lexbuf
     }
   | space+ {
        (* Parsing spaces is the only situation preserving initiality *)
-       sentence initial lexbuf
+       sentence initial stamp lexbuf
     }
   | _ {
       (* Any other characters *)
-      sentence false lexbuf
+      sentence false stamp lexbuf
     }
-  | eof { raise Unterminated }
+  | eof { if initial then () else raise Unterminated }
 
 {
 
-  (** Parse a sentence in string [slice], tagging relevant parts with
-      function [stamp], and returning the position of the first
-      sentence delimitor (either "." or "{" or "}" or the end of a comment).
-      It will raise [Unterminated] when no end of sentence is found.
+  (** Parse sentences in string [slice], tagging last characters
+      of sentences with the [stamp] function.
+      It will raise [Unterminated] if [slice] ends with an unfinished
+      sentence.
   *)
 
-  let delimit_sentence slice =
-    sentence true (Lexing.from_string slice)
+  let delimit_sentences stamp slice =
+    sentence true stamp (Lexing.from_string slice)
 
 }
diff --git a/ide/coqide.ml b/ide/coqide.ml
index de04e0891..9dfbf2f2e 100644
--- a/ide/coqide.ml
+++ b/ide/coqide.ml
@@ -218,27 +218,20 @@ let get_current_word () =
     May raise [Coq_lex.Unterminated] when the zone ends with
     an unterminated sentence. *)
 
-let split_slice_lax (buffer: GText.buffer) from upto =
-  buffer#remove_tag ~start:from ~stop:upto Tags.Script.comment_sentence;
-  buffer#remove_tag ~start:from ~stop:upto Tags.Script.sentence;
-  let slice = buffer#get_text ~start:from ~stop:upto () in
-  let rec split_substring str =
-    let off_conv = byte_offset_to_char_offset str in
-    let slice_len = String.length str in
-    let is_comment,end_off = Coq_lex.delimit_sentence str in
-    let start = from#forward_chars (off_conv end_off) in
-    let stop = start#forward_char in
-    let tag =
-      if is_comment then Tags.Script.comment_sentence else Tags.Script.sentence
-    in
-    buffer#apply_tag ~start ~stop tag;
-    let next = end_off + 1 in
-    if next < slice_len then begin
-      ignore (from#nocopy#forward_chars (off_conv next));
-      split_substring (String.sub str next (slice_len - next))
-    end
+let split_slice_lax (buffer: GText.buffer) start stop =
+  buffer#remove_tag ~start ~stop Tags.Script.comment_sentence;
+  buffer#remove_tag ~start ~stop Tags.Script.sentence;
+  let slice = buffer#get_text ~start ~stop () in
+  let mkiter =
+    (* caveat : partial application with effects *)
+    let convert = byte_offset_to_char_offset slice in
+    fun off -> start#forward_chars (convert off)
+  in
+  let apply_tag off tag =
+    let start = mkiter off in
+    buffer#apply_tag ~start ~stop:start#forward_char tag
   in
-  split_substring slice
+  Coq_lex.delimit_sentences apply_tag slice
 
 (** Searching forward and backward a position fulfilling some condition *)
 
diff --git a/ide/ideutils.ml b/ide/ideutils.ml
index e5d5ebaa8..ae6db1a7a 100644
--- a/ide/ideutils.ml
+++ b/ide/ideutils.ml
@@ -32,8 +32,15 @@ let get_insert input_buffer = input_buffer#get_iter_at_mark `INSERT
 (** A utf8 char is either a single byte (ascii char, 0xxxxxxx)
     or multi-byte (with a leading byte 11xxxxxx and extra bytes 10xxxxxx) *)
 
+let is_char_start c = ((Char.code c) lsr 6 <> 2)
 let is_extra_byte c = ((Char.code c) lsr 6 = 2)
 
+(** For a string buffer that may contain utf8 chars,
+    we convert a byte offset into a char offset
+    by only counting char-starting bytes.
+    Normally the string buffer starts with a char-starting byte
+    (buffer produced by a [#get_text]) *)
+
 let byte_offset_to_char_offset s byte_offset =
   let extra_bytes = ref 0 in
   for i = 0 to min byte_offset (String.length s - 1) do
@@ -41,6 +48,24 @@ let byte_offset_to_char_offset s byte_offset =
   done;
   byte_offset - !extra_bytes
 
+(** For multiple offset conversions in a long buffer,
+    we proceed incrementally by storing last known positions.
+    Offsets should be asked in increasing order
+   and correspond to char-starting byte. *)
+
+let incremental_byte_offset_to_char_offset s =
+  let bytes = ref 0
+  and chars = ref 0
+  and len = String.length s
+  in
+  fun byte_offset ->
+    assert (!bytes <= byte_offset);
+    for i = !bytes + 1 to byte_offset do
+      if i >= len || is_char_start s.[i] then incr chars
+    done;
+    bytes := byte_offset;
+    !chars
+
 let print_id id =
   Minilib.log ("GOT sig id :"^(string_of_int (Obj.magic id)))