diff options
-rw-r--r-- | ide/coq_lex.mll | 22 | ||||
-rw-r--r-- | ide/ideutils.ml | 19 | ||||
-rw-r--r-- | ide/sentence.ml | 10 |
3 files changed, 19 insertions, 32 deletions
diff --git a/ide/coq_lex.mll b/ide/coq_lex.mll index c1f75b6ff..d1d7c6629 100644 --- a/ide/coq_lex.mll +++ b/ide/coq_lex.mll @@ -8,26 +8,34 @@ { exception Unterminated + + let utf8_adjust = ref 0 + + let utf8_lexeme_start lexbuf = + Lexing.lexeme_start lexbuf - !utf8_adjust } -let space = - [' ' '\n' '\r' '\t' '\012'] (* '\012' is form-feed *) +let space = [' ' '\n' '\r' '\t' '\012'] (* '\012' is form-feed *) let undotted_sep = [ '{' '}' '-' '+' '*' ] let dot_sep = '.' (space | eof) +let utf8_extra_byte = [ '\x80' - '\xBF' ] + rule coq_string = parse | "\"\"" { coq_string lexbuf } | "\"" { () } | eof { () } + | utf8_extra_byte { incr utf8_adjust; coq_string lexbuf } | _ { coq_string lexbuf } and comment = parse | "(*" { let _ = comment lexbuf in comment lexbuf } | "\"" { let () = coq_string lexbuf in comment lexbuf } - | "*)" { Some (Lexing.lexeme_start lexbuf + 1) } + | "*)" { Some (utf8_lexeme_start lexbuf + 1) } | eof { None } + | utf8_extra_byte { incr utf8_adjust; comment lexbuf } | _ { comment lexbuf } (** NB : [mkiter] should be called on increasing offsets *) @@ -57,24 +65,25 @@ and sentence initial stamp = parse } | dot_sep { (* The usual "." terminator *) - stamp (Lexing.lexeme_start lexbuf) Tags.Script.sentence; + stamp (utf8_lexeme_start lexbuf) Tags.Script.sentence; sentence true stamp lexbuf } | undotted_sep { (* Separators like { or } and bullets * - + are only active at the start of a sentence *) - if initial then stamp (Lexing.lexeme_start lexbuf) Tags.Script.sentence; + if initial then stamp (utf8_lexeme_start lexbuf) Tags.Script.sentence; sentence initial stamp lexbuf } | space+ { (* Parsing spaces is the only situation preserving initiality *) sentence initial stamp lexbuf } + | utf8_extra_byte { incr utf8_adjust; sentence false stamp lexbuf } + | eof { if initial then () else raise Unterminated } | _ { (* Any other characters *) sentence false stamp lexbuf } - | eof { if initial then () else raise Unterminated } { @@ -85,6 +94,7 @@ and sentence initial stamp = parse *) let delimit_sentences stamp slice = + utf8_adjust := 0; sentence true stamp (Lexing.from_string slice) } diff --git a/ide/ideutils.ml b/ide/ideutils.ml index ed07135b5..e1c57074b 100644 --- a/ide/ideutils.ml +++ b/ide/ideutils.ml @@ -44,7 +44,6 @@ let set_location = ref (function s -> failwith "not ready") (** A utf8 char is either a single byte (ascii char, 0xxxxxxx) or multi-byte (with a leading byte 11xxxxxx and extra bytes 10xxxxxx) *) -let is_char_start c = ((Char.code c) lsr 6 <> 2) let is_extra_byte c = ((Char.code c) lsr 6 = 2) (** For a string buffer that may contain utf8 chars, @@ -60,24 +59,6 @@ let byte_offset_to_char_offset s byte_offset = done; byte_offset - !extra_bytes -(** For multiple offset conversions in a long buffer, - we proceed incrementally by storing last known positions. - Offsets should be asked in increasing order - and correspond to char-starting byte. *) - -let incremental_byte_offset_to_char_offset s = - let bytes = ref 0 - and chars = ref 0 - and len = String.length s - in - fun byte_offset -> - assert (!bytes <= byte_offset); - for i = !bytes + 1 to byte_offset do - if i >= len || is_char_start s.[i] then incr chars - done; - bytes := byte_offset; - !chars - let print_id id = Minilib.log ("GOT sig id :"^(string_of_int (Obj.magic id))) diff --git a/ide/sentence.ml b/ide/sentence.ml index f6bc99207..01add490e 100644 --- a/ide/sentence.ml +++ b/ide/sentence.ml @@ -16,14 +16,10 @@ let split_slice_lax (buffer:GText.buffer) start stop = List.iter (buffer#remove_tag ~start ~stop) Tags.Script.all; let slice = buffer#get_text ~start ~stop () in - let mkiter = - (* caveat : partial application with effects *) - let convert = Ideutils.byte_offset_to_char_offset slice in - fun off -> start#forward_chars (convert off) - in let apply_tag off tag = - let start = mkiter off in - buffer#apply_tag ~start ~stop:start#forward_char tag + (* off is now a utf8-compliant char offset, cf Coq_lex.utf8_adjust *) + let iter = start#forward_chars off in + buffer#apply_tag ~start:iter ~stop:iter#forward_char tag in Coq_lex.delimit_sentences apply_tag slice |