diff options
author | miquel <miquel@85f007b7-540e-0410-9357-904b9bb8a0f7> | 2000-10-19 15:46:44 +0000 |
---|---|---|
committer | miquel <miquel@85f007b7-540e-0410-9357-904b9bb8a0f7> | 2000-10-19 15:46:44 +0000 |
commit | d3e2f28966bb6d3e5766b3da2ee333cdc3438b13 (patch) | |
tree | 5e8510d56de255594dbc10bc5dfaebe07ba89aba /lib | |
parent | 2ae228a7cac719e4fc949fe1f14115f68e93353f (diff) |
Use UTF-8 as default encoding for computing length of strings in pretty
print boxes. Full compatibility with *pure* ascii, an quasi full
compatibility with latin1 encoding.
git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@730 85f007b7-540e-0410-9357-904b9bb8a0f7
Diffstat (limited to 'lib')
-rw-r--r-- | lib/pp.ml | 37 |
1 files changed, 36 insertions, 1 deletions
@@ -44,9 +44,44 @@ type 'a ppdir_token = type std_ppcmds = (int*string) ppcmd_token Stream.t type 'a ppdirs = 'a ppdir_token Stream.t +(* Compute length of an UTF-8 encoded string + Rem 1 : utf8_length <= String.length (equal if pure ascii) + Rem 2 : if used for an iso8859_1 encoded string, the result is + wrong in very rare cases. Such a wrong case corresponds to any + sequence of a character in range 192..253 immediately followed by a + character in range 128..191 (typical case in french is "déçu" which + is counted 3 instead of 4); then no real harm to use always + utf8_length even if using an iso8859_1 encoding *) + +let utf8_length s = + let len = String.length s + and cnt = ref 0 + and nc = ref 0 + and p = ref 0 in + while !p < len do + begin + match s.[!p] with + | '\000'..'\127' -> nc := 0 (* ascii char *) + | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *) + | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) + | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) + | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) + | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *) + | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *) + | '\254'..'\255' -> nc := 0 (* invalid byte *) + end ; + incr p ; + while !p < len && !nc > 0 do + match s.[!p] with + | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc + | _ (* not a continuation byte *) -> nc := 0 + done ; + incr cnt + done ; + !cnt (* formatting commands *) -let sTR s = Ppcmd_print (String.length s,s) +let sTR s = Ppcmd_print (utf8_length s,s) let sTRas (i,s) = Ppcmd_print (i,s) let bRK (a,b) = Ppcmd_print_break (a,b) let tBRK (a,b) = Ppcmd_print_tbreak (a,b) |