1 files changed, 106 insertions, 16 deletions
diff --git a/lib/unicode.ml b/lib/unicode.ml
index 1765e93d..ced5e258 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -8,9 +8,7 @@
 
 (** Unicode utilities *)
 
-type status = Letter | IdentPart | Symbol
-
-exception Unsupported
+type status = Letter | IdentPart | Symbol | Unknown
 
 (* The following table stores classes of Unicode characters that
    are used by the lexer. There are 3 different classes so 2 bits are
@@ -18,7 +16,7 @@ exception Unsupported
    to simplify the masking process. (This choice seems to be a good
    trade-off between speed and space after some benchmarks.) *)
 
-(* A 256ko table, initially filled with zeros. *)
+(* A 256 KiB table, initially filled with zeros. *)
 let table = Array.make (1 lsl 17) 0
 
 (* Associate a 2-bit pattern to each status at position [i].
@@ -29,6 +27,7 @@ let mask i = function
   | Letter    -> 1 lsl ((i land 7) lsl 1) (* 01 *)
   | IdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *)
   | Symbol    -> 3 lsl ((i land 7) lsl 1) (* 11 *)
+  | Unknown   -> 0 lsl ((i land 7) lsl 1) (* 00 *)
 
 (* Helper to reset 2 bits in a word. *)
 let reset_mask i =
@@ -55,7 +54,7 @@ let lookup x =
     if      v = 1 then Letter
     else if v = 2 then IdentPart
     else if v = 3 then Symbol
-    else raise Unsupported
+    else Unknown
 
 (* [classify] discriminates between 3 different kinds of
    symbols based on the standard unicode classification (extracted from
@@ -147,6 +146,11 @@ let utf8_of_unicode n =
       s
     end
 
+(* If [s] is some UTF-8 encoded string
+   and [i] is a position of some UTF-8 character within [s]
+   then [next_utf8 s i] returns [(j,n)] where:
+   - [j] indicates the position of the next UTF-8 character
+   - [n] represents the UTF-8 character at index [i] *)
 let next_utf8 s i =
   let err () = invalid_arg "utf8" in
   let l = String.length s - i in
@@ -168,6 +172,13 @@ let next_utf8 s i =
        (c land 0x3F) lsl 6 + (d land 0x3F)
   else err ()
 
+let is_utf8 s =
+  let rec check i =
+    let (off, _) = next_utf8 s i in
+    check (i + off)
+  in
+  try check 0 with End_of_input -> true | Invalid_argument _ -> false
+
 (* Check the well-formedness of an identifier *)
 
 let initial_refutation j n s =
@@ -203,7 +214,6 @@ let ident_refutation s =
         |x -> x
   with
   | End_of_input -> Some (true,"The empty string is not an identifier.")
-  | Unsupported -> Some (true,s^": unsupported character in utf8 sequence.")
   | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
 
 let lowercase_unicode =
@@ -228,14 +238,94 @@ let is_basic_ascii s =
   !ok
 
 let ascii_of_ident s =
-  if is_basic_ascii s then s else
-    let i = ref 0 and out = ref "" in
-    begin try while true do
+  let len = String.length s in
+  let has_UU i =
+    i+2 < len && s.[i]='_' && s.[i+1]='U' && s.[i+2]='U'
+  in
+  let i = ref 0 in
+  while !i < len && Char.code s.[!i] < 128 && not (has_UU !i) do
+    incr i
+  done;
+  if !i = len then s else
+    let out = Buffer.create (2*len) in
+    Buffer.add_substring out s 0 !i;
+    while !i < len do
       let j, n = next_utf8 s !i in
-      out :=
-        if n >= 128
-        then Printf.sprintf "%s__U%04x_" !out n
-        else Printf.sprintf "%s%c" !out s.[!i];
-      i := !i + j
-    done with End_of_input -> () end;
-    !out
+      if n >= 128 then
+        (Printf.bprintf out "_UU%04x_" n; i := !i + j)
+      else if has_UU !i then
+        (Buffer.add_string out "_UUU"; i := !i + 3)
+      else
+        (Buffer.add_char out s.[!i]; incr i)
+    done;
+    Buffer.contents out
+
+(* Compute length of an UTF-8 encoded string
+   Rem 1 : utf8_length <= String.length (equal if pure ascii)
+   Rem 2 : if used for an iso8859_1 encoded string, the result is
+   wrong in very rare cases. Such a wrong case corresponds to any
+   sequence of a character in range 192..253 immediately followed by a
+   character in range 128..191 (typical case in french is "déçu" which
+   is counted 3 instead of 4); then no real harm to use always
+   utf8_length even if using an iso8859_1 encoding *)
+
+(** FIXME: duplicate code with Pp *)
+
+let utf8_length s =
+  let len = String.length s
+  and cnt = ref 0
+  and nc = ref 0
+  and p = ref 0 in
+  while !p < len do
+    begin
+      match s.[!p] with
+      | '\000'..'\127' -> nc := 0 (* ascii char *)
+      | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
+      | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
+      | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
+      | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
+      | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
+      | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
+      | '\254'..'\255' -> nc := 0 (* invalid byte *)
+    end ;
+    incr p ;
+    while !p < len && !nc > 0 do
+      match s.[!p] with
+      | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
+      | _ (* not a continuation byte *) -> nc := 0
+    done ;
+    incr cnt
+  done ;
+  !cnt
+
+(* Variant of String.sub for UTF8 character positions *)
+let utf8_sub s start_u len_u =
+  let len_b = String.length s
+  and end_u = start_u + len_u
+  and cnt = ref 0
+  and nc = ref 0
+  and p = ref 0 in
+  let start_b = ref len_b in
+  while !p < len_b && !cnt < end_u do
+    if !cnt <= start_u then start_b := !p ;
+    begin
+      match s.[!p] with
+      | '\000'..'\127' -> nc := 0 (* ascii char *)
+      | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
+      |	'\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
+      |	'\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
+      |	'\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
+      |	'\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
+      |	'\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
+      |	'\254'..'\255' -> nc := 0 (* invalid byte *)
+    end ;
+    incr p ;
+    while !p < len_b && !nc > 0 do
+      match s.[!p] with
+      |	'\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
+      |	_ (* not a continuation byte *) -> nc := 0
+    done ;
+    incr cnt
+  done ;
+  let end_b = !p in
+  String.sub s !start_b (end_b - !start_b)