aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Hugo Herbelin <Hugo.Herbelin@inria.fr>2017-09-11 09:27:50 +0200
committerGravatar Hugo Herbelin <Hugo.Herbelin@inria.fr>2017-09-13 18:59:32 +0200
commit240c8bffaa788669cf3135c95d067cc7b11b5da1 (patch)
tree2a4cf2e33bc08e760fb2fe4b93e7a0f6b6c706ac
parente88dfedd99a84e9e375f3583be6fd1de3de36c72 (diff)
Adding a function to escape strings with non-utf8 characters.
-rw-r--r--lib/unicode.ml35
-rw-r--r--lib/unicode.mli3
2 files changed, 37 insertions, 1 deletions
diff --git a/lib/unicode.ml b/lib/unicode.ml
index 959ccaf73..a7132f62f 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -163,6 +163,39 @@ let is_utf8 s =
in
try check 0 with End_of_input -> true | Invalid_argument _ -> false
+(* Escape string if it contains non-utf8 characters *)
+
+let escaped_non_utf8 s =
+ let mk_escape x = Printf.sprintf "%%%X" x in
+ let buff = Buffer.create (String.length s * 3) in
+ let rec process_trailing_aux i j =
+ if i = j then i else
+ match String.unsafe_get s i with
+ | '\128'..'\191' -> process_trailing_aux (i+1) j
+ | _ -> i in
+ let process_trailing i n =
+ let j = if i+n-1 >= String.length s then i+1 else process_trailing_aux (i+1) (i+n) in
+ (if j = i+n then
+ Buffer.add_string buff (String.sub s i n)
+ else
+ let v = Array.init (j-i) (fun k -> mk_escape (Char.code s.[i+k])) in
+ Buffer.add_string buff (String.concat "" (Array.to_list v)));
+ j in
+ let rec process i =
+ if i >= String.length s then Buffer.contents buff else
+ let c = String.unsafe_get s i in
+ match c with
+ | '\000'..'\127' -> Buffer.add_char buff c; process (i+1)
+ | '\128'..'\191' | '\248'..'\255' -> Buffer.add_string buff (mk_escape (Char.code c)); process (i+1)
+ | '\192'..'\223' -> process (process_trailing i 2)
+ | '\224'..'\239' -> process (process_trailing i 3)
+ | '\240'..'\247' -> process (process_trailing i 4)
+ in
+ process 0
+
+let escaped_if_non_utf8 s =
+ if is_utf8 s then s else escaped_non_utf8 s
+
(* Check the well-formedness of an identifier *)
let initial_refutation j n s =
@@ -198,7 +231,7 @@ let ident_refutation s =
|x -> x
with
| End_of_input -> Some (true,"The empty string is not an identifier.")
- | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
+ | Invalid_argument _ -> Some (true,escaped_non_utf8 s^": invalid utf8 sequence.")
let lowercase_unicode =
let tree = Segmenttree.make Unicodetable.to_lower in
diff --git a/lib/unicode.mli b/lib/unicode.mli
index c7d742480..b8e7c33ad 100644
--- a/lib/unicode.mli
+++ b/lib/unicode.mli
@@ -40,3 +40,6 @@ val utf8_length : string -> int
(** Variant of {!String.sub} for UTF-8 strings. *)
val utf8_sub : string -> int -> int -> string
+
+(** Return a "%XX"-escaped string if it contains non UTF-8 characters. *)
+val escaped_if_non_utf8 : string -> string