aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib
diff options
context:
space:
mode:
authorGravatar Maxime Dénès <mail@maximedenes.fr>2017-10-05 18:49:12 +0200
committerGravatar Maxime Dénès <mail@maximedenes.fr>2017-10-05 18:49:12 +0200
commit73620255a7a62703a89026336f97a4bc8a913afd (patch)
tree0b28d62d27b3fe6e94a07845ca8900cc9b3c9a71 /lib
parentf78a39499ef210456dd96ecb9896874c9ce300ae (diff)
parenta47a249e8c31dece9b816bc49d6c5a2aa50fb21b (diff)
Merge PR #1041: Miscellaneous fixes about UTF-8 (including a fix to BZ#5715 to escape non-UTF-8 file names)
Diffstat (limited to 'lib')
-rw-r--r--lib/system.ml2
-rw-r--r--lib/unicode.ml43
-rw-r--r--lib/unicode.mli3
3 files changed, 40 insertions, 8 deletions
diff --git a/lib/system.ml b/lib/system.ml
index 12eacf2ea..0b64b237d 100644
--- a/lib/system.ml
+++ b/lib/system.ml
@@ -52,7 +52,7 @@ let dirmap = ref StrMap.empty
let make_dir_table dir =
let filter_dotfiles s f = if f.[0] = '.' then s else StrSet.add f s in
- Array.fold_left filter_dotfiles StrSet.empty (readdir dir)
+ Array.fold_left filter_dotfiles StrSet.empty (Sys.readdir dir)
let exists_in_dir_respecting_case dir bf =
let cache_dir dir =
diff --git a/lib/unicode.ml b/lib/unicode.ml
index 959ccaf73..8eb2eb45d 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -163,6 +163,39 @@ let is_utf8 s =
in
try check 0 with End_of_input -> true | Invalid_argument _ -> false
+(* Escape string if it contains non-utf8 characters *)
+
+let escaped_non_utf8 s =
+ let mk_escape x = Printf.sprintf "%%%X" x in
+ let buff = Buffer.create (String.length s * 3) in
+ let rec process_trailing_aux i j =
+ if i = j then i else
+ match String.unsafe_get s i with
+ | '\128'..'\191' -> process_trailing_aux (i+1) j
+ | _ -> i in
+ let process_trailing i n =
+ let j = if i+n-1 >= String.length s then i+1 else process_trailing_aux (i+1) (i+n) in
+ (if j = i+n then
+ Buffer.add_string buff (String.sub s i n)
+ else
+ let v = Array.init (j-i) (fun k -> mk_escape (Char.code s.[i+k])) in
+ Buffer.add_string buff (String.concat "" (Array.to_list v)));
+ j in
+ let rec process i =
+ if i >= String.length s then Buffer.contents buff else
+ let c = String.unsafe_get s i in
+ match c with
+ | '\000'..'\127' -> Buffer.add_char buff c; process (i+1)
+ | '\128'..'\191' | '\248'..'\255' -> Buffer.add_string buff (mk_escape (Char.code c)); process (i+1)
+ | '\192'..'\223' -> process (process_trailing i 2)
+ | '\224'..'\239' -> process (process_trailing i 3)
+ | '\240'..'\247' -> process (process_trailing i 4)
+ in
+ process 0
+
+let escaped_if_non_utf8 s =
+ if is_utf8 s then s else escaped_non_utf8 s
+
(* Check the well-formedness of an identifier *)
let initial_refutation j n s =
@@ -198,7 +231,7 @@ let ident_refutation s =
|x -> x
with
| End_of_input -> Some (true,"The empty string is not an identifier.")
- | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
+ | Invalid_argument _ -> Some (true,escaped_non_utf8 s^": invalid utf8 sequence.")
let lowercase_unicode =
let tree = Segmenttree.make Unicodetable.to_lower in
@@ -268,9 +301,7 @@ let utf8_length s =
| '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
| '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
| '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
- | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
- | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
- | '\254'..'\255' -> nc := 0 (* invalid byte *)
+ | '\248'..'\255' -> nc := 0 (* invalid byte *)
end ;
incr p ;
while !p < len && !nc > 0 do
@@ -299,9 +330,7 @@ let utf8_sub s start_u len_u =
| '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
| '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
| '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
- | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
- | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
- | '\254'..'\255' -> nc := 0 (* invalid byte *)
+ | '\248'..'\255' -> nc := 0 (* invalid byte *)
end ;
incr p ;
while !p < len_b && !nc > 0 do
diff --git a/lib/unicode.mli b/lib/unicode.mli
index c7d742480..b8e7c33ad 100644
--- a/lib/unicode.mli
+++ b/lib/unicode.mli
@@ -40,3 +40,6 @@ val utf8_length : string -> int
(** Variant of {!String.sub} for UTF-8 strings. *)
val utf8_sub : string -> int -> int -> string
+
+(** Return a "%XX"-escaped string if it contains non UTF-8 characters. *)
+val escaped_if_non_utf8 : string -> string