diff options
author | Maxime Dénès <mail@maximedenes.fr> | 2017-10-05 18:49:12 +0200 |
---|---|---|
committer | Maxime Dénès <mail@maximedenes.fr> | 2017-10-05 18:49:12 +0200 |
commit | 73620255a7a62703a89026336f97a4bc8a913afd (patch) | |
tree | 0b28d62d27b3fe6e94a07845ca8900cc9b3c9a71 /lib | |
parent | f78a39499ef210456dd96ecb9896874c9ce300ae (diff) | |
parent | a47a249e8c31dece9b816bc49d6c5a2aa50fb21b (diff) |
Merge PR #1041: Miscellaneous fixes about UTF-8 (including a fix to BZ#5715 to escape non-UTF-8 file names)
Diffstat (limited to 'lib')
-rw-r--r-- | lib/system.ml | 2 | ||||
-rw-r--r-- | lib/unicode.ml | 43 | ||||
-rw-r--r-- | lib/unicode.mli | 3 |
3 files changed, 40 insertions, 8 deletions
diff --git a/lib/system.ml b/lib/system.ml index 12eacf2ea..0b64b237d 100644 --- a/lib/system.ml +++ b/lib/system.ml @@ -52,7 +52,7 @@ let dirmap = ref StrMap.empty let make_dir_table dir = let filter_dotfiles s f = if f.[0] = '.' then s else StrSet.add f s in - Array.fold_left filter_dotfiles StrSet.empty (readdir dir) + Array.fold_left filter_dotfiles StrSet.empty (Sys.readdir dir) let exists_in_dir_respecting_case dir bf = let cache_dir dir = diff --git a/lib/unicode.ml b/lib/unicode.ml index 959ccaf73..8eb2eb45d 100644 --- a/lib/unicode.ml +++ b/lib/unicode.ml @@ -163,6 +163,39 @@ let is_utf8 s = in try check 0 with End_of_input -> true | Invalid_argument _ -> false +(* Escape string if it contains non-utf8 characters *) + +let escaped_non_utf8 s = + let mk_escape x = Printf.sprintf "%%%X" x in + let buff = Buffer.create (String.length s * 3) in + let rec process_trailing_aux i j = + if i = j then i else + match String.unsafe_get s i with + | '\128'..'\191' -> process_trailing_aux (i+1) j + | _ -> i in + let process_trailing i n = + let j = if i+n-1 >= String.length s then i+1 else process_trailing_aux (i+1) (i+n) in + (if j = i+n then + Buffer.add_string buff (String.sub s i n) + else + let v = Array.init (j-i) (fun k -> mk_escape (Char.code s.[i+k])) in + Buffer.add_string buff (String.concat "" (Array.to_list v))); + j in + let rec process i = + if i >= String.length s then Buffer.contents buff else + let c = String.unsafe_get s i in + match c with + | '\000'..'\127' -> Buffer.add_char buff c; process (i+1) + | '\128'..'\191' | '\248'..'\255' -> Buffer.add_string buff (mk_escape (Char.code c)); process (i+1) + | '\192'..'\223' -> process (process_trailing i 2) + | '\224'..'\239' -> process (process_trailing i 3) + | '\240'..'\247' -> process (process_trailing i 4) + in + process 0 + +let escaped_if_non_utf8 s = + if is_utf8 s then s else escaped_non_utf8 s + (* Check the well-formedness of an identifier *) let initial_refutation j n s = @@ -198,7 +231,7 @@ let ident_refutation s = |x -> x with | End_of_input -> Some (true,"The empty string is not an identifier.") - | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.") + | Invalid_argument _ -> Some (true,escaped_non_utf8 s^": invalid utf8 sequence.") let lowercase_unicode = let tree = Segmenttree.make Unicodetable.to_lower in @@ -268,9 +301,7 @@ let utf8_length s = | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) - | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *) - | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *) - | '\254'..'\255' -> nc := 0 (* invalid byte *) + | '\248'..'\255' -> nc := 0 (* invalid byte *) end ; incr p ; while !p < len && !nc > 0 do @@ -299,9 +330,7 @@ let utf8_sub s start_u len_u = | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) - | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *) - | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *) - | '\254'..'\255' -> nc := 0 (* invalid byte *) + | '\248'..'\255' -> nc := 0 (* invalid byte *) end ; incr p ; while !p < len_b && !nc > 0 do diff --git a/lib/unicode.mli b/lib/unicode.mli index c7d742480..b8e7c33ad 100644 --- a/lib/unicode.mli +++ b/lib/unicode.mli @@ -40,3 +40,6 @@ val utf8_length : string -> int (** Variant of {!String.sub} for UTF-8 strings. *) val utf8_sub : string -> int -> int -> string + +(** Return a "%XX"-escaped string if it contains non UTF-8 characters. *) +val escaped_if_non_utf8 : string -> string |