From 240c8bffaa788669cf3135c95d067cc7b11b5da1 Mon Sep 17 00:00:00 2001 From: Hugo Herbelin Date: Mon, 11 Sep 2017 09:27:50 +0200 Subject: Adding a function to escape strings with non-utf8 characters. --- lib/unicode.ml | 35 ++++++++++++++++++++++++++++++++++- lib/unicode.mli | 3 +++ 2 files changed, 37 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/unicode.ml b/lib/unicode.ml index 959ccaf73..a7132f62f 100644 --- a/lib/unicode.ml +++ b/lib/unicode.ml @@ -163,6 +163,39 @@ let is_utf8 s = in try check 0 with End_of_input -> true | Invalid_argument _ -> false +(* Escape string if it contains non-utf8 characters *) + +let escaped_non_utf8 s = + let mk_escape x = Printf.sprintf "%%%X" x in + let buff = Buffer.create (String.length s * 3) in + let rec process_trailing_aux i j = + if i = j then i else + match String.unsafe_get s i with + | '\128'..'\191' -> process_trailing_aux (i+1) j + | _ -> i in + let process_trailing i n = + let j = if i+n-1 >= String.length s then i+1 else process_trailing_aux (i+1) (i+n) in + (if j = i+n then + Buffer.add_string buff (String.sub s i n) + else + let v = Array.init (j-i) (fun k -> mk_escape (Char.code s.[i+k])) in + Buffer.add_string buff (String.concat "" (Array.to_list v))); + j in + let rec process i = + if i >= String.length s then Buffer.contents buff else + let c = String.unsafe_get s i in + match c with + | '\000'..'\127' -> Buffer.add_char buff c; process (i+1) + | '\128'..'\191' | '\248'..'\255' -> Buffer.add_string buff (mk_escape (Char.code c)); process (i+1) + | '\192'..'\223' -> process (process_trailing i 2) + | '\224'..'\239' -> process (process_trailing i 3) + | '\240'..'\247' -> process (process_trailing i 4) + in + process 0 + +let escaped_if_non_utf8 s = + if is_utf8 s then s else escaped_non_utf8 s + (* Check the well-formedness of an identifier *) let initial_refutation j n s = @@ -198,7 +231,7 @@ let ident_refutation s = |x -> x with | End_of_input -> Some (true,"The empty string is not an identifier.") - | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.") + | Invalid_argument _ -> Some (true,escaped_non_utf8 s^": invalid utf8 sequence.") let lowercase_unicode = let tree = Segmenttree.make Unicodetable.to_lower in diff --git a/lib/unicode.mli b/lib/unicode.mli index c7d742480..b8e7c33ad 100644 --- a/lib/unicode.mli +++ b/lib/unicode.mli @@ -40,3 +40,6 @@ val utf8_length : string -> int (** Variant of {!String.sub} for UTF-8 strings. *) val utf8_sub : string -> int -> int -> string + +(** Return a "%XX"-escaped string if it contains non UTF-8 characters. *) +val escaped_if_non_utf8 : string -> string -- cgit v1.2.3 From 2938fceb50b71d4784d6d718021c505c00196f50 Mon Sep 17 00:00:00 2001 From: Hugo Herbelin Date: Tue, 12 Sep 2017 17:15:06 +0200 Subject: Complying more precisely to unicode standard. In particular, checking that it is at most 4 bytes. --- lib/unicode.ml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/unicode.ml b/lib/unicode.ml index a7132f62f..8eb2eb45d 100644 --- a/lib/unicode.ml +++ b/lib/unicode.ml @@ -301,9 +301,7 @@ let utf8_length s = | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) - | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *) - | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *) - | '\254'..'\255' -> nc := 0 (* invalid byte *) + | '\248'..'\255' -> nc := 0 (* invalid byte *) end ; incr p ; while !p < len && !nc > 0 do @@ -332,9 +330,7 @@ let utf8_sub s start_u len_u = | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) - | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *) - | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *) - | '\254'..'\255' -> nc := 0 (* invalid byte *) + | '\248'..'\255' -> nc := 0 (* invalid byte *) end ; incr p ; while !p < len_b && !nc > 0 do -- cgit v1.2.3 From e9cd8ef01fe5b9ffc3e85984d0f9bc410a56f8ea Mon Sep 17 00:00:00 2001 From: Hugo Herbelin Date: Mon, 11 Sep 2017 16:22:51 +0200 Subject: A possible fix for BZ#5715 (escape non-utf8 win32 file names). On Win32, Sys.readdir translates the file names to the charset of the local "code page", which may be not compatible with utf8. Warnings referring to these names can be generated. These warnings may be sent to CoqIDE. To ensure a utf8 compliant communication, we escape non-utf8 file names under win32. In the CoqIDE/Coq communication, Glib.IO.read_chars expects an utf8-encoding and raises otherwise a Glib.Error "Invalid byte sequence in conversion input". This fixes bug #5715 (Hangul characters not recognized in file names) but this does not solve the case of an operating system mounting a file system with a different coding convention than the default one, i.e. unicode using "Normalization Form Canonical Decomposition" in UTF-8 for HFS+ on MacOS X, no encoding for ext3/ext4 on Linux, (non-normalized?) UTF-16 for NTFS on Windows. --- lib/system.ml | 2 +- vernac/mltop.ml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/system.ml b/lib/system.ml index 12eacf2ea..0b64b237d 100644 --- a/lib/system.ml +++ b/lib/system.ml @@ -52,7 +52,7 @@ let dirmap = ref StrMap.empty let make_dir_table dir = let filter_dotfiles s f = if f.[0] = '.' then s else StrSet.add f s in - Array.fold_left filter_dotfiles StrSet.empty (readdir dir) + Array.fold_left filter_dotfiles StrSet.empty (Sys.readdir dir) let exists_in_dir_respecting_case dir bf = let cache_dir dir = diff --git a/vernac/mltop.ml b/vernac/mltop.ml index e8a0ba3dd..8ec85688c 100644 --- a/vernac/mltop.ml +++ b/vernac/mltop.ml @@ -175,6 +175,7 @@ let warn_cannot_use_directory = let convert_string d = try Names.Id.of_string d with UserError _ -> + let d = Unicode.escaped_if_non_utf8 d in warn_cannot_use_directory d; raise Exit -- cgit v1.2.3