Merge PR #1041: Miscellaneous fixes about UTF-8 (including a fix to BZ#5715 to escape non-UTF-8 file names)

author: Maxime Dénès <mail@maximedenes.fr> 2017-10-05 18:49:12 +0200
committer: Maxime Dénès <mail@maximedenes.fr> 2017-10-05 18:49:12 +0200
commit: 73620255a7a62703a89026336f97a4bc8a913afd (patch)
tree: 0b28d62d27b3fe6e94a07845ca8900cc9b3c9a71
parent: f78a39499ef210456dd96ecb9896874c9ce300ae (diff)
parent: a47a249e8c31dece9b816bc49d6c5a2aa50fb21b (diff)
11 files changed, 94 insertions, 50 deletions
diff --git a/Makefile.build b/Makefile.build
index 10d314f0d..ecaaccaaf 100644
--- a/Makefile.build
+++ b/Makefile.build
@@ -429,6 +429,7 @@ tools: $(TOOLS) $(OCAMLLIBDEP) $(COQDEPBOOT)
 # is being built.
 
 COQDEPBOOTSRC := lib/minisys.cmo \
+ lib/segmenttree.cmo lib/unicodetable.cmo lib/unicode.cmo \
  tools/coqdep_lexer.cmo tools/coqdep_common.cmo tools/coqdep_boot.cmo
 
 tools/coqdep_lexer.cmo : tools/coqdep_lexer.cmi
@@ -449,6 +450,7 @@ $(OCAMLLIBDEP): $(call bestobj, tools/ocamllibdep.cmo)
 # The full coqdep (unused by this build, but distributed by make install)
 
 COQDEPCMO:=lib/clib.cma lib/cErrors.cmo lib/cWarnings.cmo lib/minisys.cmo \
+  lib/segmenttree.cmo lib/unicodetable.cmo lib/unicode.cmo \
   lib/system.cmo tools/coqdep_lexer.cmo tools/coqdep_common.cmo \
   tools/coqdep.cmo
 
diff --git a/doc/refman/RefMan-com.tex b/doc/refman/RefMan-com.tex
index 9790111f1..892c9931b 100644
--- a/doc/refman/RefMan-com.tex
+++ b/doc/refman/RefMan-com.tex
@@ -107,6 +107,15 @@ The following command-line options are recognized by the commands {\tt
   recursively available from {\Coq} using absolute names (extending
   the {\dirpath} prefix) (see Section~\ref{LongNames}).
 
+  Note that only those subdirectories and files which obey the lexical
+  conventions of what is an {\ident} (see Section~\ref{lexical})
+  are taken into account. Conversely, the underlying file systems or
+  operating systems may be more restrictive than {\Coq}. While Linux's
+  ext4 file system supports any {\Coq} recursive layout
+  (within the limit of 255 bytes per file name), the default on NTFS
+  (Windows) or HFS+ (MacOS X) file systems is on the contrary to
+  disallow two files differing only in the case in the same directory.
+
   \SeeAlso Section~\ref{Libraries}.
 
 \item[{\tt -R} {\em directory} {\dirpath}]\ %
diff --git a/lib/system.ml b/lib/system.ml
index 12eacf2ea..0b64b237d 100644
--- a/lib/system.ml
+++ b/lib/system.ml
@@ -52,7 +52,7 @@ let dirmap = ref StrMap.empty
 
 let make_dir_table dir =
   let filter_dotfiles s f = if f.[0] = '.' then s else StrSet.add f s in
-  Array.fold_left filter_dotfiles StrSet.empty (readdir dir)
+  Array.fold_left filter_dotfiles StrSet.empty (Sys.readdir dir)
 
 let exists_in_dir_respecting_case dir bf =
   let cache_dir dir =
diff --git a/lib/unicode.ml b/lib/unicode.ml
index 959ccaf73..8eb2eb45d 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -163,6 +163,39 @@ let is_utf8 s =
   in
   try check 0 with End_of_input -> true | Invalid_argument _ -> false
 
+(* Escape string if it contains non-utf8 characters *)
+
+let escaped_non_utf8 s =
+  let mk_escape x = Printf.sprintf "%%%X" x in
+  let buff = Buffer.create (String.length s * 3) in
+  let rec process_trailing_aux i j =
+    if i = j then i else
+      match String.unsafe_get s i with
+      | '\128'..'\191' -> process_trailing_aux (i+1) j
+      | _ -> i in
+  let process_trailing i n =
+    let j = if i+n-1 >= String.length s then i+1 else process_trailing_aux (i+1) (i+n) in
+    (if j = i+n then
+      Buffer.add_string buff (String.sub s i n)
+    else
+      let v = Array.init (j-i) (fun k -> mk_escape (Char.code s.[i+k])) in
+      Buffer.add_string buff (String.concat "" (Array.to_list v)));
+    j in
+  let rec process i =
+    if i >= String.length s then Buffer.contents buff else
+      let c = String.unsafe_get s i in
+      match c with
+      | '\000'..'\127' -> Buffer.add_char buff c; process (i+1)
+      | '\128'..'\191' | '\248'..'\255' -> Buffer.add_string buff (mk_escape (Char.code c)); process (i+1)
+      | '\192'..'\223' -> process (process_trailing i 2)
+      | '\224'..'\239' -> process (process_trailing i 3)
+      | '\240'..'\247' -> process (process_trailing i 4)
+  in
+  process 0
+
+let escaped_if_non_utf8 s =
+  if is_utf8 s then s else escaped_non_utf8 s
+
 (* Check the well-formedness of an identifier *)
 
 let initial_refutation j n s =
@@ -198,7 +231,7 @@ let ident_refutation s =
         |x -> x
   with
   | End_of_input -> Some (true,"The empty string is not an identifier.")
-  | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
+  | Invalid_argument _ -> Some (true,escaped_non_utf8 s^": invalid utf8 sequence.")
 
 let lowercase_unicode =
   let tree = Segmenttree.make Unicodetable.to_lower in
@@ -268,9 +301,7 @@ let utf8_length s =
       | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
       | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
       | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
-      | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
-      | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
-      | '\254'..'\255' -> nc := 0 (* invalid byte *)
+      | '\248'..'\255' -> nc := 0 (* invalid byte *)
     end ;
     incr p ;
     while !p < len && !nc > 0 do
@@ -299,9 +330,7 @@ let utf8_sub s start_u len_u =
       |	'\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
       |	'\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
       |	'\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
-      |	'\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
-      |	'\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
-      |	'\254'..'\255' -> nc := 0 (* invalid byte *)
+      |	'\248'..'\255' -> nc := 0 (* invalid byte *)
     end ;
     incr p ;
     while !p < len_b && !nc > 0 do
diff --git a/lib/unicode.mli b/lib/unicode.mli
index c7d742480..b8e7c33ad 100644
--- a/lib/unicode.mli
+++ b/lib/unicode.mli
@@ -40,3 +40,6 @@ val utf8_length : string -> int
 
 (** Variant of {!String.sub} for UTF-8 strings. *)
 val utf8_sub : string -> int -> int -> string
+
+(** Return a "%XX"-escaped string if it contains non UTF-8 characters. *)
+val escaped_if_non_utf8 : string -> string
diff --git a/library/library.ml b/library/library.ml
index 28afa054e..1da2c591d 100644
--- a/library/library.ml
+++ b/library/library.ml
@@ -620,25 +620,6 @@ let check_coq_overwriting p id =
       (str "Cannot build module " ++ pr_dirpath p ++ str "." ++ pr_id id ++ str "." ++ spc () ++
       str "it starts with prefix \"Coq\" which is reserved for the Coq library.")
 
-(* Verifies that a string starts by a letter and do not contain
-   others caracters than letters, digits, or `_` *)
-
-let check_module_name s =
-  let msg c =
-    strbrk "Invalid module name: " ++ str s ++ strbrk " character " ++
-    (if c = '\'' then str "\"'\"" else (str "'" ++ str (String.make 1 c) ++ str "'")) ++
-    strbrk " is not allowed in module names\n"
-  in
-  let err c = user_err  (msg c) in
-  match String.get s 0 with
-    | 'a' .. 'z' | 'A' .. 'Z' ->
-        for i = 1 to (String.length s)-1 do
-          match String.get s i with
-            | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_'  -> ()
-            | c -> err c
-        done
-    | c -> err c
-
 let start_library fo =
   let ldir0 =
     try
@@ -648,7 +629,6 @@ let start_library fo =
   in
   let file = Filename.chop_extension (Filename.basename fo) in
   let id = Id.of_string file in
-  check_module_name file;
   check_coq_overwriting ldir0 id;
   let ldir = add_dirpath_suffix ldir0 id in
   Declaremods.start_library ldir;
diff --git a/test-suite/misc/deps-utf8.sh b/test-suite/misc/deps-utf8.sh
new file mode 100755
index 000000000..13e264c09
--- /dev/null
+++ b/test-suite/misc/deps-utf8.sh
@@ -0,0 +1,17 @@
+# Check reading directories matching non pure ascii idents
+# See bug #5715 (utf-8 working on macos X and linux)
+# Windows is still not compliant
+a=`uname`
+if [ "$a" = "Darwin" -o "$a" = "Linux" ]; then
+rm -f misc/deps/théorèmes/*.v
+tmpoutput=`mktemp /tmp/coqcheck.XXXXXX`
+$coqc -R misc/deps AlphaBêta misc/deps/αβ/γδ.v
+R=$?
+$coqtop -R misc/deps AlphaBêta -load-vernac-source misc/deps/αβ/εζ.v
+S=$?
+if [ $R = 0 -a $S = 0 ]; then
+    exit 0
+else
+    exit 1
+fi
+fi
diff --git a/test-suite/misc/deps/αβ/γδ.v b/test-suite/misc/deps/αβ/γδ.v
new file mode 100644
index 000000000..f43a2d657
--- /dev/null
+++ b/test-suite/misc/deps/αβ/γδ.v
@@ -0,0 +1,4 @@
+Theorem simple : forall A, A -> A.
+Proof.
+auto.
+Qed.
diff --git a/test-suite/misc/deps/αβ/εζ.v b/test-suite/misc/deps/αβ/εζ.v
new file mode 100644
index 000000000..e7fd25c0d
--- /dev/null
+++ b/test-suite/misc/deps/αβ/εζ.v
@@ -0,0 +1 @@
+Require Import γδ.
diff --git a/tools/coqdep_lexer.mll b/tools/coqdep_lexer.mll
index 8eeb59898..564e20d0e 100644
--- a/tools/coqdep_lexer.mll
+++ b/tools/coqdep_lexer.mll
@@ -25,8 +25,6 @@
   exception Fin_fichier
   exception Syntax_error of int*int
 
-  let field_name s = String.sub s 1 (String.length s - 1)
-
   let unquote_string s =
     String.sub s 1 (String.length s - 2)
 
@@ -40,6 +38,18 @@
   let syntax_error lexbuf =
     raise (Syntax_error (Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf))
 
+  let check_valid lexbuf s =
+    match Unicode.ident_refutation s with
+    | None -> s
+    | Some _ -> syntax_error lexbuf
+
+  let get_ident lexbuf =
+    let s = Lexing.lexeme lexbuf in check_valid lexbuf s
+
+  let get_field_name lexbuf =
+    let s = Lexing.lexeme lexbuf in
+    check_valid lexbuf (String.sub s 1 (String.length s - 1))
+
   [@@@ocaml.warning "-3"]       (* String.uncapitalize_ascii since 4.03.0 GPR#124 *)
   let uncapitalize = String.uncapitalize
   [@@@ocaml.warning "+3"]
@@ -52,20 +62,8 @@ let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
 let caml_up_ident = uppercase identchar*
 let caml_low_ident = lowercase identchar*
 
-let coq_firstchar =
-  (* This is only an approximation, refer to lib/util.ml for correct def *)
-  ['A'-'Z' 'a'-'z' '_'] |
-  (* superscript 1 *)
-  '\194' '\185' |
-  (* utf-8 latin 1 supplement *)
-  '\195' ['\128'-'\150'] | '\195' ['\152'-'\182'] | '\195' ['\184'-'\191'] |
-  (* utf-8 letters *)
-  '\206' (['\145'-'\161'] | ['\163'-'\187'])
-  '\226' ('\130' [ '\128'-'\137' ] (* subscripts *)
-    | '\129' [ '\176'-'\187' ] (* superscripts *)
-    | '\132' ['\128'-'\191'] | '\133' ['\128'-'\143'])
-let coq_identchar = coq_firstchar | ['\'' '0'-'9']
-let coq_ident = coq_firstchar coq_identchar*
+(* This is an overapproximation, we check correctness afterwards *)
+let coq_ident = ['A'-'Z' 'a'-'z' '_' '\128'-'\255'] ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9' '\128'-'\255']*
 let coq_field = '.' coq_ident
 
 let dot = '.' ( space+ | eof)
@@ -102,7 +100,7 @@ and from_rule = parse
   | space+
       { from_rule lexbuf }
   | coq_ident
-      { let from = coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf in
+      { let from = coq_qual_id_tail [get_ident lexbuf] lexbuf in
         consume_require (Some from) lexbuf }
   | eof
       { syntax_error lexbuf }
@@ -241,7 +239,7 @@ and load_file = parse
 	parse_dot lexbuf;
 	Load (unquote_vfile_string s) }
   | coq_ident
-      { let s = lexeme lexbuf in skip_to_dot lexbuf; Load s }
+      { let s = get_ident lexbuf in skip_to_dot lexbuf; Load s }
   | eof
       { syntax_error lexbuf }
   | _
@@ -253,7 +251,7 @@ and require_file from = parse
   | space+
       { require_file from lexbuf }
   | coq_ident
-      { let name = coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf in
+      { let name = coq_qual_id_tail [get_ident lexbuf] lexbuf in
         let qid = coq_qual_id_list [name] lexbuf in
         parse_dot lexbuf;
         Require (from, qid) }
@@ -278,7 +276,7 @@ and coq_qual_id = parse
   | space+
       { coq_qual_id lexbuf }
   | coq_ident
-      { coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf }
+      { coq_qual_id_tail [get_ident lexbuf] lexbuf }
   | _
       { syntax_error lexbuf }
 
@@ -288,7 +286,7 @@ and coq_qual_id_tail module_name = parse
   | space+
       { coq_qual_id_tail module_name lexbuf }
   | coq_field
-      { coq_qual_id_tail (field_name (Lexing.lexeme lexbuf) :: module_name) lexbuf }
+      { coq_qual_id_tail (get_field_name lexbuf :: module_name) lexbuf }
   | eof
       { syntax_error lexbuf }
   | _
@@ -301,7 +299,7 @@ and coq_qual_id_list module_names = parse
   | space+
       { coq_qual_id_list module_names lexbuf }
   | coq_ident
-      { let name = coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf in
+      { let name = coq_qual_id_tail [get_ident lexbuf] lexbuf in
         coq_qual_id_list (name :: module_names) lexbuf
       }
   | eof
diff --git a/vernac/mltop.ml b/vernac/mltop.ml
index 1edbd1a85..d3de10235 100644
--- a/vernac/mltop.ml
+++ b/vernac/mltop.ml
@@ -174,6 +174,7 @@ let warn_cannot_use_directory =
 let convert_string d =
   try Names.Id.of_string d
   with UserError _ ->
+    let d = Unicode.escaped_if_non_utf8 d in
     warn_cannot_use_directory d;
     raise Exit
author	Maxime Dénès <mail@maximedenes.fr>	2017-10-05 18:49:12 +0200
committer	Maxime Dénès <mail@maximedenes.fr>	2017-10-05 18:49:12 +0200
commit	73620255a7a62703a89026336f97a4bc8a913afd (patch)
tree	0b28d62d27b3fe6e94a07845ca8900cc9b3c9a71
parent	f78a39499ef210456dd96ecb9896874c9ce300ae (diff)
parent	a47a249e8c31dece9b816bc49d6c5a2aa50fb21b (diff)