From 712c0f124da2322dc58b012a110894329360581d Mon Sep 17 00:00:00 2001
From: Hugo Herbelin <Hugo.Herbelin@inria.fr>
Date: Mon, 11 Sep 2017 16:04:32 +0200
Subject: Removing now useless former fix to #3333 (check valid module names).

The fix is not anymore needed after Id.of_string was made strict
(5b218f87). This allows to support the whole official syntax of
identifiers and not just the alpha-numerical ones (without quote).
---
 library/library.ml | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/library/library.ml b/library/library.ml
index 28afa054e..1da2c591d 100644
--- a/library/library.ml
+++ b/library/library.ml
@@ -620,25 +620,6 @@ let check_coq_overwriting p id =
       (str "Cannot build module " ++ pr_dirpath p ++ str "." ++ pr_id id ++ str "." ++ spc () ++
       str "it starts with prefix \"Coq\" which is reserved for the Coq library.")
 
-(* Verifies that a string starts by a letter and do not contain
-   others caracters than letters, digits, or `_` *)
-
-let check_module_name s =
-  let msg c =
-    strbrk "Invalid module name: " ++ str s ++ strbrk " character " ++
-    (if c = '\'' then str "\"'\"" else (str "'" ++ str (String.make 1 c) ++ str "'")) ++
-    strbrk " is not allowed in module names\n"
-  in
-  let err c = user_err  (msg c) in
-  match String.get s 0 with
-    | 'a' .. 'z' | 'A' .. 'Z' ->
-        for i = 1 to (String.length s)-1 do
-          match String.get s i with
-            | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_'  -> ()
-            | c -> err c
-        done
-    | c -> err c
-
 let start_library fo =
   let ldir0 =
     try
@@ -648,7 +629,6 @@ let start_library fo =
   in
   let file = Filename.chop_extension (Filename.basename fo) in
   let id = Id.of_string file in
-  check_module_name file;
   check_coq_overwriting ldir0 id;
   let ldir = add_dirpath_suffix ldir0 id in
   Declaremods.start_library ldir;
-- 
cgit v1.2.3


From e88dfedd99a84e9e375f3583be6fd1de3de36c72 Mon Sep 17 00:00:00 2001
From: Hugo Herbelin <Hugo.Herbelin@inria.fr>
Date: Mon, 11 Sep 2017 20:45:02 +0200
Subject: Supporting library names in utf8 in coqdep.

---
 Makefile.build         |  2 ++
 tools/coqdep_lexer.mll | 42 ++++++++++++++++++++----------------------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/Makefile.build b/Makefile.build
index 26a40c6cc..db821d485 100644
--- a/Makefile.build
+++ b/Makefile.build
@@ -429,6 +429,7 @@ tools: $(TOOLS) $(OCAMLLIBDEP) $(COQDEPBOOT)
 # is being built.
 
 COQDEPBOOTSRC := lib/minisys.cmo \
+ lib/segmenttree.cmo lib/unicodetable.cmo lib/unicode.cmo \
  tools/coqdep_lexer.cmo tools/coqdep_common.cmo tools/coqdep_boot.cmo
 
 tools/coqdep_lexer.cmo : tools/coqdep_lexer.cmi
@@ -449,6 +450,7 @@ $(OCAMLLIBDEP): $(call bestobj, tools/ocamllibdep.cmo)
 # The full coqdep (unused by this build, but distributed by make install)
 
 COQDEPCMO:=lib/clib.cma lib/cErrors.cmo lib/cWarnings.cmo lib/minisys.cmo \
+  lib/segmenttree.cmo lib/unicodetable.cmo lib/unicode.cmo \
   lib/system.cmo tools/coqdep_lexer.cmo tools/coqdep_common.cmo \
   tools/coqdep.cmo
 
diff --git a/tools/coqdep_lexer.mll b/tools/coqdep_lexer.mll
index 8eeb59898..564e20d0e 100644
--- a/tools/coqdep_lexer.mll
+++ b/tools/coqdep_lexer.mll
@@ -25,8 +25,6 @@
   exception Fin_fichier
   exception Syntax_error of int*int
 
-  let field_name s = String.sub s 1 (String.length s - 1)
-
   let unquote_string s =
     String.sub s 1 (String.length s - 2)
 
@@ -40,6 +38,18 @@
   let syntax_error lexbuf =
     raise (Syntax_error (Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf))
 
+  let check_valid lexbuf s =
+    match Unicode.ident_refutation s with
+    | None -> s
+    | Some _ -> syntax_error lexbuf
+
+  let get_ident lexbuf =
+    let s = Lexing.lexeme lexbuf in check_valid lexbuf s
+
+  let get_field_name lexbuf =
+    let s = Lexing.lexeme lexbuf in
+    check_valid lexbuf (String.sub s 1 (String.length s - 1))
+
   [@@@ocaml.warning "-3"]       (* String.uncapitalize_ascii since 4.03.0 GPR#124 *)
   let uncapitalize = String.uncapitalize
   [@@@ocaml.warning "+3"]
@@ -52,20 +62,8 @@ let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
 let caml_up_ident = uppercase identchar*
 let caml_low_ident = lowercase identchar*
 
-let coq_firstchar =
-  (* This is only an approximation, refer to lib/util.ml for correct def *)
-  ['A'-'Z' 'a'-'z' '_'] |
-  (* superscript 1 *)
-  '\194' '\185' |
-  (* utf-8 latin 1 supplement *)
-  '\195' ['\128'-'\150'] | '\195' ['\152'-'\182'] | '\195' ['\184'-'\191'] |
-  (* utf-8 letters *)
-  '\206' (['\145'-'\161'] | ['\163'-'\187'])
-  '\226' ('\130' [ '\128'-'\137' ] (* subscripts *)
-    | '\129' [ '\176'-'\187' ] (* superscripts *)
-    | '\132' ['\128'-'\191'] | '\133' ['\128'-'\143'])
-let coq_identchar = coq_firstchar | ['\'' '0'-'9']
-let coq_ident = coq_firstchar coq_identchar*
+(* This is an overapproximation, we check correctness afterwards *)
+let coq_ident = ['A'-'Z' 'a'-'z' '_' '\128'-'\255'] ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9' '\128'-'\255']*
 let coq_field = '.' coq_ident
 
 let dot = '.' ( space+ | eof)
@@ -102,7 +100,7 @@ and from_rule = parse
   | space+
       { from_rule lexbuf }
   | coq_ident
-      { let from = coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf in
+      { let from = coq_qual_id_tail [get_ident lexbuf] lexbuf in
         consume_require (Some from) lexbuf }
   | eof
       { syntax_error lexbuf }
@@ -241,7 +239,7 @@ and load_file = parse
 	parse_dot lexbuf;
 	Load (unquote_vfile_string s) }
   | coq_ident
-      { let s = lexeme lexbuf in skip_to_dot lexbuf; Load s }
+      { let s = get_ident lexbuf in skip_to_dot lexbuf; Load s }
   | eof
       { syntax_error lexbuf }
   | _
@@ -253,7 +251,7 @@ and require_file from = parse
   | space+
       { require_file from lexbuf }
   | coq_ident
-      { let name = coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf in
+      { let name = coq_qual_id_tail [get_ident lexbuf] lexbuf in
         let qid = coq_qual_id_list [name] lexbuf in
         parse_dot lexbuf;
         Require (from, qid) }
@@ -278,7 +276,7 @@ and coq_qual_id = parse
   | space+
       { coq_qual_id lexbuf }
   | coq_ident
-      { coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf }
+      { coq_qual_id_tail [get_ident lexbuf] lexbuf }
   | _
       { syntax_error lexbuf }
 
@@ -288,7 +286,7 @@ and coq_qual_id_tail module_name = parse
   | space+
       { coq_qual_id_tail module_name lexbuf }
   | coq_field
-      { coq_qual_id_tail (field_name (Lexing.lexeme lexbuf) :: module_name) lexbuf }
+      { coq_qual_id_tail (get_field_name lexbuf :: module_name) lexbuf }
   | eof
       { syntax_error lexbuf }
   | _
@@ -301,7 +299,7 @@ and coq_qual_id_list module_names = parse
   | space+
       { coq_qual_id_list module_names lexbuf }
   | coq_ident
-      { let name = coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf in
+      { let name = coq_qual_id_tail [get_ident lexbuf] lexbuf in
         coq_qual_id_list (name :: module_names) lexbuf
       }
   | eof
-- 
cgit v1.2.3


From 240c8bffaa788669cf3135c95d067cc7b11b5da1 Mon Sep 17 00:00:00 2001
From: Hugo Herbelin <Hugo.Herbelin@inria.fr>
Date: Mon, 11 Sep 2017 09:27:50 +0200
Subject: Adding a function to escape strings with non-utf8 characters.

---
 lib/unicode.ml  | 35 ++++++++++++++++++++++++++++++++++-
 lib/unicode.mli |  3 +++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/lib/unicode.ml b/lib/unicode.ml
index 959ccaf73..a7132f62f 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -163,6 +163,39 @@ let is_utf8 s =
   in
   try check 0 with End_of_input -> true | Invalid_argument _ -> false
 
+(* Escape string if it contains non-utf8 characters *)
+
+let escaped_non_utf8 s =
+  let mk_escape x = Printf.sprintf "%%%X" x in
+  let buff = Buffer.create (String.length s * 3) in
+  let rec process_trailing_aux i j =
+    if i = j then i else
+      match String.unsafe_get s i with
+      | '\128'..'\191' -> process_trailing_aux (i+1) j
+      | _ -> i in
+  let process_trailing i n =
+    let j = if i+n-1 >= String.length s then i+1 else process_trailing_aux (i+1) (i+n) in
+    (if j = i+n then
+      Buffer.add_string buff (String.sub s i n)
+    else
+      let v = Array.init (j-i) (fun k -> mk_escape (Char.code s.[i+k])) in
+      Buffer.add_string buff (String.concat "" (Array.to_list v)));
+    j in
+  let rec process i =
+    if i >= String.length s then Buffer.contents buff else
+      let c = String.unsafe_get s i in
+      match c with
+      | '\000'..'\127' -> Buffer.add_char buff c; process (i+1)
+      | '\128'..'\191' | '\248'..'\255' -> Buffer.add_string buff (mk_escape (Char.code c)); process (i+1)
+      | '\192'..'\223' -> process (process_trailing i 2)
+      | '\224'..'\239' -> process (process_trailing i 3)
+      | '\240'..'\247' -> process (process_trailing i 4)
+  in
+  process 0
+
+let escaped_if_non_utf8 s =
+  if is_utf8 s then s else escaped_non_utf8 s
+
 (* Check the well-formedness of an identifier *)
 
 let initial_refutation j n s =
@@ -198,7 +231,7 @@ let ident_refutation s =
         |x -> x
   with
   | End_of_input -> Some (true,"The empty string is not an identifier.")
-  | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
+  | Invalid_argument _ -> Some (true,escaped_non_utf8 s^": invalid utf8 sequence.")
 
 let lowercase_unicode =
   let tree = Segmenttree.make Unicodetable.to_lower in
diff --git a/lib/unicode.mli b/lib/unicode.mli
index c7d742480..b8e7c33ad 100644
--- a/lib/unicode.mli
+++ b/lib/unicode.mli
@@ -40,3 +40,6 @@ val utf8_length : string -> int
 
 (** Variant of {!String.sub} for UTF-8 strings. *)
 val utf8_sub : string -> int -> int -> string
+
+(** Return a "%XX"-escaped string if it contains non UTF-8 characters. *)
+val escaped_if_non_utf8 : string -> string
-- 
cgit v1.2.3


From 2938fceb50b71d4784d6d718021c505c00196f50 Mon Sep 17 00:00:00 2001
From: Hugo Herbelin <Hugo.Herbelin@inria.fr>
Date: Tue, 12 Sep 2017 17:15:06 +0200
Subject: Complying more precisely to unicode standard.

In particular, checking that it is at most 4 bytes.
---
 lib/unicode.ml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/lib/unicode.ml b/lib/unicode.ml
index a7132f62f..8eb2eb45d 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -301,9 +301,7 @@ let utf8_length s =
       | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
       | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
       | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
-      | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
-      | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
-      | '\254'..'\255' -> nc := 0 (* invalid byte *)
+      | '\248'..'\255' -> nc := 0 (* invalid byte *)
     end ;
     incr p ;
     while !p < len && !nc > 0 do
@@ -332,9 +330,7 @@ let utf8_sub s start_u len_u =
       |	'\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
       |	'\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
       |	'\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
-      |	'\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
-      |	'\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
-      |	'\254'..'\255' -> nc := 0 (* invalid byte *)
+      |	'\248'..'\255' -> nc := 0 (* invalid byte *)
     end ;
     incr p ;
     while !p < len_b && !nc > 0 do
-- 
cgit v1.2.3


From e9cd8ef01fe5b9ffc3e85984d0f9bc410a56f8ea Mon Sep 17 00:00:00 2001
From: Hugo Herbelin <Hugo.Herbelin@inria.fr>
Date: Mon, 11 Sep 2017 16:22:51 +0200
Subject: A possible fix for BZ#5715 (escape non-utf8 win32 file names).

On Win32, Sys.readdir translates the file names to the charset of the
local "code page", which may be not compatible with utf8.

Warnings referring to these names can be generated. These warnings may
be sent to CoqIDE. To ensure a utf8 compliant communication, we escape
non-utf8 file names under win32.

In the CoqIDE/Coq communication, Glib.IO.read_chars expects an
utf8-encoding and raises otherwise a Glib.Error "Invalid byte sequence in
conversion input".

This fixes bug #5715 (Hangul characters not recognized in file names)
but this does not solve the case of an operating system mounting a
file system with a different coding convention than the default one,
i.e. unicode using "Normalization Form Canonical Decomposition" in
UTF-8 for HFS+ on MacOS X, no encoding for ext3/ext4 on Linux,
(non-normalized?) UTF-16 for NTFS on Windows.
---
 lib/system.ml   | 2 +-
 vernac/mltop.ml | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/system.ml b/lib/system.ml
index 12eacf2ea..0b64b237d 100644
--- a/lib/system.ml
+++ b/lib/system.ml
@@ -52,7 +52,7 @@ let dirmap = ref StrMap.empty
 
 let make_dir_table dir =
   let filter_dotfiles s f = if f.[0] = '.' then s else StrSet.add f s in
-  Array.fold_left filter_dotfiles StrSet.empty (readdir dir)
+  Array.fold_left filter_dotfiles StrSet.empty (Sys.readdir dir)
 
 let exists_in_dir_respecting_case dir bf =
   let cache_dir dir =
diff --git a/vernac/mltop.ml b/vernac/mltop.ml
index e8a0ba3dd..8ec85688c 100644
--- a/vernac/mltop.ml
+++ b/vernac/mltop.ml
@@ -175,6 +175,7 @@ let warn_cannot_use_directory =
 let convert_string d =
   try Names.Id.of_string d
   with UserError _ ->
+    let d = Unicode.escaped_if_non_utf8 d in
     warn_cannot_use_directory d;
     raise Exit
 
-- 
cgit v1.2.3


From 98e2b048aa5419ac3d075e1d3f2499cb816fe92e Mon Sep 17 00:00:00 2001
From: Hugo Herbelin <Hugo.Herbelin@inria.fr>
Date: Mon, 11 Sep 2017 16:32:39 +0200
Subject: Adding a test for utf8 characters in directory names.

---
 test-suite/misc/deps-utf8.sh                            | 17 +++++++++++++++++
 .../misc/deps/\316\261\316\262/\316\263\316\264.v"      |  4 ++++
 .../misc/deps/\316\261\316\262/\316\265\316\266.v"      |  1 +
 3 files changed, 22 insertions(+)
 create mode 100755 test-suite/misc/deps-utf8.sh
 create mode 100644 "test-suite/misc/deps/\316\261\316\262/\316\263\316\264.v"
 create mode 100644 "test-suite/misc/deps/\316\261\316\262/\316\265\316\266.v"

diff --git a/test-suite/misc/deps-utf8.sh b/test-suite/misc/deps-utf8.sh
new file mode 100755
index 000000000..13e264c09
--- /dev/null
+++ b/test-suite/misc/deps-utf8.sh
@@ -0,0 +1,17 @@
+# Check reading directories matching non pure ascii idents
+# See bug #5715 (utf-8 working on macos X and linux)
+# Windows is still not compliant
+a=`uname`
+if [ "$a" = "Darwin" -o "$a" = "Linux" ]; then
+rm -f misc/deps/théorèmes/*.v
+tmpoutput=`mktemp /tmp/coqcheck.XXXXXX`
+$coqc -R misc/deps AlphaBêta misc/deps/αβ/γδ.v
+R=$?
+$coqtop -R misc/deps AlphaBêta -load-vernac-source misc/deps/αβ/εζ.v
+S=$?
+if [ $R = 0 -a $S = 0 ]; then
+    exit 0
+else
+    exit 1
+fi
+fi
diff --git "a/test-suite/misc/deps/\316\261\316\262/\316\263\316\264.v" "b/test-suite/misc/deps/\316\261\316\262/\316\263\316\264.v"
new file mode 100644
index 000000000..f43a2d657
--- /dev/null
+++ "b/test-suite/misc/deps/\316\261\316\262/\316\263\316\264.v"
@@ -0,0 +1,4 @@
+Theorem simple : forall A, A -> A.
+Proof.
+auto.
+Qed.
diff --git "a/test-suite/misc/deps/\316\261\316\262/\316\265\316\266.v" "b/test-suite/misc/deps/\316\261\316\262/\316\265\316\266.v"
new file mode 100644
index 000000000..e7fd25c0d
--- /dev/null
+++ "b/test-suite/misc/deps/\316\261\316\262/\316\265\316\266.v"
@@ -0,0 +1 @@
+Require Import γδ.
-- 
cgit v1.2.3


From a47a249e8c31dece9b816bc49d6c5a2aa50fb21b Mon Sep 17 00:00:00 2001
From: Hugo Herbelin <Hugo.Herbelin@inria.fr>
Date: Tue, 12 Sep 2017 20:06:35 +0200
Subject: Reference manual: A few words about the file system constraints for
 -Q/-R.

---
 doc/refman/RefMan-com.tex | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/doc/refman/RefMan-com.tex b/doc/refman/RefMan-com.tex
index 45230fb6e..036fc9368 100644
--- a/doc/refman/RefMan-com.tex
+++ b/doc/refman/RefMan-com.tex
@@ -106,6 +106,15 @@ The following command-line options are recognized by the commands {\tt
   recursively available from {\Coq} using absolute names (extending
   the {\dirpath} prefix) (see Section~\ref{LongNames}).
 
+  Note that only those subdirectories and files which obey the lexical
+  conventions of what is an {\ident} (see Section~\ref{lexical})
+  are taken into account. Conversely, the underlying file systems or
+  operating systems may be more restrictive than {\Coq}. While Linux's
+  ext4 file system supports any {\Coq} recursive layout
+  (within the limit of 255 bytes per file name), the default on NTFS
+  (Windows) or HFS+ (MacOS X) file systems is on the contrary to
+  disallow two files differing only in the case in the same directory.
+
   \SeeAlso Section~\ref{Libraries}.
 
 \item[{\tt -R} {\em directory} {\dirpath}]\ %
-- 
cgit v1.2.3