aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Maxime Dénès <mail@maximedenes.fr>2017-10-05 18:49:12 +0200
committerGravatar Maxime Dénès <mail@maximedenes.fr>2017-10-05 18:49:12 +0200
commit73620255a7a62703a89026336f97a4bc8a913afd (patch)
tree0b28d62d27b3fe6e94a07845ca8900cc9b3c9a71
parentf78a39499ef210456dd96ecb9896874c9ce300ae (diff)
parenta47a249e8c31dece9b816bc49d6c5a2aa50fb21b (diff)
Merge PR #1041: Miscellaneous fixes about UTF-8 (including a fix to BZ#5715 to escape non-UTF-8 file names)
-rw-r--r--Makefile.build2
-rw-r--r--doc/refman/RefMan-com.tex9
-rw-r--r--lib/system.ml2
-rw-r--r--lib/unicode.ml43
-rw-r--r--lib/unicode.mli3
-rw-r--r--library/library.ml20
-rwxr-xr-xtest-suite/misc/deps-utf8.sh17
-rw-r--r--test-suite/misc/deps/αβ/γδ.v4
-rw-r--r--test-suite/misc/deps/αβ/εζ.v1
-rw-r--r--tools/coqdep_lexer.mll42
-rw-r--r--vernac/mltop.ml1
11 files changed, 94 insertions, 50 deletions
diff --git a/Makefile.build b/Makefile.build
index 10d314f0d..ecaaccaaf 100644
--- a/Makefile.build
+++ b/Makefile.build
@@ -429,6 +429,7 @@ tools: $(TOOLS) $(OCAMLLIBDEP) $(COQDEPBOOT)
# is being built.
COQDEPBOOTSRC := lib/minisys.cmo \
+ lib/segmenttree.cmo lib/unicodetable.cmo lib/unicode.cmo \
tools/coqdep_lexer.cmo tools/coqdep_common.cmo tools/coqdep_boot.cmo
tools/coqdep_lexer.cmo : tools/coqdep_lexer.cmi
@@ -449,6 +450,7 @@ $(OCAMLLIBDEP): $(call bestobj, tools/ocamllibdep.cmo)
# The full coqdep (unused by this build, but distributed by make install)
COQDEPCMO:=lib/clib.cma lib/cErrors.cmo lib/cWarnings.cmo lib/minisys.cmo \
+ lib/segmenttree.cmo lib/unicodetable.cmo lib/unicode.cmo \
lib/system.cmo tools/coqdep_lexer.cmo tools/coqdep_common.cmo \
tools/coqdep.cmo
diff --git a/doc/refman/RefMan-com.tex b/doc/refman/RefMan-com.tex
index 9790111f1..892c9931b 100644
--- a/doc/refman/RefMan-com.tex
+++ b/doc/refman/RefMan-com.tex
@@ -107,6 +107,15 @@ The following command-line options are recognized by the commands {\tt
recursively available from {\Coq} using absolute names (extending
the {\dirpath} prefix) (see Section~\ref{LongNames}).
+ Note that only those subdirectories and files which obey the lexical
+ conventions of what is an {\ident} (see Section~\ref{lexical})
+ are taken into account. Conversely, the underlying file systems or
+ operating systems may be more restrictive than {\Coq}. While Linux's
+ ext4 file system supports any {\Coq} recursive layout
+ (within the limit of 255 bytes per file name), the default on NTFS
+ (Windows) or HFS+ (MacOS X) file systems is on the contrary to
+ disallow two files differing only in the case in the same directory.
+
\SeeAlso Section~\ref{Libraries}.
\item[{\tt -R} {\em directory} {\dirpath}]\ %
diff --git a/lib/system.ml b/lib/system.ml
index 12eacf2ea..0b64b237d 100644
--- a/lib/system.ml
+++ b/lib/system.ml
@@ -52,7 +52,7 @@ let dirmap = ref StrMap.empty
let make_dir_table dir =
let filter_dotfiles s f = if f.[0] = '.' then s else StrSet.add f s in
- Array.fold_left filter_dotfiles StrSet.empty (readdir dir)
+ Array.fold_left filter_dotfiles StrSet.empty (Sys.readdir dir)
let exists_in_dir_respecting_case dir bf =
let cache_dir dir =
diff --git a/lib/unicode.ml b/lib/unicode.ml
index 959ccaf73..8eb2eb45d 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -163,6 +163,39 @@ let is_utf8 s =
in
try check 0 with End_of_input -> true | Invalid_argument _ -> false
+(* Escape string if it contains non-utf8 characters *)
+
+let escaped_non_utf8 s =
+ let mk_escape x = Printf.sprintf "%%%X" x in
+ let buff = Buffer.create (String.length s * 3) in
+ let rec process_trailing_aux i j =
+ if i = j then i else
+ match String.unsafe_get s i with
+ | '\128'..'\191' -> process_trailing_aux (i+1) j
+ | _ -> i in
+ let process_trailing i n =
+ let j = if i+n-1 >= String.length s then i+1 else process_trailing_aux (i+1) (i+n) in
+ (if j = i+n then
+ Buffer.add_string buff (String.sub s i n)
+ else
+ let v = Array.init (j-i) (fun k -> mk_escape (Char.code s.[i+k])) in
+ Buffer.add_string buff (String.concat "" (Array.to_list v)));
+ j in
+ let rec process i =
+ if i >= String.length s then Buffer.contents buff else
+ let c = String.unsafe_get s i in
+ match c with
+ | '\000'..'\127' -> Buffer.add_char buff c; process (i+1)
+ | '\128'..'\191' | '\248'..'\255' -> Buffer.add_string buff (mk_escape (Char.code c)); process (i+1)
+ | '\192'..'\223' -> process (process_trailing i 2)
+ | '\224'..'\239' -> process (process_trailing i 3)
+ | '\240'..'\247' -> process (process_trailing i 4)
+ in
+ process 0
+
+let escaped_if_non_utf8 s =
+ if is_utf8 s then s else escaped_non_utf8 s
+
(* Check the well-formedness of an identifier *)
let initial_refutation j n s =
@@ -198,7 +231,7 @@ let ident_refutation s =
|x -> x
with
| End_of_input -> Some (true,"The empty string is not an identifier.")
- | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
+ | Invalid_argument _ -> Some (true,escaped_non_utf8 s^": invalid utf8 sequence.")
let lowercase_unicode =
let tree = Segmenttree.make Unicodetable.to_lower in
@@ -268,9 +301,7 @@ let utf8_length s =
| '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
| '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
| '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
- | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
- | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
- | '\254'..'\255' -> nc := 0 (* invalid byte *)
+ | '\248'..'\255' -> nc := 0 (* invalid byte *)
end ;
incr p ;
while !p < len && !nc > 0 do
@@ -299,9 +330,7 @@ let utf8_sub s start_u len_u =
| '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
| '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
| '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
- | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
- | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
- | '\254'..'\255' -> nc := 0 (* invalid byte *)
+ | '\248'..'\255' -> nc := 0 (* invalid byte *)
end ;
incr p ;
while !p < len_b && !nc > 0 do
diff --git a/lib/unicode.mli b/lib/unicode.mli
index c7d742480..b8e7c33ad 100644
--- a/lib/unicode.mli
+++ b/lib/unicode.mli
@@ -40,3 +40,6 @@ val utf8_length : string -> int
(** Variant of {!String.sub} for UTF-8 strings. *)
val utf8_sub : string -> int -> int -> string
+
+(** Return a "%XX"-escaped string if it contains non UTF-8 characters. *)
+val escaped_if_non_utf8 : string -> string
diff --git a/library/library.ml b/library/library.ml
index 28afa054e..1da2c591d 100644
--- a/library/library.ml
+++ b/library/library.ml
@@ -620,25 +620,6 @@ let check_coq_overwriting p id =
(str "Cannot build module " ++ pr_dirpath p ++ str "." ++ pr_id id ++ str "." ++ spc () ++
str "it starts with prefix \"Coq\" which is reserved for the Coq library.")
-(* Verifies that a string starts by a letter and do not contain
- others caracters than letters, digits, or `_` *)
-
-let check_module_name s =
- let msg c =
- strbrk "Invalid module name: " ++ str s ++ strbrk " character " ++
- (if c = '\'' then str "\"'\"" else (str "'" ++ str (String.make 1 c) ++ str "'")) ++
- strbrk " is not allowed in module names\n"
- in
- let err c = user_err (msg c) in
- match String.get s 0 with
- | 'a' .. 'z' | 'A' .. 'Z' ->
- for i = 1 to (String.length s)-1 do
- match String.get s i with
- | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' -> ()
- | c -> err c
- done
- | c -> err c
-
let start_library fo =
let ldir0 =
try
@@ -648,7 +629,6 @@ let start_library fo =
in
let file = Filename.chop_extension (Filename.basename fo) in
let id = Id.of_string file in
- check_module_name file;
check_coq_overwriting ldir0 id;
let ldir = add_dirpath_suffix ldir0 id in
Declaremods.start_library ldir;
diff --git a/test-suite/misc/deps-utf8.sh b/test-suite/misc/deps-utf8.sh
new file mode 100755
index 000000000..13e264c09
--- /dev/null
+++ b/test-suite/misc/deps-utf8.sh
@@ -0,0 +1,17 @@
+# Check reading directories matching non pure ascii idents
+# See bug #5715 (utf-8 working on macos X and linux)
+# Windows is still not compliant
+a=`uname`
+if [ "$a" = "Darwin" -o "$a" = "Linux" ]; then
+rm -f misc/deps/théorèmes/*.v
+tmpoutput=`mktemp /tmp/coqcheck.XXXXXX`
+$coqc -R misc/deps AlphaBêta misc/deps/αβ/γδ.v
+R=$?
+$coqtop -R misc/deps AlphaBêta -load-vernac-source misc/deps/αβ/εζ.v
+S=$?
+if [ $R = 0 -a $S = 0 ]; then
+ exit 0
+else
+ exit 1
+fi
+fi
diff --git a/test-suite/misc/deps/αβ/γδ.v b/test-suite/misc/deps/αβ/γδ.v
new file mode 100644
index 000000000..f43a2d657
--- /dev/null
+++ b/test-suite/misc/deps/αβ/γδ.v
@@ -0,0 +1,4 @@
+Theorem simple : forall A, A -> A.
+Proof.
+auto.
+Qed.
diff --git a/test-suite/misc/deps/αβ/εζ.v b/test-suite/misc/deps/αβ/εζ.v
new file mode 100644
index 000000000..e7fd25c0d
--- /dev/null
+++ b/test-suite/misc/deps/αβ/εζ.v
@@ -0,0 +1 @@
+Require Import γδ.
diff --git a/tools/coqdep_lexer.mll b/tools/coqdep_lexer.mll
index 8eeb59898..564e20d0e 100644
--- a/tools/coqdep_lexer.mll
+++ b/tools/coqdep_lexer.mll
@@ -25,8 +25,6 @@
exception Fin_fichier
exception Syntax_error of int*int
- let field_name s = String.sub s 1 (String.length s - 1)
-
let unquote_string s =
String.sub s 1 (String.length s - 2)
@@ -40,6 +38,18 @@
let syntax_error lexbuf =
raise (Syntax_error (Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf))
+ let check_valid lexbuf s =
+ match Unicode.ident_refutation s with
+ | None -> s
+ | Some _ -> syntax_error lexbuf
+
+ let get_ident lexbuf =
+ let s = Lexing.lexeme lexbuf in check_valid lexbuf s
+
+ let get_field_name lexbuf =
+ let s = Lexing.lexeme lexbuf in
+ check_valid lexbuf (String.sub s 1 (String.length s - 1))
+
[@@@ocaml.warning "-3"] (* String.uncapitalize_ascii since 4.03.0 GPR#124 *)
let uncapitalize = String.uncapitalize
[@@@ocaml.warning "+3"]
@@ -52,20 +62,8 @@ let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
let caml_up_ident = uppercase identchar*
let caml_low_ident = lowercase identchar*
-let coq_firstchar =
- (* This is only an approximation, refer to lib/util.ml for correct def *)
- ['A'-'Z' 'a'-'z' '_'] |
- (* superscript 1 *)
- '\194' '\185' |
- (* utf-8 latin 1 supplement *)
- '\195' ['\128'-'\150'] | '\195' ['\152'-'\182'] | '\195' ['\184'-'\191'] |
- (* utf-8 letters *)
- '\206' (['\145'-'\161'] | ['\163'-'\187'])
- '\226' ('\130' [ '\128'-'\137' ] (* subscripts *)
- | '\129' [ '\176'-'\187' ] (* superscripts *)
- | '\132' ['\128'-'\191'] | '\133' ['\128'-'\143'])
-let coq_identchar = coq_firstchar | ['\'' '0'-'9']
-let coq_ident = coq_firstchar coq_identchar*
+(* This is an overapproximation, we check correctness afterwards *)
+let coq_ident = ['A'-'Z' 'a'-'z' '_' '\128'-'\255'] ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9' '\128'-'\255']*
let coq_field = '.' coq_ident
let dot = '.' ( space+ | eof)
@@ -102,7 +100,7 @@ and from_rule = parse
| space+
{ from_rule lexbuf }
| coq_ident
- { let from = coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf in
+ { let from = coq_qual_id_tail [get_ident lexbuf] lexbuf in
consume_require (Some from) lexbuf }
| eof
{ syntax_error lexbuf }
@@ -241,7 +239,7 @@ and load_file = parse
parse_dot lexbuf;
Load (unquote_vfile_string s) }
| coq_ident
- { let s = lexeme lexbuf in skip_to_dot lexbuf; Load s }
+ { let s = get_ident lexbuf in skip_to_dot lexbuf; Load s }
| eof
{ syntax_error lexbuf }
| _
@@ -253,7 +251,7 @@ and require_file from = parse
| space+
{ require_file from lexbuf }
| coq_ident
- { let name = coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf in
+ { let name = coq_qual_id_tail [get_ident lexbuf] lexbuf in
let qid = coq_qual_id_list [name] lexbuf in
parse_dot lexbuf;
Require (from, qid) }
@@ -278,7 +276,7 @@ and coq_qual_id = parse
| space+
{ coq_qual_id lexbuf }
| coq_ident
- { coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf }
+ { coq_qual_id_tail [get_ident lexbuf] lexbuf }
| _
{ syntax_error lexbuf }
@@ -288,7 +286,7 @@ and coq_qual_id_tail module_name = parse
| space+
{ coq_qual_id_tail module_name lexbuf }
| coq_field
- { coq_qual_id_tail (field_name (Lexing.lexeme lexbuf) :: module_name) lexbuf }
+ { coq_qual_id_tail (get_field_name lexbuf :: module_name) lexbuf }
| eof
{ syntax_error lexbuf }
| _
@@ -301,7 +299,7 @@ and coq_qual_id_list module_names = parse
| space+
{ coq_qual_id_list module_names lexbuf }
| coq_ident
- { let name = coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf in
+ { let name = coq_qual_id_tail [get_ident lexbuf] lexbuf in
coq_qual_id_list (name :: module_names) lexbuf
}
| eof
diff --git a/vernac/mltop.ml b/vernac/mltop.ml
index 1edbd1a85..d3de10235 100644
--- a/vernac/mltop.ml
+++ b/vernac/mltop.ml
@@ -174,6 +174,7 @@ let warn_cannot_use_directory =
let convert_string d =
try Names.Id.of_string d
with UserError _ ->
+ let d = Unicode.escaped_if_non_utf8 d in
warn_cannot_use_directory d;
raise Exit