New model for user-driven translation of tokens in coqdoc

In the initial model (formerly to r11432), coqdoc parsed separatedly letters and symbolic characters and was thus not able to translate tokens mixing letters and symbolic characters such as "\in" or "=_h". Revision 11432 extended the definition of translatable tokens by supporting letters in it with the benefit of supporting "\in" or "=_h" but added the constraint of requiring spaces to correctly separate tokens in expressions such as "x : nat" which otherwise would be split into "x" and ":nat", then leading to fail understanding "nat" as a proper reference. The new model renounces to define a lexical category of tokens and uses instead a dynamically extensible sublexer similar to the one used in the Coq lexer. The new model works even if tokens are not separated by spaces in the source file and it thus solves problems such as the one mentioned in bug #2273 (failure to link C in "`(!C)"). However, it imposes a stronger discipline in writing the lexing rules in cpretty.mll because the characters that can eventually contribute to the application of a printing rule are buffered in the sublexer and no output is allowed to occur until the buffer of the sublexer if flushed. The theoretical overhead due to the intermediate use of buffers is apparently less than 5%. More details on the token cutting discipline can be found in the new file token.mli. Incidentally fixed two small problems with notation links in LaTeX: made escaping of characters in LaTeX labels more robust so that notations do not easily get the same label name; avoid only highlighting the first '"' of a notation def (failing to have now a better highlighting strategy). git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@12905 85f007b7-540e-0410-9357-904b9bb8a0f7
author: herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7> 2010-04-06 18:55:36 +0000
committer: herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7> 2010-04-06 18:55:36 +0000
commit: 7672bf2bdb941fd487e6c4a0e7deff1143bff680 (patch)
tree: 68d2aee88883fb880b3d83a4edf60ad2626f939f
parent: 3ace40d1b3230f34e7d6a1961eb1fc1f972f57c9 (diff)
8 files changed, 540 insertions, 226 deletions
diff --git a/Makefile.common b/Makefile.common
index c4169009b..e3a94f56d 100644
--- a/Makefile.common
+++ b/Makefile.common
@@ -249,7 +249,7 @@ DEBUGPRINTERS:=dev/top_printers.cmo dev/vm_printers.cmo dev/printers.cma
 COQDEPCMO:=$(COQENVCMO) tools/coqdep_lexer.cmo tools/coqdep_common.cmo tools/coqdep.cmo
 
 COQDOCCMO:=$(CONFIG) $(addprefix tools/coqdoc/, \
-  cdglobals.cmo alpha.cmo index.cmo output.cmo cpretty.cmo main.cmo )
+  cdglobals.cmo alpha.cmo index.cmo tokens.cmo output.cmo cpretty.cmo main.cmo )
 
 ###########################################################################
 # vo files
diff --git a/test-suite/coqdoc/links.v b/test-suite/coqdoc/links.v
index 16028fe17..581029bdd 100644
--- a/test-suite/coqdoc/links.v
+++ b/test-suite/coqdoc/links.v
@@ -3,7 +3,7 @@
 - symbols should not be inlined in string g
 - links to both kinds of notations in a' should work to the right notation
 - with utf8 option, forall must be unicode
-- spliting between symbols and ident should be correct in a' and c
+- splitting between symbols and ident should be correct in a' and c
 - ".." should be rendered correctly
 *)
 
diff --git a/tools/coqdoc/cpretty.mll b/tools/coqdoc/cpretty.mll
index af8bac2c1..b70053173 100644
--- a/tools/coqdoc/cpretty.mll
+++ b/tools/coqdoc/cpretty.mll
@@ -18,7 +18,7 @@
   (* A function that emulates Lexing.new_line (which does not exist in OCaml < 3.11.0) *)
   let new_line lexbuf =
     let pos = lexbuf.lex_curr_p in
-      lexbuf.lex_curr_p <- { pos with 
+      lexbuf.lex_curr_p <- { pos with
 	pos_lnum = pos.pos_lnum + 1;
 	pos_bol = pos.pos_cnum }
 
@@ -40,6 +40,12 @@
     in
       count 0 0
 
+  let remove_newline s =
+    let n = String.length s in
+    let rec count i = if i == n || s.[i] <> '\n' then i else count (i + 1) in
+    let i = count 0 in
+    i, String.sub s i (n - i)
+
   let count_dashes s =
     let c = ref 0 in
     for i = 0 to String.length s - 1 do if s.[i] = '-' then incr c done;
@@ -76,7 +82,7 @@
 
   let start_emph () = in_emph := true; Output.start_emph ()
   let stop_emph () = if !in_emph then (Output.stop_emph (); in_emph := false)
-    
+
   let backtrack lexbuf = lexbuf.lex_curr_pos <- lexbuf.lex_start_pos;
     lexbuf.lex_curr_p <- lexbuf.lex_start_p
 
@@ -247,8 +253,6 @@
     else
       String.sub s 1 (String.length s - 3)
 
-  let symbol lexbuf s = Output.symbol s (lexeme_start lexbuf)
-
   let output_indented_keyword s lexbuf =
     let nbsp,isp = count_spaces s in
     Output.indentation nbsp;
@@ -269,7 +273,7 @@ let firstchar =
   '\194' '\185' |
   (* utf-8 latin 1 supplement *)
   '\195' ['\128'-'\191'] |
-  (* utf-8 letterlike symbols *) 
+  (* utf-8 letterlike symbols *)
   (* '\206' ([ '\145' - '\183'] | '\187') | *)
   (* '\xCF' [ '\x00' - '\xCE' ] |  *)
   (* utf-8 letterlike symbols *)
@@ -284,37 +288,12 @@ let pfx_id = (id '.')*
 let identifier =
   id | pfx_id id
 
-let utf8_multibyte =
-   [ '\xC0'-'\xDF' ] _
- | [ '\xE0'-'\xEF' ] _ _
- | [ '\xF0'-'\xF7' ] _ _ _
-
 (* This misses unicode stuff, and it adds "[" and "]".  It's only an
    approximation of idents - used for detecting whether an underscore
    is part of an identifier or meant to indicate emphasis *)
-let nonidentchar =
-  [^ 'A'-'Z' 'a'-'z' '_' '[' ']'
-      (* iso 8859-1 accents *)
-    '\192'-'\214' '\216'-'\246' '\248'-'\255'
-    '\'' '0'-'9' '@' ]
-
-let symbolchar_symbol_no_brackets =
-  ['!' '$' '%' '&' '*' '+' ',' '^' '#'
-      '\\' '/' '-' '<' '>' '|' ':' '?' '=' '~' ] |
-  (* utf-8 symbols *)
-  '\226' ['\134'-'\143' '\152'-'\155' '\164'-'\165' '\168'-'\171'] _
-let symbolchar_no_brackets = symbolchar_symbol_no_brackets |
-    [ '@' '{' '}' '(' ')' 'A'-'Z' 'a'-'z' '_']
-let symbolchar = symbolchar_no_brackets | '[' | ']'
-let token_no_brackets = symbolchar_symbol_no_brackets symbolchar_no_brackets*
-let token = symbolchar_symbol_no_brackets symbolchar* | '[' [^ '[' ']' ':']* ']'
-let printing_token = (token | id)+
-
-(* tokens with balanced brackets *)
-let token_brackets =
-  ( token_no_brackets ('[' token_no_brackets? ']')*
-  | token_no_brackets? ('[' token_no_brackets? ']')+ )
-  token_no_brackets?
+let nonidentchar = [^ 'A'-'Z' 'a'-'z' '_' '[' ']' '\'' '0'-'9' '@' ]
+
+let printing_token = [^ ' ' '\t']*
 
 let thm_token =
   "Theorem"
@@ -454,8 +433,8 @@ let end_verb = "(*" space* "end" space+ "verb" space* "*)"
 
 rule coq_bol = parse
   | space* nl+
-      { if not (!in_proof <> None && (!Cdglobals.gallina || !Cdglobals.light)) 
-        then Output.empty_line_of_code (); 
+      { if not (!in_proof <> None && (!Cdglobals.gallina || !Cdglobals.light))
+        then Output.empty_line_of_code ();
         coq_bol lexbuf }
   | space* "(**" space_nl
       { Output.end_coq (); Output.start_doc ();
@@ -538,7 +517,7 @@ rule coq_bol = parse
 	ignore (comment lexbuf);
 	coq_bol lexbuf }
   | space* "(**" space+ "remove" space+ "printing" space+
-      (identifier | token) space* "*)"
+      printing_token space* "*)"
       { remove_printing_token (lexeme lexbuf);
 	coq_bol lexbuf }
   | space* "(**" space+ "remove" space+ "printing" space+
@@ -591,7 +570,18 @@ and coq = parse
           else coq lexbuf
       }
   | nl+ space* "]]"
-      { if not !formatted then begin symbol lexbuf (lexeme lexbuf); coq lexbuf end }
+      { if not !formatted then
+	begin
+	  (* Isn't this an anomaly *)
+	  let s = lexeme lexbuf in
+	  let nlsp,s = remove_newline s in
+	  let nbsp,isp = count_spaces s in
+	  Output.indentation nbsp;
+	  let loc = lexeme_start lexbuf + isp + nlsp in
+	  Output.sublexer ']' loc;
+	  Output.sublexer ']' (loc+1);
+	  coq lexbuf
+	end }
   | eof
       { () }
   | gallina_kw_to_hide
@@ -891,22 +881,27 @@ and verbatim = parse
 and escaped_coq = parse
   | "]"
       { decr brackets;
-	if !brackets > 0 then begin Output.char ']'; escaped_coq lexbuf end }
+	if !brackets > 0 then
+	  (Output.sublexer ']' (lexeme_start lexbuf); escaped_coq lexbuf)
+	else Tokens.flush_sublexer () }
   | "["
-      { incr brackets; Output.char '['; escaped_coq lexbuf }
+      { incr brackets;
+        Output.sublexer '[' (lexeme_start lexbuf); escaped_coq lexbuf }
   | "(*"
-      { comment_level := 1; ignore (comment lexbuf); escaped_coq lexbuf }
+      { Tokens.flush_sublexer (); comment_level := 1;
+        ignore (comment lexbuf); escaped_coq lexbuf }
   | "*)"
       { (* likely to be a syntax error: we escape *) backtrack lexbuf }
   | eof
-      { () }
-  | token_brackets
-      { let s = lexeme lexbuf in
-	  symbol lexbuf s; escaped_coq lexbuf }
+      { Tokens.flush_sublexer () }
   | (identifier '.')* identifier
       { Output.ident (lexeme lexbuf) (lexeme_start lexbuf); escaped_coq lexbuf }
+  | space
+      { Tokens.flush_sublexer(); Output.char (lexeme_char lexbuf 0);
+        escaped_coq lexbuf }
   | _
-      { Output.char (lexeme_char lexbuf 0); escaped_coq lexbuf }
+      { Output.sublexer (lexeme_char lexbuf 0) (lexeme_start lexbuf);
+        escaped_coq lexbuf }
 
 (*s Coq "Comments" command. *)
 
@@ -1004,11 +999,18 @@ and body_bol = parse
   | _ { backtrack lexbuf; Output.indentation 0; body lexbuf }
 
 and body = parse
-  | nl {Output.line_break(); new_line lexbuf; body_bol lexbuf}
+  | nl {Tokens.flush_sublexer(); Output.line_break(); new_line lexbuf; body_bol lexbuf}
   | nl+ space* "]]" space* nl
-      { if not !formatted then
+      { Tokens.flush_sublexer();
+        if not !formatted then
           begin
-            symbol lexbuf (lexeme lexbuf);
+            let s = lexeme lexbuf in
+            let nlsp,s = remove_newline s in
+            let _,isp = count_spaces s in
+            let loc = lexeme_start lexbuf + nlsp + isp in
+            Output.sublexer ']' loc;
+            Output.sublexer ']' (loc+1);
+            Tokens.flush_sublexer();
             body lexbuf
           end
         else
@@ -1017,9 +1019,14 @@ and body = parse
             true
           end }
   | "]]" space* nl
-      { if not !formatted then
+      { Tokens.flush_sublexer();
+        if not !formatted then
           begin
-            symbol lexbuf (lexeme lexbuf);
+	    let loc = lexeme_start lexbuf in
+	    Output.sublexer ']' loc;
+	    Output.sublexer ']' (loc+1);
+	    Tokens.flush_sublexer();
+	    Output.line_break();
             body lexbuf
           end
         else
@@ -1027,12 +1034,12 @@ and body = parse
             Output.paragraph ();
             true
           end }
-  | eof { false }
+  | eof { Tokens.flush_sublexer(); false }
   | '.' space* nl | '.' space* eof
-	{ Output.char '.'; Output.line_break();
+	{ Tokens.flush_sublexer(); Output.char '.'; Output.line_break();
 	  if not !formatted then true else body_bol lexbuf }
   | '.' space* nl "]]" space* nl
-	{ Output.char '.';
+	{ Tokens.flush_sublexer(); Output.char '.';
         if not !formatted then
           begin
             eprintf "Error: stray ]] at %d\n"  (lexeme_start lexbuf);
@@ -1045,50 +1052,46 @@ and body = parse
             true
           end
       }
-  | '.' space+ { Output.char '.'; Output.char ' ';
+  | '.' space+
+        { Tokens.flush_sublexer(); Output.char '.'; Output.char ' ';
 	  if not !formatted then false else body lexbuf }
   | "(**" space_nl
-      { Output.end_coq (); Output.start_doc ();
+      { Tokens.flush_sublexer(); Output.end_coq (); Output.start_doc ();
 	let eol = doc_bol lexbuf in
 	  Output.end_doc (); Output.start_coq ();
 	  if eol then body_bol lexbuf else body lexbuf }
-  | "(*" { comment_level := 1;
+  | "(*" { Tokens.flush_sublexer(); comment_level := 1;
 	   if !Cdglobals.parse_comments then Output.start_comment ();
 	   let eol = comment lexbuf in
 	     if eol
 	     then begin if not !Cdglobals.parse_comments then Output.line_break(); body_bol lexbuf end
 	     else body lexbuf }
   | "where" space*
-      { let s = lexeme lexbuf in
-	Output.ident s (lexeme_start lexbuf);
+      { Tokens.flush_sublexer();
+        Output.ident (lexeme lexbuf) (lexeme_start lexbuf);
 	start_notation_string lexbuf }
   | identifier
-      { let s = lexeme lexbuf in
-	  Output.ident s (lexeme_start lexbuf);
-	  body lexbuf }
-  | token_no_brackets
-      { let s = lexeme lexbuf in
-	  symbol lexbuf s; body lexbuf }
+      { Tokens.flush_sublexer();
+        Output.ident (lexeme lexbuf) (lexeme_start lexbuf);
+	body lexbuf }
   | ".."
-      { Output.char '.'; Output.char '.';
+      { Tokens.flush_sublexer(); Output.char '.'; Output.char '.';
         body lexbuf }
-  | '"' 
-      { Output.char '"'; 
+  | '"'
+      { Tokens.flush_sublexer(); Output.char '"';
         string lexbuf;
         body lexbuf }
-
-  | utf8_multibyte
-      { let c = lexeme lexbuf in
-	symbol lexbuf c;
+  | space
+      { Tokens.flush_sublexer(); Output.char (lexeme_char lexbuf 0);
         body lexbuf }
 
   | _ { let c = lexeme_char lexbuf 0 in
-	  Output.char c;
-          body lexbuf }
+	Output.sublexer c (lexeme_start lexbuf);
+        body lexbuf }
 
 and start_notation_string = parse
   | '"' (* a true notation *)
-      { symbol lexbuf "\"";
+      { Output.sublexer '"' (lexeme_start lexbuf);
         notation_string lexbuf;
 	body lexbuf }
   | _ (* an abbreviation *)
@@ -1099,13 +1102,9 @@ and notation_string = parse
       { Output.char '"'; Output.char '"'; (* Unlikely! *)
         notation_string lexbuf }
   | '"'
-      { Output.char '"' }
-  | token
-      { let s = lexeme lexbuf in
-        symbol lexbuf s;
-        notation_string lexbuf }
+      { Tokens.flush_sublexer(); Output.char '"' }
   | _ { let c = lexeme_char lexbuf 0 in
-        Output.char c;
+        Output.sublexer c (lexeme_start lexbuf);
         notation_string lexbuf }
 
 and string = parse
@@ -1164,6 +1163,7 @@ and st_subtitle = parse
     let c = open_in f in
     let lb = from_channel c in
       (Index.current_library := m;
+       Output.initialize ();
        Output.start_module ();
        Output.start_coq (); coq_bol lb; Output.end_coq ();
        close_in c)
diff --git a/tools/coqdoc/main.ml b/tools/coqdoc/main.ml
index b52c28ff2..67c63865a 100644
--- a/tools/coqdoc/main.ml
+++ b/tools/coqdoc/main.ml
@@ -359,7 +359,6 @@ let parse () =
 	add_file (what_file f); parse_rec rem
   in
     parse_rec (List.tl (Array.to_list Sys.argv));
-    Output.initialize ();
     List.rev !files
 
 
diff --git a/tools/coqdoc/output.ml b/tools/coqdoc/output.ml
index 168a28f20..0f36b775c 100644
--- a/tools/coqdoc/output.ml
+++ b/tools/coqdoc/output.ml
@@ -96,22 +96,32 @@ let set_module m sub = current_module := (m,sub);
 let item_level = ref 0
 let in_doc = ref false
 
-(*s Customized pretty-print *)
-
-let token_pp = Hashtbl.create 97
-
-let add_printing_token = Hashtbl.replace token_pp
-
-let find_printing_token tok =
-  try Hashtbl.find token_pp tok with Not_found -> None, None
-
-let remove_printing_token = Hashtbl.remove token_pp
-
-(* predefined pretty-prints *)
-let initialize () =
+(*s Customized and predefined pretty-print *)
+
+let initialize_texmacs () =
+  let ensuremath x = sprintf "<with|mode|math|\\<%s\\>>" x in
+  List.fold_right (fun (s,t) tt -> Tokens.ttree_add tt s t)
+    [ "*", ensuremath "times";
+      "->", ensuremath "rightarrow";
+      "<-", ensuremath "leftarrow";
+      "<->", ensuremath "leftrightarrow";
+      "=>", ensuremath "Rightarrow";
+      "<=", ensuremath "le";
+      ">=", ensuremath "ge";
+      "<>", ensuremath "noteq";
+      "~",  ensuremath "lnot";
+      "/\\", ensuremath "land";
+      "\\/", ensuremath "lor";
+      "|-", ensuremath "vdash"
+    ] Tokens.empty_ttree
+
+let token_tree_texmacs = lazy (initialize_texmacs ())
+
+let initialize_tex_html () =
   let if_utf8 = if !Cdglobals.utf8 then fun x -> Some x else fun _ -> None in
-    List.iter
-      (fun (s,l,l') -> Hashtbl.add token_pp s (Some l, l'))
+    List.fold_right (fun (s,l,l') (tt,tt') ->
+      (Tokens.ttree_add tt s l,
+       match l' with None -> tt' | Some l' -> Tokens.ttree_add tt' s l'))
       [ "*" ,  "\\ensuremath{\\times}", if_utf8 "×";
 	"|", "\\ensuremath{|}", None;
 	"->",  "\\ensuremath{\\rightarrow}", if_utf8 "→";
@@ -132,7 +142,20 @@ let initialize () =
 	"Π", "\\ensuremath{\\Pi}", if_utf8 "Π";
 	"λ", "\\ensuremath{\\lambda}", if_utf8 "λ";
 	(* "fun", "\\ensuremath{\\lambda}" ? *)
-      ]
+      ] (Tokens.empty_ttree,Tokens.empty_ttree)
+
+let token_tree_latex = lazy (fst (initialize_tex_html ()))
+let token_tree_html = lazy (snd (initialize_tex_html ()))
+
+let add_printing_token s (t1,t2) =
+  match
+    (match !Cdglobals.target_language with LaTeX -> t1 | HTML -> t2 | _ -> None)
+  with
+  | None -> ()
+  | Some t -> Tokens.token_tree := Tokens.ttree_add !Tokens.token_tree s t
+
+let remove_printing_token s =
+  Tokens.token_tree := Tokens.ttree_remove !Tokens.token_tree s
 
 (*s Table of contents *)
 
@@ -187,6 +210,8 @@ module Latex = struct
       printf "\\end{document}\n"
     end
 
+  (*s Latex low-level translation *)
+
   let nbsp () = output_char '~'
 
   let char c = match c with
@@ -200,10 +225,13 @@ module Latex = struct
 	output_char c
 
   let label_char c = match c with
-    | '\\' | '$' | '#' | '%' | '&' | '{' | '}' | '_'
-    | '^' | '~' -> ()
-    | _ ->
-	output_char c
+    | '_' -> output_char ' '
+    | '\\' | '$' | '#' | '%' | '&' | '{' | '}'
+    | '^' | '~' -> printf "x%X" (Char.code c)
+    | _ -> if c >= '\x80' then printf "x%X" (Char.code c) else output_char c
+
+  let label_ident s =
+    for i = 0 to String.length s - 1 do label_char s.[i] done
 
   let latex_char = output_char
   let latex_string = output_string
@@ -211,11 +239,27 @@ module Latex = struct
   let html_char _ = ()
   let html_string _ = ()
 
-  let raw_ident s =
-    for i = 0 to String.length s - 1 do char s.[i] done
-
-  let label_ident s =
-    for i = 0 to String.length s - 1 do label_char s.[i] done
+  (*s Latex char escaping *)
+
+  let escaped =
+    let buff = Buffer.create 5 in
+    fun s ->
+      Buffer.clear buff;
+      for i = 0 to String.length s - 1 do
+	match s.[i] with
+	| '\\' ->
+	    Buffer.add_string buff "\\symbol{92}"
+	| '$' | '#' | '%' | '&' | '{' | '}' | '_' as c ->
+	    Buffer.add_char buff '\\'; Buffer.add_char buff c
+	| '^' | '~' as c ->
+	    Buffer.add_char buff '\\'; Buffer.add_char buff c;
+	    Buffer.add_string buff "{}"
+	| c ->
+	    Buffer.add_char buff c
+      done;
+      Buffer.contents buff
+
+  (*s Latex reference and symbol translation *)
 
   let start_module () =
     let ln = !lib_name in
@@ -224,9 +268,7 @@ module Latex = struct
         label_ident (get_module false);
         printf "}{";
         if ln <> "" then printf "%s " ln;
-        printf "}{";
-        raw_ident (get_module true);
-        printf "}\n\n"
+        printf "}{%s}\n\n" (escaped (get_module true))
     end
 
   let start_latex_math () = output_char '$'
@@ -244,35 +286,33 @@ module Latex = struct
       let space = 0.5 *. (float n) in
       printf "\\coqdocindent{%2.2fem}\n" space
 
-  let with_latex_printing f tok =
-    try
-      (match Hashtbl.find token_pp tok with
-	 | Some s, _ -> output_string s
-	 | _ -> f tok)
-    with Not_found ->
-      f tok
-
   let module_ref m s =
-    printf "\\moduleid{%s}{" m; raw_ident s; printf "}"
+    printf "\\moduleid{%s}{%s}" m (escaped s)
 
   let ident_ref m fid typ s =
     let id = if fid <> "" then (m ^ "." ^ fid) else m in
     match find_module m with
-      | Local -> 
-	  if typ = Variable then (printf "\\coqdoc%s{" (type_name typ); raw_ident s; printf "}")
+      | Local ->
+	  if typ = Variable then
+	    printf "\\coqdoc%s{%s}" (type_name typ) s
 	  else
-	    (printf "\\coqref{"; label_ident id; printf "}{\\coqdoc%s{" (type_name typ);
-	     raw_ident s; printf "}}")
+	    (printf "\\coqref{"; label_ident id;
+	     printf "}{\\coqdoc%s{%s}}" (type_name typ) s)
       | External m when !externals ->
-	  printf "\\coqexternalref{"; raw_ident m; printf "}{";
-	  label_ident fid; printf "}{\\coqdoc%s{" (type_name typ); raw_ident s; printf "}}"
+	  printf "\\coqexternalref{"; label_ident fid;
+	  printf "}{%s}{\\coqdoc%s{%s}}" (escaped m) (type_name typ) s
       | External _ | Unknown ->
-	  (* printf "\\coqref{"; label_ident id; printf "}{" *)
-	  printf "\\coqdoc%s{" (type_name typ); raw_ident s; printf "}"
+	  printf "\\coqdoc%s{%s}" (type_name typ) s
 
   let defref m id ty s =
-    printf "\\coqdef{"; label_ident (m ^ "." ^ id); printf "}{"; raw_ident s; printf "}{";
-    printf "\\coqdoc%s{" (type_name ty); raw_ident s; printf "}}"
+    if ty <> Notation then
+      (printf "\\coqdef{"; label_ident (m ^ "." ^ id);
+       printf "}{%s}{\\coqdoc%s{%s}}" s (type_name ty) s)
+    else
+      (* Glob file still not able to say the exact extent of the definition *)
+      (* so we currently renounce to highlight the notation location *)
+      (printf "\\coqdef{"; label_ident (m ^ "." ^ id);
+       printf "}{%s}{%s}" s s)
 
   let reference s = function
     | Def (fullid,typ) ->
@@ -282,31 +322,60 @@ module Latex = struct
     | Ref (m,fullid,typ) ->
 	ident_ref m fullid typ s
     | Mod _ ->
-	printf "\\coqdocvar{"; raw_ident s; printf "}"
+	printf "\\coqdocvar{%s}" (escaped s)
+
+  (*s The sublexer buffers symbol characters and attached
+       uninterpreted ident and try to apply special translation such as,
+       predefined, translation "->" to "\ensuremath{\rightarrow}" or,
+       virtually, a user-level translation from "=_h" to "\ensuremath{=_{h}}" *)
+
+  let output_sublexer_string doescape issymbchar tag s =
+    let s = if doescape then escaped s else s in
+    match tag with
+    | Some ref -> reference s ref
+    | None -> if issymbchar then output_string s else printf "\\coqdocvar{%s}" s
+
+  let sublexer c loc =
+    let tag =
+      try Some (Index.find (get_module false) loc) with Not_found -> None
+    in
+    Tokens.output_tagged_symbol_char tag c
+
+  let initialize () =
+    Tokens.token_tree := Lazy.force token_tree_latex;
+    Tokens.outfun := output_sublexer_string
+
+  (*s Interpreting ident with fallback on sublexer if unknown ident *)
+
+  let translate s =
+    match Tokens.translate s with Some s -> s | None -> escaped s
 
   let ident s loc =
     try
-      reference s (Index.find (get_module false) loc)
+      let tag = Index.find (get_module false) loc in
+      reference (translate s) tag
     with Not_found ->
       if is_tactic s then
-	(printf "\\coqdoctac{"; raw_ident s; printf "}")
+	printf "\\coqdoctac{%s}" (translate s)
       else if is_keyword s then
-	(printf "\\coqdockw{"; raw_ident s; printf "}")
+	printf "\\coqdockw{%s}" (translate s)
       else if !Cdglobals.interpolate && !in_doc (* always a var otherwise *)
       then
-	try reference s (Index.find_string (get_module false) s)
-	with _ -> (printf "\\coqdocvar{"; raw_ident s; printf "}")
-      else (printf "\\coqdocvar{"; raw_ident s; printf "}")
+	try
+	  let tag = Index.find_string (get_module false) s in
+	  reference (translate s) tag
+	with _ -> Tokens.output_tagged_ident_string s
+      else Tokens.output_tagged_ident_string s
 
   let ident s l =
     if !in_title then (
       printf "\\texorpdfstring{\\protect";
-      with_latex_printing (fun s -> ident s l) s;
-      printf "}{"; raw_ident s; printf "}")
+      ident s l;
+      printf "}{%s}" (translate s))
     else
-      with_latex_printing (fun s -> ident s l) s
+      ident s l
 
-  let symbol s l = with_latex_printing raw_ident s
+  (*s Translating structure *)
 
   let proofbox () = printf "\\ensuremath{\\Box}"
 
@@ -330,8 +399,6 @@ module Latex = struct
 
   let end_doc () = in_doc := false; stop_item ()
 
-  let comment c = char c
-
   (* This is broken if we are in math mode, but coqdoc currently isn't
      tracking that *)
   let start_emph () = printf "\\textit{"
@@ -460,7 +527,21 @@ module Html = struct
     | '&' -> printf "&amp;"
     | c -> output_char c
 
-  let raw_ident s = for i = 0 to String.length s - 1 do char s.[i] done
+  let raw_string s =
+    for i = 0 to String.length s - 1 do char s.[i] done
+
+  let escaped =
+    let buff = Buffer.create 5 in
+    fun s ->
+      Buffer.clear buff;
+      for i = 0 to String.length s - 1 do
+	match s.[i] with
+	| '<' -> Buffer.add_string buff "&lt;"
+	| '>' -> Buffer.add_string buff "&gt;"
+	| '&' -> Buffer.add_string buff "&amp;"
+	| c -> Buffer.add_char buff c
+      done;
+      Buffer.contents buff
 
   let latex_char _ = ()
   let latex_string _ = ()
@@ -477,78 +558,72 @@ module Html = struct
   let module_ref m s =
     match find_module m with
       | Local ->
-	  printf "<a class=\"modref\" href=\"%s.html\">" m; raw_ident s; printf "</a>"
+	  printf "<a class=\"modref\" href=\"%s.html\">%s</a>" m s
       | External m when !externals ->
-	    printf "<a class=\"modref\" href=\"%s.html\">" m; raw_ident s; printf "</a>"
+	  printf "<a class=\"modref\" href=\"%s.html\">%s</a>" m s
       | External _ | Unknown ->
-	  raw_ident s
+	  output_string s
 
   let ident_ref m fid typ s =
     match find_module m with
     | Local ->
 	printf "<a class=\"idref\" href=\"%s.html#%s\">" m fid;
-	printf "<span class=\"id\" type=\"%s\">" typ;
-	raw_ident s;
-	printf "</span></a>"
+	printf "<span class=\"id\" type=\"%s\">%s</span></a>" typ s
     | External m when !externals ->
-	  printf "<a class=\"idref\" href=\"%s.html#%s\">" m fid;
-	  printf "<span class=\"id\" type=\"%s\">" typ;
-	  raw_ident s; printf "</span></a>"
+	printf "<a class=\"idref\" href=\"%s.html#%s\">" m fid;
+	printf "<span class=\"id\" type=\"%s\">%s</span></a>" typ s
     | External _ | Unknown ->
-	printf "<span class=\"id\" type=\"%s\">" typ; raw_ident s; printf "</span>"
+	printf "<span class=\"id\" type=\"%s\">%s</span>" typ s
 
   let reference s r =
     match r with
     | Def (fullid,ty) ->
-	  printf "<a name=\"%s\">" fullid;
-	  printf "<span class=\"id\" type=\"%s\">" (type_name ty);
-	  raw_ident s; printf "</span></a>"
+	printf "<a name=\"%s\">" fullid;
+	printf "<span class=\"id\" type=\"%s\">%s</span></a>" (type_name ty) s
     | Mod (m,s') when s = s' ->
-	  module_ref m s
+	module_ref m s
     | Ref (m,fullid,ty) ->
-	  ident_ref m fullid (type_name ty) s
+	ident_ref m fullid (type_name ty) s
     | Mod _ ->
-	  printf "<span class=\"id\" type=\"mod\">"; raw_ident s ; printf "</span>"
-
-  let ident s loc =
-    if is_keyword s then begin
-      printf "<span class=\"id\" type=\"keyword\">";
-      raw_ident s;
-      printf "</span>"
-    end else
-      begin
-	  try reference s (Index.find (get_module false) loc)
-	  with Not_found ->
-	    if is_tactic s then
-		(printf "<span class=\"id\" type=\"tactic\">"; raw_ident s; printf "</span>")
-	    else
-		if !Cdglobals.interpolate && !in_doc (* always a var otherwise *)
-		then
-	        try reference s (Index.find_string (get_module false) s)
-	        with _ ->
-		    (printf "<span class=\"id\" type=\"var\">"; raw_ident s ; printf "</span>")
-	    else (printf "<span class=\"id\" type=\"var\">"; raw_ident s ; printf "</span>")
-      end
-
-  let with_html_printing f tok loc =
-    try
-      (match Hashtbl.find token_pp tok with
-	 | _, Some s ->
-	     (try reference s (Index.find (get_module false) loc)
-	      with Not_found -> output_string s)
-	 | _ -> f tok loc)
-    with Not_found ->
-      f tok loc
+	printf "<span class=\"id\" type=\"mod\">%s</span>" s
+
+  let output_sublexer_string doescape issymbchar tag s =
+    let s = if doescape then escaped s else s in
+    match tag with
+    | Some ref -> reference s ref
+    | None ->
+	if issymbchar then output_string s
+	else printf "<span class=\"id\" type=\"var\">%s</span>" s
+
+  let sublexer c loc =
+    let tag =
+      try Some (Index.find (get_module false) loc) with Not_found -> None
+    in
+    Tokens.output_tagged_symbol_char tag c
 
-  let ident s l =
-    with_html_printing ident s l
+  let initialize () =
+    Tokens.token_tree := Lazy.force token_tree_html;
+    Tokens.outfun := output_sublexer_string
 
-  let raw_symbol s loc =
-    try reference s (Index.find (get_module false) loc)
-    with Not_found -> raw_ident s
+  let translate s =
+    match Tokens.translate s with Some s -> s | None -> escaped s
 
-  let symbol s l =
-    with_html_printing raw_symbol s l
+  let ident s loc =
+    if is_keyword s then begin
+      printf "<span class=\"id\" type=\"keyword\">%s</span>" (translate s)
+    end else begin
+      try reference (translate s) (Index.find (get_module false) loc)
+      with Not_found ->
+	if is_tactic s then
+	  printf "<span class=\"id\" type=\"tactic\">%s</span>" (translate s)
+	else
+	  if !Cdglobals.interpolate && !in_doc (* always a var otherwise *)
+	  then
+	    try reference (translate s) (Index.find_string (get_module false) s)
+	    with _ -> Tokens.output_tagged_ident_string s
+	  else
+	    Tokens.output_tagged_ident_string s
+    end
 
   let proofbox () = printf "<font size=-2>&#9744;</font>"
 
@@ -803,24 +878,15 @@ module TeXmacs = struct
 
   let ident s _ = if !in_doc then ident_true s else raw_ident s
 
-  let symbol_true s =
-    let ensuremath x = printf "<with|mode|math|\\<%s\\>>" x in
-      match s with
-	| "*"  -> ensuremath "times"
-	| "->" -> ensuremath "rightarrow"
-	| "<-" -> ensuremath "leftarrow"
-	| "<->" ->ensuremath "leftrightarrow"
-	| "=>" -> ensuremath "Rightarrow"
-	| "<=" -> ensuremath "le"
-	| ">=" -> ensuremath "ge"
-	| "<>" -> ensuremath "noteq"
-	| "~" ->  ensuremath "lnot"
-	| "/\\" -> ensuremath "land"
-	| "\\/" -> ensuremath "lor"
-	| "|-" -> ensuremath "vdash"
-	| s    -> raw_ident s
-
-  let symbol s _ = if !in_doc then symbol_true s else raw_ident s
+  let output_sublexer_string doescape issymbchar tag s =
+    if doescape then raw_ident s else output_string s
+
+  let sublexer c l =
+    if !in_doc then Tokens.output_tagged_symbol_char None c else char c
+
+  let initialize () =
+    Tokens.token_tree := Lazy.force token_tree_texmacs;
+    Tokens.outfun := output_sublexer_string
 
   let proofbox () = printf "QED"
 
@@ -897,7 +963,7 @@ module TeXmacs = struct
 end
 
 
-(*s LaTeX output *)
+(*s Raw output *)
 
 module Raw = struct
 
@@ -909,12 +975,6 @@ module Raw = struct
 
   let char = output_char
 
-  let label_char c = match c with
-    | '\\' | '$' | '#' | '%' | '&' | '{' | '}' | '_'
-    | '^' | '~' -> ()
-    | _ ->
-	output_char c
-
   let latex_char = output_char
   let latex_string = output_string
 
@@ -939,7 +999,11 @@ module Raw = struct
 
   let ident s loc = raw_ident s
 
-  let symbol s loc = raw_ident s
+  let sublexer c l = char c
+
+  let initialize () =
+    Tokens.token_tree := Tokens.empty_ttree;
+    Tokens.outfun := (fun _ _ _ _ -> failwith "Useless")
 
   let proofbox () = printf "[]"
 
@@ -1027,7 +1091,7 @@ let end_inline_coq =
   select Latex.end_inline_coq Html.end_inline_coq TeXmacs.end_inline_coq Raw.end_inline_coq
 
 let start_inline_coq_block =
-  select Latex.start_inline_coq_block Html.start_inline_coq_block 
+  select Latex.start_inline_coq_block Html.start_inline_coq_block
     TeXmacs.start_inline_coq_block Raw.start_inline_coq_block
 let end_inline_coq_block =
   select Latex.end_inline_coq_block Html.end_inline_coq_block TeXmacs.end_inline_coq_block Raw.end_inline_coq_block
@@ -1047,7 +1111,8 @@ let rule = select Latex.rule Html.rule TeXmacs.rule Raw.rule
 let nbsp = select Latex.nbsp Html.nbsp TeXmacs.nbsp Raw.nbsp
 let char = select Latex.char Html.char TeXmacs.char Raw.char
 let ident = select Latex.ident Html.ident TeXmacs.ident Raw.ident
-let symbol = select Latex.symbol Html.symbol TeXmacs.symbol Raw.symbol
+let sublexer = select Latex.sublexer Html.sublexer TeXmacs.sublexer Raw.sublexer
+let initialize = select Latex.initialize Html.initialize TeXmacs.initialize Raw.initialize
 
 let proofbox = select Latex.proofbox Html.proofbox TeXmacs.proofbox Raw.proofbox
 
diff --git a/tools/coqdoc/output.mli b/tools/coqdoc/output.mli
index 1ccbac2f9..d836f6b39 100644
--- a/tools/coqdoc/output.mli
+++ b/tools/coqdoc/output.mli
@@ -63,7 +63,8 @@ val rule : unit -> unit
 val nbsp : unit -> unit
 val char : char -> unit
 val ident : string -> loc -> unit
-val symbol : string -> loc -> unit
+val sublexer : char -> loc -> unit
+val initialize : unit -> unit
 
 val proofbox : unit -> unit
 
diff --git a/tools/coqdoc/tokens.ml b/tools/coqdoc/tokens.ml
new file mode 100644
index 000000000..7de3ad80d
--- /dev/null
+++ b/tools/coqdoc/tokens.ml
@@ -0,0 +1,171 @@
+(************************************************************************)
+(*  v      *   The Coq Proof Assistant  /  The Coq Development Team     *)
+(* <O___,, * CNRS-Ecole Polytechnique-INRIA Futurs-Universite Paris Sud *)
+(*   \VV/  **************************************************************)
+(*    //   *      This file is distributed under the terms of the       *)
+(*         *       GNU Lesser General Public License Version 2.1        *)
+(************************************************************************)
+
+(* Application of printing rules based on a dictionary specific to the
+   target language *)
+
+open Cdglobals
+
+(*s Dictionaries: trees annotated with string options, each node being a map
+    from chars to dictionaries (the subtrees). A trie, in other words.
+
+    (code copied from parsing/lexer.ml4 for the use of coqdoc, Apr 2010)
+*)
+
+module CharMap = Map.Make (struct type t = char let compare = compare end)
+
+type ttree = {
+  node : string option;
+  branch : ttree CharMap.t }
+
+let empty_ttree = { node = None; branch = CharMap.empty }
+
+let ttree_add ttree str translated =
+  let rec insert tt i =
+    if i == String.length str then
+      {node = Some translated; branch = tt.branch}
+    else
+      let c = str.[i] in
+      let br =
+        match try Some (CharMap.find c tt.branch) with Not_found -> None with
+          | Some tt' ->
+              CharMap.add c (insert tt' (i + 1)) (CharMap.remove c tt.branch)
+          | None ->
+              let tt' = {node = None; branch = CharMap.empty} in
+              CharMap.add c (insert tt' (i + 1)) tt.branch
+      in
+      { node = tt.node; branch = br }
+  in
+  insert ttree 0
+
+(* Removes a string from a dictionary: returns an equal dictionary
+   if the word not present. *)
+let ttree_remove ttree str =
+  let rec remove tt i =
+    if i == String.length str then
+      {node = None; branch = tt.branch}
+    else
+      let c = str.[i] in
+      let br =
+        match try Some (CharMap.find c tt.branch) with Not_found -> None with
+          | Some tt' ->
+              CharMap.add c (remove tt' (i + 1)) (CharMap.remove c tt.branch)
+          | None -> tt.branch
+      in
+      { node = tt.node; branch = br }
+  in
+  remove ttree 0
+
+let ttree_descend ttree c = CharMap.find c ttree.branch
+
+let ttree_find ttree str =
+  let rec proc_rec tt i =
+    if i == String.length str then tt
+    else proc_rec (CharMap.find str.[i] tt.branch) (i+1)
+  in
+  proc_rec ttree 0
+
+(*s Parameters of the translation automaton *)
+
+type out_function = bool -> bool -> Index.index_entry option -> string -> unit
+
+let token_tree = ref empty_ttree
+let outfun = ref (fun _ _ _ _ -> failwith "outfun not initialized")
+
+(*s Translation automaton *)
+
+let buff = Buffer.create 4
+
+let flush_buffer was_symbolchar tag tok =
+  let hastr = String.length tok <> 0 in
+  if hastr then !outfun false was_symbolchar tag tok;
+  if Buffer.length buff <> 0 then
+    !outfun true (if hastr then not was_symbolchar else was_symbolchar)
+      tag (Buffer.contents buff);
+  Buffer.clear buff
+
+type sublexer_state =
+  | Neutral
+  | Buffering of bool * Index.index_entry option * string * ttree
+
+let translation_state = ref Neutral
+
+let buffer_char is_symbolchar ctag c =
+  let rec aux = function
+  | Neutral ->
+      restart_buffering ()
+  | Buffering (was_symbolchar,tag,translated,tt) ->
+      if tag <> ctag then
+	(* A strong tag comes from Coq; if different Coq tags *)
+	(* hence, we don't try to see the chars as part of a single token *)
+	let translated =
+	  match tt.node with
+	  | Some tok -> Buffer.clear buff; tok
+	  | None -> translated in
+	flush_buffer was_symbolchar tag translated;
+	restart_buffering ()
+      else
+	begin
+	(* If we change the category of characters (symbol vs ident) *)
+	(* we accept this as a possible token cut point and remember the *)
+	(* translated token up to that point *)
+	let translated =
+	  if is_symbolchar <> was_symbolchar then
+	    match tt.node with
+	    | Some tok -> Buffer.clear buff; tok
+	    | None -> translated
+	  else translated in
+	(* We try to make a significant token from the current *)
+	(* buffer and the new character *)
+	try
+	  let tt = ttree_descend tt c in
+	  Buffer.add_char buff c;
+	  Buffering (is_symbolchar,ctag,translated,tt)
+	with Not_found ->
+	(* No existing translation for the given set of chars *)
+	if is_symbolchar <> was_symbolchar then
+	  (* If we changed the category of character read, we accept it *)
+	  (* as a possible cut point and restart looking for a translation *)
+	  (flush_buffer was_symbolchar tag translated;
+	   restart_buffering ())
+	else
+	  (* If we did not change the category of character read, we do *)
+	  (* not want to cut arbitrarily in the middle of the sequence of *)
+	  (*  symbol characters or identifier characters *)
+	  (Buffer.add_char buff c;
+	   Buffering (is_symbolchar,tag,translated,empty_ttree))
+	end
+
+  and restart_buffering () =
+    let tt = try ttree_descend !token_tree c with Not_found -> empty_ttree in
+    Buffer.add_char buff c;
+    Buffering (is_symbolchar,ctag,"",tt)
+
+  in
+  translation_state := aux !translation_state
+
+let output_tagged_ident_string s =
+  for i = 0 to String.length s - 1 do buffer_char false None s.[i] done
+
+let output_tagged_symbol_char tag c =
+  buffer_char true tag c
+
+let flush_sublexer () =
+  match !translation_state with
+  | Neutral -> ()
+  | Buffering (was_symbolchar,tag,translated,tt) ->
+      let translated =
+	match tt.node with
+	| Some tok -> Buffer.clear buff; tok
+	| None -> translated in
+      flush_buffer was_symbolchar tag translated;
+      translation_state := Neutral
+
+(* Translation not using the automaton *)
+let translate s =
+  try (ttree_find !token_tree s).node with Not_found -> None
diff --git a/tools/coqdoc/tokens.mli b/tools/coqdoc/tokens.mli
new file mode 100644
index 000000000..4e53108ac
--- /dev/null
+++ b/tools/coqdoc/tokens.mli
@@ -0,0 +1,78 @@
+(************************************************************************)
+(*  v      *   The Coq Proof Assistant  /  The Coq Development Team     *)
+(* <O___,, * CNRS-Ecole Polytechnique-INRIA Futurs-Universite Paris Sud *)
+(*   \VV/  **************************************************************)
+(*    //   *      This file is distributed under the terms of the       *)
+(*         *       GNU Lesser General Public License Version 2.1        *)
+(************************************************************************)
+
+(* Type of dictionaries *)
+
+type ttree
+
+val empty_ttree : ttree
+
+(* Add a string with some translation in dictionary *)
+val ttree_add : ttree -> string -> string -> ttree
+
+(* Remove a translation from a dictionary: returns an equal dictionary
+   if the word not present *)
+val ttree_remove : ttree -> string -> ttree
+
+(* Translate a string *)
+val translate : string -> string option
+
+(* Sublexer automaton *)
+
+(* The sublexer buffers the chars it receives; if after some time, it
+   recognizes that a sequence of chars has a translation in the
+   current dictionary, it replaces the buffer by the translation *)
+
+(* Received chars can come with a "tag" (usually made from
+   informations from the globalization file). A sequence of chars can
+   be considered a word only, if all chars have the same "tag". Rules
+   for cutting words are the following:
+
+   - in a sequence like "**" where * is in the dictionary but not **,
+     "**" is not translated; otherwise said, to be translated, a sequence
+     must not be surrounded by other symbol-like chars
+
+   - in a sequence like "<>_h*", where <>_h is in the dictionary, the
+     translation is done because the switch from a letter to a symbol char
+     is an acceptable cutting point
+
+   - in a sequence like "<>_ha", where <>_h is in the dictionary, the
+     translation is not done because it is considered that h and a are
+     not separable (however, if h and a have different tags, and h has
+     the same tags as <, > and _, the translation happens)
+
+   - in a sequence like "<>_ha", where <> but not <>_h is in the
+     dictionary, the translation is done for <> and _ha is considered
+     independently because  the switch from a symbol char to a letter
+     is considered to be an acceptable cutting point
+
+   - the longest-word rule applies: if both <> and <>_h are in the
+     dictionary, "<>_h" is one word and gets translated
+*)
+
+(* Warning: do not output anything on output channel inbetween a call
+   to [output_tagged_*] and [flush_sublexer]!! *)
+
+type out_function =
+  bool (* needs escape *) ->
+  bool (* it is a symbol, not a pure ident *) ->
+  Index.index_entry option (* the index type of the token if any *) ->
+  string -> unit
+
+(* This must be initialized before calling the sublexer *)
+val token_tree : ttree ref
+val outfun : out_function ref
+
+(* Process an ident part that might be a symbol part *)
+val output_tagged_ident_string : string -> unit
+
+(* Process a non-ident char (possibly equipped with a tag) *)
+val output_tagged_symbol_char : Index.index_entry option -> char -> unit
+
+(* Flush the buffered content of the lexer using [outfun] *)
+val flush_sublexer : unit -> unit
author	herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7>	2010-04-06 18:55:36 +0000
committer	herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7>	2010-04-06 18:55:36 +0000
commit	7672bf2bdb941fd487e6c4a0e7deff1143bff680 (patch)
tree	68d2aee88883fb880b3d83a4edf60ad2626f939f
parent	3ace40d1b3230f34e7d6a1961eb1fc1f972f57c9 (diff)