1 files changed, 78 insertions, 0 deletions
diff --git a/tools/coqdoc/tokens.mli b/tools/coqdoc/tokens.mli
new file mode 100644
index 00000000..a85e75c4
--- /dev/null
+++ b/tools/coqdoc/tokens.mli
@@ -0,0 +1,78 @@
+(************************************************************************)
+(*  v      *   The Coq Proof Assistant  /  The Coq Development Team     *)
+(* <O___,, * CNRS-Ecole Polytechnique-INRIA Futurs-Universite Paris Sud *)
+(*   \VV/  **************************************************************)
+(*    //   *      This file is distributed under the terms of the       *)
+(*         *       GNU Lesser General Public License Version 2.1        *)
+(************************************************************************)
+
+(* Type of dictionaries *)
+
+type ttree
+
+val empty_ttree : ttree
+
+(* Add a string with some translation in dictionary *)
+val ttree_add : ttree -> string -> string -> ttree
+
+(* Remove a translation from a dictionary: returns an equal dictionary
+   if the word not present *)
+val ttree_remove : ttree -> string -> ttree
+
+(* Translate a string *)
+val translate : string -> string option
+
+(* Sublexer automaton *)
+
+(* The sublexer buffers the chars it receives; if after some time, it
+   recognizes that a sequence of chars has a translation in the
+   current dictionary, it replaces the buffer by the translation *)
+
+(* Received chars can come with a "tag" (usually made from
+   informations from the globalization file). A sequence of chars can
+   be considered a word only, if all chars have the same "tag". Rules
+   for cutting words are the following:
+
+   - in a sequence like "**" where * is in the dictionary but not **,
+     "**" is not translated; otherwise said, to be translated, a sequence
+     must not be surrounded by other symbol-like chars
+
+   - in a sequence like "<>_h*", where <>_h is in the dictionary, the
+     translation is done because the switch from a letter to a symbol char
+     is an acceptable cutting point
+
+   - in a sequence like "<>_ha", where <>_h is in the dictionary, the
+     translation is not done because it is considered that h and a are
+     not separable (however, if h and a have different tags, and h has
+     the same tags as <, > and _, the translation happens)
+
+   - in a sequence like "<>_ha", where <> but not <>_h is in the
+     dictionary, the translation is done for <> and _ha is considered
+     independently because  the switch from a symbol char to a letter
+     is considered to be an acceptable cutting point
+
+   - the longest-word rule applies: if both <> and <>_h are in the
+     dictionary, "<>_h" is one word and gets translated
+*)
+
+(* Warning: do not output anything on output channel inbetween a call
+   to [output_tagged_*] and [flush_sublexer]!! *)
+
+type out_function =
+  bool (* needs escape *) ->
+  bool (* it is a symbol, not a pure ident *) ->
+  Index.index_entry option (* the index type of the token if any *) ->
+  string -> unit
+
+(* This must be initialized before calling the sublexer *)
+val token_tree : ttree ref ref
+val outfun : out_function ref
+
+(* Process an ident part that might be a symbol part *)
+val output_tagged_ident_string : string -> unit
+
+(* Process a non-ident char (possibly equipped with a tag) *)
+val output_tagged_symbol_char : Index.index_entry option -> char -> unit
+
+(* Flush the buffered content of the lexer using [outfun] *)
+val flush_sublexer : unit -> unit