summaryrefslogtreecommitdiff
path: root/contrib/interface/line_parser.ml4
blob: b566935123689bbc6021f3484b55f9785a41fe6b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
(* line-oriented Syntactic analyser for a Coq parser *)
(* This parser expects a very small number of commands, each given on a complete
line.  Some of these commands are then followed by a text fragment terminated
by a precise keyword, which is also expected to appear alone on a line. *)

(* The main parsing loop procedure is "parser_loop", given at the end of this
file.  It read lines one by one and checks whether they can be parsed using
a very simple parser.  This very simple parser uses a lexer, which is also given
in this file.  

The lexical analyser:
 There are only 5 sorts of tokens *)
type simple_tokens = Tspace | Tid of string | Tint of int | Tstring of string |
      Tlbracket | Trbracket;;

(* When recognizing identifiers or strings, the lexical analyser accumulates
   the characters in a buffer, using the command add_in_buff.  To recuperate
   the characters, one can use get_buff (this code was inspired by the
   code in src/meta/lexer.ml of Coq revision 6.1) *)
let add_in_buff,get_buff =
  let buff = ref (String.create 80) in
    (fun i x -> 
       let len = String.length !buff in
        if i >= len then (buff := !buff ^ (String.create len);());
         String.set !buff i x;
         succ i),
     (fun len -> String.sub !buff 0 len);;

(* Identifiers are [a-zA-Z_][.a-zA-Z0-9_]*.  When arriving here the first
   character has already been recognized. *)
let rec ident len = parser
  [<''_' | '.' | 'a'..'z' | 'A'..'Z' | '0'..'9' as c; s >] ->
    ident (add_in_buff len c) s
| [< >] -> let str = get_buff len in Tid(str);;

(* While recognizing integers, one constructs directly the integer value.
   The ascii code of '0' is important for this. *)
let code0 = Char.code '0';;

let get_digit c =  Char.code c - code0;;

(* Integers are [0-9]*
   The variable intval is the integer value of the text that has already
   been recognized.  As for identifiers, the first character has already been
   recognized. *)

let rec parse_int intval = parser
  [< ''0'..'9' as c ; i=parse_int (10 * intval + get_digit c)>] -> i
| [< >] -> Tint intval;;
      
(* The string lexer is borrowed from the string parser of Coq V6.1 
   This may be a problem if convention have changed in Coq,
   However this parser is only used to recognize file names which should
   not contain too many special characters *)

let rec spec_char = parser
  [< ''n' >] -> '\n' 
| [< ''t' >] -> '\t'
| [< ''b' >] -> '\008' 
| [< ''r' >] -> '\013'
| [< ''0'..'9' as c; v= (spec1 (get_digit c)) >] ->
    Char.chr v
| [< 'x >] -> x

and spec1 v = parser
  [< ''0'..'9' as c; s >] -> spec1 (10*v+(get_digit c)) s
| [< >] -> v
;;

(* This is the actual string lexical analyser.  Strings are
   QUOT([^QUOT\\]|\\[0-9]*|\\[^0-9])QUOT (the word QUOT is used
   to represents double quotation characters, that cannot be used
   freely, even inside comments. *)

let rec string len = parser
  [< ''"' >] -> len
| [<''\\' ;
      len = (parser [< ''\n' >] -> len
             | [< c=spec_char >] -> add_in_buff len c);
      s >] -> string len s
| [< 'x; s >] -> string (add_in_buff len x) s;;

(* The lexical analyser repeats the recognized given by next_token:
   spaces and tabulations are ignored, identifiers, integers,
   strings, opening and closing square brackets.  Lexical errors are
   ignored ! *)
let rec next_token = parser count
  [< '' ' | '\t'; tok = next_token >] -> tok
| [< ''_' | 'a'..'z' | 'A'..'Z' as c;i = (ident (add_in_buff 0 c))>] -> i
| [< ''0'..'9' as c ; i = (parse_int (get_digit c))>] -> i
| [< ''"' ; len = (string 0) >] -> Tstring (get_buff len)
| [< ''[' >] -> Tlbracket
| [< '']' >] -> Trbracket
| [< '_ ; x = next_token >] -> x;;

(* A very simple lexical analyser to recognize a integer value behind 
   blank characters *)

let rec next_int = parser count
  [< '' ' | '\t'; v = next_int >] -> v
| [< ''0'..'9' as c; i = (parse_int (get_digit c))>] ->
     (match i with
        Tint n -> n
     |  _ -> failwith "unexpected branch in next_int");;

(* This is the actual lexical analyser, implemented as a function on a stream.
   It will be used with the Stream.from primitive to construct a function
   of type char Stream.t -> simple_token option Stream.t *)
let token_stream cs _ =
 try  let tok = next_token cs in
      Some tok
 with Stream.Failure -> None;;

(* Two of the actions of the parser request that one reads the rest of
   the input up to a specific string stop_string.  This is done
   with a function that transform the input_channel into a pair of
   char Stream.t, reading from the input_channel all the lines to
   the stop_string first. *)


let rec gather_strings stop_string input_channel =
   let buff = input_line input_channel in
   if buff = stop_string then
     []
   else
     (buff::(gather_strings stop_string input_channel));;


(* the result of this function is supposed to be used in a Stream.from
   construction. *)

let line_list_to_stream string_list =
   let count = ref 0 in
   let buff = ref "" in
   let reserve = ref string_list in
   let current_length = ref 0 in
   (fun i -> if (i - !count) >= !current_length then
               begin
                  count := !count + !current_length + 1;
                  match !reserve with
		  | [] -> None
                  | s1::rest -> 
                    begin
                      buff := s1;
                      current_length := String.length !buff;
		      reserve := rest;
                      Some '\n'
                    end
               end
             else
               Some(String.get !buff (i - !count)));;
 

(* In older revisions of this file you would find a function that
   does line oriented breakdown of the input channel without resorting to
   a list of lines.  However, the need for the list of line appeared when
   we wanted to have  a channel and a list of strings describing the same
   data, one for regular parsing and the other for error recovery. *)

let channel_to_stream_and_string_list stop_string input_channel =
   let string_list = gather_strings stop_string input_channel in
   (line_list_to_stream string_list, string_list);;

let flush_until_end_of_stream char_stream =
 Stream.iter (function _ -> ()) char_stream;;

(* There are only 5 kinds of lines recognized by our little parser.
   Unrecognized lines are ignored. *)
type parser_request =
  | PRINT_VERSION
  | PARSE_STRING of string
      (* parse_string <int> [<ident>] then text and && END--OF--DATA *)
  | QUIET_PARSE_STRING
      (* quiet_parse_string then text and && END--OF--DATA *)
  | PARSE_FILE of string
      (* parse_file <int> <string> *)
  | ADD_PATH of string
      (* add_path <int> <string> *)
  | ADD_REC_PATH of string * string
      (* add_rec_path <int> <string> <ident> *)
  | LOAD_SYNTAX of string
      (* load_syntax_file <int> <ident> *)
  | GARBAGE
;;

(* The procedure parser_loop should never terminate while the input_channel is
   not closed. This procedure receives the functions called for each sentence
   as arguments.  Thus the code is completely independent from the Coq sources. *)
let parser_loop functions input_channel =
  let print_version_action,
      parse_string_action,
      quiet_parse_string_action,
      parse_file_action,
      add_path_action,
      add_rec_path_action,
      load_syntax_action = functions in
  let rec parser_loop_rec input_channel =
  (let line = input_line input_channel in
  let reqid, parser_request = 
      try 
      (match Stream.from (token_stream (Stream.of_string line)) with
        parser
        | [< 'Tid "print_version" >] ->
                     0, PRINT_VERSION
        | [< 'Tid "parse_string" ; 'Tint reqid ; 'Tlbracket ;
             'Tid phylum ; 'Trbracket >] 
                  -> reqid,PARSE_STRING phylum
        | [< 'Tid "quiet_parse_string" >] ->
                     0,QUIET_PARSE_STRING
        | [< 'Tid "parse_file" ; 'Tint reqid ; 'Tstring fname >] ->
                     reqid, PARSE_FILE fname
        | [< 'Tid "add_rec_path"; 'Tint reqid ; 'Tstring directory ;  'Tid alias >]
            -> reqid, ADD_REC_PATH(directory, alias)
        | [< 'Tid "add_path"; 'Tint reqid ; 'Tstring directory >]
            -> reqid, ADD_PATH directory
        | [< 'Tid "load_syntax_file"; 'Tint reqid; 'Tid module_name >] ->
               reqid, LOAD_SYNTAX module_name
	| [< 'Tid "quit_parser" >] -> raise End_of_file
        | [< >] -> 0, GARBAGE)
       with
         Stream.Failure | Stream.Error _ -> 0,GARBAGE in
   match parser_request with
     PRINT_VERSION -> print_version_action ()
   | PARSE_STRING phylum ->
     let regular_stream, string_list =
        channel_to_stream_and_string_list "&& END--OF--DATA" input_channel in
      parse_string_action reqid phylum (Stream.from regular_stream)
        string_list;()
   | QUIET_PARSE_STRING ->
     let regular_stream, string_list =
        channel_to_stream_and_string_list "&& END--OF--DATA" input_channel in
       quiet_parse_string_action
        (Stream.from regular_stream);()
   | PARSE_FILE file_name ->
        parse_file_action reqid file_name
   | ADD_PATH path -> add_path_action reqid path
   | ADD_REC_PATH(path, alias) -> add_rec_path_action reqid path alias
   | LOAD_SYNTAX syn -> load_syntax_action reqid syn
   | GARBAGE -> ());
   parser_loop_rec input_channel in
   parser_loop_rec input_channel;;