diff options
Diffstat (limited to 'lib/xml_lexer.mll')
-rw-r--r-- | lib/xml_lexer.mll | 299 |
1 files changed, 299 insertions, 0 deletions
diff --git a/lib/xml_lexer.mll b/lib/xml_lexer.mll new file mode 100644 index 00000000..2be4bf98 --- /dev/null +++ b/lib/xml_lexer.mll @@ -0,0 +1,299 @@ +{(* + * Xml Light, an small Xml parser/printer with DTD support. + * Copyright (C) 2003 Nicolas Cannasse (ncannasse@motion-twin.com) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + *) + +open Lexing + +type error = + | EUnterminatedComment + | EUnterminatedString + | EIdentExpected + | ECloseExpected + | ENodeExpected + | EAttributeNameExpected + | EAttributeValueExpected + | EUnterminatedEntity + +exception Error of error + +type pos = int * int * int * int + +type token = + | Tag of string * (string * string) list * bool + | PCData of string + | Endtag of string + | Eof + +let last_pos = ref 0 +and current_line = ref 0 +and current_line_start = ref 0 + +let tmp = Buffer.create 200 + +let idents = Hashtbl.create 0 + +let _ = begin + Hashtbl.add idents "gt;" ">"; + Hashtbl.add idents "lt;" "<"; + Hashtbl.add idents "amp;" "&"; + Hashtbl.add idents "apos;" "'"; + Hashtbl.add idents "quot;" "\""; +end + +let init lexbuf = + current_line := 1; + current_line_start := lexeme_start lexbuf; + last_pos := !current_line_start + +let close lexbuf = + Buffer.reset tmp + +let pos lexbuf = + !current_line , !current_line_start , + !last_pos , + lexeme_start lexbuf + +let restore (cl,cls,lp,_) = + current_line := cl; + current_line_start := cls; + last_pos := lp + +let newline lexbuf = + incr current_line; + last_pos := lexeme_end lexbuf; + current_line_start := !last_pos + +let error lexbuf e = + last_pos := lexeme_start lexbuf; + raise (Error e) + +} + +let newline = ['\n'] +let break = ['\r'] +let space = [' ' '\t'] +let identchar = ['A'-'Z' 'a'-'z' '_' '0'-'9' ':' '-'] +let entitychar = ['A'-'Z' 'a'-'z'] +let pcchar = [^ '\r' '\n' '<' '>' '&'] + +rule token = parse + | newline | (newline break) | break + { + newline lexbuf; + PCData "\n" + } + | "<!--" + { + last_pos := lexeme_start lexbuf; + comment lexbuf; + token lexbuf + } + | "<?" + { + last_pos := lexeme_start lexbuf; + header lexbuf; + token lexbuf; + } + | '<' space* '/' space* + { + last_pos := lexeme_start lexbuf; + let tag = ident_name lexbuf in + ignore_spaces lexbuf; + close_tag lexbuf; + Endtag tag + } + | '<' space* + { + last_pos := lexeme_start lexbuf; + let tag = ident_name lexbuf in + ignore_spaces lexbuf; + let attribs, closed = attributes lexbuf in + Tag(tag, attribs, closed) + } + | "&#" + { + last_pos := lexeme_start lexbuf; + Buffer.reset tmp; + Buffer.add_string tmp (lexeme lexbuf); + PCData (pcdata lexbuf) + } + | '&' + { + last_pos := lexeme_start lexbuf; + Buffer.reset tmp; + Buffer.add_string tmp (entity lexbuf); + PCData (pcdata lexbuf) + } + | pcchar+ + { + last_pos := lexeme_start lexbuf; + Buffer.reset tmp; + Buffer.add_string tmp (lexeme lexbuf); + PCData (pcdata lexbuf) + } + | eof { Eof } + | _ + { error lexbuf ENodeExpected } + +and ignore_spaces = parse + | newline | (newline break) | break + { + newline lexbuf; + ignore_spaces lexbuf + } + | space + + { ignore_spaces lexbuf } + | "" + { () } + +and comment = parse + | newline | (newline break) | break + { + newline lexbuf; + comment lexbuf + } + | "-->" + { () } + | eof + { raise (Error EUnterminatedComment) } + | _ + { comment lexbuf } + +and header = parse + | newline | (newline break) | break + { + newline lexbuf; + header lexbuf + } + | "?>" + { () } + | eof + { error lexbuf ECloseExpected } + | _ + { header lexbuf } + +and pcdata = parse + | pcchar+ + { + Buffer.add_string tmp (lexeme lexbuf); + pcdata lexbuf + } + | "&#" + { + Buffer.add_string tmp (lexeme lexbuf); + pcdata lexbuf; + } + | '&' + { + Buffer.add_string tmp (entity lexbuf); + pcdata lexbuf + } + | "" + { Buffer.contents tmp } + +and entity = parse + | entitychar+ ';' + { + let ident = lexeme lexbuf in + try + Hashtbl.find idents (String.lowercase ident) + with + Not_found -> "&" ^ ident + } + | _ | eof + { raise (Error EUnterminatedEntity) } + +and ident_name = parse + | identchar+ + { lexeme lexbuf } + | _ | eof + { error lexbuf EIdentExpected } + +and close_tag = parse + | '>' + { () } + | _ | eof + { error lexbuf ECloseExpected } + +and attributes = parse + | '>' + { [], false } + | "/>" + { [], true } + | "" (* do not read a char ! *) + { + let key = attribute lexbuf in + let data = attribute_data lexbuf in + ignore_spaces lexbuf; + let others, closed = attributes lexbuf in + (key, data) :: others, closed + } + +and attribute = parse + | identchar+ + { lexeme lexbuf } + | _ | eof + { error lexbuf EAttributeNameExpected } + +and attribute_data = parse + | space* '=' space* '"' + { + Buffer.reset tmp; + last_pos := lexeme_end lexbuf; + dq_string lexbuf + } + | space* '=' space* '\'' + { + Buffer.reset tmp; + last_pos := lexeme_end lexbuf; + q_string lexbuf + } + | _ | eof + { error lexbuf EAttributeValueExpected } + +and dq_string = parse + | '"' + { Buffer.contents tmp } + | '\\' [ '"' '\\' ] + { + Buffer.add_char tmp (lexeme_char lexbuf 1); + dq_string lexbuf + } + | eof + { raise (Error EUnterminatedString) } + | _ + { + Buffer.add_char tmp (lexeme_char lexbuf 0); + dq_string lexbuf + } + +and q_string = parse + | '\'' + { Buffer.contents tmp } + | '\\' [ '\'' '\\' ] + { + Buffer.add_char tmp (lexeme_char lexbuf 1); + q_string lexbuf + } + | eof + { raise (Error EUnterminatedString) } + | _ + { + Buffer.add_char tmp (lexeme_char lexbuf 0); + q_string lexbuf + } |