summaryrefslogtreecommitdiff
path: root/lib/xml_lexer.mll
diff options
context:
space:
mode:
Diffstat (limited to 'lib/xml_lexer.mll')
-rw-r--r--lib/xml_lexer.mll299
1 files changed, 299 insertions, 0 deletions
diff --git a/lib/xml_lexer.mll b/lib/xml_lexer.mll
new file mode 100644
index 00000000..2be4bf98
--- /dev/null
+++ b/lib/xml_lexer.mll
@@ -0,0 +1,299 @@
+{(*
+ * Xml Light, an small Xml parser/printer with DTD support.
+ * Copyright (C) 2003 Nicolas Cannasse (ncannasse@motion-twin.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *)
+
+open Lexing
+
+type error =
+ | EUnterminatedComment
+ | EUnterminatedString
+ | EIdentExpected
+ | ECloseExpected
+ | ENodeExpected
+ | EAttributeNameExpected
+ | EAttributeValueExpected
+ | EUnterminatedEntity
+
+exception Error of error
+
+type pos = int * int * int * int
+
+type token =
+ | Tag of string * (string * string) list * bool
+ | PCData of string
+ | Endtag of string
+ | Eof
+
+let last_pos = ref 0
+and current_line = ref 0
+and current_line_start = ref 0
+
+let tmp = Buffer.create 200
+
+let idents = Hashtbl.create 0
+
+let _ = begin
+ Hashtbl.add idents "gt;" ">";
+ Hashtbl.add idents "lt;" "<";
+ Hashtbl.add idents "amp;" "&";
+ Hashtbl.add idents "apos;" "'";
+ Hashtbl.add idents "quot;" "\"";
+end
+
+let init lexbuf =
+ current_line := 1;
+ current_line_start := lexeme_start lexbuf;
+ last_pos := !current_line_start
+
+let close lexbuf =
+ Buffer.reset tmp
+
+let pos lexbuf =
+ !current_line , !current_line_start ,
+ !last_pos ,
+ lexeme_start lexbuf
+
+let restore (cl,cls,lp,_) =
+ current_line := cl;
+ current_line_start := cls;
+ last_pos := lp
+
+let newline lexbuf =
+ incr current_line;
+ last_pos := lexeme_end lexbuf;
+ current_line_start := !last_pos
+
+let error lexbuf e =
+ last_pos := lexeme_start lexbuf;
+ raise (Error e)
+
+}
+
+let newline = ['\n']
+let break = ['\r']
+let space = [' ' '\t']
+let identchar = ['A'-'Z' 'a'-'z' '_' '0'-'9' ':' '-']
+let entitychar = ['A'-'Z' 'a'-'z']
+let pcchar = [^ '\r' '\n' '<' '>' '&']
+
+rule token = parse
+ | newline | (newline break) | break
+ {
+ newline lexbuf;
+ PCData "\n"
+ }
+ | "<!--"
+ {
+ last_pos := lexeme_start lexbuf;
+ comment lexbuf;
+ token lexbuf
+ }
+ | "<?"
+ {
+ last_pos := lexeme_start lexbuf;
+ header lexbuf;
+ token lexbuf;
+ }
+ | '<' space* '/' space*
+ {
+ last_pos := lexeme_start lexbuf;
+ let tag = ident_name lexbuf in
+ ignore_spaces lexbuf;
+ close_tag lexbuf;
+ Endtag tag
+ }
+ | '<' space*
+ {
+ last_pos := lexeme_start lexbuf;
+ let tag = ident_name lexbuf in
+ ignore_spaces lexbuf;
+ let attribs, closed = attributes lexbuf in
+ Tag(tag, attribs, closed)
+ }
+ | "&#"
+ {
+ last_pos := lexeme_start lexbuf;
+ Buffer.reset tmp;
+ Buffer.add_string tmp (lexeme lexbuf);
+ PCData (pcdata lexbuf)
+ }
+ | '&'
+ {
+ last_pos := lexeme_start lexbuf;
+ Buffer.reset tmp;
+ Buffer.add_string tmp (entity lexbuf);
+ PCData (pcdata lexbuf)
+ }
+ | pcchar+
+ {
+ last_pos := lexeme_start lexbuf;
+ Buffer.reset tmp;
+ Buffer.add_string tmp (lexeme lexbuf);
+ PCData (pcdata lexbuf)
+ }
+ | eof { Eof }
+ | _
+ { error lexbuf ENodeExpected }
+
+and ignore_spaces = parse
+ | newline | (newline break) | break
+ {
+ newline lexbuf;
+ ignore_spaces lexbuf
+ }
+ | space +
+ { ignore_spaces lexbuf }
+ | ""
+ { () }
+
+and comment = parse
+ | newline | (newline break) | break
+ {
+ newline lexbuf;
+ comment lexbuf
+ }
+ | "-->"
+ { () }
+ | eof
+ { raise (Error EUnterminatedComment) }
+ | _
+ { comment lexbuf }
+
+and header = parse
+ | newline | (newline break) | break
+ {
+ newline lexbuf;
+ header lexbuf
+ }
+ | "?>"
+ { () }
+ | eof
+ { error lexbuf ECloseExpected }
+ | _
+ { header lexbuf }
+
+and pcdata = parse
+ | pcchar+
+ {
+ Buffer.add_string tmp (lexeme lexbuf);
+ pcdata lexbuf
+ }
+ | "&#"
+ {
+ Buffer.add_string tmp (lexeme lexbuf);
+ pcdata lexbuf;
+ }
+ | '&'
+ {
+ Buffer.add_string tmp (entity lexbuf);
+ pcdata lexbuf
+ }
+ | ""
+ { Buffer.contents tmp }
+
+and entity = parse
+ | entitychar+ ';'
+ {
+ let ident = lexeme lexbuf in
+ try
+ Hashtbl.find idents (String.lowercase ident)
+ with
+ Not_found -> "&" ^ ident
+ }
+ | _ | eof
+ { raise (Error EUnterminatedEntity) }
+
+and ident_name = parse
+ | identchar+
+ { lexeme lexbuf }
+ | _ | eof
+ { error lexbuf EIdentExpected }
+
+and close_tag = parse
+ | '>'
+ { () }
+ | _ | eof
+ { error lexbuf ECloseExpected }
+
+and attributes = parse
+ | '>'
+ { [], false }
+ | "/>"
+ { [], true }
+ | "" (* do not read a char ! *)
+ {
+ let key = attribute lexbuf in
+ let data = attribute_data lexbuf in
+ ignore_spaces lexbuf;
+ let others, closed = attributes lexbuf in
+ (key, data) :: others, closed
+ }
+
+and attribute = parse
+ | identchar+
+ { lexeme lexbuf }
+ | _ | eof
+ { error lexbuf EAttributeNameExpected }
+
+and attribute_data = parse
+ | space* '=' space* '"'
+ {
+ Buffer.reset tmp;
+ last_pos := lexeme_end lexbuf;
+ dq_string lexbuf
+ }
+ | space* '=' space* '\''
+ {
+ Buffer.reset tmp;
+ last_pos := lexeme_end lexbuf;
+ q_string lexbuf
+ }
+ | _ | eof
+ { error lexbuf EAttributeValueExpected }
+
+and dq_string = parse
+ | '"'
+ { Buffer.contents tmp }
+ | '\\' [ '"' '\\' ]
+ {
+ Buffer.add_char tmp (lexeme_char lexbuf 1);
+ dq_string lexbuf
+ }
+ | eof
+ { raise (Error EUnterminatedString) }
+ | _
+ {
+ Buffer.add_char tmp (lexeme_char lexbuf 0);
+ dq_string lexbuf
+ }
+
+and q_string = parse
+ | '\''
+ { Buffer.contents tmp }
+ | '\\' [ '\'' '\\' ]
+ {
+ Buffer.add_char tmp (lexeme_char lexbuf 1);
+ q_string lexbuf
+ }
+ | eof
+ { raise (Error EUnterminatedString) }
+ | _
+ {
+ Buffer.add_char tmp (lexeme_char lexbuf 0);
+ q_string lexbuf
+ }