From 16ca376bf4cca71f7e39ce6842f0371767b73df6 Mon Sep 17 00:00:00 2001 From: Regis-Gianas Date: Tue, 4 Nov 2014 11:38:56 +0100 Subject: Xml_lexer: Handle non-breakable spaces. --- lib/xml_lexer.mll | 419 +++++++++++++++++++++++++++--------------------------- 1 file changed, 210 insertions(+), 209 deletions(-) (limited to 'lib') diff --git a/lib/xml_lexer.mll b/lib/xml_lexer.mll index 5b06e720a..a33be9da7 100644 --- a/lib/xml_lexer.mll +++ b/lib/xml_lexer.mll @@ -20,24 +20,24 @@ open Lexing type error = - | EUnterminatedComment - | EUnterminatedString - | EIdentExpected - | ECloseExpected - | ENodeExpected - | EAttributeNameExpected - | EAttributeValueExpected - | EUnterminatedEntity + | EUnterminatedComment + | EUnterminatedString + | EIdentExpected + | ECloseExpected + | ENodeExpected + | EAttributeNameExpected + | EAttributeValueExpected + | EUnterminatedEntity exception Error of error type pos = int * int * int * int type token = - | Tag of string * (string * string) list * bool - | PCData of string - | Endtag of string - | Eof + | Tag of string * (string * string) list * bool + | PCData of string + | Endtag of string + | Eof let last_pos = ref 0 and current_line = ref 0 @@ -48,39 +48,40 @@ let tmp = Buffer.create 200 let idents = Hashtbl.create 0 let _ = begin - Hashtbl.add idents "gt;" ">"; - Hashtbl.add idents "lt;" "<"; - Hashtbl.add idents "amp;" "&"; - Hashtbl.add idents "apos;" "'"; - Hashtbl.add idents "quot;" "\""; + Hashtbl.add idents "nbsp;" " "; + Hashtbl.add idents "gt;" ">"; + Hashtbl.add idents "lt;" "<"; + Hashtbl.add idents "amp;" "&"; + Hashtbl.add idents "apos;" "'"; + Hashtbl.add idents "quot;" "\""; end let init lexbuf = - current_line := 1; - current_line_start := lexeme_start lexbuf; - last_pos := !current_line_start + current_line := 1; + current_line_start := lexeme_start lexbuf; + last_pos := !current_line_start let close lexbuf = - Buffer.reset tmp + Buffer.reset tmp let pos lexbuf = - !current_line , !current_line_start , - !last_pos , - lexeme_start lexbuf + !current_line , !current_line_start , + !last_pos , + lexeme_start lexbuf let restore (cl,cls,lp,_) = - current_line := cl; - current_line_start := cls; - last_pos := lp + current_line := cl; + current_line_start := cls; + last_pos := lp let newline lexbuf = - incr current_line; - last_pos := lexeme_end lexbuf; - current_line_start := !last_pos + incr current_line; + last_pos := lexeme_end lexbuf; + current_line_start := !last_pos let error lexbuf e = - last_pos := lexeme_start lexbuf; - raise (Error e) + last_pos := lexeme_start lexbuf; + raise (Error e) } @@ -92,100 +93,100 @@ let entitychar = ['A'-'Z' 'a'-'z'] let pcchar = [^ '\r' '\n' '<' '>' '&'] rule token = parse - | newline | (newline break) | break - { - newline lexbuf; + | newline | (newline break) | break + { + newline lexbuf; PCData "\n" - } - | "" - { () } - | eof - { raise (Error EUnterminatedComment) } - | _ - { comment lexbuf } + { + newline lexbuf; + comment lexbuf + } + | "-->" + { () } + | eof + { raise (Error EUnterminatedComment) } + | _ + { comment lexbuf } and header = parse | newline | (newline break) | break - { - newline lexbuf; - header lexbuf - } - | "?>" - { () } - | eof - { error lexbuf ECloseExpected } - | _ - { header lexbuf } + { + newline lexbuf; + header lexbuf + } + | "?>" + { () } + | eof + { error lexbuf ECloseExpected } + | _ + { header lexbuf } and pcdata = parse | newline | (newline break) | break @@ -194,112 +195,112 @@ and pcdata = parse newline lexbuf; pcdata lexbuf } - | pcchar+ - { - Buffer.add_string tmp (lexeme lexbuf); - pcdata lexbuf - } - | "&#" - { - Buffer.add_string tmp (lexeme lexbuf); - pcdata lexbuf; - } - | '&' - { - Buffer.add_string tmp (entity lexbuf); - pcdata lexbuf - } - | "" - { Buffer.contents tmp } + | pcchar+ + { + Buffer.add_string tmp (lexeme lexbuf); + pcdata lexbuf + } + | "&#" + { + Buffer.add_string tmp (lexeme lexbuf); + pcdata lexbuf; + } + | '&' + { + Buffer.add_string tmp (entity lexbuf); + pcdata lexbuf + } + | "" + { Buffer.contents tmp } and entity = parse - | entitychar+ ';' - { - let ident = lexeme lexbuf in - try - Hashtbl.find idents (String.lowercase ident) - with - Not_found -> "&" ^ ident - } - | _ | eof - { raise (Error EUnterminatedEntity) } + | entitychar+ ';' + { + let ident = lexeme lexbuf in + try + Hashtbl.find idents (String.lowercase ident) + with + Not_found -> "&" ^ ident + } + | _ | eof + { raise (Error EUnterminatedEntity) } and ident_name = parse - | identchar+ - { lexeme lexbuf } - | _ | eof - { error lexbuf EIdentExpected } + | identchar+ + { lexeme lexbuf } + | _ | eof + { error lexbuf EIdentExpected } and close_tag = parse - | '>' - { () } - | _ | eof - { error lexbuf ECloseExpected } + | '>' + { () } + | _ | eof + { error lexbuf ECloseExpected } and attributes = parse - | '>' - { [], false } - | "/>" - { [], true } - | "" (* do not read a char ! *) - { - let key = attribute lexbuf in - let data = attribute_data lexbuf in - ignore_spaces lexbuf; - let others, closed = attributes lexbuf in - (key, data) :: others, closed - } + | '>' + { [], false } + | "/>" + { [], true } + | "" (* do not read a char ! *) + { + let key = attribute lexbuf in + let data = attribute_data lexbuf in + ignore_spaces lexbuf; + let others, closed = attributes lexbuf in + (key, data) :: others, closed + } and attribute = parse - | identchar+ - { lexeme lexbuf } - | _ | eof - { error lexbuf EAttributeNameExpected } + | identchar+ + { lexeme lexbuf } + | _ | eof + { error lexbuf EAttributeNameExpected } and attribute_data = parse - | space* '=' space* '"' - { - Buffer.reset tmp; - last_pos := lexeme_end lexbuf; - dq_string lexbuf - } - | space* '=' space* '\'' - { - Buffer.reset tmp; - last_pos := lexeme_end lexbuf; - q_string lexbuf - } - | _ | eof - { error lexbuf EAttributeValueExpected } + | space* '=' space* '"' + { + Buffer.reset tmp; + last_pos := lexeme_end lexbuf; + dq_string lexbuf + } + | space* '=' space* '\'' + { + Buffer.reset tmp; + last_pos := lexeme_end lexbuf; + q_string lexbuf + } + | _ | eof + { error lexbuf EAttributeValueExpected } and dq_string = parse - | '"' - { Buffer.contents tmp } - | '\\' [ '"' '\\' ] - { - Buffer.add_char tmp (lexeme_char lexbuf 1); - dq_string lexbuf - } - | eof - { raise (Error EUnterminatedString) } - | _ - { - Buffer.add_char tmp (lexeme_char lexbuf 0); - dq_string lexbuf - } + | '"' + { Buffer.contents tmp } + | '\\' [ '"' '\\' ] + { + Buffer.add_char tmp (lexeme_char lexbuf 1); + dq_string lexbuf + } + | eof + { raise (Error EUnterminatedString) } + | _ + { + Buffer.add_char tmp (lexeme_char lexbuf 0); + dq_string lexbuf + } and q_string = parse - | '\'' - { Buffer.contents tmp } - | '\\' [ '\'' '\\' ] - { - Buffer.add_char tmp (lexeme_char lexbuf 1); - q_string lexbuf - } - | eof - { raise (Error EUnterminatedString) } - | _ - { - Buffer.add_char tmp (lexeme_char lexbuf 0); - q_string lexbuf - } + | '\'' + { Buffer.contents tmp } + | '\\' [ '\'' '\\' ] + { + Buffer.add_char tmp (lexeme_char lexbuf 1); + q_string lexbuf + } + | eof + { raise (Error EUnterminatedString) } + | _ + { + Buffer.add_char tmp (lexeme_char lexbuf 0); + q_string lexbuf + } -- cgit v1.2.3