From 16ca376bf4cca71f7e39ce6842f0371767b73df6 Mon Sep 17 00:00:00 2001
From: Regis-Gianas <yrg@pps.univ-paris-diderot.fr>
Date: Tue, 4 Nov 2014 11:38:56 +0100
Subject: Xml_lexer: Handle non-breakable spaces.

---
 lib/xml_lexer.mll | 419 +++++++++++++++++++++++++++---------------------------
 1 file changed, 210 insertions(+), 209 deletions(-)

(limited to 'lib')

diff --git a/lib/xml_lexer.mll b/lib/xml_lexer.mll
index 5b06e720a..a33be9da7 100644
--- a/lib/xml_lexer.mll
+++ b/lib/xml_lexer.mll
@@ -20,24 +20,24 @@
 open Lexing
 
 type error =
-	| EUnterminatedComment
-	| EUnterminatedString
-	| EIdentExpected
-	| ECloseExpected
-	| ENodeExpected
-	| EAttributeNameExpected
-	| EAttributeValueExpected
-	| EUnterminatedEntity
+        | EUnterminatedComment
+        | EUnterminatedString
+        | EIdentExpected
+        | ECloseExpected
+        | ENodeExpected
+        | EAttributeNameExpected
+        | EAttributeValueExpected
+        | EUnterminatedEntity
 
 exception Error of error
 
 type pos = int * int * int * int
 
 type token =
-	| Tag of string * (string * string) list * bool
-	| PCData of string
-	| Endtag of string
-	| Eof
+        | Tag of string * (string * string) list * bool
+        | PCData of string
+        | Endtag of string
+        | Eof
 
 let last_pos = ref 0
 and current_line = ref 0
@@ -48,39 +48,40 @@ let tmp = Buffer.create 200
 let idents = Hashtbl.create 0
 
 let _ = begin
-	Hashtbl.add idents "gt;" ">";
-	Hashtbl.add idents "lt;" "<";
-	Hashtbl.add idents "amp;" "&";
-	Hashtbl.add idents "apos;" "'";
-	Hashtbl.add idents "quot;" "\"";
+        Hashtbl.add idents "nbsp;" " ";
+        Hashtbl.add idents "gt;" ">";
+        Hashtbl.add idents "lt;" "<";
+        Hashtbl.add idents "amp;" "&";
+        Hashtbl.add idents "apos;" "'";
+        Hashtbl.add idents "quot;" "\"";
 end
 
 let init lexbuf =
-	current_line := 1;
-	current_line_start := lexeme_start lexbuf;
-	last_pos := !current_line_start
+        current_line := 1;
+        current_line_start := lexeme_start lexbuf;
+        last_pos := !current_line_start
 
 let close lexbuf =
-	Buffer.reset tmp
+        Buffer.reset tmp
 
 let pos lexbuf =
-	!current_line ,	!current_line_start ,
-	!last_pos ,
-	lexeme_start lexbuf
+        !current_line , !current_line_start ,
+        !last_pos ,
+        lexeme_start lexbuf
 
 let restore (cl,cls,lp,_) =
-	current_line := cl;
-	current_line_start := cls;
-	last_pos := lp
+        current_line := cl;
+        current_line_start := cls;
+        last_pos := lp
 
 let newline lexbuf =
-	incr current_line;
-	last_pos := lexeme_end lexbuf;
-	current_line_start := !last_pos
+        incr current_line;
+        last_pos := lexeme_end lexbuf;
+        current_line_start := !last_pos
 
 let error lexbuf e =
-	last_pos := lexeme_start lexbuf;
-	raise (Error e)
+        last_pos := lexeme_start lexbuf;
+        raise (Error e)
 
 }
 
@@ -92,100 +93,100 @@ let entitychar = ['A'-'Z' 'a'-'z']
 let pcchar = [^ '\r' '\n' '<' '>' '&']
 
 rule token = parse
-	| newline | (newline break) | break
-		{
-			newline lexbuf;
+        | newline | (newline break) | break
+                {
+                        newline lexbuf;
                         PCData "\n"
-		}
-	| "<!--"
-		{
-			last_pos := lexeme_start lexbuf;
-			comment lexbuf;
-			token lexbuf
-		}
-	| "<?"
-		{
-			last_pos := lexeme_start lexbuf;
-			header lexbuf;
-			token lexbuf;
-		}
-	| '<' space* '/' space*
-		{
-			last_pos := lexeme_start lexbuf;
-			let tag = ident_name lexbuf in
-			ignore_spaces lexbuf;
-			close_tag lexbuf;
-			Endtag tag
-		}
-	| '<' space*
-		{
-			last_pos := lexeme_start lexbuf;
-			let tag = ident_name lexbuf in
-			ignore_spaces lexbuf;
-			let attribs, closed = attributes lexbuf in
-			Tag(tag, attribs, closed)
-		}
-	| "&#"
-		{
-			last_pos := lexeme_start lexbuf;
-			Buffer.reset tmp;
-			Buffer.add_string tmp (lexeme lexbuf);
-			PCData (pcdata lexbuf)
-		}
-	| '&'
-		{
-			last_pos := lexeme_start lexbuf;
-			Buffer.reset tmp;
-			Buffer.add_string tmp (entity lexbuf);
-			PCData (pcdata lexbuf)
-		}
-	| pcchar+
-		{
-			last_pos := lexeme_start lexbuf;
-			Buffer.reset tmp;
-			Buffer.add_string tmp (lexeme lexbuf);
-			PCData (pcdata lexbuf)
-		}
-	| eof { Eof }
-	| _
-		{ error lexbuf ENodeExpected }
+                }
+        | "<!--"
+                {
+                        last_pos := lexeme_start lexbuf;
+                        comment lexbuf;
+                        token lexbuf
+                }
+        | "<?"
+                {
+                        last_pos := lexeme_start lexbuf;
+                        header lexbuf;
+                        token lexbuf;
+                }
+        | '<' space* '/' space*
+                {
+                        last_pos := lexeme_start lexbuf;
+                        let tag = ident_name lexbuf in
+                        ignore_spaces lexbuf;
+                        close_tag lexbuf;
+                        Endtag tag
+                }
+        | '<' space*
+                {
+                        last_pos := lexeme_start lexbuf;
+                        let tag = ident_name lexbuf in
+                        ignore_spaces lexbuf;
+                        let attribs, closed = attributes lexbuf in
+                        Tag(tag, attribs, closed)
+                }
+        | "&#"
+                {
+                        last_pos := lexeme_start lexbuf;
+                        Buffer.reset tmp;
+                        Buffer.add_string tmp (lexeme lexbuf);
+                        PCData (pcdata lexbuf)
+                }
+        | '&'
+                {
+                        last_pos := lexeme_start lexbuf;
+                        Buffer.reset tmp;
+                        Buffer.add_string tmp (entity lexbuf);
+                        PCData (pcdata lexbuf)
+                }
+        | pcchar+
+                {
+                        last_pos := lexeme_start lexbuf;
+                        Buffer.reset tmp;
+                        Buffer.add_string tmp (lexeme lexbuf);
+                        PCData (pcdata lexbuf)
+                }
+        | eof { Eof }
+        | _
+                { error lexbuf ENodeExpected }
 
 and ignore_spaces = parse
         | newline | (newline break) | break
-		{
-			newline lexbuf;
-			ignore_spaces lexbuf
-		}
-	| space +
-		{ ignore_spaces lexbuf }
-	| ""
-		{ () }
+                {
+                        newline lexbuf;
+                        ignore_spaces lexbuf
+                }
+        | space +
+                { ignore_spaces lexbuf }
+        | ""
+                { () }
 
 and comment = parse
         | newline | (newline break) | break
-		{
-			newline lexbuf;
-			comment lexbuf
-		}
-	| "-->"
-		{ () }
-	| eof
-		{ raise (Error EUnterminatedComment) }
-	| _
-		{ comment lexbuf }
+                {
+                        newline lexbuf;
+                        comment lexbuf
+                }
+        | "-->"
+                { () }
+        | eof
+                { raise (Error EUnterminatedComment) }
+        | _
+                { comment lexbuf }
 
 and header = parse
         | newline | (newline break) | break
-		{
-			newline lexbuf;
-			header lexbuf
-		}
-	| "?>"
-		{ () }
-	| eof
-		{ error lexbuf ECloseExpected }
-	| _
-		{ header lexbuf }
+                {
+                        newline lexbuf;
+                        header lexbuf
+                }
+        | "?>"
+                { () }
+        | eof
+                { error lexbuf ECloseExpected }
+        | _
+                { header lexbuf }
 
 and pcdata = parse
         | newline | (newline break) | break
@@ -194,112 +195,112 @@ and pcdata = parse
                         newline lexbuf;
                         pcdata lexbuf
                 }
-	| pcchar+
-		{
-			Buffer.add_string tmp (lexeme lexbuf);
-			pcdata lexbuf
-		}
-	| "&#"
-		{
-			Buffer.add_string tmp (lexeme lexbuf);
-			pcdata lexbuf;
-		}
-	| '&'
-		{
-			Buffer.add_string tmp (entity lexbuf);
-			pcdata lexbuf
-		}
-	| ""
-		{ Buffer.contents tmp }
+        | pcchar+
+                {
+                        Buffer.add_string tmp (lexeme lexbuf);
+                        pcdata lexbuf
+                }
+        | "&#"
+                {
+                        Buffer.add_string tmp (lexeme lexbuf);
+                        pcdata lexbuf;
+                }
+        | '&'
+                {
+                        Buffer.add_string tmp (entity lexbuf);
+                        pcdata lexbuf
+                }
+        | ""
+                { Buffer.contents tmp }
 
 and entity = parse
-	| entitychar+ ';'
-		{
-			let ident = lexeme lexbuf in
-			try
-				Hashtbl.find idents (String.lowercase ident)
-			with
-				Not_found -> "&" ^ ident
-		}
-	| _ | eof
-		{ raise (Error EUnterminatedEntity) }
+        | entitychar+ ';'
+                {
+                        let ident = lexeme lexbuf in
+                        try
+                                Hashtbl.find idents (String.lowercase ident)
+                        with
+                                Not_found -> "&" ^ ident
+                }
+        | _ | eof
+                { raise (Error EUnterminatedEntity) }
 
 and ident_name = parse
-	| identchar+
-		{ lexeme lexbuf }
-	| _ | eof
-		{ error lexbuf EIdentExpected }
+        | identchar+
+                { lexeme lexbuf }
+        | _ | eof
+                { error lexbuf EIdentExpected }
 
 and close_tag = parse
-	| '>'
-		{ () }
-	| _ | eof
-		{ error lexbuf ECloseExpected }
+        | '>'
+                { () }
+        | _ | eof
+                { error lexbuf ECloseExpected }
 
 and attributes = parse
-	| '>'
-		{ [], false }
-	| "/>"
-		{ [], true }
-	| "" (* do not read a char ! *)
-		{
-			let key = attribute lexbuf in
-			let data = attribute_data lexbuf in
-			ignore_spaces lexbuf;
-			let others, closed = attributes lexbuf in
-			(key, data) :: others, closed
-		}
+        | '>'
+                { [], false }
+        | "/>"
+                { [], true }
+        | "" (* do not read a char ! *)
+                {
+                        let key = attribute lexbuf in
+                        let data = attribute_data lexbuf in
+                        ignore_spaces lexbuf;
+                        let others, closed = attributes lexbuf in
+                        (key, data) :: others, closed
+                }
 
 and attribute = parse
-	| identchar+
-		{ lexeme lexbuf }
-	| _ | eof
-		{ error lexbuf EAttributeNameExpected }
+        | identchar+
+                { lexeme lexbuf }
+        | _ | eof
+                { error lexbuf EAttributeNameExpected }
 
 and attribute_data = parse
-	| space* '=' space* '"'
-		{
-			Buffer.reset tmp;
-			last_pos := lexeme_end lexbuf;
-			dq_string lexbuf
-		}
-	| space* '=' space* '\''
-		{
-			Buffer.reset tmp;
-			last_pos := lexeme_end lexbuf;
-			q_string lexbuf
-		}
-	| _ | eof
-		{ error lexbuf EAttributeValueExpected }
+        | space* '=' space* '"'
+                {
+                        Buffer.reset tmp;
+                        last_pos := lexeme_end lexbuf;
+                        dq_string lexbuf
+                }
+        | space* '=' space* '\''
+                {
+                        Buffer.reset tmp;
+                        last_pos := lexeme_end lexbuf;
+                        q_string lexbuf
+                }
+        | _ | eof
+                { error lexbuf EAttributeValueExpected }
 
 and dq_string = parse
-	| '"'
-		{ Buffer.contents tmp }
-	| '\\' [ '"' '\\' ]
-		{
-			Buffer.add_char tmp (lexeme_char lexbuf 1);
-			dq_string lexbuf
-		}
-	| eof
-		{ raise (Error EUnterminatedString) }
-	| _
-		{ 
-			Buffer.add_char tmp (lexeme_char lexbuf 0);
-			dq_string lexbuf
-		}
+        | '"'
+                { Buffer.contents tmp }
+        | '\\' [ '"' '\\' ]
+                {
+                        Buffer.add_char tmp (lexeme_char lexbuf 1);
+                        dq_string lexbuf
+                }
+        | eof
+                { raise (Error EUnterminatedString) }
+        | _
+                {
+                        Buffer.add_char tmp (lexeme_char lexbuf 0);
+                        dq_string lexbuf
+                }
 
 and q_string = parse
-	| '\''
-		{ Buffer.contents tmp }
-	| '\\' [ '\'' '\\' ]
-		{
-			Buffer.add_char tmp (lexeme_char lexbuf 1);
-			q_string lexbuf
-		}
-	| eof
-		{ raise (Error EUnterminatedString) }
-	| _
-		{ 
-			Buffer.add_char tmp (lexeme_char lexbuf 0);
-			q_string lexbuf
-		}
+        | '\''
+                { Buffer.contents tmp }
+        | '\\' [ '\'' '\\' ]
+                {
+                        Buffer.add_char tmp (lexeme_char lexbuf 1);
+                        q_string lexbuf
+                }
+        | eof
+                { raise (Error EUnterminatedString) }
+        | _
+                {
+                        Buffer.add_char tmp (lexeme_char lexbuf 0);
+                        q_string lexbuf
+                }
-- 
cgit v1.2.3