From c58220701ada0ccca85cd290a57b3e2277537b75 Mon Sep 17 00:00:00 2001 From: "Hoa V. DINH" Date: Tue, 21 Oct 2014 20:50:13 -0700 Subject: Implement unit tests for message builder and message parsers --- unittest/data/parser/input/mbox/jwz/144 | 857 ++++++++++++++++++++++++++++++++ 1 file changed, 857 insertions(+) create mode 100644 unittest/data/parser/input/mbox/jwz/144 (limited to 'unittest/data/parser/input/mbox/jwz/144') diff --git a/unittest/data/parser/input/mbox/jwz/144 b/unittest/data/parser/input/mbox/jwz/144 new file mode 100644 index 00000000..56e36af3 --- /dev/null +++ b/unittest/data/parser/input/mbox/jwz/144 @@ -0,0 +1,857 @@ +Return-Path: +Received: from nevada.bellcore.com by greenbush.bellcore.com (4.1/4.7) + id for nsb; Mon, 8 Jun 92 15:57:08 EDT +Received: by nevada.bellcore.com (4.1/4.7) + id for nsb@greenbush; Mon, 8 Jun 92 15:57:01 EDT +Received: from Messages.8.5.N.CUILIB.3.45.SNAP.NOT.LINKED.nevada.galaxy.sun4.41 + via MS.5.6.nevada.galaxy.sun4_41; + Mon, 8 Jun 1992 15:56:59 -0400 (EDT) +Message-Id: <0eAvi=C0M2U=0U4XZI@thumper.bellcore.com> +Date: Mon, 8 Jun 1992 15:56:59 -0400 (EDT) +From: Darren New +X-Andrew-Message-Size: 8538+4 +Mime-Version: 1.0 +Content-Type: multipart/alternative; + boundary="Interpart.Boundary.keAvi9O0M2U=AU4XRw" +To: Nathaniel Borenstein +Subject: Fwd: revised MIME architecture +References: <9206062231.AA23867@pixel.convex.com> + +> THIS IS A MESSAGE IN 'MIME' FORMAT. Your mail reader does not support MIME. +> Please read the first section, which is plain text, and ignore the rest. + +--Interpart.Boundary.keAvi9O0M2U=AU4XRw +Content-type: text/plain; charset=US-ASCII + + +I am on the WWW mailing list and got this interesting tidbit of which I +thought you should be aware... + -- Darren + + + +Return-Path: +Received: from thumper.bellcore.com by nevada.bellcore.com (4.1/4.7) + id for dnew; Mon, 8 Jun 92 15:50:28 EDT +Received: from udel.edu (louie.udel.edu) by thumper.bellcore.com (4.1/4.7) + id for dnew@nevada; Mon, 8 Jun 92 15:50:25 EDT +Received: from snow-white.ee.udel.edu by louie.udel.edu id ah01113; + 8 Jun 92 15:45 EDT +Received: from louie.udel.edu by snow-white.ee.udel.edu id aa01403; + 8 Jun 92 18:34 GMT +Received: from CEARN.cern.ch by pucc.Princeton.EDU (IBM VM SMTP V2R2) + with BSMTP id 0017; Sat, 06 Jun 92 18:56:24 EDT +Received: from CEARN by CEARN.cern.ch (Mailer R2.07B) with BSMTP id 7873; Sun, + 07 Jun 92 00:35:28 SET +Received: from dxmint.cern.ch by CEARN.cern.ch (IBM VM SMTP V2R1) with TCP; + Sun, 07 Jun 92 00:35:21 SET +Received: by dxmint.cern.ch (dxcern) (5.57/3.14) + id AA02301; Sun, 7 Jun 92 00:34:37 +0200 +Received: from dxmint.cern.ch by nxoc01.cern.ch (NeXT-1.0 (From Sendmail + 5.52)/NeXT-2.0) + id AA08529; Sun, 7 Jun 92 00:34:17 MET DST +Received: by dxmint.cern.ch (dxcern) (5.57/3.14) + id AA02281; Sun, 7 Jun 92 00:32:19 +0200 +Received: from pixel.convex.com by convex.convex.com (5.64/1.35) + id AA04364; Sat, 6 Jun 92 17:32:01 -0500 +Received: from localhost by pixel.convex.com (5.64/1.28) + id AA23867; Sat, 6 Jun 92 17:31:59 -0500 +Message-Id: <9206062231.AA23867@pixel.convex.com> +To: www-interest@nxoc01.cern.ch +Subject: revised MIME architecture +Mime-Version: 1.0 +Content-Type: multipart/mixed;boundary=cut-here +Date: Sat, 06 Jun 92 17:31:58 CDT +From: Dan Connolly +Resent-Date: Mon, 8 Jun 92 18:34:59 GMT +Resent-From: new@ee.udel.edu +Resent-To: dnew@thumper.bellcore.com + +In an earlier message, I proposed we make the W3 project +interoperate with MIME systems. I made the mistake +of using existing names for formats and types that +don't yet exist. + +I'd like to make a more organized transition to MIME +interoperability. + +First, we define some types for existing web servers +and documents. + +X-HTTP is an access-type for message/external-body body +parts to access existing W3 servers. +Additional parameters include host, port, path, and anchor. + +X-HTML is a subtype of text for existing W3 documents. + +So the next part of this message is an HTML document expressed +as a MIME external-body message. + + +[An Andrew ToolKit view (mailobjv) was included here, but could not be +displayed.]Then we address limitations in the existing format with two +new types: + +In order to encapsulate multimedia objects in web nodes, +we define X-HYPERTEXT to be a subtype of the multipart body type. +The first part of a multipart/X-HYPERTEXT is the content of the hypertext. +The other parts are multimedia attachments and links to other documents. + +The user agent (WWW client) displays the first part and allows the +user to choose attachments and/or links. The attachments and links +will be displayed in addition to or in place of the original content. + +Then, in order to formalize the structure of hypertext parts, +we define X-SGML to be a subtype of text. The body of an X-SGML part must +be a complete SGML document. The user agent (WWW client) will resolve +external entities (such as the DTD and the mutlimedia attachments). + +So here's a multimedia web node expressed as MIME body part: + + +[An Andrew ToolKit view (mailobjv) was included here, but could not be +displayed.] +[An Andrew ToolKit view (mailobjv) was included here, but could not be +displayed.] +[An Andrew ToolKit view (mailobjv) was included here, but could not be +displayed.]And here's the DTD for WEB-NODE documents: + + + + + + + + + + + + + + + + + +"> + + + +"> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +And here's a perl script to convert an HTML document +into a multipart/X-HYPERTEXT MIME body part: + +#!/usr/local/bin/perl + +$boundary = "attachment"; +print "Content-Type: multipart/X-HYPERTEXT; boundary=$boundary¥n¥n"; + +print "--$boundary¥n"; +print "Content-Type: text/SGML¥n¥n"; + +print "; # read whole file +$_ = join('', @html); +$out = ''; + +sub fix_anchor{ + local($name, $href, $type); + + # What exactly is the syntax of an SGML attribute value? + while(s/^(¥w+)¥s*=¥s*((¥"[^¥"]*¥")|([^¥s>]+))¥s*//){ + local($v) = ($3 || $4); + local($a) = $1; + $href = $v if $a =‾ /^href$/i; + $name = $v if $a =‾ /^name$/i; + $type = $v if $a =‾ /^type$/i; + } + s/[^>]*>//; + + $out .= "//){ + $out .= ""; + }elsif(s/^H(¥d)>//){ + local($n) = $1; + while($n<=$header){ $out .= ""; $header--; } + while($n>$header){ $out .= "
"; $header++; } + $out .= ""; + }else{ + $out .= '<'; + } +} + +$out .= $_; + +foreach(keys %anchor){ + local($ent) = $anchor{$_}; + + print "¥n"; +} + +print "]>¥n", $out; + +foreach(keys %anchor){ + local($access_type); + + print "¥n¥n--$boundary¥n"; + print "Content-id: $_¥n"; + print "Content-type: message/external-body¥n"; + + $access_type = $1 if s/^(¥w+)://; + + if(s/#([^#]+)$//){ + print "¥t;x-anchor=¥"$1¥"¥n"; + } + + if($access_type =‾ /file/i){ + if(&hostport){ + ¶m('access-type', "ANON-FTP"); + }else{ + ¶m('access-type', 'LOCAL-FILE'); + } + ¶m('name', $_); + + print "¥nContent-Type: application/octet-stream¥n¥n"; + }elsif($access_type =‾ /http/i){ + ¶m('access-type', 'X-HTTP'); + &hostport; + &unescape; + ¶m('name', $_); + + print "¥nContent-Type: text/X-HTML¥n¥n"; + }elsif($access_type =‾ /news/i){ + ¶m('access-type', 'X-NEWS'); + &unescape; + if(/@/){ + ¶m('message-id', $_); + }else{ + ¶m('group', $_); + } + + print "¥nContent-Type: message¥n¥n"; + + }elsif($access_type =‾ /telnet/i){ + ¶m('access-type', 'x-telnet'); + &unescape; + ¶m('user', $1) if s/^(.*)@//; + ¶m('port', $1) if s/:(.*)$//; + ¶m('site', $_); + + print "¥nContent-Type: X-TELNET¥n¥n"; + + }elsif($access_type =‾ /gopher/i){ + ¶m('access-type', 'x-gopher'); + &hostport; + ¶m('type', $1) if s-^/(¥d+)/--; + &unescape; + ¶m('selector', $_); + + print "¥nContent-Type: @@@@¥n¥n"; + + }elsif($access_type =‾ /wais/i){ + ¶m('access-type', 'x-wais'); + &hostport; + if(m-^/-){ + ¶m('type', $1) if s-^/(¥w+)--; + ¶m('size', $1) if s-^/(¥d+)--; + &unescape; + ¶m('path', $_); + }else{ + &unescape; + ¶m('words', $1) if /¥?(.*)/; + } + + $type = "image/$type" if $type =‾ /gif|tiff/i; + $type = "application/postscript" if $type =‾ /PS/i; + + print "¥nContent-Type: $type¥n¥n"; + + }elsif($access_type eq ""){ + ¶m('access-type', 'x-relative'); + &unescape; + ¶m('name', $_); + + print "¥nContent-Type: message¥n¥n"; + }else{ + warn "unknown access type: $access_type in $_"; + } +} + +print "--$boundary--¥n"; + +sub unescape{ + s/%(¥w¥w)/sprintf("%c",hex($1))/ge; +} + +sub param{ + local($p, $v) = @_; + # quote tspecials in parameter values + $v = '"'.$v.'"' if $v =‾ m-[¥s()<>@,;:¥¥¥"¥/¥[¥]?¥.=]-; + print "¥t;$p=$v¥n"; +} + +sub hostport{ + if(s-//([^:/]+)--){ + ¶m('host', $1); + ¶m('port', $1) if s/:(¥d+)//; + 1; + }else{ + 0; + } +} + + +--Interpart.Boundary.keAvi9O0M2U=AU4XRw +Content-Type: multipart/mixed; + boundary="Alternative.Boundary.keAvi9O0M2U=8U4XMt" + +--Alternative.Boundary.keAvi9O0M2U=8U4XMt +Content-type: text/richtext; charset=US-ASCII +Content-Transfer-Encoding: quoted-printable + + +I am on the WWW mailing list and got this interesting tidbit of which I though= +t you should be aware... + -- Darren + + + +Return-Path: new@ee.udel.edu> +Received: from thumper.bellcore.com by nevada.bellcore.com (4.1/4.7) + id AA08378> for dnew; Mon, 8 Jun 92 15:50:28 EDT +Received: from udel.edu (louie.udel.edu) by thumper.bellcore.com (4.1/4.7)= + + id AA01533> for dnew@nevada; Mon, 8 Jun 92 15:50:25 EDT +Received: from snow-white.ee.udel.edu by louie.udel.edu id ah01113; + 8 Jun 92 15:45 EDT +Received: from louie.udel.edu by snow-white.ee.udel.edu id aa01403; + 8 Jun 92 18:34 GMT +Received: from CEARN.cern.ch by pucc.Princeton.EDU (IBM VM SMTP V2R2) + with BSMTP id 0017; Sat, 06 Jun 92 18:56:24 EDT +Received: from CEARN by CEARN.cern.ch (Mailer R2.07B) with BSMTP id 7873; Sun,= + + 07 Jun 92 00:35:28 SET +Received: from dxmint.cern.ch by CEARN.cern.ch (IBM VM SMTP V2R1) with TCP; + Sun, 07 Jun 92 00:35:21 SET +Received: by dxmint.cern.ch (dxcern) (5.57/3.14) + id AA02301; Sun, 7 Jun 92 00:34:37 +0200 +Received: from dxmint.cern.ch by nxoc01.cern.ch (NeXT-1.0 (From Sendmail= + + 5.52)/NeXT-2.0) + id AA08529; Sun, 7 Jun 92 00:34:17 MET DST +Received: by dxmint.cern.ch (dxcern) (5.57/3.14) + id AA02281; Sun, 7 Jun 92 00:32:19 +0200 +Received: from pixel.convex.com by convex.convex.com (5.64/1.35) + id AA04364; Sat, 6 Jun 92 17:32:01 -0500 +Received: from localhost by pixel.convex.com (5.64/1.28) + id AA23867; Sat, 6 Jun 92 17:31:59 -0500 +Message-Id: 9206062231.AA23867@pixel.convex.com> +To: www-interest@nxoc01.cern.ch +Subject: revised MIME architecture +Mime-Version: 1.0 +Content-Type: multipart/mixed;boundary=3Dcut-here +Date: Sat, 06 Jun 92 17:31:58 CDT +From: Dan Connolly connolly@pixel.convex.com> +Resent-Date: Mon, 8 Jun 92 18:34:59 GMT +Resent-From: new@ee.udel.edu +Resent-To: dnew@thumper.bellcore.com + +In an earlier message, I proposed we make the W3 project +interoperate with MIME systems. I made the mistake +of using existing names for formats and types that +don't yet exist. + +I'd like to make a more organized transition to MIME +interoperability. + +First, we define some types for existing web servers +and documents. + +X-HTTP is an access-type for message/external-body body +parts to access existing W3 servers. +Additional parameters include host, port, path, and anchor. + +X-HTML is a subtype of text for existing W3 documents. + +So the next part of this message is an HTML document expressed +as a MIME external-body message. + + + +--Alternative.Boundary.keAvi9O0M2U=8U4XMt +Content-type: message/external-body; + access-type="X-HTTP"; + host="info.cern.ch"; + port="2784"; + path="/hypertext/WWW/TheProject.html" +Content-Description: Object of type 'message/external-body; + +Content-type: text/X-HTML + + + + +--Alternative.Boundary.keAvi9O0M2U=8U4XMt +Content-type: text/richtext; charset=US-ASCII +Content-Transfer-Encoding: quoted-printable + + +Then we address limitations in the existing format with two +new types: + +In order to encapsulate multimedia objects in web nodes, +we define X-HYPERTEXT to be a subtype of the multipart body type. +The first part of a multipart/X-HYPERTEXT is the content of the hypertext.= + +The other parts are multimedia attachments and links to other documents. + +The user agent (WWW client) displays the first part and allows the +user to choose attachments and/or links. The attachments and links +will be displayed in addition to or in place of the original content. + +Then, in order to formalize the structure of hypertext parts, +we define X-SGML to be a subtype of text. The body of an X-SGML part must +be a complete SGML document. The user agent (WWW client) will resolve +external entities (such as the DTD and the mutlimedia attachments). + +So here's a multimedia web node expressed as MIME body part: + + + +--Alternative.Boundary.keAvi9O0M2U=8U4XMt +Content-type: text/SGML +Content-Description: Object of type 'text/SGML +Content-Transfer-Encoding: quoted-printable + + + +]> +Sample mutlimedia web node +

Old features

+Here's a link to some info at cern: + cern stuff +

New features

+Here's a picture: + + + + +--Alternative.Boundary.keAvi9O0M2U=8U4XMt +Content-type: text/richtext; charset=US-ASCII +Content-Transfer-Encoding: quoted-printable + + + + +--Alternative.Boundary.keAvi9O0M2U=8U4XMt +Content-type: message/external-body ; + access-type="X-HTTP "; + host="info.cern.ch"; + name="/hypertext/WWW/TheProject.html" +Content-Description: Object of type 'message/external-body + +Content-Type: text/X-HTML + + + +--Alternative.Boundary.keAvi9O0M2U=8U4XMt +Content-type: text/richtext; charset=US-ASCII +Content-Transfer-Encoding: quoted-printable + + + + +--Alternative.Boundary.keAvi9O0M2U=8U4XMt +Content-type: image/gif +Content-Description: Object of type 'image/gif +Content-Transfer-Encoding: base64 + +R0lGODdhdQAvAIQAAL9/v3+ff39/f/+/f/+ff/9/f3+fv///v39/v//fv/+/v/+fv/9/v7+/ +f7+ff79/f//////f/7/fv7+/v7+fvwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +ACwAAAAAdQAvAAAF/uARHQkkiuMoQmZUnixUvkfNHjKbivM75riEMIa7xYiQXYvH/CVIMuhz +5gK6nsuabUYawqamcCQJvOGKQZnz9oues9rge/wOy4asWdQpj6r1EWNVRSVKT3RLYXWEbIot +jjw6J21jZEdaOycleVCTQYc9PXQ0nSMvXC4qg46QPzA4bWdKNXpvtFpPK4g+d4W9vUIKXHk7 +p0m2KSYJV0JrejmNrJp7dkLNwtbLPKmXLc3DUd8JCuTl2MzZh37ariuHnmVAUCZFpndXRi48 +2MvLKdauikWyBsyauXIUFEwgN65cQyFSfHhyZ+RFHlaPSFCEJgtWM45hapUhEWjIpn4N/oUp +VEBhAAUCAwgsaKCA5sIJFG4qGLOpkx0Vyqz4PHkCVtBOIm040iNxEDeSmnz463dlwoKFORU4 +WKlw5oKtE6Z4wYelip0iZ6GdFIO27TGeOVyFzNiOjj01Upb0c5gym06rDVbhQRrU0MW5i5Ag +MiJtLq83hfLeeZTr3x9xKJlhU9il54rPSNLY8JkmIxpCtogo4ZiGhpF5RSMXY4oys8FxFFIB +HC3nSzXVae1AE8721cUaldrClT0pRw/nk9/ppjrB6gCNASNLYmURdOq0trQfOU4GeVBIqV85 ++SeqC3aDCgjEPzbxVEd7HefA+u72LO/XZ40CHVyGrOFJMTxd/rENVgoM8E5ZjyTiVizHAFiG +RcGJIdx+QEGiiEVS4SWRP51dRoICCwxwlR9u5GJcLvRU5OGGMrYSIy0WMreDXVTkJ6IK7EBE +RUMyOcCDdG8hh9YmErIhmlJoIMaYDq/JlVhHWeiD1xc7bpPNSgkJ0weM2mmkDGmJ/FQljegh +AxlbUgKyhWXNHPlgkAmkqNUC0WmixYXdGKfknyKxiKFbUe6SRkC2UHRGT2Bcho5VFDjQkgPt +VTFIJLck8ligH+boKYBbvIYlPCGGs9wU+qRCQaUvvYqpfVxwOlx3WWLnnRngNdYTIrKMtFQW +JgVV7CEKTUCApcwye4U+AGWjV16H/oYWbJSjMZZtRo29WQsWRFVhEjsrESArrLAaKaQJKFpa +XVilZYtKKRk1VaFo5GXbh3429lBUbMoksJC7lprrwAMUPGBpOuM4gJO7K42DGT1AFdWhLONI +AA8MHu7nJKOZKOkGxaxilVPBsCr8KsJ9yVRpdczilFN1yQ4k7pH2CPHuTVigGa9cuHbHcRyP +WuTCw+mmvLLCCDfoNLqWNoA0sy/Di9SJEryrCtIUNEABROAx2jHQzrkYEooviSnwAklbirAD +cK+csDkGc02wrEizw0N1FFw1E82VKuQ13z2j5uG8cqiyhdksQV3pV0knXCnCABwMt8IDKDv1 +yefGTfXL/sHgxrnMr5KeU+kJ/Qfjf1Iq2U9nOLBkaeSOU90sAJI33XnblRpAQQAUCHDu6TjJ +7O7LXb/ssAMzvUwOLl5gzJAxKOnE8OxUK9xsyg4AgPADlTsgQPcBPDCAVo6nW3nllTpggO1J +TzA7uvQTD6uYLqCoQOzk8L3QMhErHur6Foy2+Q5q7Gtf+QLgAAR072DB413cHvC+CiLMgQAA +nqXeF7m4tc948nscTqRmDpqhDXtVW96r+gY/k1EtAOH7HaxgKIAHIIB93useBR8APqXNTng6 +lJzjfGcp4B1wfkjcnfzmF0JYTcBrJxPg/CQoK/g9MGEG+F7CPNe98TXwAeUD/iMYPRc8ZgmP +gb9DIdyKWEXgyfBzn6Pf5wY3Oyh2MH27U58G35i0ytkQbgCoIQQF4L1Aeq+MwXsb7oRnQAo4 +EIu4S5oR4WjF4U0RfrWrIhwrlcGEZfCQ32ug+PwYSB0igIfiCyMiEfa7DKrsbZy8ovvWeMZ0 +MYuIllwhCtu3Sypu8FwMrBwG3Za9B76NgoAU4xdPaUOVCW98fsQiGWdJQ08iIJadFGbc0IjG +9qXxm7ubpi7f98AsUqBy5oTbI2uITMkdLIuBBKP3Tmk5dhLScq7M4vuaCUMdhg+Z6mQjGGG1 +PsmFz3dunJ3ChIcw4T2SlQzEYOX6Wal+ds+Bf6TA/vu8Vz6GRpOH5fsiIeUpgGdCkGl/dFv5 +3nfKLNqQoeVjX0oxSk7fKSyHNJVVJyf4PoqykX3Q/N4+J+c9kK7vAeMjZPlOGc8dEpKdBzvl ++FoaVXkmbHyT46A8F7rTg17wpuaUXBYP9smLihJ3DwRl+XL3O6YlMosRjar4PGlP8fEwizW0 +5x+dejBBjlQAwEPqU+PJUbSCT56n3KKlxoewiKqskwpzIAQP1lD33bStZbzgR78HPsZWTpB7 +DWQikeq9GkoVqZ1N7D1hyEOGXm6WMFWqDi8HQ3MeM4ZbJSbu5DlUhY21sgAY61zHh9cyltK0 +zcwrakva2uX+0ZVVHaU8/jsaSGYmN4el/aIB/OhMZDbznAEFIjQvV6lhljV86wviTQfpSaYx +NpFF7SxqmancZlZVnmSdnDq3ODkHCg+90y2lJzXoRQwmDIMMdSA92SdLoJLPrPoEpPscqdLK +PhW1RWXnXW+KRmO6koKHPKkiiXlK1qZ1h/784hbXy1/wVVim+nzlA8mrxdemNWHr5eFV7dpM +t57UdkB2p051O1nyNfam2luwVcUYQe2prFI1HKtMNRpZAPzzqmjl6EKv+sca4ni0hy3yiitV +ADIr9sprbB8pYfjlTt7ze/1cMgTZ7FJWTtbFUrZcA10JZaTqcKzDPCxyUVliPx/WXMtiAHkn +KchJNbO1sf/87YjLqsXLehmVS0urA0yMSmEyjZQPZOxhF2rovuZWACEAADs= + + + +--Alternative.Boundary.keAvi9O0M2U=8U4XMt +Content-type: text/richtext; charset=US-ASCII +Content-Transfer-Encoding: quoted-printable + + +And here's the DTD for WEB-NODE documents: + +!-- This DTD was produced by DeveGram on Tue Jun 2 18:58:16 1992 --> +!-- and hand-edited by connolly@convex.com --> + +!-- Parameter Entities --> + +!-- Terminal symbols --> + +!ENTITY % words "#PCDATA" > + +!-- Non-ELEMENT symbols --> + +!ENTITY % inline "%words | A" > +!ENTITY % text "%inline | P | IMAGE" > +!ENTITY % heading "H1|H2|H3|H4|H5|H6" > + +!ENTITY lt ""> +!ENTITY gt ">"> +!ENTITY amp "&"> + +!ENTITY lt. ""> +!ENTITY gt. ">"> +!ENTITY amp. "&"> + +!-- Document structure --> + +!ELEMENT WEB-NODE O O (TITLE, NEXTID?, ISINDEX?, section+, ADDRESS?)>= + + +!ELEMENT TITLE - - (%inline)+> +!ELEMENT ADDRESS - - (%text)+> + +!ELEMENT NEXTID - O EMPTY > +!ATTLIST NEXTID N NUMBER #IMPLIED> + +!ELEMENT ISINDEX - O EMPTY > + + +!ELEMENT section O O ((%heading)?, + ( + %text | + section | + MENU | + UL | + OL | + DIR | + DL)+)> + +!ELEMENT (H1|H2|H3|H4|H5|H6) - - (%inline) > + +!ELEMENT P - O EMPTY -- paragraph SEPARATOR --> + +!ELEMENT IMAGE - O EMPTY> +!ATTLIST IMAGE ATTACHMENT ENTITY #REQUIRED> + +!ELEMENT A - - (%inline)+> +!ATTLIST A + NAME CDATA #IMPLIED + HREF ENTITY #IMPLIED + TYPE CDATA #IMPLIED --@@-- > + +!ELEMENT MENU - - (LI+)> + +!ELEMENT UL - - (LI+)> + +!ELEMENT OL - - (LI+)> + +!ELEMENT DIR - - (LI+)> + +!ELEMENT LI - O (%text)+> + +!ELEMENT DL - - ((DT, DD)+)> + +!ELEMENT DT - O (%inline)+> + +!ELEMENT DD - O (%text)+> + +And here's a perl script to convert an HTML document +into a multipart/X-HYPERTEXT MIME body part: + +#!/usr/local/bin/perl + +$boundary =3D "attachment"; +print "Content-Type: multipart/X-HYPERTEXT; boundary=3D$boundary¥n¥n"; + +print "--$boundary¥n"; +print "Content-Type: text/SGML¥n¥n"; + +print "!DOCTYPE WEB-NODE SYSTEM ¥n[¥n"; + +@html =3D >; # read whole file +$_ =3D join('', @html); +$out =3D ''; + +sub fix_anchor{ + local($name, $href, $type); + + # What exactly is the syntax of an SGML attribute value? + while(s/^(¥w+)¥s*=3D¥s*((¥"[^¥"]*¥")|([^¥s>]+))¥s*//){ + local($v) =3D ($3 || $4); + local($a) =3D $1; + $href =3D $v if $a =3D‾ /^href$/i; + $name =3D $v if $a =3D‾ /^name$/i; + $type =3D $v if $a =3D‾ /^type$/i; + } + s/[^>]*>//; + + $out .=3D "A"; + $out .=3D " NAME=3D¥"$name¥"" if $name ne ''; + $out .=3D " TYPE=3D¥"$type¥"" if $type ne ''; + if($href ne ''){ + if(!defined($anchor{$href})){ + $anchor{$href} =3D ++$anchor; + } + $out .=3D " HREF=3D" . $anchor{$href}; + } + $out .=3D ">"; +} + +$header =3D 0; +$anchor =3D "UDI000"; +while(//){ + $out .=3D $`; + $_ =3D $'; + if(s/^A¥s+//i){ + &fix_anchor; + }elsif(s/^NEXTID¥s+(¥d+)¥s*>//){ + $out .=3D "NEXTID N=3D$1>"; + }elsif(s/^H(¥d)>//){ + local($n) =3D $1; + while($n=3D$header){ $out .=3D "/SECTION>"; $header--; } + while($n>$header){ $out .=3D "SECTION>"; $header++; } + $out .=3D "H$n>"; + }else{ + $out .=3D ''; + } +} + +$out .=3D $_; + +foreach(keys %anchor){ + local($ent) =3D $anchor{$_}; + + print "!ENTITY $ent SDATA ¥"$_¥">¥n"; +} + +print "]>¥n", $out; + +foreach(keys %anchor){ + local($access_type); + + print "¥n¥n--$boundary¥n"; + print "Content-id: $_¥n"; + print "Content-type: message/external-body¥n"; + + $access_type =3D $1 if s/^(¥w+)://; + + if(s/#([^#]+)$//){ + print "¥t;x-anchor=3D¥"$1¥"¥n"; + } + + if($access_type =3D‾ /file/i){ + if(&hostport){ + ¶m('access-type', "ANON-FTP"); + }else{ + ¶m('access-type', 'LOCAL-FILE'); + } + ¶m('name', $_); + + print "¥nContent-Type: application/octet-stream¥n¥n"; + }elsif($access_type =3D‾ /http/i){ + ¶m('access-type', 'X-HTTP'); + &hostport; + &unescape; + ¶m('name', $_); + + print "¥nContent-Type: text/X-HTML¥n¥n"; + }elsif($access_type =3D‾ /news/i){ + ¶m('access-type', 'X-NEWS'); + &unescape; + if(/@/){ + ¶m('message-id', $_); + }else{ + ¶m('group', $_); + } + + print "¥nContent-Type: message¥n¥n"; + + }elsif($access_type =3D‾ /telnet/i){ + ¶m('access-type', 'x-telnet'); + &unescape; + ¶m('user', $1) if s/^(.*)@//; + ¶m('port', $1) if s/:(.*)$//; + ¶m('site', $_); + + print "¥nContent-Type: X-TELNET¥n¥n"; + + }elsif($access_type =3D‾ /gopher/i){ + ¶m('access-type', 'x-gopher'); + &hostport; + ¶m('type', $1) if s-^/(¥d+)/--; + &unescape; + ¶m('selector', $_); + + print "¥nContent-Type: @@@@¥n¥n"; + + }elsif($access_type =3D‾ /wais/i){ + ¶m('access-type', 'x-wais'); + &hostport; + if(m-^/-){ + ¶m('type', $1) if s-^/(¥w+)--; + ¶m('size', $1) if s-^/(¥d+)--; + &unescape; + ¶m('path', $_); + }else{ + &unescape; + ¶m('words', $1) if /¥?(.*)/; + } + + $type =3D "image/$type" if $type =3D‾ /gif|tiff/i; + $type =3D "application/postscript" if $type =3D‾ /PS/i; + + print "¥nContent-Type: $type¥n¥n"; + + }elsif($access_type eq ""){ + ¶m('access-type', 'x-relative'); + &unescape; + ¶m('name', $_); + + print "¥nContent-Type: message¥n¥n"; + }else{ + warn "unknown access type: $access_type in $_"; + } +} + +print "--$boundary--¥n"; + +sub unescape{ + s/%(¥w¥w)/sprintf("%c",hex($1))/ge; +} + +sub param{ + local($p, $v) =3D @_; + # quote tspecials in parameter values + $v =3D '"'.$v.'"' if $v =3D‾ m-[¥s()>@,;:¥¥¥"¥/¥[¥]?¥.=3D]-; + print "¥t;$p=3D$v¥n"; +} + +sub hostport{ + if(s-//([^:/]+)--){ + ¶m('host', $1); + ¶m('port', $1) if s/:(¥d+)//; + 1; + }else{ + 0; + } +} + + +--Alternative.Boundary.keAvi9O0M2U=8U4XMt-- + +--Interpart.Boundary.keAvi9O0M2U=AU4XRw-- + -- cgit v1.2.3