aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/core/basetypes
diff options
context:
space:
mode:
authorGravatar DINH Viet Hoa <dinh.viet.hoa@gmail.com>2013-02-04 22:17:59 -0800
committerGravatar DINH Viet Hoa <dinh.viet.hoa@gmail.com>2013-02-04 22:18:38 -0800
commit7437be6e661b83e5e3af85848c940f9fd2f533de (patch)
tree41139a74374d1d372a1b582500db19e8406b51a6 /src/core/basetypes
parent57ef6b75d7465a78c09ddbba07379625072077f5 (diff)
Implemented HTML cleaner string->cleanedHTMLString(). Fixed crash in flattenHTML().
Diffstat (limited to 'src/core/basetypes')
-rw-r--r--src/core/basetypes/MCHTMLCleaner.cc109
-rw-r--r--src/core/basetypes/MCHTMLCleaner.h23
-rw-r--r--src/core/basetypes/MCString.cc28
3 files changed, 147 insertions, 13 deletions
diff --git a/src/core/basetypes/MCHTMLCleaner.cc b/src/core/basetypes/MCHTMLCleaner.cc
new file mode 100644
index 00000000..d8c63613
--- /dev/null
+++ b/src/core/basetypes/MCHTMLCleaner.cc
@@ -0,0 +1,109 @@
+//
+// HTMLCleaner.cpp
+// mailcore2
+//
+// Created by DINH Viêt Hoà on 2/3/13.
+// Copyright (c) 2013 MailCore. All rights reserved.
+//
+
+#include "MCHTMLCleaner.h"
+
+#include "MCString.h"
+#include "MCData.h"
+
+#include <tidy.h>
+#include <buffio.h>
+
+#include "MCUtils.h"
+#include "MCLog.h"
+
+using namespace mailcore;
+
+String * HTMLCleaner::cleanHTML(String * input)
+{
+ TidyBuffer output;
+ TidyBuffer errbuf;
+ TidyBuffer docbuf;
+ int rc;
+
+ TidyDoc tdoc = tidyCreate();
+ tidyBufInit(&output);
+ tidyBufInit(&errbuf);
+ tidyBufInit(&docbuf);
+
+ Data * data = input->dataUsingEncoding("utf-8");
+ tidyBufAppend(&docbuf, data->bytes(), data->length());
+
+ tidyOptSetBool(tdoc, TidyXhtmlOut, yes);
+ tidySetCharEncoding(tdoc, "utf8");
+ tidyOptSetBool(tdoc, TidyForceOutput, yes);
+ rc = tidySetErrorBuffer(tdoc, &errbuf);
+ if ((rc > 1) || (rc < 0)) {
+ fprintf(stderr, "error tidySetErrorBuffer: %i\n", rc);
+ fprintf(stderr, "1:%s", errbuf.bp);
+ //return NULL;
+ }
+ rc = tidyParseBuffer(tdoc, &docbuf);
+ //MCLog("%s", MCUTF8(input));
+ if ((rc > 1) || (rc < 0)) {
+ fprintf(stderr, "error tidyParseBuffer: %i\n", rc);
+ fprintf(stderr, "1:%s", errbuf.bp);
+ //return NULL;
+ }
+ rc = tidyCleanAndRepair(tdoc);
+ if ((rc > 1) || (rc < 0)) {
+ fprintf(stderr, "error tidyCleanAndRepair: %i\n", rc);
+ fprintf(stderr, "1:%s", errbuf.bp);
+ //return NULL;
+ }
+ rc = tidySaveBuffer(tdoc, &output);
+ if ((rc > 1) || (rc < 0)) {
+ fprintf(stderr, "error tidySaveBuffer: %i\n", rc);
+ fprintf(stderr, "1:%s", errbuf.bp);
+ }
+
+ String * result = String::stringWithUTF8Characters((const char *) output.bp);
+
+ tidyBufFree(&output);
+ tidyBufFree(&errbuf);
+ tidyRelease(tdoc);
+
+ return result;
+
+ /*
+ if ( ok ) {
+ rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics
+ }
+ if ( rc &gt;= 0 ) {
+ rc = tidyParseString( tdoc, input ); // Parse the input
+ }
+ if ( rc &gt;= 0 ) {
+ rc = tidyCleanAndRepair( tdoc ); // Tidy it up!
+ }
+ if ( rc &gt;= 0 ) {
+ rc = tidyRunDiagnostics( tdoc ); // Kvetch
+ }
+ if ( rc &gt; 1 ) { // If error, force output.
+ rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 );
+ }
+ if ( rc &gt;= 0 ) {
+ rc = tidySaveBuffer( tdoc, &output ); // Pretty Print
+ }
+ */
+
+ /*
+ if ( rc &gt;= 0 )
+ {
+ if ( rc &gt; 0 )
+ printf( "\\nDiagnostics:\\n\\n\%s", errbuf.bp );
+ printf( "\\nAnd here is the result:\\n\\n\%s", output.bp );
+ }
+ else
+ printf( "A severe error (\%d) occurred.\\n", rc );
+
+ tidyBufFree( &amp;output );
+ tidyBufFree( &amp;errbuf );
+ tidyRelease( tdoc );
+ return rc;
+ */
+}
diff --git a/src/core/basetypes/MCHTMLCleaner.h b/src/core/basetypes/MCHTMLCleaner.h
new file mode 100644
index 00000000..f4707872
--- /dev/null
+++ b/src/core/basetypes/MCHTMLCleaner.h
@@ -0,0 +1,23 @@
+//
+// HTMLCleaner.h
+// mailcore2
+//
+// Created by DINH Viêt Hoà on 2/3/13.
+// Copyright (c) 2013 MailCore. All rights reserved.
+//
+
+#ifndef __mailcore2__HTMLCleaner__
+#define __mailcore2__HTMLCleaner__
+
+#include <mailcore/MCString.h>
+
+namespace mailcore {
+
+ class HTMLCleaner {
+ public:
+ static String * cleanHTML(String * input);
+ };
+
+}
+
+#endif /* defined(__mailcore2__HTMLCleaner__) */
diff --git a/src/core/basetypes/MCString.cc b/src/core/basetypes/MCString.cc
index d57e70ed..d72d111d 100644
--- a/src/core/basetypes/MCString.cc
+++ b/src/core/basetypes/MCString.cc
@@ -20,6 +20,7 @@
#include "MCHashMap.h"
#include "MCAutoreleasePool.h"
#include "MCValue.h"
+#include "MCHTMLCleaner.h"
using namespace mailcore;
@@ -1510,17 +1511,19 @@ static void elementStarted(void * ctx, const xmlChar * name, const xmlChar ** at
pool = new AutoreleasePool();
attributes = dictionaryFromAttributes(atts);
style = (String *) attributes->objectForKey(MCSTR("style"));
- if (style->locationOfString(MCSTR("margin: 0.0px 0.0px 0.0px 0.0px;")) != -1) {
- hasSpacing = false;
- }
- else if (style->locationOfString(MCSTR("margin: 0px 0px 0px 0px;")) != -1) {
- hasSpacing = false;
- }
- else if (style->locationOfString(MCSTR("margin: 0.0px;")) != -1) {
- hasSpacing = false;
- }
- else if (style->locationOfString(MCSTR("margin: 0px;")) != -1) {
- hasSpacing = false;
+ if (style != NULL) {
+ if (style->locationOfString(MCSTR("margin: 0.0px 0.0px 0.0px 0.0px;")) != -1) {
+ hasSpacing = false;
+ }
+ else if (style->locationOfString(MCSTR("margin: 0px 0px 0px 0px;")) != -1) {
+ hasSpacing = false;
+ }
+ else if (style->locationOfString(MCSTR("margin: 0.0px;")) != -1) {
+ hasSpacing = false;
+ }
+ else if (style->locationOfString(MCSTR("margin: 0px;")) != -1) {
+ hasSpacing = false;
+ }
}
pool->release();
@@ -2010,8 +2013,7 @@ String * String::htmlEncodedString()
String * String::cleanedHTMLString()
{
-#warning implement HTML cleaning with tidy
- return (String *) copy()->autorelease();
+ return HTMLCleaner::cleanHTML(this);
}
bool String::isEqualCaseInsensitive(String * otherString)