diff options
author | 2013-02-04 22:17:59 -0800 | |
---|---|---|
committer | 2013-02-04 22:18:38 -0800 | |
commit | 7437be6e661b83e5e3af85848c940f9fd2f533de (patch) | |
tree | 41139a74374d1d372a1b582500db19e8406b51a6 /src/core/basetypes | |
parent | 57ef6b75d7465a78c09ddbba07379625072077f5 (diff) |
Implemented HTML cleaner string->cleanedHTMLString(). Fixed crash in flattenHTML().
Diffstat (limited to 'src/core/basetypes')
-rw-r--r-- | src/core/basetypes/MCHTMLCleaner.cc | 109 | ||||
-rw-r--r-- | src/core/basetypes/MCHTMLCleaner.h | 23 | ||||
-rw-r--r-- | src/core/basetypes/MCString.cc | 28 |
3 files changed, 147 insertions, 13 deletions
diff --git a/src/core/basetypes/MCHTMLCleaner.cc b/src/core/basetypes/MCHTMLCleaner.cc new file mode 100644 index 00000000..d8c63613 --- /dev/null +++ b/src/core/basetypes/MCHTMLCleaner.cc @@ -0,0 +1,109 @@ +// +// HTMLCleaner.cpp +// mailcore2 +// +// Created by DINH Viêt Hoà on 2/3/13. +// Copyright (c) 2013 MailCore. All rights reserved. +// + +#include "MCHTMLCleaner.h" + +#include "MCString.h" +#include "MCData.h" + +#include <tidy.h> +#include <buffio.h> + +#include "MCUtils.h" +#include "MCLog.h" + +using namespace mailcore; + +String * HTMLCleaner::cleanHTML(String * input) +{ + TidyBuffer output; + TidyBuffer errbuf; + TidyBuffer docbuf; + int rc; + + TidyDoc tdoc = tidyCreate(); + tidyBufInit(&output); + tidyBufInit(&errbuf); + tidyBufInit(&docbuf); + + Data * data = input->dataUsingEncoding("utf-8"); + tidyBufAppend(&docbuf, data->bytes(), data->length()); + + tidyOptSetBool(tdoc, TidyXhtmlOut, yes); + tidySetCharEncoding(tdoc, "utf8"); + tidyOptSetBool(tdoc, TidyForceOutput, yes); + rc = tidySetErrorBuffer(tdoc, &errbuf); + if ((rc > 1) || (rc < 0)) { + fprintf(stderr, "error tidySetErrorBuffer: %i\n", rc); + fprintf(stderr, "1:%s", errbuf.bp); + //return NULL; + } + rc = tidyParseBuffer(tdoc, &docbuf); + //MCLog("%s", MCUTF8(input)); + if ((rc > 1) || (rc < 0)) { + fprintf(stderr, "error tidyParseBuffer: %i\n", rc); + fprintf(stderr, "1:%s", errbuf.bp); + //return NULL; + } + rc = tidyCleanAndRepair(tdoc); + if ((rc > 1) || (rc < 0)) { + fprintf(stderr, "error tidyCleanAndRepair: %i\n", rc); + fprintf(stderr, "1:%s", errbuf.bp); + //return NULL; + } + rc = tidySaveBuffer(tdoc, &output); + if ((rc > 1) || (rc < 0)) { + fprintf(stderr, "error tidySaveBuffer: %i\n", rc); + fprintf(stderr, "1:%s", errbuf.bp); + } + + String * result = String::stringWithUTF8Characters((const char *) output.bp); + + tidyBufFree(&output); + tidyBufFree(&errbuf); + tidyRelease(tdoc); + + return result; + + /* + if ( ok ) { + rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics + } + if ( rc >= 0 ) { + rc = tidyParseString( tdoc, input ); // Parse the input + } + if ( rc >= 0 ) { + rc = tidyCleanAndRepair( tdoc ); // Tidy it up! + } + if ( rc >= 0 ) { + rc = tidyRunDiagnostics( tdoc ); // Kvetch + } + if ( rc > 1 ) { // If error, force output. + rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 ); + } + if ( rc >= 0 ) { + rc = tidySaveBuffer( tdoc, &output ); // Pretty Print + } + */ + + /* + if ( rc >= 0 ) + { + if ( rc > 0 ) + printf( "\\nDiagnostics:\\n\\n\%s", errbuf.bp ); + printf( "\\nAnd here is the result:\\n\\n\%s", output.bp ); + } + else + printf( "A severe error (\%d) occurred.\\n", rc ); + + tidyBufFree( &output ); + tidyBufFree( &errbuf ); + tidyRelease( tdoc ); + return rc; + */ +} diff --git a/src/core/basetypes/MCHTMLCleaner.h b/src/core/basetypes/MCHTMLCleaner.h new file mode 100644 index 00000000..f4707872 --- /dev/null +++ b/src/core/basetypes/MCHTMLCleaner.h @@ -0,0 +1,23 @@ +// +// HTMLCleaner.h +// mailcore2 +// +// Created by DINH Viêt Hoà on 2/3/13. +// Copyright (c) 2013 MailCore. All rights reserved. +// + +#ifndef __mailcore2__HTMLCleaner__ +#define __mailcore2__HTMLCleaner__ + +#include <mailcore/MCString.h> + +namespace mailcore { + + class HTMLCleaner { + public: + static String * cleanHTML(String * input); + }; + +} + +#endif /* defined(__mailcore2__HTMLCleaner__) */ diff --git a/src/core/basetypes/MCString.cc b/src/core/basetypes/MCString.cc index d57e70ed..d72d111d 100644 --- a/src/core/basetypes/MCString.cc +++ b/src/core/basetypes/MCString.cc @@ -20,6 +20,7 @@ #include "MCHashMap.h" #include "MCAutoreleasePool.h" #include "MCValue.h" +#include "MCHTMLCleaner.h" using namespace mailcore; @@ -1510,17 +1511,19 @@ static void elementStarted(void * ctx, const xmlChar * name, const xmlChar ** at pool = new AutoreleasePool(); attributes = dictionaryFromAttributes(atts); style = (String *) attributes->objectForKey(MCSTR("style")); - if (style->locationOfString(MCSTR("margin: 0.0px 0.0px 0.0px 0.0px;")) != -1) { - hasSpacing = false; - } - else if (style->locationOfString(MCSTR("margin: 0px 0px 0px 0px;")) != -1) { - hasSpacing = false; - } - else if (style->locationOfString(MCSTR("margin: 0.0px;")) != -1) { - hasSpacing = false; - } - else if (style->locationOfString(MCSTR("margin: 0px;")) != -1) { - hasSpacing = false; + if (style != NULL) { + if (style->locationOfString(MCSTR("margin: 0.0px 0.0px 0.0px 0.0px;")) != -1) { + hasSpacing = false; + } + else if (style->locationOfString(MCSTR("margin: 0px 0px 0px 0px;")) != -1) { + hasSpacing = false; + } + else if (style->locationOfString(MCSTR("margin: 0.0px;")) != -1) { + hasSpacing = false; + } + else if (style->locationOfString(MCSTR("margin: 0px;")) != -1) { + hasSpacing = false; + } } pool->release(); @@ -2010,8 +2013,7 @@ String * String::htmlEncodedString() String * String::cleanedHTMLString() { -#warning implement HTML cleaning with tidy - return (String *) copy()->autorelease(); + return HTMLCleaner::cleanHTML(this); } bool String::isEqualCaseInsensitive(String * otherString) |