From 66609a28b667de38a73130e5f6e67c2690eddfc9 Mon Sep 17 00:00:00 2001 From: Ray Chiang Date: Wed, 15 Nov 2017 11:13:36 +0800 Subject: HTML unescape using unichar array instead of bytes --- Foundation/GTMNSString+HTML.m | 35 ++++++++++++++++++++++++++--------- Foundation/GTMNSString+HTMLTest.m | 7 +++++-- 2 files changed, 31 insertions(+), 11 deletions(-) (limited to 'Foundation') diff --git a/Foundation/GTMNSString+HTML.m b/Foundation/GTMNSString+HTML.m index c35e760..168c094 100644 --- a/Foundation/GTMNSString+HTML.m +++ b/Foundation/GTMNSString+HTML.m @@ -486,29 +486,46 @@ static int EscapeMapCompare(const void *ucharVoid, const void *mapVoid) { NSScanner *scanner = [NSScanner scannerWithString:hexSequence]; unsigned value; if ([scanner scanHexInt:&value] && - value < INT_MAX && value > 0 && [scanner scanLocation] == length - 4) { - value = NSSwapHostIntToLittle(value); - NSString *charString = [[NSString alloc] initWithBytes:&value length:sizeof(value) encoding:NSUTF32LittleEndianStringEncoding]; - if (charString) { + if (value < USHRT_MAX) { + unichar uchar = (unichar)value; + NSString *charString = [NSString stringWithCharacters:&uchar length:1]; [finalString replaceCharactersInRange:escapeRange withString:charString]; + } else if (value >= 0x10000 && value <= 0x10FFFF) { + // code points in unicode supplementary planes + int subtractedValue = value - 0x10000; + unichar uchars[2]; + uchars[0] = 0xD800 + (subtractedValue >> 10); + uchars[1] = 0xDC00 + (subtractedValue & 0x3FF); + NSString *charString = [NSString stringWithCharacters:uchars length:2]; + if (charString) { + [finalString replaceCharactersInRange:escapeRange withString:charString]; + } } } - } else { // Decimal Sequences { NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)]; NSScanner *scanner = [NSScanner scannerWithString:numberSequence]; int value; if ([scanner scanInt:&value] && - value < INT_MAX && value > 0 && [scanner scanLocation] == length - 3) { - value = NSSwapHostIntToLittle(value); - NSString *charString = [[NSString alloc] initWithBytes:&value length:sizeof(value) encoding:NSUTF32LittleEndianStringEncoding]; - if (charString) { + if (value < USHRT_MAX) { + unichar uchar = (unichar)value; + NSString *charString = [NSString stringWithCharacters:&uchar length:1]; [finalString replaceCharactersInRange:escapeRange withString:charString]; + } else if (value >= 0x10000 && value <= 0x10FFFF) { + // code points in unicode supplementary planes + int subtractedValue = value - 0x10000; + unichar uchars[2]; + uchars[0] = 0xD800 + (subtractedValue >> 10); + uchars[1] = 0xDC00 + (subtractedValue & 0x3FF); + NSString *charString = [NSString stringWithCharacters:uchars length:2]; + if (charString) { + [finalString replaceCharactersInRange:escapeRange withString:charString]; + } } } } diff --git a/Foundation/GTMNSString+HTMLTest.m b/Foundation/GTMNSString+HTMLTest.m index f6fb362..7e85eca 100644 --- a/Foundation/GTMNSString+HTMLTest.m +++ b/Foundation/GTMNSString+HTMLTest.m @@ -226,8 +226,11 @@ XCTAssertEqualObjects([@"<this & that>" gtm_stringByUnescapingFromHTML], @"", @"HTML unescaping failed"); - XCTAssertEqualObjects([@"👍" gtm_stringByUnescapingFromHTML], - @"👍", @"HTML unescaping failed"); + XCTAssertEqualObjects([@"𐐷" gtm_stringByUnescapingFromHTML], + @"𐐷", @"HTML unescaping failed"); + + XCTAssertEqualObjects([@"𐐷" gtm_stringByUnescapingFromHTML], + @"𐐷", @"HTML unescaping failed"); } // testStringByUnescapingHTML -- cgit v1.2.3