// // GTMNSString+HTML.m // Dealing with NSStrings that contain HTML // // Copyright 2006-2008 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations under // the License. // #import "GTMDefines.h" #import "GTMNSString+HTML.h" typedef struct { NSString *escapeSequence; unichar uchar; } HTMLEscapeMap; // Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters // Ordered by uchar lowest to highest for bsearching static HTMLEscapeMap gAsciiHTMLEscapeMap[] = { // A.2.2. Special characters { @""", 34 }, { @"&", 38 }, { @"'", 39 }, { @"<", 60 }, { @">", 62 }, // A.2.1. Latin-1 characters { @" ", 160 }, { @"¡", 161 }, { @"¢", 162 }, { @"£", 163 }, { @"¤", 164 }, { @"¥", 165 }, { @"¦", 166 }, { @"§", 167 }, { @"¨", 168 }, { @"©", 169 }, { @"ª", 170 }, { @"«", 171 }, { @"¬", 172 }, { @"­", 173 }, { @"®", 174 }, { @"¯", 175 }, { @"°", 176 }, { @"±", 177 }, { @"²", 178 }, { @"³", 179 }, { @"´", 180 }, { @"µ", 181 }, { @"¶", 182 }, { @"·", 183 }, { @"¸", 184 }, { @"¹", 185 }, { @"º", 186 }, { @"»", 187 }, { @"¼", 188 }, { @"½", 189 }, { @"¾", 190 }, { @"¿", 191 }, { @"À", 192 }, { @"Á", 193 }, { @"Â", 194 }, { @"Ã", 195 }, { @"Ä", 196 }, { @"Å", 197 }, { @"Æ", 198 }, { @"Ç", 199 }, { @"È", 200 }, { @"É", 201 }, { @"Ê", 202 }, { @"Ë", 203 }, { @"Ì", 204 }, { @"Í", 205 }, { @"Î", 206 }, { @"Ï", 207 }, { @"Ð", 208 }, { @"Ñ", 209 }, { @"Ò", 210 }, { @"Ó", 211 }, { @"Ô", 212 }, { @"Õ", 213 }, { @"Ö", 214 }, { @"×", 215 }, { @"Ø", 216 }, { @"Ù", 217 }, { @"Ú", 218 }, { @"Û", 219 }, { @"Ü", 220 }, { @"Ý", 221 }, { @"Þ", 222 }, { @"ß", 223 }, { @"à", 224 }, { @"á", 225 }, { @"â", 226 }, { @"ã", 227 }, { @"ä", 228 }, { @"å", 229 }, { @"æ", 230 }, { @"ç", 231 }, { @"è", 232 }, { @"é", 233 }, { @"ê", 234 }, { @"ë", 235 }, { @"ì", 236 }, { @"í", 237 }, { @"î", 238 }, { @"ï", 239 }, { @"ð", 240 }, { @"ñ", 241 }, { @"ò", 242 }, { @"ó", 243 }, { @"ô", 244 }, { @"õ", 245 }, { @"ö", 246 }, { @"÷", 247 }, { @"ø", 248 }, { @"ù", 249 }, { @"ú", 250 }, { @"û", 251 }, { @"ü", 252 }, { @"ý", 253 }, { @"þ", 254 }, { @"ÿ", 255 }, // A.2.2. Special characters cont'd { @"Œ", 338 }, { @"œ", 339 }, { @"Š", 352 }, { @"š", 353 }, { @"Ÿ", 376 }, // A.2.3. Symbols { @"ƒ", 402 }, // A.2.2. Special characters cont'd { @"ˆ", 710 }, { @"˜", 732 }, // A.2.3. Symbols cont'd { @"Α", 913 }, { @"Β", 914 }, { @"Γ", 915 }, { @"Δ", 916 }, { @"Ε", 917 }, { @"Ζ", 918 }, { @"Η", 919 }, { @"Θ", 920 }, { @"Ι", 921 }, { @"Κ", 922 }, { @"Λ", 923 }, { @"Μ", 924 }, { @"Ν", 925 }, { @"Ξ", 926 }, { @"Ο", 927 }, { @"Π", 928 }, { @"Ρ", 929 }, { @"Σ", 931 }, { @"Τ", 932 }, { @"Υ", 933 }, { @"Φ", 934 }, { @"Χ", 935 }, { @"Ψ", 936 }, { @"Ω", 937 }, { @"α", 945 }, { @"β", 946 }, { @"γ", 947 }, { @"δ", 948 }, { @"ε", 949 }, { @"ζ", 950 }, { @"η", 951 }, { @"θ", 952 }, { @"ι", 953 }, { @"κ", 954 }, { @"λ", 955 }, { @"μ", 956 }, { @"ν", 957 }, { @"ξ", 958 }, { @"ο", 959 }, { @"π", 960 }, { @"ρ", 961 }, { @"ς", 962 }, { @"σ", 963 }, { @"τ", 964 }, { @"υ", 965 }, { @"φ", 966 }, { @"χ", 967 }, { @"ψ", 968 }, { @"ω", 969 }, { @"ϑ", 977 }, { @"ϒ", 978 }, { @"ϖ", 982 }, // A.2.2. Special characters cont'd { @" ", 8194 }, { @" ", 8195 }, { @" ", 8201 }, { @"‌", 8204 }, { @"‍", 8205 }, { @"‎", 8206 }, { @"‏", 8207 }, { @"–", 8211 }, { @"—", 8212 }, { @"‘", 8216 }, { @"’", 8217 }, { @"‚", 8218 }, { @"“", 8220 }, { @"”", 8221 }, { @"„", 8222 }, { @"†", 8224 }, { @"‡", 8225 }, // A.2.3. Symbols cont'd { @"•", 8226 }, { @"…", 8230 }, // A.2.2. Special characters cont'd { @"‰", 8240 }, // A.2.3. Symbols cont'd { @"′", 8242 }, { @"″", 8243 }, // A.2.2. Special characters cont'd { @"‹", 8249 }, { @"›", 8250 }, // A.2.3. Symbols cont'd { @"‾", 8254 }, { @"⁄", 8260 }, // A.2.2. Special characters cont'd { @"€", 8364 }, // A.2.3. Symbols cont'd { @"ℑ", 8465 }, { @"℘", 8472 }, { @"ℜ", 8476 }, { @"™", 8482 }, { @"ℵ", 8501 }, { @"←", 8592 }, { @"↑", 8593 }, { @"→", 8594 }, { @"↓", 8595 }, { @"↔", 8596 }, { @"↵", 8629 }, { @"⇐", 8656 }, { @"⇑", 8657 }, { @"⇒", 8658 }, { @"⇓", 8659 }, { @"⇔", 8660 }, { @"∀", 8704 }, { @"∂", 8706 }, { @"∃", 8707 }, { @"∅", 8709 }, { @"∇", 8711 }, { @"∈", 8712 }, { @"∉", 8713 }, { @"∋", 8715 }, { @"∏", 8719 }, { @"∑", 8721 }, { @"−", 8722 }, { @"∗", 8727 }, { @"√", 8730 }, { @"∝", 8733 }, { @"∞", 8734 }, { @"∠", 8736 }, { @"∧", 8743 }, { @"∨", 8744 }, { @"∩", 8745 }, { @"∪", 8746 }, { @"∫", 8747 }, { @"∴", 8756 }, { @"∼", 8764 }, { @"≅", 8773 }, { @"≈", 8776 }, { @"≠", 8800 }, { @"≡", 8801 }, { @"≤", 8804 }, { @"≥", 8805 }, { @"⊂", 8834 }, { @"⊃", 8835 }, { @"⊄", 8836 }, { @"⊆", 8838 }, { @"⊇", 8839 }, { @"⊕", 8853 }, { @"⊗", 8855 }, { @"⊥", 8869 }, { @"⋅", 8901 }, { @"⌈", 8968 }, { @"⌉", 8969 }, { @"⌊", 8970 }, { @"⌋", 8971 }, { @"⟨", 9001 }, { @"⟩", 9002 }, { @"◊", 9674 }, { @"♠", 9824 }, { @"♣", 9827 }, { @"♥", 9829 }, { @"♦", 9830 } }; // Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters // This is table A.2.2 Special Characters static HTMLEscapeMap gUnicodeHTMLEscapeMap[] = { // C0 Controls and Basic Latin { @""", 34 }, { @"&", 38 }, { @"'", 39 }, { @"<", 60 }, { @">", 62 }, // Latin Extended-A { @"Œ", 338 }, { @"œ", 339 }, { @"Š", 352 }, { @"š", 353 }, { @"Ÿ", 376 }, // Spacing Modifier Letters { @"ˆ", 710 }, { @"˜", 732 }, // General Punctuation { @" ", 8194 }, { @" ", 8195 }, { @" ", 8201 }, { @"‌", 8204 }, { @"‍", 8205 }, { @"‎", 8206 }, { @"‏", 8207 }, { @"–", 8211 }, { @"—", 8212 }, { @"‘", 8216 }, { @"’", 8217 }, { @"‚", 8218 }, { @"“", 8220 }, { @"”", 8221 }, { @"„", 8222 }, { @"†", 8224 }, { @"‡", 8225 }, { @"‰", 8240 }, { @"‹", 8249 }, { @"›", 8250 }, { @"€", 8364 }, }; // Utility function for Bsearching table above static int EscapeMapCompare(const void *ucharVoid, const void *mapVoid) { const unichar *uchar = (const unichar*)ucharVoid; const HTMLEscapeMap *map = (const HTMLEscapeMap*)mapVoid; int val; if (*uchar > map->uchar) { val = 1; } else if (*uchar < map->uchar) { val = -1; } else { val = 0; } return val; } @implementation NSString (GTMNSStringHTMLAdditions) - (NSString *)gtm_stringByEscapingHTMLUsingTable:(HTMLEscapeMap*)table ofSize:(NSUInteger)size escapingUnicode:(BOOL)escapeUnicode { NSUInteger length = [self length]; if (!length) { return self; } NSMutableString *finalString = [NSMutableString string]; NSMutableData *data2 = [NSMutableData dataWithCapacity:sizeof(unichar) * length]; // this block is common between GTMNSString+HTML and GTMNSString+XML but // it's so short that it isn't really worth trying to share. const unichar *buffer = CFStringGetCharactersPtr((CFStringRef)self); if (!buffer) { // We want this buffer to be autoreleased. NSMutableData *data = [NSMutableData dataWithLength:length * sizeof(UniChar)]; if (!data) { // COV_NF_START - Memory fail case _GTMDevLog(@"couldn't alloc buffer"); return nil; // COV_NF_END } [self getCharacters:[data mutableBytes]]; buffer = [data bytes]; } if (!buffer || !data2) { // COV_NF_START _GTMDevLog(@"Unable to allocate buffer or data2"); return nil; // COV_NF_END } unichar *buffer2 = (unichar *)[data2 mutableBytes]; NSUInteger buffer2Length = 0; for (NSUInteger i = 0; i < length; ++i) { HTMLEscapeMap *val = bsearch(&buffer[i], table, size / sizeof(HTMLEscapeMap), sizeof(HTMLEscapeMap), EscapeMapCompare); if (val || (escapeUnicode && buffer[i] > 127)) { if (buffer2Length) { CFStringAppendCharacters((CFMutableStringRef)finalString, buffer2, buffer2Length); buffer2Length = 0; } if (val) { [finalString appendString:val->escapeSequence]; } else { _GTMDevAssert(escapeUnicode && buffer[i] > 127, @"Illegal Character"); [finalString appendFormat:@"&#%d;", buffer[i]]; } } else { buffer2[buffer2Length] = buffer[i]; buffer2Length += 1; } } if (buffer2Length) { CFStringAppendCharacters((CFMutableStringRef)finalString, buffer2, buffer2Length); } return finalString; } - (NSString *)gtm_stringByEscapingForHTML { return [self gtm_stringByEscapingHTMLUsingTable:gUnicodeHTMLEscapeMap ofSize:sizeof(gUnicodeHTMLEscapeMap) escapingUnicode:NO]; } // gtm_stringByEscapingHTML - (NSString *)gtm_stringByEscapingForAsciiHTML { return [self gtm_stringByEscapingHTMLUsingTable:gAsciiHTMLEscapeMap ofSize:sizeof(gAsciiHTMLEscapeMap) escapingUnicode:YES]; } // gtm_stringByEscapingAsciiHTML - (NSString *)gtm_stringByUnescapingFromHTML { NSRange range = NSMakeRange(0, [self length]); NSRange subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]; // if no ampersands, we've got a quick way out if (subrange.length == 0) return self; NSMutableString *finalString = [NSMutableString stringWithString:self]; do { NSRange semiColonRange = NSMakeRange(subrange.location, NSMaxRange(range) - subrange.location); semiColonRange = [self rangeOfString:@";" options:0 range:semiColonRange]; range = NSMakeRange(0, subrange.location); // if we don't find a semicolon in the range, we don't have a sequence if (semiColonRange.location == NSNotFound) { continue; } NSRange escapeRange = NSMakeRange(subrange.location, semiColonRange.location - subrange.location + 1); NSString *escapeString = [self substringWithRange:escapeRange]; NSUInteger length = [escapeString length]; // a squence must be longer than 3 (<) and less than 11 (ϑ) if (length > 3 && length < 11) { if ([escapeString characterAtIndex:1] == '#') { unichar char2 = [escapeString characterAtIndex:2]; if (char2 == 'x' || char2 == 'X') { // Hex escape squences £ NSString *hexSequence = [escapeString substringWithRange:NSMakeRange(3, length - 4)]; NSScanner *scanner = [NSScanner scannerWithString:hexSequence]; unsigned value; if ([scanner scanHexInt:&value] && value > 0 && [scanner scanLocation] == length - 4) { if (value < USHRT_MAX) { unichar uchar = (unichar)value; NSString *charString = [NSString stringWithCharacters:&uchar length:1]; [finalString replaceCharactersInRange:escapeRange withString:charString]; } else if (value >= 0x10000 && value <= 0x10FFFF) { // code points in unicode supplementary planes int subtractedValue = value - 0x10000; unichar uchars[2]; uchars[0] = 0xD800 + (subtractedValue >> 10); uchars[1] = 0xDC00 + (subtractedValue & 0x3FF); NSString *charString = [NSString stringWithCharacters:uchars length:2]; if (charString) { [finalString replaceCharactersInRange:escapeRange withString:charString]; } } } } else { // Decimal Sequences { NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)]; NSScanner *scanner = [NSScanner scannerWithString:numberSequence]; int value; if ([scanner scanInt:&value] && value > 0 && [scanner scanLocation] == length - 3) { if (value < USHRT_MAX) { unichar uchar = (unichar)value; NSString *charString = [NSString stringWithCharacters:&uchar length:1]; [finalString replaceCharactersInRange:escapeRange withString:charString]; } else if (value >= 0x10000 && value <= 0x10FFFF) { // code points in unicode supplementary planes int subtractedValue = value - 0x10000; unichar uchars[2]; uchars[0] = 0xD800 + (subtractedValue >> 10); uchars[1] = 0xDC00 + (subtractedValue & 0x3FF); NSString *charString = [NSString stringWithCharacters:uchars length:2]; if (charString) { [finalString replaceCharactersInRange:escapeRange withString:charString]; } } } } } else { // "standard" sequences for (unsigned i = 0; i < sizeof(gAsciiHTMLEscapeMap) / sizeof(HTMLEscapeMap); ++i) { if ([escapeString isEqualToString:gAsciiHTMLEscapeMap[i].escapeSequence]) { [finalString replaceCharactersInRange:escapeRange withString:[NSString stringWithCharacters:&gAsciiHTMLEscapeMap[i].uchar length:1]]; break; } } } } } while ((subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]).length != 0); return finalString; } // gtm_stringByUnescapingHTML @end