From 2a5219567634ab7ab74314ff3615132becadff4a Mon Sep 17 00:00:00 2001 From: thomasvl Date: Mon, 28 Jan 2008 20:19:42 +0000 Subject: initial drop of a few sources to start things out --- Foundation/GTMNSString+HTML.m | 514 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 514 insertions(+) create mode 100644 Foundation/GTMNSString+HTML.m (limited to 'Foundation/GTMNSString+HTML.m') diff --git a/Foundation/GTMNSString+HTML.m b/Foundation/GTMNSString+HTML.m new file mode 100644 index 0000000..f9e99dc --- /dev/null +++ b/Foundation/GTMNSString+HTML.m @@ -0,0 +1,514 @@ +// +// GTMNSString+HTML.m +// Dealing with NSStrings that contain HTML +// +// Copyright 2006-2008 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. +// + +#import "GTMNSString+HTML.h" + +typedef struct { + NSString *escapeSequence; + unichar uchar; +} HTMLEscapeMap; + +// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters +// Ordered by uchar lowest to highest for bsearching +static HTMLEscapeMap gAsciiHTMLEscapeMap[] = { + // A.2.2. Special characters + { @""", 34 }, + { @"&", 38 }, + { @"'", 39 }, + { @"<", 60 }, + { @">", 62 }, + + // A.2.1. Latin-1 characters + { @" ", 160 }, + { @"¡", 161 }, + { @"¢", 162 }, + { @"£", 163 }, + { @"¤", 164 }, + { @"¥", 165 }, + { @"¦", 166 }, + { @"§", 167 }, + { @"¨", 168 }, + { @"©", 169 }, + { @"ª", 170 }, + { @"«", 171 }, + { @"¬", 172 }, + { @"­", 173 }, + { @"®", 174 }, + { @"¯", 175 }, + { @"°", 176 }, + { @"±", 177 }, + { @"²", 178 }, + { @"³", 179 }, + { @"´", 180 }, + { @"µ", 181 }, + { @"¶", 182 }, + { @"·", 183 }, + { @"¸", 184 }, + { @"¹", 185 }, + { @"º", 186 }, + { @"»", 187 }, + { @"¼", 188 }, + { @"½", 189 }, + { @"¾", 190 }, + { @"¿", 191 }, + { @"À", 192 }, + { @"Á", 193 }, + { @"Â", 194 }, + { @"Ã", 195 }, + { @"Ä", 196 }, + { @"Å", 197 }, + { @"Æ", 198 }, + { @"Ç", 199 }, + { @"È", 200 }, + { @"É", 201 }, + { @"Ê", 202 }, + { @"Ë", 203 }, + { @"Ì", 204 }, + { @"Í", 205 }, + { @"Î", 206 }, + { @"Ï", 207 }, + { @"Ð", 208 }, + { @"Ñ", 209 }, + { @"Ò", 210 }, + { @"Ó", 211 }, + { @"Ô", 212 }, + { @"Õ", 213 }, + { @"Ö", 214 }, + { @"×", 215 }, + { @"Ø", 216 }, + { @"Ù", 217 }, + { @"Ú", 218 }, + { @"Û", 219 }, + { @"Ü", 220 }, + { @"Ý", 221 }, + { @"Þ", 222 }, + { @"ß", 223 }, + { @"à", 224 }, + { @"á", 225 }, + { @"â", 226 }, + { @"ã", 227 }, + { @"ä", 228 }, + { @"å", 229 }, + { @"æ", 230 }, + { @"ç", 231 }, + { @"è", 232 }, + { @"é", 233 }, + { @"ê", 234 }, + { @"ë", 235 }, + { @"ì", 236 }, + { @"í", 237 }, + { @"î", 238 }, + { @"ï", 239 }, + { @"ð", 240 }, + { @"ñ", 241 }, + { @"ò", 242 }, + { @"ó", 243 }, + { @"ô", 244 }, + { @"õ", 245 }, + { @"ö", 246 }, + { @"÷", 247 }, + { @"ø", 248 }, + { @"ù", 249 }, + { @"ú", 250 }, + { @"û", 251 }, + { @"ü", 252 }, + { @"ý", 253 }, + { @"þ", 254 }, + { @"ÿ", 255 }, + + // A.2.2. Special characters cont'd + { @"Œ", 338 }, + { @"œ", 339 }, + { @"Š", 352 }, + { @"š", 353 }, + { @"Ÿ", 376 }, + + // A.2.3. Symbols + { @"ƒ", 402 }, + + // A.2.2. Special characters cont'd + { @"ˆ", 710 }, + { @"˜", 732 }, + + // A.2.3. Symbols cont'd + { @"Α", 913 }, + { @"Β", 914 }, + { @"Γ", 915 }, + { @"Δ", 916 }, + { @"Ε", 917 }, + { @"Ζ", 918 }, + { @"Η", 919 }, + { @"Θ", 920 }, + { @"Ι", 921 }, + { @"Κ", 922 }, + { @"Λ", 923 }, + { @"Μ", 924 }, + { @"Ν", 925 }, + { @"Ξ", 926 }, + { @"Ο", 927 }, + { @"Π", 928 }, + { @"Ρ", 929 }, + { @"Σ", 931 }, + { @"Τ", 932 }, + { @"Υ", 933 }, + { @"Φ", 934 }, + { @"Χ", 935 }, + { @"Ψ", 936 }, + { @"Ω", 937 }, + { @"α", 945 }, + { @"β", 946 }, + { @"γ", 947 }, + { @"δ", 948 }, + { @"ε", 949 }, + { @"ζ", 950 }, + { @"η", 951 }, + { @"θ", 952 }, + { @"ι", 953 }, + { @"κ", 954 }, + { @"λ", 955 }, + { @"μ", 956 }, + { @"ν", 957 }, + { @"ξ", 958 }, + { @"ο", 959 }, + { @"π", 960 }, + { @"ρ", 961 }, + { @"ς", 962 }, + { @"σ", 963 }, + { @"τ", 964 }, + { @"υ", 965 }, + { @"φ", 966 }, + { @"χ", 967 }, + { @"ψ", 968 }, + { @"ω", 969 }, + { @"ϑ", 977 }, + { @"ϒ", 978 }, + { @"ϖ", 982 }, + + // A.2.2. Special characters cont'd + { @" ", 8194 }, + { @" ", 8195 }, + { @" ", 8201 }, + { @"‌", 8204 }, + { @"‍", 8205 }, + { @"‎", 8206 }, + { @"‏", 8207 }, + { @"–", 8211 }, + { @"—", 8212 }, + { @"‘", 8216 }, + { @"’", 8217 }, + { @"‚", 8218 }, + { @"“", 8220 }, + { @"”", 8221 }, + { @"„", 8222 }, + { @"†", 8224 }, + { @"‡", 8225 }, + // A.2.3. Symbols cont'd + { @"•", 8226 }, + { @"…", 8230 }, + + // A.2.2. Special characters cont'd + { @"‰", 8240 }, + + // A.2.3. Symbols cont'd + { @"′", 8242 }, + { @"″", 8243 }, + + // A.2.2. Special characters cont'd + { @"‹", 8249 }, + { @"›", 8250 }, + + // A.2.3. Symbols cont'd + { @"‾", 8254 }, + { @"⁄", 8260 }, + + // A.2.2. Special characters cont'd + { @"€", 8364 }, + + // A.2.3. Symbols cont'd + { @"ℑ", 8465 }, + { @"℘", 8472 }, + { @"ℜ", 8476 }, + { @"™", 8482 }, + { @"ℵ", 8501 }, + { @"←", 8592 }, + { @"↑", 8593 }, + { @"→", 8594 }, + { @"↓", 8595 }, + { @"↔", 8596 }, + { @"↵", 8629 }, + { @"⇐", 8656 }, + { @"⇑", 8657 }, + { @"⇒", 8658 }, + { @"⇓", 8659 }, + { @"⇔", 8660 }, + { @"∀", 8704 }, + { @"∂", 8706 }, + { @"∃", 8707 }, + { @"∅", 8709 }, + { @"∇", 8711 }, + { @"∈", 8712 }, + { @"∉", 8713 }, + { @"∋", 8715 }, + { @"∏", 8719 }, + { @"∑", 8721 }, + { @"−", 8722 }, + { @"∗", 8727 }, + { @"√", 8730 }, + { @"∝", 8733 }, + { @"∞", 8734 }, + { @"∠", 8736 }, + { @"∧", 8743 }, + { @"∨", 8744 }, + { @"∩", 8745 }, + { @"∪", 8746 }, + { @"∫", 8747 }, + { @"∴", 8756 }, + { @"∼", 8764 }, + { @"≅", 8773 }, + { @"≈", 8776 }, + { @"≠", 8800 }, + { @"≡", 8801 }, + { @"≤", 8804 }, + { @"≥", 8805 }, + { @"⊂", 8834 }, + { @"⊃", 8835 }, + { @"⊄", 8836 }, + { @"⊆", 8838 }, + { @"⊇", 8839 }, + { @"⊕", 8853 }, + { @"⊗", 8855 }, + { @"⊥", 8869 }, + { @"⋅", 8901 }, + { @"⌈", 8968 }, + { @"⌉", 8969 }, + { @"⌊", 8970 }, + { @"⌋", 8971 }, + { @"⟨", 9001 }, + { @"⟩", 9002 }, + { @"◊", 9674 }, + { @"♠", 9824 }, + { @"♣", 9827 }, + { @"♥", 9829 }, + { @"♦", 9830 } +}; + +// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters +// This is table A.2.2 Special Characters +static HTMLEscapeMap gUnicodeHTMLEscapeMap[] = { + // C0 Controls and Basic Latin + { @""", 34 }, + { @"&", 38 }, + { @"'", 39 }, + { @"<", 60 }, + { @">", 62 }, + + // Latin Extended-A + { @"Œ", 338 }, + { @"œ", 339 }, + { @"Š", 352 }, + { @"š", 353 }, + { @"Ÿ", 376 }, + + // Spacing Modifier Letters + { @"ˆ", 710 }, + { @"˜", 732 }, + + // General Punctuation + { @" ", 8194 }, + { @" ", 8195 }, + { @" ", 8201 }, + { @"‌", 8204 }, + { @"‍", 8205 }, + { @"‎", 8206 }, + { @"‏", 8207 }, + { @"–", 8211 }, + { @"—", 8212 }, + { @"‘", 8216 }, + { @"’", 8217 }, + { @"‚", 8218 }, + { @"“", 8220 }, + { @"”", 8221 }, + { @"„", 8222 }, + { @"†", 8224 }, + { @"‡", 8225 }, + { @"‰", 8240 }, + { @"‹", 8249 }, + { @"›", 8250 }, + { @"€", 8364 }, +}; + + +// Utility function for Bsearching table above +static int escapeMapCompare(const void *ucharVoid, const void *mapVoid) { + unichar *uchar = (unichar*)ucharVoid; + HTMLEscapeMap *map = (HTMLEscapeMap*)mapVoid; + int val; + if (*uchar > map->uchar) { + val = 1; + } else if (*uchar < map->uchar) { + val = -1; + } else { + val = 0; + } + return val; +} + +@implementation NSString (GTMNSStringHTMLAdditions) + +- (NSString *)gtm_stringByEscapingHTMLUsingTable:(HTMLEscapeMap*)table + ofSize:(int)size + escapingUnicode:(BOOL)escapeUnicode { + NSMutableString *finalString = [NSMutableString string]; + int length = [self length]; + require_quiet(length != 0, cantConvertAnything); + + unichar *buffer = malloc(sizeof(unichar) * length); + require_action(buffer, cantAllocBuffer, finalString = nil); + unichar *buffer2 = malloc(sizeof(unichar) * length); + require_action(buffer2, cantAllocBuffer2, finalString = nil); + + [self getCharacters:buffer]; + int buffer2Length = 0; + + for (int i = 0; i < length; ++i) { + HTMLEscapeMap *val = bsearch(&buffer[i], table, + size / sizeof(HTMLEscapeMap), + sizeof(HTMLEscapeMap), escapeMapCompare); + if (val || (escapeUnicode && buffer[i] > 127)) { + if (buffer2Length) { + CFStringRef buffer2String = + CFStringCreateWithCharactersNoCopy(kCFAllocatorDefault, + buffer2, buffer2Length, + kCFAllocatorNull); + require_action(buffer2String, cantCreateString, finalString = nil); + [finalString appendString:(NSString*)buffer2String]; + CFRelease(buffer2String); + buffer2Length = 0; + } + if (val) { + [finalString appendString:val->escapeSequence]; + } + else if (escapeUnicode && buffer[i] > 127) { + [finalString appendFormat:@"&#%d;", buffer[i]]; + } else { + // Should never get here. Assert in debug. + require_action(NO, cantCreateString, finalString = nil); + } + } else { + buffer2[buffer2Length] = buffer[i]; + buffer2Length += 1; + } + } + if (buffer2Length) { + CFStringRef buffer2String = + CFStringCreateWithCharactersNoCopy(kCFAllocatorDefault, + buffer2, buffer2Length, + kCFAllocatorNull); + require_action(buffer2String, cantCreateString, finalString = nil); + [finalString appendString:(NSString*)buffer2String]; + CFRelease(buffer2String); + } +cantCreateString: + free(buffer2); +cantAllocBuffer2: + free(buffer); +cantAllocBuffer: +cantConvertAnything: + return finalString; +} + +- (NSString *)gtm_stringByEscapingForHTML { + return [self gtm_stringByEscapingHTMLUsingTable:gUnicodeHTMLEscapeMap + ofSize:sizeof(gUnicodeHTMLEscapeMap) + escapingUnicode:NO]; +} // gtm_stringByEscapingHTML + +- (NSString *)gtm_stringByEscapingForAsciiHTML { + return [self gtm_stringByEscapingHTMLUsingTable:gAsciiHTMLEscapeMap + ofSize:sizeof(gAsciiHTMLEscapeMap) + escapingUnicode:YES]; +} // gtm_stringByEscapingAsciiHTML + +- (NSString *)gtm_stringByUnescapingFromHTML { + NSRange range = NSMakeRange(0, [self length]); + NSRange subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]; + + // if no ampersands, we've got a quick way out + if (subrange.length == 0) return self; + NSMutableString *finalString = [NSMutableString stringWithString:self]; + do { + NSRange semiColonRange = NSMakeRange(subrange.location, NSMaxRange(range) - subrange.location); + semiColonRange = [self rangeOfString:@";" options:0 range:semiColonRange]; + range = NSMakeRange(0, subrange.location); + // if we don't find a semicolon in the range, we don't have a sequence + if (semiColonRange.location == NSNotFound) { + continue; + } + NSRange escapeRange = NSMakeRange(subrange.location, semiColonRange.location - subrange.location + 1); + NSString *escapeString = [self substringWithRange:escapeRange]; + unsigned length = [escapeString length]; + // a squence must be longer than 3 (<) and less than 11 (ϑ) + if (length > 3 && length < 11) { + if ([escapeString characterAtIndex:1] == '#') { + unichar char2 = [escapeString characterAtIndex:2]; + if (char2 == 'x' || char2 == 'X') { + // Hex escape squences £ + NSString *hexSequence = [escapeString substringWithRange:NSMakeRange(3, length - 4)]; + NSScanner *scanner = [NSScanner scannerWithString:hexSequence]; + unsigned value; + if ([scanner scanHexInt:&value] && + value < USHRT_MAX && + value > 0 + && [scanner scanLocation] == length - 4) { + unichar uchar = value; + NSString *charString = [NSString stringWithCharacters:&uchar length:1]; + [finalString replaceCharactersInRange:escapeRange withString:charString]; + } + + } else { + // Decimal Sequences { + NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)]; + NSScanner *scanner = [NSScanner scannerWithString:numberSequence]; + int value; + if ([scanner scanInt:&value] && + value < USHRT_MAX && + value > 0 + && [scanner scanLocation] == length - 3) { + unichar uchar = value; + NSString *charString = [NSString stringWithCharacters:&uchar length:1]; + [finalString replaceCharactersInRange:escapeRange withString:charString]; + } + } + } else { + // "standard" sequences + for (unsigned i = 0; i < sizeof(gAsciiHTMLEscapeMap) / sizeof(HTMLEscapeMap); ++i) { + if ([escapeString isEqualToString:gAsciiHTMLEscapeMap[i].escapeSequence]) { + [finalString replaceCharactersInRange:escapeRange withString:[NSString stringWithCharacters:&gAsciiHTMLEscapeMap[i].uchar length:1]]; + break; + } + } + } + } + } while ((subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]).length != 0); + return finalString; +} // gtm_stringByUnescapingHTML + + + +@end -- cgit v1.2.3