From f38d4ee0e9157d2ac3c9a08fde37b9e19457fb0f Mon Sep 17 00:00:00 2001 From: thomasvl Date: Wed, 27 Feb 2008 03:54:04 +0000 Subject: Found and fixed a bug in the regex enumerators that was causing them to incorrectly walk a string when using '^' in an expression. Added GTMScriptRunner for spawning scripts. --- Foundation/GTMRegex.m | 61 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 6 deletions(-) (limited to 'Foundation/GTMRegex.m') diff --git a/Foundation/GTMRegex.m b/Foundation/GTMRegex.m index c582b1e..d7900fa 100644 --- a/Foundation/GTMRegex.m +++ b/Foundation/GTMRegex.m @@ -6,9 +6,9 @@ // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the @@ -21,7 +21,7 @@ // This is the pattern to use for walking replacement text when doing // substitutions. // -// this pattern may look over escaped, but remember the compiler will consume +// This pattern may look over-escaped, but remember the compiler will consume // one layer of slashes, and then we have to escape the slashes for them to be // seen as we want in the pattern. static NSString *const kReplacementPattern = @@ -43,12 +43,14 @@ static NSString *const kReplacementPattern = GTMRegex *regex_; NSData *utf8StrBuf_; BOOL allSegments_; + BOOL treatStartOfNewSegmentAsBeginningOfString_; regoff_t curParseIndex_; regmatch_t *savedRegMatches_; } - (id)initWithRegex:(GTMRegex *)regex processString:(NSString *)str allSegments:(BOOL)allSegments; +- (void)treatStartOfNewSegmentAsBeginningOfString:(BOOL)yesNo; @end @interface GTMRegexStringSegment (PrivateMethods) @@ -263,9 +265,32 @@ static NSString *const kReplacementPattern = GTMRegex *replacementRegex = [GTMRegex regexWithPattern:kReplacementPattern options:kGTMRegexOptionSupressNewlineSupport]; +#ifdef DEBUG + if (!replacementRegex) + NSLog(@"failed to parse out replacement regex!!!"); +#endif + GTMRegexEnumerator *relacementEnumerator = + [[[GTMRegexEnumerator alloc] initWithRegex:replacementRegex + processString:replacementPattern + allSegments:YES] autorelease]; + // We turn on treatStartOfNewSegmentAsBeginningOfLine for this enumerator. + // As complex as kReplacementPattern is, it can't completely do what we want + // with the normal string walk. The problem is this, backreferences are a + // slash follow by a number ("\0"), but the replacement pattern might + // actually need to use backslashes (they have to be escaped). So if a + // replacement were "\\0", then there is no backreference, instead the + // replacement is a backslash and a zero. Generically this means an even + // number of backslashes are all escapes, and an odd are some number of + // literal backslashes followed by our backreference. Think of it as a "an + // odd number of slashes that comes after a non-backslash character." There + // is no way to rexpress this in re_format(7) extended expressions. Instead + // we look for a non-blackslash or string start followed by an optional even + // number of slashes followed by the backreference; and use the special + // flag; so after each match, we restart claiming it's the start of the + // string. (the problem match w/o this flag is a substition of "\2\1") + [relacementEnumerator treatStartOfNewSegmentAsBeginningOfString:YES]; // pull them all into an array so we can walk this as many times as needed. - replacements = - [[replacementRegex segmentEnumeratorForString:replacementPattern] allObjects]; + replacements = [relacementEnumerator allObjects]; if (!replacements) { NSLog(@"failed to create the replacements for subtituations"); return nil; @@ -413,6 +438,21 @@ static NSString *const kReplacementPattern = [super dealloc]; } +- (void)treatStartOfNewSegmentAsBeginningOfString:(BOOL)yesNo { + // The way regexec works, it assumes the first char it's looking at to the + // start of the string. In normal use, this makes sense; but in this case, + // we're going to walk the entry string splitting it up by our pattern. That + // means for the first call, it is the string start, but for all future calls, + // it is NOT the string start, so we will pass regexec the flag to let it + // know. However, (you knew that was coming), there are some cases where you + // actually want the each pass to be considered as the start of the string + // (usually the cases are where a pattern can't express what's needed w/o + // this). There is no really good way to explain this behavior w/o all this + // text and lot of examples, so for now this is not in the public api, and + // just here. (Hint: see what w/in this file uses this for why we have it) + treatStartOfNewSegmentAsBeginningOfString_ = yesNo; +} + - (id)nextObject { GTMRegexStringSegment *result = nil; @@ -446,11 +486,20 @@ static NSString *const kReplacementPattern = nextMatches[0].rm_so = curParseIndex_; nextMatches[0].rm_eo = [utf8StrBuf_ length]; + // figure out our flags + int flags = REG_STARTEND; + if ((!treatStartOfNewSegmentAsBeginningOfString_) && + (curParseIndex_ != 0)) { + // see -treatStartOfNewSegmentAsBeginningOfString: for why we have + // this check here. + flags |= REG_NOTBOL; + } + // call for the match if ([regex_ runRegexOnUTF8:[utf8StrBuf_ bytes] nmatch:([regex_ subPatternCount] + 1) pmatch:nextMatches - flags:REG_STARTEND]) { + flags:flags]) { // match if (allSegments_ && -- cgit v1.2.3