aboutsummaryrefslogtreecommitdiff
path: root/contexts/data/lib/closure-library/closure/goog/dom/annotate.js
blob: df5ca828354159a59c3944f80ca21eda7b23c842 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
// Copyright 2006 The Closure Library Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS-IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

/**
 * @fileoverview Methods for annotating occurrences of query terms in text or
 *   in a DOM tree. Adapted from Gmail code.
 *
 */

goog.provide('goog.dom.annotate');

goog.require('goog.array');
goog.require('goog.dom');
goog.require('goog.dom.NodeType');
goog.require('goog.string');


/**
 * Calls {@code annotateFn} for each occurrence of a search term in text nodes
 * under {@code node}. Returns the number of hits.
 *
 * @param {Node} node  A DOM node.
 * @param {Array} terms  An array of [searchTerm, matchWholeWordOnly] tuples.
 *   The matchWholeWordOnly value is a per-term attribute because some terms
 *   may be CJK, while others are not. (For correctness, matchWholeWordOnly
 *   should always be false for CJK terms.).
 * @param {Function} annotateFn  A function that takes
 *   (1) the number of the term that is "hit",
 *   (2) the HTML string (search term) to be annotated,
 *   and returns the annotated term as an HTML string.
 * @param {*=} opt_ignoreCase  Whether to ignore the case of the query
 *   terms when looking for matches.
 * @param {Array.<string>=} opt_classesToSkip  Nodes with one of these CSS class
 *   names (and its descendants) will be skipped.
 * @param {number=} opt_maxMs  Number of milliseconds after which this function,
 *   if still annotating, should stop and return.
 *
 * @return {boolean} Whether any terms were annotated.
 */
goog.dom.annotate.annotateTerms = function(node, terms, annotateFn,
                                           opt_ignoreCase,
                                           opt_classesToSkip,
                                           opt_maxMs) {
  if (opt_ignoreCase) {
    terms = goog.dom.annotate.lowercaseTerms_(terms);
  }
  var stopTime = opt_maxMs > 0 ? goog.now() + opt_maxMs : 0;

  return goog.dom.annotate.annotateTermsInNode_(
      node, terms, annotateFn, opt_ignoreCase, opt_classesToSkip || [],
      stopTime, 0);
};


/**
 * The maximum recursion depth allowed. Any DOM nodes deeper than this are
 * ignored.
 * @type {number}
 * @private
 */
goog.dom.annotate.MAX_RECURSION_ = 200;


/**
 * The node types whose descendants should not be affected by annotation.
 * @type {Array}
 * @private
 */
goog.dom.annotate.NODES_TO_SKIP_ = ['SCRIPT', 'STYLE', 'TEXTAREA'];


/**
 * Recursive helper function.
 *
 * @param {Node} node  A DOM node.
 * @param {Array} terms  An array of [searchTerm, matchWholeWordOnly] tuples.
 *     The matchWholeWordOnly value is a per-term attribute because some terms
 *     may be CJK, while others are not. (For correctness, matchWholeWordOnly
 *     should always be false for CJK terms.).
 * @param {Function} annotateFn  function(number, string) : string  A function
 *     that takes :
 *     (1) the number of the term that is "hit",
 *     (2) the HTML string (search term) to be annotated,
 *     and returns the annotated term as an HTML string.
 * @param {*} ignoreCase  Whether to ignore the case of the query terms
 *     when looking for matches.
 * @param {Array.<string>} classesToSkip  Nodes with one of these CSS class
 *     names will be skipped (as will their descendants).
 * @param {number} stopTime  Deadline for annotation operation (ignored if 0).
 * @param {number} recursionLevel  How deep this recursive call is; pass the
 *     value 0 in the initial call.
 * @return {boolean} Whether any terms were annotated.
 * @private
 */
goog.dom.annotate.annotateTermsInNode_ =
    function(node, terms, annotateFn, ignoreCase, classesToSkip,
             stopTime, recursionLevel) {
  if ((stopTime > 0 && goog.now() >= stopTime) ||
      recursionLevel > goog.dom.annotate.MAX_RECURSION_) {
    return false;
  }

  var annotated = false;

  if (node.nodeType == goog.dom.NodeType.TEXT) {
    var html = goog.dom.annotate.helpAnnotateText_(node.nodeValue, terms,
                                                   annotateFn, ignoreCase);
    if (html != null) {
      // Replace the text with the annotated html. First we put the html into
      // a temporary node, to get its DOM structure. To avoid adding a wrapper
      // element as a side effect, we'll only actually use the temporary node's
      // children.
      var tempNode = goog.dom.getOwnerDocument(node).createElement('SPAN');
      tempNode.innerHTML = html;

      var parentNode = node.parentNode;
      var nodeToInsert;
      while ((nodeToInsert = tempNode.firstChild) != null) {
        // Each parentNode.insertBefore call removes the inserted node from
        // tempNode's list of children.
        parentNode.insertBefore(nodeToInsert, node);
      }

      parentNode.removeChild(node);
      annotated = true;
    }
  } else if (node.hasChildNodes() &&
             !goog.array.contains(goog.dom.annotate.NODES_TO_SKIP_,
                 node.tagName)) {
    var classes = node.className.split(/\s+/);
    var skip = goog.array.some(classes, function(className) {
      return goog.array.contains(classesToSkip, className);
    });

    if (!skip) {
      ++recursionLevel;
      var curNode = node.firstChild;
      var numTermsAnnotated = 0;
      while (curNode) {
        var nextNode = curNode.nextSibling;
        var curNodeAnnotated = goog.dom.annotate.annotateTermsInNode_(
            curNode, terms, annotateFn, ignoreCase, classesToSkip,
            stopTime, recursionLevel);
        annotated = annotated || curNodeAnnotated;
        curNode = nextNode;
      }
    }
  }

  return annotated;
};


/**
 * Regular expression that matches non-word characters.
 *
 * Performance note: Testing a one-character string using this regex is as fast
 * as the equivalent string test ("a-zA-Z0-9_".indexOf(c) < 0), give or take a
 * few percent. (The regex is about 5% faster in IE 6 and about 4% slower in
 * Firefox 1.5.) If performance becomes critical, it may be better to convert
 * the character to a numerical char code and check whether it falls in the
 * word character ranges. A quick test suggests that could be 33% faster.
 *
 * @type {RegExp}
 * @private
 */
goog.dom.annotate.NONWORD_RE_ = /\W/;


/**
 * Annotates occurrences of query terms in plain text. This process consists of
 * identifying all occurrences of all query terms, calling a provided function
 * to get the appropriate replacement HTML for each occurrence, and
 * HTML-escaping all the text.
 *
 * @param {string} text  The plain text to be searched.
 * @param {Array} terms  An array of
 *   [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
 *   The matchWholeWordOnly value is a per-term attribute because some terms
 *   may be CJK, while others are not. (For correctness, matchWholeWordOnly
 *   should always be false for CJK terms.).
 * @param {Function} annotateFn {function(number, string) : string} A function
 *   that takes
 *   (1) the number of the term that is "hit",
 *   (2) the HTML string (search term) to be annotated,
 *   and returns the annotated term as an HTML string.
 * @param {*=} opt_ignoreCase  Whether to ignore the case of the query
 *   terms when looking for matches.
 * @return {?string} The HTML equivalent of {@code text} with terms
 *   annotated, or null if the text did not contain any of the terms.
 */
goog.dom.annotate.annotateText = function(text, terms, annotateFn,
                                          opt_ignoreCase) {
  if (opt_ignoreCase) {
    terms = goog.dom.annotate.lowercaseTerms_(terms);
  }
  return goog.dom.annotate.helpAnnotateText_(text, terms, annotateFn,
                                             opt_ignoreCase);
};


/**
 * Annotates occurrences of query terms in plain text. This process consists of
 * identifying all occurrences of all query terms, calling a provided function
 * to get the appropriate replacement HTML for each occurrence, and
 * HTML-escaping all the text.
 *
 * @param {string} text  The plain text to be searched.
 * @param {Array} terms  An array of
 *   [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
 *   If {@code ignoreCase} is true, each search term must already be lowercase.
 *   The matchWholeWordOnly value is a per-term attribute because some terms
 *   may be CJK, while others are not. (For correctness, matchWholeWordOnly
 *   should always be false for CJK terms.).
 * @param {Function} annotateFn {function(number, string) : string} A function
 *   that takes
 *   (1) the number of the term that is "hit",
 *   (2) the HTML string (search term) to be annotated,
 *   and returns the annotated term as an HTML string.
 * @param {*} ignoreCase  Whether to ignore the case of the query terms
 *   when looking for matches.
 * @return {?string} The HTML equivalent of {@code text} with terms
 *   annotated, or null if the text did not contain any of the terms.
 * @private
 */
goog.dom.annotate.helpAnnotateText_ = function(text, terms, annotateFn,
                                               ignoreCase) {
  var hit = false;
  var resultHtml = null;
  var textToSearch = ignoreCase ? text.toLowerCase() : text;
  var textLen = textToSearch.length;
  var numTerms = terms.length;

  // Each element will be an array of hit positions for the term.
  var termHits = new Array(numTerms);

  // First collect all the hits into allHits.
  for (var i = 0; i < numTerms; i++) {
    var term = terms[i];
    var hits = [];
    var termText = term[0];
    if (termText != '') {
      var matchWholeWordOnly = term[1];
      var termLen = termText.length;
      var pos = 0;
      // Find each hit for term t and append to termHits.
      while (pos < textLen) {
        var hitPos = textToSearch.indexOf(termText, pos);
        if (hitPos == -1) {
          break;
        } else {
          var prevCharPos = hitPos - 1;
          var nextCharPos = hitPos + termLen;
          if (!matchWholeWordOnly ||
              ((prevCharPos < 0 ||
                goog.dom.annotate.NONWORD_RE_.test(
                    textToSearch.charAt(prevCharPos))) &&
               (nextCharPos >= textLen ||
                goog.dom.annotate.NONWORD_RE_.test(
                    textToSearch.charAt(nextCharPos))))) {
            hits.push(hitPos);
            hit = true;
          }
          pos = hitPos + termLen;
        }
      }
    }
    termHits[i] = hits;
  }

  if (hit) {
    var html = [];
    var pos = 0;

    while (true) {
      // First determine which of the n terms is the next hit.
      var termIndexOfNextHit;
      var posOfNextHit = -1;

      for (var i = 0; i < numTerms; i++) {
        var hits = termHits[i];
        // pull off the position of the next hit of term t
        // (it's always the first in the array because we're shifting
        // hits off the front of the array as we process them)
        // this is the next candidate to consider for the next overall hit
        if (!goog.array.isEmpty(hits)) {
          var hitPos = hits[0];

          // Discard any hits embedded in the previous hit.
          while (hitPos >= 0 && hitPos < pos) {
            hits.shift();
            hitPos = goog.array.isEmpty(hits) ? -1 : hits[0];
          }

          if (hitPos >= 0 && (posOfNextHit < 0 || hitPos < posOfNextHit)) {
            termIndexOfNextHit = i;
            posOfNextHit = hitPos;
          }
        }
      }

      // Quit if there are no more hits.
      if (posOfNextHit < 0) break;

      // Remove the next hit from our hit list.
      termHits[termIndexOfNextHit].shift();

      // Append everything from the end of the last hit up to this one.
      html.push(goog.string.htmlEscape(text.substr(pos, posOfNextHit - pos)));

      // Append the annotated term.
      var termLen = terms[termIndexOfNextHit][0].length;
      var termHtml = goog.string.htmlEscape(text.substr(posOfNextHit, termLen));
      html.push(annotateFn(termIndexOfNextHit, termHtml));

      pos = posOfNextHit + termLen;
    }

    // Append everything after the last hit.
    html.push(goog.string.htmlEscape(text.substr(pos)));
    return html.join('');
  } else {
    return null;
  }
};


/**
 * Converts terms to lowercase.
 *
 * @param {Array} terms  An array of
 *   [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
 * @return {Array}  An array of
 *   [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
 * @private
 */
goog.dom.annotate.lowercaseTerms_ = function(terms) {
  var lowercaseTerms = [];
  for (var i = 0; i < terms.length; ++i) {
    var term = terms[i];
    lowercaseTerms[i] = [term[0].toLowerCase(), term[1]];
  }
  return lowercaseTerms;
};