aboutsummaryrefslogtreecommitdiffhomepage
path: root/index.cc
diff options
context:
space:
mode:
authorGravatar Carl Worth <cworth@cworth.org>2009-10-28 15:41:42 -0700
committerGravatar Carl Worth <cworth@cworth.org>2009-10-28 15:41:42 -0700
commit56218ddbb4a72fdec534773f2bd4e85aec914ae9 (patch)
tree2942e5d8707cb626cb1fef385f231f297279a6c7 /index.cc
parentcfa228a3d4b300df3551e811028508d3de5cd81c (diff)
index: Don't bother indexing quoted portions of messages (and signatures).
Our old notmuch-index-message.cc code had this, but I originally left it out when adding indexing back in. I was concerned primarily with mistakenly detecting signature markers and omitting important text, (for example, I often do long lines of "----" as section separators). But now I see that there's a performance benefit to skippint the quotations, (about 120 files/sec. instead of 95 files/sec.). I mitigated the bogus signature checking by recognizing nothing other than the all-time classic "-- ".
Diffstat (limited to 'index.cc')
-rw-r--r--index.cc56
1 files changed, 55 insertions, 1 deletions
diff --git a/index.cc b/index.cc
index b51d2261..747a4e63 100644
--- a/index.cc
+++ b/index.cc
@@ -135,6 +135,60 @@ skip_re_in_subject (const char *subject)
return s;
}
+/* Given a string representing the body of a message, generate terms
+ * for it, (skipping quoted portions and signatures).
+ *
+ * This function is evil in that it modifies the string passed to it,
+ * (changing some newlines into '\0').
+ */
+static void
+_index_body_text (notmuch_message_t *message, char *body)
+{
+ char *line, *line_end, *next_line;
+
+ if (body == NULL)
+ return;
+
+ next_line = body;
+
+ while (1) {
+ line = next_line;
+ if (*line == '\0')
+ break;
+
+ next_line = strchr (line, '\n');
+ if (next_line == NULL) {
+ next_line = line + strlen (line);
+ }
+ line_end = next_line - 1;
+
+ /* Get to the next non-blank line. */
+ while (*next_line == '\n')
+ next_line++;
+
+ /* Skip blank lines. */
+ if (line_end < line)
+ continue;
+
+ /* Skip lines that are quotes. */
+ if (*line == '>')
+ continue;
+
+ /* Also skip lines introducing a quote on the next line. */
+ if (*line_end == ':' && *next_line == '>')
+ continue;
+
+ /* Finally, bail as soon as we see a signature. */
+ /* XXX: Should only do this if "near" the end of the message. */
+ if (strncmp (line, "-- ", 3) == 0)
+ break;
+
+ *(line_end + 1) = '\0';
+
+ _notmuch_message_gen_terms (message, NULL, line);
+ }
+}
+
/* Callback to generate terms for each mime part of a message. */
static void
_index_mime_part (notmuch_message_t *message,
@@ -207,7 +261,7 @@ _index_mime_part (notmuch_message_t *message,
g_byte_array_append (byte_array, (guint8 *) "\0", 1);
body = (char *) g_byte_array_free (byte_array, FALSE);
- _notmuch_message_gen_terms (message, NULL, body);
+ _index_body_text (message, body);
free (body);
}