diff options
Diffstat (limited to 'exampleData/ruleSets/language-processing/natural/nattest1.js')
-rw-r--r-- | exampleData/ruleSets/language-processing/natural/nattest1.js | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/exampleData/ruleSets/language-processing/natural/nattest1.js b/exampleData/ruleSets/language-processing/natural/nattest1.js new file mode 100644 index 0000000..bcb8712 --- /dev/null +++ b/exampleData/ruleSets/language-processing/natural/nattest1.js @@ -0,0 +1,66 @@ +exports.name = "Common words"; +exports.description = "Identifies rare word use (words not in the 1000 most common English word list)."; + +var natural = require('natural'); + +var stemmer = natural.PorterStemmer; +var tokenizer = new natural.TreebankWordTokenizer(); + +var getTextNodesIn = function (node, includeWhitespaceNodes) { + var textNodes = [], whitespace = /^\s*$/; + + function getTextNodes(node) { + if (node.nodeType == 3) { + if (includeWhitespaceNodes || !whitespace.test(node.nodeValue)) { + textNodes.push(node); + } + } else { + for (var i = 0, len = node.childNodes.length; i < len; ++i) { + getTextNodes(node.childNodes[i]); + } + } + } + + getTextNodes(node); + return textNodes; +}; + +var commonWord = function(word) { + return _.contains(words, word); +}; + +var isPunctuation = function(str) { + return _.contains(['&', '%', '(', ')', ';', ':', '.', ',', '"', "'", '`', '!', '?' ], str); +}; + + +var markWords = function(obj, report) { + + var mergeFn = function(obj, tok) { + if (commonWord(tok) || isPunctuation(tok) || _.isNumber(tok)) { + obj.append(tok + ' '); + } else { + // var newObj = "<span style='background-color: red'>"+tok+"</span> "); + var newObj = $("<span>"+tok+"</span> "); + obj.append(newObj); + report.error("The word '"+tok+"' is uncommon", newObj); + } + return obj; + }; + + var toks = tokenizer.tokenize(obj.text()); + var rawObj = $('<p></p>', {id: 'text'}); + var newObj = _.reduce(toks, mergeFn , rawObj); + + obj.replaceWith(newObj); +}; + +exports.rule = function(report) { + $5('body').each( + function(i){ + var nodes = getTextNodesIn($(this)); + _.map(nodes, function(n){ + markWords(n, report); + }); + }); +};
\ No newline at end of file |