""" PRE-PROCESSORS ============================================================================= Preprocessors work on source text before we start doing anything too complicated. """ import re import markdown HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:" HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX class Processor: def __init__(self, markdown_instance=None): if markdown_instance: self.markdown = markdown_instance class Preprocessor (Processor): """ Preprocessors are run after the text is broken into lines. Each preprocessor implements a "run" method that takes a pointer to a list of lines of the document, modifies it as necessary and returns either the same pointer or a pointer to a new list. Preprocessors must extend markdown.Preprocessor. """ def run(self, lines): """ Each subclass of Preprocessor should override the `run` method, which takes the document as a list of strings split by newlines and returns the (possibly modified) list of lines. """ pass class HtmlStash: """ This class is used for stashing HTML objects that we extract in the beginning and replace with place-holders. """ def __init__ (self): """ Create a HtmlStash. """ self.html_counter = 0 # for counting inline html segments self.rawHtmlBlocks=[] def store(self, html, safe=False): """ Saves an HTML segment for later reinsertion. Returns a placeholder string that needs to be inserted into the document. Keyword arguments: * html: an html segment * safe: label an html segment as safe for safemode Returns : a placeholder string """ self.rawHtmlBlocks.append((html, safe)) placeholder = HTML_PLACEHOLDER % self.html_counter self.html_counter += 1 return placeholder def reset(self): self.html_counter = 0 self.rawHtmlBlocks = [] class HtmlBlockPreprocessor(Preprocessor): """Remove html blocks from the text and store them for later retrieval.""" right_tag_patterns = ["", "%s>"] def _get_left_tag(self, block): return block[1:].replace(">", " ", 1).split()[0].lower() def _get_right_tag(self, left_tag, block): for p in self.right_tag_patterns: tag = p % left_tag i = block.rfind(tag) if i > 2: return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag) return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block) def _equal_tags(self, left_tag, right_tag): if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. return True if ("/" + left_tag) == right_tag: return True if (right_tag == "--" and left_tag == "--"): return True elif left_tag == right_tag[1:] \ and right_tag[0] != "<": return True else: return False def _is_oneliner(self, tag): return (tag in ['hr', 'hr/']) def run(self, lines): text = "\n".join(lines) new_blocks = [] text = text.split("\n\n") items = [] left_tag = '' right_tag = '' in_tag = False # flag while text: block = text[0] if block.startswith("\n"): block = block[1:] text = text[1:] if block.startswith("\n"): block = block[1:] if not in_tag: if block.startswith("<"): left_tag = self._get_left_tag(block) right_tag, data_index = self._get_right_tag(left_tag, block) if data_index < len(block): text.insert(0, block[data_index:]) block = block[:data_index] if not (markdown.isBlockLevel(left_tag) \ or block[1] in ["!", "?", "@", "%"]): new_blocks.append(block) continue if self._is_oneliner(left_tag): new_blocks.append(block.strip()) continue if block[1] == "!": # is a comment block left_tag = "--" right_tag, data_index = self._get_right_tag(left_tag, block) # keep checking conditions below and maybe just append if block.rstrip().endswith(">") \ and self._equal_tags(left_tag, right_tag): new_blocks.append( self.markdown.htmlStash.store(block.strip())) continue else: #if not block[1] == "!": # if is block level tag and is not complete if markdown.isBlockLevel(left_tag) or left_tag == "--" \ and not block.rstrip().endswith(">"): items.append(block.strip()) in_tag = True else: new_blocks.append( self.markdown.htmlStash.store(block.strip())) continue new_blocks.append(block) else: items.append(block.strip()) right_tag, data_index = self._get_right_tag(left_tag, block) if self._equal_tags(left_tag, right_tag): # if find closing tag in_tag = False new_blocks.append( self.markdown.htmlStash.store('\n\n'.join(items))) items = [] if items: new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) new_blocks.append('\n') new_text = "\n\n".join(new_blocks) return new_text.split("\n") class ReferencePreprocessor(Preprocessor): """ Remove reference definitions from text and store for later use. """ RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL) def run (self, lines): new_text = []; for line in lines: m = self.RE.match(line) if m: id = m.group(2).strip().lower() t = m.group(4).strip() # potential title if not t: self.markdown.references[id] = (m.group(3), t) elif (len(t) >= 2 and (t[0] == t[-1] == "\"" or t[0] == t[-1] == "\'" or (t[0] == "(" and t[-1] == ")") ) ): self.markdown.references[id] = (m.group(3), t[1:-1]) else: new_text.append(line) else: new_text.append(line) return new_text #+ "\n"