markdown/preprocessors.py - platform/external/markdown - Git at Google


 """
 PRE-PROCESSORS
 =============================================================================

 Preprocessors work on source text before we start doing anything too
 complicated.
 """

 import re
 import markdown

 HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:"
 HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX

 class Processor:
     def __init__(self, markdown_instance=None):
         if markdown_instance:
             self.markdown = markdown_instance

 class Preprocessor (Processor):
     """
     Preprocessors are run after the text is broken into lines.

     Each preprocessor implements a "run" method that takes a pointer to a
     list of lines of the document, modifies it as necessary and returns
     either the same pointer or a pointer to a new list.

     Preprocessors must extend markdown.Preprocessor.

     """
     def run(self, lines):
         """
         Each subclass of Preprocessor should override the `run` method, which
         takes the document as a list of strings split by newlines and returns
         the (possibly modified) list of lines.

         """
         pass

 class HtmlStash:
     """
     This class is used for stashing HTML objects that we extract
     in the beginning and replace with place-holders.
     """

     def __init__ (self):
         """ Create a HtmlStash. """
         self.html_counter = 0 # for counting inline html segments
         self.rawHtmlBlocks=[]

     def store(self, html, safe=False):
         """
         Saves an HTML segment for later reinsertion.  Returns a
         placeholder string that needs to be inserted into the
         document.

         Keyword arguments:

         * html: an html segment
         * safe: label an html segment as safe for safemode

         Returns : a placeholder string

         """
         self.rawHtmlBlocks.append((html, safe))
         placeholder = HTML_PLACEHOLDER % self.html_counter
         self.html_counter += 1
         return placeholder

     def reset(self):
         self.html_counter = 0
         self.rawHtmlBlocks = []


 class HtmlBlockPreprocessor(Preprocessor):
     """Remove html blocks from the text and store them for later retrieval."""

     right_tag_patterns = ["</%s>", "%s>"]

     def _get_left_tag(self, block):
         return block[1:].replace(">", " ", 1).split()[0].lower()

     def _get_right_tag(self, left_tag, block):
         for p in self.right_tag_patterns:
             tag = p % left_tag
             i = block.rfind(tag)
             if i > 2:
                 return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
         return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)

     def _equal_tags(self, left_tag, right_tag):
         if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
             return True
         if ("/" + left_tag) == right_tag:
             return True
         if (right_tag == "--" and left_tag == "--"):
             return True
         elif left_tag == right_tag[1:] \
             and right_tag[0] != "<":
             return True
         else:
             return False

     def _is_oneliner(self, tag):
         return (tag in ['hr', 'hr/'])

     def run(self, lines):
         text = "\n".join(lines)
         new_blocks = []
         text = text.split("\n\n")
         items = []
         left_tag = ''
         right_tag = ''
         in_tag = False # flag

         while text:
             block = text[0]
             if block.startswith("\n"):
                 block = block[1:]
             text = text[1:]

             if block.startswith("\n"):
                 block = block[1:]

             if not in_tag:
                 if block.startswith("<"):
                     left_tag = self._get_left_tag(block)
                     right_tag, data_index = self._get_right_tag(left_tag, block)

                     if block[1] == "!":
                         # is a comment block
                         left_tag = "--"
                         right_tag, data_index = self._get_right_tag(left_tag, block)
                         # keep checking conditions below and maybe just append

                     if data_index < len(block) \
                         and markdown.isBlockLevel(left_tag):
                         text.insert(0, block[data_index:])
                         block = block[:data_index]

                     if not (markdown.isBlockLevel(left_tag) \
                         or block[1] in ["!", "?", "@", "%"]):
                         new_blocks.append(block)
                         continue

                     if self._is_oneliner(left_tag):
                         new_blocks.append(block.strip())
                         continue

                     if block.rstrip().endswith(">") \
                         and self._equal_tags(left_tag, right_tag):
                         new_blocks.append(
                             self.markdown.htmlStash.store(block.strip()))
                         continue
                     else: #if not block[1] == "!":
                         # if is block level tag and is not complete

                         if markdown.isBlockLevel(left_tag) or left_tag == "--" \
                             and not block.rstrip().endswith(">"):
                             items.append(block.strip())
                             in_tag = True
                         else:
                             new_blocks.append(
                             self.markdown.htmlStash.store(block.strip()))

                         continue

                 new_blocks.append(block)

             else:
                 items.append(block.strip())

                 right_tag, data_index = self._get_right_tag(left_tag, block)

                 if self._equal_tags(left_tag, right_tag):
                     # if find closing tag
                     in_tag = False
                     new_blocks.append(
                         self.markdown.htmlStash.store('\n\n'.join(items)))
                     items = []

         if items:
             new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
             new_blocks.append('\n')

         new_text = "\n\n".join(new_blocks)
         return new_text.split("\n")


 class ReferencePreprocessor(Preprocessor):
     """ Remove reference definitions from text and store for later use. """

     RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL)

     def run (self, lines):
         new_text = [];
         for line in lines:
             m = self.RE.match(line)
             if m:
                 id = m.group(2).strip().lower()
                 t = m.group(4).strip()  # potential title
                 if not t:
                     self.markdown.references[id] = (m.group(3), t)
                 elif (len(t) >= 2
                       and (t[0] == t[-1] == "\""
                            or t[0] == t[-1] == "\'"
                            or (t[0] == "(" and t[-1] == ")") ) ):
                     self.markdown.references[id] = (m.group(3), t[1:-1])
                 else:
                     new_text.append(line)
             else:
                 new_text.append(line)

         return new_text #+ "\n"

	"""
	PRE-PROCESSORS
	=============================================================================

	Preprocessors work on source text before we start doing anything too
	complicated.
	"""

	import re
	import markdown

	HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:"
	HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX

	class Processor:
	def __init__(self, markdown_instance=None):
	if markdown_instance:
	self.markdown = markdown_instance

	class Preprocessor (Processor):
	"""
	Preprocessors are run after the text is broken into lines.

	Each preprocessor implements a "run" method that takes a pointer to a
	list of lines of the document, modifies it as necessary and returns
	either the same pointer or a pointer to a new list.

	Preprocessors must extend markdown.Preprocessor.

	"""
	def run(self, lines):
	"""
	Each subclass of Preprocessor should override the `run` method, which
	takes the document as a list of strings split by newlines and returns
	the (possibly modified) list of lines.

	"""
	pass

	class HtmlStash:
	"""
	This class is used for stashing HTML objects that we extract
	in the beginning and replace with place-holders.
	"""

	def __init__ (self):
	""" Create a HtmlStash. """
	self.html_counter = 0 # for counting inline html segments
	self.rawHtmlBlocks=[]

	def store(self, html, safe=False):
	"""
	Saves an HTML segment for later reinsertion. Returns a
	placeholder string that needs to be inserted into the
	document.

	Keyword arguments:

	* html: an html segment
	* safe: label an html segment as safe for safemode

	Returns : a placeholder string

	"""
	self.rawHtmlBlocks.append((html, safe))
	placeholder = HTML_PLACEHOLDER % self.html_counter
	self.html_counter += 1
	return placeholder

	def reset(self):
	self.html_counter = 0
	self.rawHtmlBlocks = []


	class HtmlBlockPreprocessor(Preprocessor):
	"""Remove html blocks from the text and store them for later retrieval."""

	right_tag_patterns = ["</%s>", "%s>"]

	def _get_left_tag(self, block):
	return block[1:].replace(">", " ", 1).split()[0].lower()

	def _get_right_tag(self, left_tag, block):
	for p in self.right_tag_patterns:
	tag = p % left_tag
	i = block.rfind(tag)
	if i > 2:
	return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
	return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)

	def _equal_tags(self, left_tag, right_tag):
	if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
	return True
	if ("/" + left_tag) == right_tag:
	return True
	if (right_tag == "--" and left_tag == "--"):
	return True
	elif left_tag == right_tag[1:] \
	and right_tag[0] != "<":
	return True
	else:
	return False

	def _is_oneliner(self, tag):
	return (tag in ['hr', 'hr/'])

	def run(self, lines):
	text = "\n".join(lines)
	new_blocks = []
	text = text.split("\n\n")
	items = []
	left_tag = ''
	right_tag = ''
	in_tag = False # flag

	while text:
	block = text[0]
	if block.startswith("\n"):
	block = block[1:]
	text = text[1:]

	if block.startswith("\n"):
	block = block[1:]

	if not in_tag:
	if block.startswith("<"):
	left_tag = self._get_left_tag(block)
	right_tag, data_index = self._get_right_tag(left_tag, block)

	if block[1] == "!":
	# is a comment block
	left_tag = "--"
	right_tag, data_index = self._get_right_tag(left_tag, block)
	# keep checking conditions below and maybe just append

	if data_index < len(block) \
	and markdown.isBlockLevel(left_tag):
	text.insert(0, block[data_index:])
	block = block[:data_index]

	if not (markdown.isBlockLevel(left_tag) \
	or block[1] in ["!", "?", "@", "%"]):
	new_blocks.append(block)
	continue

	if self._is_oneliner(left_tag):
	new_blocks.append(block.strip())
	continue

	if block.rstrip().endswith(">") \
	and self._equal_tags(left_tag, right_tag):
	new_blocks.append(
	self.markdown.htmlStash.store(block.strip()))
	continue
	else: #if not block[1] == "!":
	# if is block level tag and is not complete

	if markdown.isBlockLevel(left_tag) or left_tag == "--" \
	and not block.rstrip().endswith(">"):
	items.append(block.strip())
	in_tag = True
	else:
	new_blocks.append(
	self.markdown.htmlStash.store(block.strip()))

	continue

	new_blocks.append(block)

	else:
	items.append(block.strip())

	right_tag, data_index = self._get_right_tag(left_tag, block)

	if self._equal_tags(left_tag, right_tag):
	# if find closing tag
	in_tag = False
	new_blocks.append(
	self.markdown.htmlStash.store('\n\n'.join(items)))
	items = []

	if items:
	new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
	new_blocks.append('\n')

	new_text = "\n\n".join(new_blocks)
	return new_text.split("\n")


	class ReferencePreprocessor(Preprocessor):
	""" Remove reference definitions from text and store for later use. """

	RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]])\]:\s([^ ])(.)$', re.DOTALL)

	def run (self, lines):
	new_text = [];
	for line in lines:
	m = self.RE.match(line)
	if m:
	id = m.group(2).strip().lower()
	t = m.group(4).strip() # potential title
	if not t:
	self.markdown.references[id] = (m.group(3), t)
	elif (len(t) >= 2
	and (t[0] == t[-1] == "\""
	or t[0] == t[-1] == "\'"
	or (t[0] == "(" and t[-1] == ")") ) ):
	self.markdown.references[id] = (m.group(3), t[1:-1])
	else:
	new_text.append(line)
	else:
	new_text.append(line)

	return new_text #+ "\n"