Fixed #19237 -- Used HTML parser to strip tags (dc51ec8b) · Commits · Dom Sekotill / django

django/utils/html.py

+26 −2

Original line number	Diff line number	Diff line
		@@ -16,6 +16,9 @@ from django.utils.functional import allow_lazy
		from django.utils import six
		from django.utils.text import normalize_newlines

		from .html_parser import HTMLParser


		# Configuration for urlize() function.
		TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)']
		WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>')]
		@@ -33,7 +36,6 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
		html_gunk_re = re.compile(r'(?:<br clear="all">\|<i><\/i>\|<b><\/b>\|<em><\/em>\|<strong><\/strong>\|<\/?smallcaps>\|<\/?uppercase>)', re.IGNORECASE)
		hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).?[a-zA-Z].?</p>\s*)+)' % '\|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
		trailing_empty_content_re = re.compile(r'(?:<p>(?: \|\s\|<br \/>)?</p>\s)+\Z')
		strip_tags_re = re.compile(r'</?\S([^=>]=(\s"[^"]"\|\s\'[^\']\'\|\S)\|[^>])*?>', re.IGNORECASE)


		def escape(text):
		@@ -116,9 +118,31 @@ def linebreaks(value, autoescape=False):
		return '\n\n'.join(paras)
		linebreaks = allow_lazy(linebreaks, six.text_type)


		class MLStripper(HTMLParser):
		def __init__(self):
		HTMLParser.__init__(self)
		self.reset()
		self.fed = []
		def handle_data(self, d):
		self.fed.append(d)
		def handle_entityref(self, name):
		self.fed.append('&%s;' % name)
		def handle_charref(self, name):
		self.fed.append('&#%s;' % name)
		def get_data(self):
		return ''.join(self.fed)

		def strip_tags(value):
		"""Returns the given HTML with all tags stripped."""
		return strip_tags_re.sub('', force_text(value))
		s = MLStripper()
		s.feed(value)
		data = s.get_data()
		try:
		res = s.close()
		except Exception as e:
		data += s.rawdata
		return data
		strip_tags = allow_lazy(strip_tags)

		def remove_tags(html, tags):

tests/utils_tests/test_html.py

+6 −2

Original line number	Diff line number	Diff line
		@@ -5,6 +5,7 @@ import os

		from django.utils import html
		from django.utils._os import upath
		from django.utils.encoding import force_text
		from django.utils.unittest import TestCase


		@@ -63,10 +64,12 @@ class TestUtilsHtml(TestCase):
		def test_strip_tags(self):
		f = html.strip_tags
		items = (
		('<p>See: 'é is an apostrophe followed by e acute</p>',
		'See: 'é is an apostrophe followed by e acute'),
		('<adf>a', 'a'),
		('</adf>a', 'a'),
		('<asdf><asdf>e', 'e'),
		('<f', '<f'),
		('hi, <f x', 'hi, <f x'),
		('</fe', '</fe'),
		('<x>b<y>', 'b'),
		('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'),
		@@ -81,8 +84,9 @@ class TestUtilsHtml(TestCase):
		for filename in ('strip_tags1.html', 'strip_tags2.txt'):
		path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename)
		with open(path, 'r') as fp:
		content = force_text(fp.read())
		start = datetime.now()
		stripped = html.strip_tags(fp.read())
		stripped = html.strip_tags(content)
		elapsed = datetime.now() - start
		self.assertEqual(elapsed.seconds, 0)
		self.assertIn("Please try again.", stripped)