Improved strip_tags and clarified documentation (6ca6c36f) · Commits · Dom Sekotill / django

django/utils/html.py

+27 −4

Original line number	Diff line number	Diff line
		@@ -118,7 +118,10 @@ linebreaks = allow_lazy(linebreaks, six.text_type)

		class MLStripper(HTMLParser):
		def __init__(self):
		if six.PY2:
		HTMLParser.__init__(self)
		else:
		HTMLParser.__init__(self, strict=False)
		self.reset()
		self.fed = []

		@@ -135,16 +138,36 @@ class MLStripper(HTMLParser):
		return ''.join(self.fed)


		def strip_tags(value):
		"""Returns the given HTML with all tags stripped."""
		def _strip_once(value):
		"""
		Internal tag stripping utility used by strip_tags.
		"""
		s = MLStripper()
		try:
		s.feed(value)
		s.close()
		except HTMLParseError:
		return value
		try:
		s.close()
		except (HTMLParseError, UnboundLocalError) as err:
		# UnboundLocalError because of http://bugs.python.org/issue17802
		# on Python 3.2, triggered by strict=False mode of HTMLParser
		return s.get_data() + s.rawdata
		else:
		return s.get_data()


		def strip_tags(value):
		"""Returns the given HTML with all tags stripped."""
		while True:
		if not ('<' in value or '>' in value):
		return value
		new_value = _strip_once(value)
		if new_value == value:
		# _strip_once was not able to detect more tags
		return value
		else:
		value = new_value
		strip_tags = allow_lazy(strip_tags)

+11 −1

Original line number	Diff line number	Diff line
		@@ -1985,7 +1985,7 @@ If ``value`` is ``10``, the output will be ``1.000000E+01``.
		striptags
		^^^^^^^^^

		Strips all [X]HTML tags.
		Makes all possible efforts to strip all [X]HTML tags.

		For example::

		@@ -1994,6 +1994,16 @@ For example::
		If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"``, the
		output will be ``"Joel is a slug"``.

		.. admonition:: No safety guarantee

		Note that ``striptags`` doesn't give any guarantee about its output being
		entirely HTML safe, particularly with non valid HTML input. So NEVER
		apply the ``safe`` filter to a ``striptags`` output.
		If you are looking for something more robust, you can use the ``bleach``
		Python library, notably its `clean`_ method.

		.. _clean: http://bleach.readthedocs.org/en/latest/clean.html

		.. templatefilter:: time

		time

+11 −5

Original line number	Diff line number	Diff line
		@@ -595,17 +595,23 @@ escaping HTML.

		.. function:: strip_tags(value)

		Removes anything that looks like an html tag from the string, that is
		anything contained within ``<>``.
		Tries to remove anything that looks like an HTML tag from the string, that
		is anything contained within ``<>``.
		Absolutely NO guaranty is provided about the resulting string being entirely
		HTML safe. So NEVER mark safe the result of a ``strip_tag`` call without
		escaping it first, for example with :func:`~django.utils.html.escape`.

		For example::

		strip_tags(value)

		If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"``
		the return value will be ``"Joel is a slug"``. Note that ``strip_tags``
		result may still contain unsafe HTML content, so you might use
		:func:`~django.utils.html.escape` to make it a safe string.
		the return value will be ``"Joel is a slug"``.

		If you are looking for a more robust solution, take a look at the `bleach`_
		Python library.

		.. _bleach: https://pypi.python.org/pypi/bleach

		.. versionchanged:: 1.6

+2 −0

Original line number	Diff line number	Diff line
		@@ -80,6 +80,8 @@ class TestUtilsHtml(TestCase):
		('a<p a >b</p>c', 'abc'),
		('d<a:b c:d>e</p>f', 'def'),
		('<strong>foo</strong><a href="http://example.com">bar</a>', 'foobar'),
		('<sc<!-- -->ript>test<<!-- -->/script>', 'test'),
		('<script>alert()</script>&h', 'alert()&h'),
		)
		for value, output in items:
		self.check_output(f, value, output)