Commit 6ca6c36f authored by Claude Paroz's avatar Claude Paroz
Browse files

Improved strip_tags and clarified documentation

The fact that strip_tags cannot guarantee to really strip all
non-safe HTML content was not clear enough. Also see:
https://www.djangoproject.com/weblog/2014/mar/22/strip-tags-advisory/
parent aaa21102
Loading
Loading
Loading
Loading
+27 −4
Original line number Diff line number Diff line
@@ -118,7 +118,10 @@ linebreaks = allow_lazy(linebreaks, six.text_type)

class MLStripper(HTMLParser):
    def __init__(self):
        if six.PY2:
            HTMLParser.__init__(self)
        else:
            HTMLParser.__init__(self, strict=False)
        self.reset()
        self.fed = []

@@ -135,16 +138,36 @@ class MLStripper(HTMLParser):
        return ''.join(self.fed)


def strip_tags(value):
    """Returns the given HTML with all tags stripped."""
def _strip_once(value):
    """
    Internal tag stripping utility used by strip_tags.
    """
    s = MLStripper()
    try:
        s.feed(value)
        s.close()
    except HTMLParseError:
        return value
    try:
        s.close()
    except (HTMLParseError, UnboundLocalError) as err:
        # UnboundLocalError because of http://bugs.python.org/issue17802
        # on Python 3.2, triggered by strict=False mode of HTMLParser
        return s.get_data() + s.rawdata
    else:
        return s.get_data()


def strip_tags(value):
    """Returns the given HTML with all tags stripped."""
    while True:
        if not ('<' in value or '>' in value):
            return value
        new_value = _strip_once(value)
        if new_value == value:
            # _strip_once was not able to detect more tags
            return value
        else:
            value = new_value
strip_tags = allow_lazy(strip_tags)


+11 −1
Original line number Diff line number Diff line
@@ -1985,7 +1985,7 @@ If ``value`` is ``10``, the output will be ``1.000000E+01``.
striptags
^^^^^^^^^

Strips all [X]HTML tags.
Makes all possible efforts to strip all [X]HTML tags.

For example::

@@ -1994,6 +1994,16 @@ For example::
If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"``, the
output will be ``"Joel is a slug"``.

.. admonition:: No safety guarantee

    Note that ``striptags`` doesn't give any guarantee about its output being
    entirely HTML safe, particularly with non valid HTML input. So **NEVER**
    apply the ``safe`` filter to a ``striptags`` output.
    If you are looking for something more robust, you can use the ``bleach``
    Python library, notably its `clean`_ method.

.. _clean: http://bleach.readthedocs.org/en/latest/clean.html

.. templatefilter:: time

time
+11 −5
Original line number Diff line number Diff line
@@ -595,17 +595,23 @@ escaping HTML.

.. function:: strip_tags(value)

    Removes anything that looks like an html tag from the string, that is
    anything contained within ``<>``.
    Tries to remove anything that looks like an HTML tag from the string, that
    is anything contained within ``<>``.
    Absolutely NO guaranty is provided about the resulting string being entirely
    HTML safe. So NEVER mark safe the result of a ``strip_tag`` call without
    escaping it first, for example with :func:`~django.utils.html.escape`.

    For example::

        strip_tags(value)

    If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"``
    the return value will be ``"Joel is a slug"``. Note that ``strip_tags``
    result may still contain unsafe HTML content, so you might use
    :func:`~django.utils.html.escape` to make it a safe string.
    the return value will be ``"Joel is a slug"``.

    If you are looking for a more robust solution, take a look at the `bleach`_
    Python library.

    .. _bleach: https://pypi.python.org/pypi/bleach

    .. versionchanged:: 1.6

+2 −0
Original line number Diff line number Diff line
@@ -80,6 +80,8 @@ class TestUtilsHtml(TestCase):
            ('a<p a >b</p>c', 'abc'),
            ('d<a:b c:d>e</p>f', 'def'),
            ('<strong>foo</strong><a href="http://example.com">bar</a>', 'foobar'),
            ('<sc<!-- -->ript>test<<!-- -->/script>', 'test'),
            ('<script>alert()</script>&h', 'alert()&h'),
        )
        for value, output in items:
            self.check_output(f, value, output)