Loading django/utils/html.py +26 −2 Original line number Diff line number Diff line Loading @@ -16,6 +16,9 @@ from django.utils.functional import allow_lazy from django.utils import six from django.utils.text import normalize_newlines from .html_parser import HTMLParser # Configuration for urlize() function. TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)'] WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>')] Loading @@ -33,7 +36,6 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+') html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z') strip_tags_re = re.compile(r'</?\S([^=>]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE) def escape(text): Loading Loading @@ -116,9 +118,31 @@ def linebreaks(value, autoescape=False): return '\n\n'.join(paras) linebreaks = allow_lazy(linebreaks, six.text_type) class MLStripper(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.reset() self.fed = [] def handle_data(self, d): self.fed.append(d) def handle_entityref(self, name): self.fed.append('&%s;' % name) def handle_charref(self, name): self.fed.append('&#%s;' % name) def get_data(self): return ''.join(self.fed) def strip_tags(value): """Returns the given HTML with all tags stripped.""" return strip_tags_re.sub('', force_text(value)) s = MLStripper() s.feed(value) data = s.get_data() try: res = s.close() except Exception as e: data += s.rawdata return data strip_tags = allow_lazy(strip_tags) def remove_tags(html, tags): Loading tests/utils_tests/test_html.py +6 −2 Original line number Diff line number Diff line Loading @@ -5,6 +5,7 @@ import os from django.utils import html from django.utils._os import upath from django.utils.encoding import force_text from django.utils.unittest import TestCase Loading Loading @@ -63,10 +64,12 @@ class TestUtilsHtml(TestCase): def test_strip_tags(self): f = html.strip_tags items = ( ('<p>See: 'é is an apostrophe followed by e acute</p>', 'See: 'é is an apostrophe followed by e acute'), ('<adf>a', 'a'), ('</adf>a', 'a'), ('<asdf><asdf>e', 'e'), ('<f', '<f'), ('hi, <f x', 'hi, <f x'), ('</fe', '</fe'), ('<x>b<y>', 'b'), ('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'), Loading @@ -81,8 +84,9 @@ class TestUtilsHtml(TestCase): for filename in ('strip_tags1.html', 'strip_tags2.txt'): path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename) with open(path, 'r') as fp: content = force_text(fp.read()) start = datetime.now() stripped = html.strip_tags(fp.read()) stripped = html.strip_tags(content) elapsed = datetime.now() - start self.assertEqual(elapsed.seconds, 0) self.assertIn("Please try again.", stripped) Loading Loading
django/utils/html.py +26 −2 Original line number Diff line number Diff line Loading @@ -16,6 +16,9 @@ from django.utils.functional import allow_lazy from django.utils import six from django.utils.text import normalize_newlines from .html_parser import HTMLParser # Configuration for urlize() function. TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)'] WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>')] Loading @@ -33,7 +36,6 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+') html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z') strip_tags_re = re.compile(r'</?\S([^=>]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE) def escape(text): Loading Loading @@ -116,9 +118,31 @@ def linebreaks(value, autoescape=False): return '\n\n'.join(paras) linebreaks = allow_lazy(linebreaks, six.text_type) class MLStripper(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.reset() self.fed = [] def handle_data(self, d): self.fed.append(d) def handle_entityref(self, name): self.fed.append('&%s;' % name) def handle_charref(self, name): self.fed.append('&#%s;' % name) def get_data(self): return ''.join(self.fed) def strip_tags(value): """Returns the given HTML with all tags stripped.""" return strip_tags_re.sub('', force_text(value)) s = MLStripper() s.feed(value) data = s.get_data() try: res = s.close() except Exception as e: data += s.rawdata return data strip_tags = allow_lazy(strip_tags) def remove_tags(html, tags): Loading
tests/utils_tests/test_html.py +6 −2 Original line number Diff line number Diff line Loading @@ -5,6 +5,7 @@ import os from django.utils import html from django.utils._os import upath from django.utils.encoding import force_text from django.utils.unittest import TestCase Loading Loading @@ -63,10 +64,12 @@ class TestUtilsHtml(TestCase): def test_strip_tags(self): f = html.strip_tags items = ( ('<p>See: 'é is an apostrophe followed by e acute</p>', 'See: 'é is an apostrophe followed by e acute'), ('<adf>a', 'a'), ('</adf>a', 'a'), ('<asdf><asdf>e', 'e'), ('<f', '<f'), ('hi, <f x', 'hi, <f x'), ('</fe', '</fe'), ('<x>b<y>', 'b'), ('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'), Loading @@ -81,8 +84,9 @@ class TestUtilsHtml(TestCase): for filename in ('strip_tags1.html', 'strip_tags2.txt'): path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename) with open(path, 'r') as fp: content = force_text(fp.read()) start = datetime.now() stripped = html.strip_tags(fp.read()) stripped = html.strip_tags(content) elapsed = datetime.now() - start self.assertEqual(elapsed.seconds, 0) self.assertIn("Please try again.", stripped) Loading