Commit f940e564 authored by Claude Paroz's avatar Claude Paroz
Browse files

Fixed #20099 -- Eased subclassing of BrokenLinkEmailsMiddleware

Thanks Ram Rachum for the report and the initial patch, and Simon
Charette for the review.
parent 6de81d65
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -472,6 +472,7 @@ answer newbie questions, and generally made Django that much better:
    Jyrki Pulliainen <jyrki.pulliainen@gmail.com>
    Thejaswi Puthraya <thejaswi.puthraya@gmail.com>
    Johann Queuniet <johann.queuniet@adh.naellia.eu>
    Ram Rachum <ram@rachum.com>
    Jan Rademaker
    Michael Radziej <mir@noris.de>
    Laurent Rahuel <laurent.rahuel@gmail.com>
+15 −9
Original line number Diff line number Diff line
@@ -142,15 +142,17 @@ class BrokenLinkEmailsMiddleware(object):
            domain = request.get_host()
            path = request.get_full_path()
            referer = force_text(request.META.get('HTTP_REFERER', ''), errors='replace')
            is_internal = self.is_internal_request(domain, referer)
            is_not_search_engine = '?' not in referer
            is_ignorable = self.is_ignorable_404(path)
            if referer and (is_internal or is_not_search_engine) and not is_ignorable:

            if not self.is_ignorable_request(request, path, domain, referer):
                ua = request.META.get('HTTP_USER_AGENT', '<none>')
                ip = request.META.get('REMOTE_ADDR', '<none>')
                mail_managers(
                    "Broken %slink on %s" % (('INTERNAL ' if is_internal else ''), domain),
                    "Referrer: %s\nRequested URL: %s\nUser agent: %s\nIP address: %s\n" % (referer, path, ua, ip),
                    "Broken %slink on %s" % (
                        ('INTERNAL ' if self.is_internal_request(domain, referer) else ''),
                        domain
                    ),
                    "Referrer: %s\nRequested URL: %s\nUser agent: %s\n"
                    "IP address: %s\n" % (referer, path, ua, ip),
                    fail_silently=True)
        return response

@@ -159,10 +161,14 @@ class BrokenLinkEmailsMiddleware(object):
        Returns True if the referring URL is the same domain as the current request.
        """
        # Different subdomains are treated as different domains.
        return re.match("^https?://%s/" % re.escape(domain), referer)
        return bool(re.match("^https?://%s/" % re.escape(domain), referer))

    def is_ignorable_404(self, uri):
    def is_ignorable_request(self, request, uri, domain, referer):
        """
        Returns True if a 404 at the given URL *shouldn't* notify the site managers.
        Returns True if the given request *shouldn't* notify the site managers.
        """
        # '?' in referer is identified as search engine source
        if (not referer or
                (not self.is_internal_request(domain, referer) and '?' in referer)):
            return True
        return any(pattern.search(uri) for pattern in settings.IGNORABLE_404_URLS)
+5 −0
Original line number Diff line number Diff line
@@ -98,6 +98,11 @@ crawlers often request::
(Note that these are regular expressions, so we put a backslash in front of
periods to escape them.)

If you'd like to customize the behavior of
:class:`django.middleware.common.BrokenLinkEmailsMiddleware` further (for
example to ignore requests coming from web crawlers), you should subclass it
and override its methods.

.. seealso::

   404 errors are logged using the logging framework. By default, these log
+19 −0
Original line number Diff line number Diff line
@@ -326,6 +326,25 @@ class BrokenLinkEmailsMiddlewareTest(TestCase):
        BrokenLinkEmailsMiddleware().process_response(self.req, self.resp)
        self.assertEqual(len(mail.outbox), 1)

    def test_custom_request_checker(self):
        class SubclassedMiddleware(BrokenLinkEmailsMiddleware):
            ignored_user_agent_patterns = (re.compile(r'Spider.*'),
                                           re.compile(r'Robot.*'))
            def is_ignorable_request(self, request, uri, domain, referer):
                '''Check user-agent in addition to normal checks.'''
                if super(SubclassedMiddleware, self).is_ignorable_request(request, uri, domain, referer):
                    return True
                user_agent = request.META['HTTP_USER_AGENT']
                return any(pattern.search(user_agent) for pattern in
                               self.ignored_user_agent_patterns)

        self.req.META['HTTP_REFERER'] = '/another/url/'
        self.req.META['HTTP_USER_AGENT'] = 'Spider machine 3.4'
        SubclassedMiddleware().process_response(self.req, self.resp)
        self.assertEqual(len(mail.outbox), 0)
        self.req.META['HTTP_USER_AGENT'] = 'My user agent'
        SubclassedMiddleware().process_response(self.req, self.resp)
        self.assertEqual(len(mail.outbox), 1)

class ConditionalGetMiddlewareTest(TestCase):
    urls = 'middleware.cond_get_urls'