Commit badde8a7 authored by Malcolm Tredinnick's avatar Malcolm Tredinnick
Browse files

Fixed #7793 -- Handle sitemaps with more than 50,000 URLs in them (by using

pagination). Patch from Julian Bez.

The docs patch here could probably do with some rewording.


git-svn-id: http://code.djangoproject.com/svn/django/trunk@8088 bcc190cf-cafb-0310-a4f2-bffc1f526a37
parent a26ba331
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -71,7 +71,7 @@ answer newbie questions, and generally made Django that much better:
    Esdras Beleza <linux@esdrasbeleza.com>
    Chris Bennett <chrisrbennett@yahoo.com>
    James Bennett
    Ben Godfrey <http://aftnn.org>
    Julian Bez
    Arvis Bickovskis <viestards.lists@gmail.com>
    Paul Bissex <http://e-scribe.com/>
    Simon Blanchard
@@ -166,6 +166,7 @@ answer newbie questions, and generally made Django that much better:
    glin@seznam.cz
    martin.glueck@gmail.com
    Artyom Gnilov <boobsd@gmail.com>
    Ben Godfrey <http://aftnn.org>
    GomoX <gomo@datafull.com>
    Guilherme Mesquita Gondim <semente@taurinus.org>
    Mario Gonzalez <gonzalemario@gmail.com>
+13 −3
Original line number Diff line number Diff line
from django.core import urlresolvers
from django.core import urlresolvers, paginator
import urllib

PING_URL = "http://www.google.com/webmasters/tools/ping"
@@ -34,6 +34,10 @@ def ping_google(sitemap_url=None, ping_url=PING_URL):
    urllib.urlopen("%s?%s" % (ping_url, params))

class Sitemap:
    # This limit is defined by Google. See the index documentation at
    # http://sitemaps.org/protocol.php#index.
    limit = 50000

    def __get(self, name, obj, default=None):
        try:
            attr = getattr(self, name)
@@ -49,11 +53,17 @@ class Sitemap:
    def location(self, obj):
        return obj.get_absolute_url()

    def get_urls(self):
    def _get_paginator(self):
        if not hasattr(self, "paginator"):
            self.paginator = paginator.Paginator(self.items(), self.limit)
        return self.paginator
    paginator = property(_get_paginator)

    def get_urls(self, page=1):
        from django.contrib.sites.models import Site
        current_site = Site.objects.get_current()
        urls = []
        for item in self.items():
        for item in self.paginator.page(page).object_list:
            loc = "http://%s%s" % (current_site.domain, self.__get('location', item))
            url_info = {
                'location':   loc,
+19 −5
Original line number Diff line number Diff line
@@ -3,14 +3,22 @@ from django.template import loader
from django.contrib.sites.models import Site
from django.core import urlresolvers
from django.utils.encoding import smart_str
from django.core.paginator import EmptyPage, PageNotAnInteger

def index(request, sitemaps):
    current_site = Site.objects.get_current()
    sites = []
    protocol = request.is_secure() and 'https' or 'http'
    for section in sitemaps.keys():
    for section, site in sitemaps.items():
        if callable(site):
            pages = site().paginator.num_pages
        else:
            pages = site.paginator.num_pages
        sitemap_url = urlresolvers.reverse('django.contrib.sitemaps.views.sitemap', kwargs={'section': section})
        sites.append('%s://%s%s' % (protocol, current_site.domain, sitemap_url))
        if pages > 1:
            for page in range(2, pages+1):
                sites.append('%s://%s%s?p=%s' % (protocol, current_site.domain, sitemap_url, page))
    xml = loader.render_to_string('sitemap_index.xml', {'sitemaps': sites})
    return HttpResponse(xml, mimetype='application/xml')

@@ -22,10 +30,16 @@ def sitemap(request, sitemaps, section=None):
        maps.append(sitemaps[section])
    else:
        maps = sitemaps.values()
    page = request.GET.get("p", 1)
    for site in maps:
        try:
            if callable(site):
            urls.extend(site().get_urls())
                urls.extend(site().get_urls(page))
            else:
            urls.extend(site.get_urls())
                urls.extend(site.get_urls(page))
        except EmptyPage:
            raise Http404("Page %s empty" % page)
        except PageNotAnInteger:
            raise Http404("No page '%s'" % page)
    xml = smart_str(loader.render_to_string('sitemap.xml', {'urlset': urls}))
    return HttpResponse(xml, mimetype='application/xml')
+4 −0
Original line number Diff line number Diff line
@@ -282,6 +282,10 @@ This will automatically generate a ``sitemap.xml`` file that references
both ``sitemap-flatpages.xml`` and ``sitemap-blog.xml``. The ``Sitemap``
classes and the ``sitemaps`` dict don't change at all.

If one of your sitemaps is going to have more than 50,000 URLs you should 
create an index file. Your sitemap will be paginated and the index will 
reflect that.

Pinging Google
==============