Commit 662eea11 authored by Luke Plant's avatar Luke Plant
Browse files

Fixed #16937 - added `QuerySet.prefetch_related` to prefetch many related objects.

Many thanks to akaariai for lots of review and feedback, bug finding,
additional unit tests and performance testing.

git-svn-id: http://code.djangoproject.com/svn/django/trunk@16930 bcc190cf-cafb-0310-a4f2-bffc1f526a37
parent d30fbf8b
Loading
Loading
Loading
Loading
+24 −10
Original line number Diff line number Diff line
@@ -225,11 +225,7 @@ class ReverseGenericRelatedObjectsDescriptor(object):
            content_type = content_type,
            content_type_field_name = self.field.content_type_field_name,
            object_id_field_name = self.field.object_id_field_name,
            core_filters = {
                '%s__pk' % self.field.content_type_field_name: content_type.id,
                '%s__exact' % self.field.object_id_field_name: instance._get_pk_val(),
            }

            prefetch_cache_name = self.field.attname,
        )

        return manager
@@ -250,12 +246,12 @@ def create_generic_related_manager(superclass):
    """

    class GenericRelatedObjectManager(superclass):
        def __init__(self, model=None, core_filters=None, instance=None, symmetrical=None,
        def __init__(self, model=None, instance=None, symmetrical=None,
                     source_col_name=None, target_col_name=None, content_type=None,
                     content_type_field_name=None, object_id_field_name=None):
                     content_type_field_name=None, object_id_field_name=None,
                     prefetch_cache_name=None):

            super(GenericRelatedObjectManager, self).__init__()
            self.core_filters = core_filters
            self.model = model
            self.content_type = content_type
            self.symmetrical = symmetrical
@@ -264,12 +260,30 @@ def create_generic_related_manager(superclass):
            self.target_col_name = target_col_name
            self.content_type_field_name = content_type_field_name
            self.object_id_field_name = object_id_field_name
            self.prefetch_cache_name = prefetch_cache_name
            self.pk_val = self.instance._get_pk_val()
            self.core_filters = {
                '%s__pk' % content_type_field_name: content_type.id,
                '%s__exact' % object_id_field_name: instance._get_pk_val(),
            }

        def get_query_set(self):
            try:
                return self.instance._prefetched_objects_cache[self.prefetch_cache_name]
            except (AttributeError, KeyError):
                db = self._db or router.db_for_read(self.model, instance=self.instance)
                return super(GenericRelatedObjectManager, self).get_query_set().using(db).filter(**self.core_filters)

        def get_prefetch_query_set(self, instances):
            db = self._db or router.db_for_read(self.model)
            query = {
                '%s__pk' % self.content_type_field_name: self.content_type.id,
                '%s__in' % self.object_id_field_name:
                    [obj._get_pk_val() for obj in instances]
                }
            qs = super(GenericRelatedObjectManager, self).get_query_set().using(db).filter(**query)
            return (qs, self.object_id_field_name, 'pk')

        def add(self, *objs):
            for obj in objs:
                if not isinstance(obj, self.model):
+60 −9
Original line number Diff line number Diff line
@@ -432,8 +432,22 @@ class ForeignRelatedObjectsDescriptor(object):
                self.model = rel_model

            def get_query_set(self):
                try:
                    return self.instance._prefetched_objects_cache[rel_field.related_query_name()]
                except (AttributeError, KeyError):
                    db = self._db or router.db_for_read(self.model, instance=self.instance)
                return super(RelatedManager, self).get_query_set().using(db).filter(**(self.core_filters))
                    return super(RelatedManager, self).get_query_set().using(db).filter(**self.core_filters)

            def get_prefetch_query_set(self, instances):
                """
                Return a queryset that does the bulk lookup needed
                by prefetch_related functionality.
                """
                db = self._db or router.db_for_read(self.model)
                query = {'%s__%s__in' % (rel_field.name, attname):
                             [getattr(obj, attname) for obj in instances]}
                qs = super(RelatedManager, self).get_query_set().using(db).filter(**query)
                return (qs, rel_field.get_attname(), attname)

            def add(self, *objs):
                for obj in objs:
@@ -482,25 +496,60 @@ def create_many_related_manager(superclass, rel):
    """Creates a manager that subclasses 'superclass' (which is a Manager)
    and adds behavior for many-to-many related objects."""
    class ManyRelatedManager(superclass):
        def __init__(self, model=None, core_filters=None, instance=None, symmetrical=None,
        def __init__(self, model=None, query_field_name=None, instance=None, symmetrical=None,
                     source_field_name=None, target_field_name=None, reverse=False,
                     through=None):
                     through=None, prefetch_cache_name=None):
            super(ManyRelatedManager, self).__init__()
            self.model = model
            self.core_filters = core_filters
            self.query_field_name = query_field_name
            self.core_filters = {'%s__pk' % query_field_name: instance._get_pk_val()}
            self.instance = instance
            self.symmetrical = symmetrical
            self.source_field_name = source_field_name
            self.target_field_name = target_field_name
            self.reverse = reverse
            self.through = through
            self.prefetch_cache_name = prefetch_cache_name
            self._pk_val = self.instance.pk
            if self._pk_val is None:
                raise ValueError("%r instance needs to have a primary key value before a many-to-many relationship can be used." % instance.__class__.__name__)

        def get_query_set(self):
            try:
                return self.instance._prefetched_objects_cache[self.prefetch_cache_name]
            except (AttributeError, KeyError):
                db = self._db or router.db_for_read(self.instance.__class__, instance=self.instance)
            return super(ManyRelatedManager, self).get_query_set().using(db)._next_is_sticky().filter(**(self.core_filters))
                return super(ManyRelatedManager, self).get_query_set().using(db)._next_is_sticky().filter(**self.core_filters)

        def get_prefetch_query_set(self, instances):
            """
            Returns a tuple:
            (queryset of instances of self.model that are related to passed in instances
             attr of returned instances needed for matching
             attr of passed in instances needed for matching)
            """
            from django.db import connections
            db = self._db or router.db_for_read(self.model)
            query = {'%s__pk__in' % self.query_field_name:
                         [obj._get_pk_val() for obj in instances]}
            qs = super(ManyRelatedManager, self).get_query_set().using(db)._next_is_sticky().filter(**query)

            # M2M: need to annotate the query in order to get the primary model
            # that the secondary model was actually related to. We know that
            # there will already be a join on the join table, so we can just add
            # the select.

            # For non-autocreated 'through' models, can't assume we are
            # dealing with PK values.
            fk = self.through._meta.get_field(self.source_field_name)
            source_col = fk.column
            join_table = self.through._meta.db_table
            connection = connections[db]
            qn = connection.ops.quote_name
            qs = qs.extra(select={'_prefetch_related_val':
                                      '%s.%s' % (qn(join_table), qn(source_col))})
            select_attname = fk.rel.get_related_field().get_attname()
            return (qs, '_prefetch_related_val', select_attname)

        # If the ManyToMany relation has an intermediary model,
        # the add and remove methods do not exist.
@@ -683,7 +732,8 @@ class ManyRelatedObjectsDescriptor(object):

        manager = self.related_manager_cls(
            model=rel_model,
            core_filters={'%s__pk' % self.related.field.name: instance._get_pk_val()},
            query_field_name=self.related.field.name,
            prefetch_cache_name=self.related.field.related_query_name(),
            instance=instance,
            symmetrical=False,
            source_field_name=self.related.field.m2m_reverse_field_name(),
@@ -739,7 +789,8 @@ class ReverseManyRelatedObjectsDescriptor(object):

        manager = self.related_manager_cls(
            model=self.field.rel.to,
            core_filters={'%s__pk' % self.field.related_query_name(): instance._get_pk_val()},
            query_field_name=self.field.related_query_name(),
            prefetch_cache_name=self.field.name,
            instance=instance,
            symmetrical=self.field.rel.symmetrical,
            source_field_name=self.field.m2m_field_name(),
+3 −0
Original line number Diff line number Diff line
@@ -172,6 +172,9 @@ class Manager(object):
    def select_related(self, *args, **kwargs):
        return self.get_query_set().select_related(*args, **kwargs)

    def prefetch_related(self, *args, **kwargs):
        return self.get_query_set().prefetch_related(*args, **kwargs)

    def values(self, *args, **kwargs):
        return self.get_query_set().values(*args, **kwargs)

+179 −0
Original line number Diff line number Diff line
@@ -36,6 +36,8 @@ class QuerySet(object):
        self._iter = None
        self._sticky_filter = False
        self._for_write = False
        self._prefetch_related_lookups = []
        self._prefetch_done = False

    ########################
    # PYTHON MAGIC METHODS #
@@ -81,9 +83,17 @@ class QuerySet(object):
                self._result_cache = list(self.iterator())
        elif self._iter:
            self._result_cache.extend(self._iter)
        if self._prefetch_related_lookups and not self._prefetch_done:
            self._prefetch_related_objects()
        return len(self._result_cache)

    def __iter__(self):
        if self._prefetch_related_lookups and not self._prefetch_done:
            # We need all the results in order to be able to do the prefetch
            # in one go. To minimize code duplication, we use the __len__
            # code path which also forces this, and also does the prefetch
            len(self)

        if self._result_cache is None:
            self._iter = self.iterator()
            self._result_cache = []
@@ -106,6 +116,12 @@ class QuerySet(object):
                self._fill_cache()

    def __nonzero__(self):
        if self._prefetch_related_lookups and not self._prefetch_done:
            # We need all the results in order to be able to do the prefetch
            # in one go. To minimize code duplication, we use the __len__
            # code path which also forces this, and also does the prefetch
            len(self)

        if self._result_cache is not None:
            return bool(self._result_cache)
        try:
@@ -527,6 +543,11 @@ class QuerySet(object):
            return self.query.has_results(using=self.db)
        return bool(self._result_cache)

    def _prefetch_related_objects(self):
        # This method can only be called once the result cache has been filled.
        prefetch_related_objects(self._result_cache, self._prefetch_related_lookups)
        self._prefetch_done = True

    ##################################################
    # PUBLIC METHODS THAT RETURN A QUERYSET SUBCLASS #
    ##################################################
@@ -650,6 +671,23 @@ class QuerySet(object):
            obj.query.max_depth = depth
        return obj

    def prefetch_related(self, *lookups):
        """
        Returns a new QuerySet instance that will prefetch the specified
        Many-To-One and Many-To-Many related objects when the QuerySet is
        evaluated.

        When prefetch_related() is called more than once, the list of lookups to
        prefetch is appended to. If prefetch_related(None) is called, the
        the list is cleared.
        """
        clone = self._clone()
        if lookups == (None,):
            clone._prefetch_related_lookups = []
        else:
            clone._prefetch_related_lookups.extend(lookups)
        return clone

    def dup_select_related(self, other):
        """
        Copies the related selection status from the QuerySet 'other' to the
@@ -799,6 +837,7 @@ class QuerySet(object):
            query.filter_is_sticky = True
        c = klass(model=self.model, query=query, using=self._db)
        c._for_write = self._for_write
        c._prefetch_related_lookups = self._prefetch_related_lookups[:]
        c.__dict__.update(kwargs)
        if setup and hasattr(c, '_setup_query'):
            c._setup_query()
@@ -864,6 +903,7 @@ class QuerySet(object):
    # empty" result.
    value_annotation = True


class ValuesQuerySet(QuerySet):
    def __init__(self, *args, **kwargs):
        super(ValuesQuerySet, self).__init__(*args, **kwargs)
@@ -993,6 +1033,7 @@ class ValuesQuerySet(QuerySet):
                    % self.__class__.__name__)
        return self


class ValuesListQuerySet(ValuesQuerySet):
    def iterator(self):
        if self.flat and len(self._fields) == 1:
@@ -1502,6 +1543,7 @@ class RawQuerySet(object):
                self._model_fields[converter(column)] = field
        return self._model_fields


def insert_query(model, objs, fields, return_id=False, raw=False, using=None):
    """
    Inserts a new record for the given model. This provides an interface to
@@ -1511,3 +1553,140 @@ def insert_query(model, objs, fields, return_id=False, raw=False, using=None):
    query = sql.InsertQuery(model)
    query.insert_values(fields, objs, raw=raw)
    return query.get_compiler(using=using).execute_sql(return_id)


def prefetch_related_objects(result_cache, related_lookups):
    """
    Helper function for prefetch_related functionality

    Populates prefetched objects caches for a list of results
    from a QuerySet
    """
    from django.db.models.sql.constants import LOOKUP_SEP

    if len(result_cache) == 0:
        return # nothing to do

    model = result_cache[0].__class__

    # We need to be able to dynamically add to the list of prefetch_related
    # lookups that we look up (see below).  So we need some book keeping to
    # ensure we don't do duplicate work.
    done_lookups = set() # list of lookups like foo__bar__baz
    done_queries = {}    # dictionary of things like 'foo__bar': [results]
    related_lookups = list(related_lookups)

    # We may expand related_lookups, so need a loop that allows for that
    for lookup in related_lookups:
        if lookup in done_lookups:
            # We've done exactly this already, skip the whole thing
            continue
        done_lookups.add(lookup)

        # Top level, the list of objects to decorate is the the result cache
        # from the primary QuerySet. It won't be for deeper levels.
        obj_list = result_cache

        attrs = lookup.split(LOOKUP_SEP)
        for level, attr in enumerate(attrs):
            # Prepare main instances
            if len(obj_list) == 0:
                break

            good_objects = True
            for obj in obj_list:
                if not hasattr(obj, '_prefetched_objects_cache'):
                    try:
                        obj._prefetched_objects_cache = {}
                    except AttributeError:
                        # Must be in a QuerySet subclass that is not returning
                        # Model instances, either in Django or 3rd
                        # party. prefetch_related() doesn't make sense, so quit
                        # now.
                        good_objects = False
                        break
                else:
                    # We already did this list
                    break
            if not good_objects:
                break

            # Descend down tree
            try:
                rel_obj = getattr(obj_list[0], attr)
            except AttributeError:
                raise AttributeError("Cannot find '%s' on %s object, '%s' is an invalid "
                                     "parameter to prefetch_related()" %
                                     (attr, obj_list[0].__class__.__name__, lookup))

            can_prefetch = hasattr(rel_obj, 'get_prefetch_query_set')
            if level == len(attrs) - 1 and not can_prefetch:
                # Last one, this *must* resolve to a related manager.
                raise ValueError("'%s' does not resolve to a supported 'many related"
                                 " manager' for model %s - this is an invalid"
                                 " parameter to prefetch_related()."
                                 % (lookup, model.__name__))

            if can_prefetch:
                # Check we didn't do this already
                current_lookup = LOOKUP_SEP.join(attrs[0:level+1])
                if current_lookup in done_queries:
                    obj_list = done_queries[current_lookup]
                else:
                    relmanager = rel_obj
                    obj_list, additional_prl = prefetch_one_level(obj_list, relmanager, attr)
                    for f in additional_prl:
                        new_prl = LOOKUP_SEP.join([current_lookup, f])
                        related_lookups.append(new_prl)
                    done_queries[current_lookup] = obj_list
            else:
                # Assume we've got some singly related object. We replace
                # the current list of parent objects with that list.
                obj_list = [getattr(obj, attr) for obj in obj_list]

                # Filter out 'None' so that we can continue with nullable
                # relations.
                obj_list = [obj for obj in obj_list if obj is not None]


def prefetch_one_level(instances, relmanager, attname):
    """
    Helper function for prefetch_related_objects

    Runs prefetches on all instances using the manager relmanager,
    assigning results to queryset against instance.attname.

    The prefetched objects are returned, along with any additional
    prefetches that must be done due to prefetch_related lookups
    found from default managers.
    """
    rel_qs, rel_obj_attr, instance_attr = relmanager.get_prefetch_query_set(instances)
    # We have to handle the possibility that the default manager itself added
    # prefetch_related lookups to the QuerySet we just got back. We don't want to
    # trigger the prefetch_related functionality by evaluating the query.
    # Rather, we need to merge in the prefetch_related lookups.
    additional_prl = getattr(rel_qs, '_prefetch_related_lookups', [])
    if additional_prl:
        # Don't need to clone because the manager should have given us a fresh
        # instance, so we access an internal instead of using public interface
        # for performance reasons.
        rel_qs._prefetch_related_lookups = []

    all_related_objects = list(rel_qs)

    rel_obj_cache = {}
    for rel_obj in all_related_objects:
        rel_attr_val = getattr(rel_obj, rel_obj_attr)
        if rel_attr_val not in rel_obj_cache:
            rel_obj_cache[rel_attr_val] = []
        rel_obj_cache[rel_attr_val].append(rel_obj)

    for obj in instances:
        qs = getattr(obj, attname).all()
        instance_attr_val = getattr(obj, instance_attr)
        qs._result_cache = rel_obj_cache.get(instance_attr_val, [])
        # We don't want the individual qs doing prefetch_related now, since we
        # have merged this into the current work.
        qs._prefetch_done = True
        obj._prefetched_objects_cache[attname] = qs
    return all_related_objects, additional_prl
+101 −2
Original line number Diff line number Diff line
@@ -571,8 +571,6 @@ can be useful in situations where you might want to pass in either a model
manager or a ``QuerySet`` and do further filtering on the result. After calling
``all()`` on either object, you'll definitely have a ``QuerySet`` to work with.

.. _select-related:

select_related
~~~~~~~~~~~~~~

@@ -690,6 +688,107 @@ is defined. Instead of specifying the field name, use the :attr:`related_name
A :class:`~django.db.models.OneToOneField` is not traversed in the reverse
direction if you are performing a depth-based ``select_related()`` call.

prefetch_related
~~~~~~~~~~~~~~~~

.. method:: prefetch_related(*lookups)

.. versionadded:: 1.4

Returns a ``QuerySet`` that will automatically retrieve, in a single batch,
related many-to-many and many-to-one objects for each of the specified lookups.

This is similar to ``select_related`` for the 'many related objects' case, but
note that ``prefetch_related`` causes a separate query to be issued for each set
of related objects that you request, unlike ``select_related`` which modifies
the original query with joins in order to get the related objects. With
``prefetch_related``, the additional queries are done as soon as the QuerySet
begins to be evaluated.

For example, suppose you have these models::

    class Topping(models.Model):
        name = models.CharField(max_length=30)

    class Pizza(models.Model):
        name = models.CharField(max_length=50)
        toppings = models.ManyToManyField(Topping)

        def __unicode__(self):
            return u"%s (%s)" % (self.name, u", ".join([topping.name
                                                        for topping in self.toppings.all()]))

and run this code::

    >>> Pizza.objects.all()
    [u"Hawaiian (ham, pineapple)", u"Seafood (prawns, smoked salmon)"...

The problem with this code is that it will run a query on the Toppings table for
**every** item in the Pizza ``QuerySet``.  Using ``prefetch_related``, this can
be reduced to two:

    >>> Pizza.objects.all().prefetch_related('toppings')

All the relevant toppings will be fetched in a single query, and used to make
``QuerySets`` that have a pre-filled cache of the relevant results. These
``QuerySets`` are then used in the ``self.toppings.all()`` calls.

Please note that use of ``prefetch_related`` will mean that the additional
queries run will **always** be executed - even if you never use the related
objects - and it always fully populates the result cache on the primary
``QuerySet`` (which can sometimes be avoided in other cases).

Also remember that, as always with QuerySets, any subsequent chained methods
will ignore previously cached results, and retrieve data using a fresh database
query. So, if you write the following:

    >>> pizzas = Pizza.objects.prefetch_related('toppings')
    >>> [list(pizza.toppings.filter(spicy=True)) for pizza in pizzas]

...then the fact that `pizza.toppings.all()` has been prefetched will not help
you - in fact it hurts performance, since you have done a database query that
you haven't used. So use this feature with caution!

The lookups that must be supplied to this method can be any attributes on the
model instances which represent related queries that return multiple
objects. This includes attributes representing the 'many' side of ``ForeignKey``
relationships, forward and reverse ``ManyToManyField`` attributes, and also any
``GenericRelations``.

You can also use the normal join syntax to do related fields of related
fields. Suppose we have an additional model to the example above::

    class Restaurant(models.Model):
        pizzas = models.ManyToMany(Pizza, related_name='restaurants')
        best_pizza = models.ForeignKey(Pizza, related_name='championed_by')

The following are all legal:

    >>> Restaurant.objects.prefetch_related('pizzas__toppings')

This will prefetch all pizzas belonging to restaurants, and all toppings
belonging to those pizzas. This will result in a total of 3 database queries -
one for the restaurants, one for the pizzas, and one for the toppings.

    >>> Restaurant.objects.select_related('best_pizza').prefetch_related('best_pizza__toppings')

This will fetch the best pizza and all the toppings for the best pizza for each
restaurant. This will be done in 2 database queries - one for the restaurants
and 'best pizzas' combined (achieved through use of ``select_related``), and one
for the toppings.

Chaining ``prefetch_related`` calls will accumulate the fields that should have
this behavior applied. To clear any ``prefetch_related`` behavior, pass `None`
as a parameter::

   >>> non_prefetched = qs.prefetch_related(None)

One difference when using ``prefetch_related`` is that, in some circumstances,
objects created by a query can be shared between the different objects that they
are related to i.e. a single Python model instance can appear at more than one
point in the tree of objects that are returned. Normally this behavior will not
be a problem, and will in fact save both memory and CPU time.

extra
~~~~~

Loading