Commit 29050ef9 authored by Malcolm Tredinnick's avatar Malcolm Tredinnick
Browse files

Fixed #5420 -- Added support for delayed loading of model fields.

In extreme cases, some fields are expensive to load from the database
(e.g. GIS fields requiring conversion, or large text fields). This
commit adds defer() and only() methods to querysets that allow the
caller to specify which fields should not be loaded unless they are
accessed.

git-svn-id: http://code.djangoproject.com/svn/django/trunk@10090 bcc190cf-cafb-0310-a4f2-bffc1f526a37
parent 96d5d434
Loading
Loading
Loading
Loading
+44 −1
Original line number Diff line number Diff line
@@ -12,7 +12,8 @@ import django.db.models.manager # Imported to register signal handler.
from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned, FieldError
from django.db.models.fields import AutoField, FieldDoesNotExist
from django.db.models.fields.related import OneToOneRel, ManyToOneRel, OneToOneField
from django.db.models.query import delete_objects, Q, CollectedObjects
from django.db.models.query import delete_objects, Q
from django.db.models.query_utils import CollectedObjects, DeferredAttribute
from django.db.models.options import Options
from django.db import connection, transaction, DatabaseError
from django.db.models import signals
@@ -235,6 +236,7 @@ class ModelBase(type):

class Model(object):
    __metaclass__ = ModelBase
    _deferred = False

    def __init__(self, *args, **kwargs):
        signals.pre_init.send(sender=self.__class__, args=args, kwargs=kwargs)
@@ -271,6 +273,13 @@ class Model(object):
        for field in fields_iter:
            is_related_object = False
            if kwargs:
                # This slightly odd construct is so that we can access any
                # data-descriptor object (DeferredAttribute) without triggering
                # its __get__ method.
                if (field.attname not in kwargs and
                        isinstance(self.__class__.__dict__.get(field.attname), DeferredAttribute)):
                    # This field will be populated on request.
                    continue
                if isinstance(field.rel, ManyToOneRel):
                    try:
                        # Assume object instance was passed in.
@@ -332,6 +341,31 @@ class Model(object):
    def __hash__(self):
        return hash(self._get_pk_val())

    def __reduce__(self):
        """
        Provide pickling support. Normally, this just dispatches to Python's
        standard handling. However, for models with deferred field loading, we
        need to do things manually, as they're dynamically created classes and
        only module-level classes can be pickled by the default path.
        """
        if not self._deferred:
            return super(Model, self).__reduce__()
        data = self.__dict__
        defers = []
        pk_val = None
        for field in self._meta.fields:
            if isinstance(self.__class__.__dict__.get(field.attname),
                    DeferredAttribute):
                defers.append(field.attname)
                if pk_val is None:
                    # The pk_val and model values are the same for all
                    # DeferredAttribute classes, so we only need to do this
                    # once.
                    obj = self.__class__.__dict__[field.attname]
                    pk_val = obj.pk_value
                    model = obj.model_ref()
        return (model_unpickle, (model, pk_val, defers), data)

    def _get_pk_val(self, meta=None):
        if not meta:
            meta = self._meta
@@ -591,6 +625,15 @@ def get_absolute_url(opts, func, self, *args, **kwargs):
class Empty(object):
    pass

def model_unpickle(model, pk_val, attrs):
    """
    Used to unpickle Model subclasses with deferred fields.
    """
    from django.db.models.query_utils import deferred_class_factory
    cls = deferred_class_factory(model, pk_val, attrs)
    return cls.__new__(cls)
model_unpickle.__safe_for_unpickle__ = True

if sys.version_info < (2, 5):
    # Prior to Python 2.5, Exception was an old-style class
    def subclass_exception(name, parent, unused):
+6 −0
Original line number Diff line number Diff line
@@ -167,6 +167,12 @@ class Manager(object):
    def reverse(self, *args, **kwargs):
        return self.get_query_set().reverse(*args, **kwargs)

    def defer(self, *args, **kwargs):
        return self.get_query_set().defer(*args, **kwargs)

    def only(self, *args, **kwargs):
        return self.get_query_set().only(*args, **kwargs)

    def _insert(self, values, **kwargs):
        return insert_query(self.model, values, **kwargs)

+6 −0
Original line number Diff line number Diff line
@@ -477,3 +477,9 @@ class Options(object):
            self._ordered_objects = objects
        return self._ordered_objects

    def pk_index(self):
        """
        Returns the index of the primary key field in the self.fields list.
        """
        return self.fields.index(self.pk)
+87 −106
Original line number Diff line number Diff line
"""
The main QuerySet implementation. This provides the public API for the ORM.
"""

try:
    set
except NameError:
@@ -6,9 +10,8 @@ except NameError:
from django.db import connection, transaction, IntegrityError
from django.db.models.aggregates import Aggregate
from django.db.models.fields import DateField
from django.db.models.query_utils import Q, select_related_descend
from django.db.models.query_utils import Q, select_related_descend, CollectedObjects, CyclicDependency, deferred_class_factory
from django.db.models import signals, sql
from django.utils.datastructures import SortedDict


# Used to control how many objects are worked with at once in some cases (e.g.
@@ -22,102 +25,6 @@ REPR_OUTPUT_SIZE = 20
# Pull into this namespace for backwards compatibility.
EmptyResultSet = sql.EmptyResultSet


class CyclicDependency(Exception):
    """
    An error when dealing with a collection of objects that have a cyclic
    dependency, i.e. when deleting multiple objects.
    """
    pass


class CollectedObjects(object):
    """
    A container that stores keys and lists of values along with remembering the
    parent objects for all the keys.

    This is used for the database object deletion routines so that we can
    calculate the 'leaf' objects which should be deleted first.
    """

    def __init__(self):
        self.data = {}
        self.children = {}

    def add(self, model, pk, obj, parent_model, nullable=False):
        """
        Adds an item to the container.

        Arguments:
        * model - the class of the object being added.
        * pk - the primary key.
        * obj - the object itself.
        * parent_model - the model of the parent object that this object was
          reached through.
        * nullable - should be True if this relation is nullable.

        Returns True if the item already existed in the structure and
        False otherwise.
        """
        d = self.data.setdefault(model, SortedDict())
        retval = pk in d
        d[pk] = obj
        # Nullable relationships can be ignored -- they are nulled out before
        # deleting, and therefore do not affect the order in which objects
        # have to be deleted.
        if parent_model is not None and not nullable:
            self.children.setdefault(parent_model, []).append(model)
        return retval

    def __contains__(self, key):
        return self.data.__contains__(key)

    def __getitem__(self, key):
        return self.data[key]

    def __nonzero__(self):
        return bool(self.data)

    def iteritems(self):
        for k in self.ordered_keys():
            yield k, self[k]

    def items(self):
        return list(self.iteritems())

    def keys(self):
        return self.ordered_keys()

    def ordered_keys(self):
        """
        Returns the models in the order that they should be dealt with (i.e.
        models with no dependencies first).
        """
        dealt_with = SortedDict()
        # Start with items that have no children
        models = self.data.keys()
        while len(dealt_with) < len(models):
            found = False
            for model in models:
                if model in dealt_with:
                    continue
                children = self.children.setdefault(model, [])
                if len([c for c in children if c not in dealt_with]) == 0:
                    dealt_with[model] = None
                    found = True
            if not found:
                raise CyclicDependency(
                    "There is a cyclic dependency of items to be processed.")

        return dealt_with.keys()

    def unordered_keys(self):
        """
        Fallback for the case where is a cyclic dependency but we don't  care.
        """
        return self.data.keys()


class QuerySet(object):
    """
    Represents a lazy database lookup for a set of objects.
@@ -275,6 +182,11 @@ class QuerySet(object):
        extra_select = self.query.extra_select.keys()
        aggregate_select = self.query.aggregate_select.keys()

        only_load = self.query.get_loaded_field_names()
        if not fill_cache:
            fields = self.model._meta.fields
            pk_idx = self.model._meta.pk_index()

        index_start = len(extra_select)
        aggregate_start = index_start + len(self.model._meta.fields)

@@ -282,9 +194,30 @@ class QuerySet(object):
            if fill_cache:
                obj, _ = get_cached_row(self.model, row,
                            index_start, max_depth,
                            requested=requested, offset=len(aggregate_select))
                            requested=requested, offset=len(aggregate_select),
                            only_load=only_load)
            else:
                load_fields = only_load.get(self.model)
                if load_fields:
                    # Some fields have been deferred, so we have to initialise
                    # via keyword arguments.
                    row_data = row[index_start:aggregate_start]
                    pk_val = row_data[pk_idx]
                    skip = set()
                    init_list = []
                    for field in fields:
                        if field.name not in load_fields:
                            skip.add(field.attname)
                        else:
                            init_list.append(field.attname)
                    if skip:
                        model_cls = deferred_class_factory(self.model, pk_val,
                                skip)
                        obj = model_cls(**dict(zip(init_list, row_data)))
                    else:
                # omit aggregates in object creation
                        obj = self.model(*row[index_start:aggregate_start])
                else:
                    # Omit aggregates in object creation.
                    obj = self.model(*row[index_start:aggregate_start])

            for i, k in enumerate(extra_select):
@@ -655,6 +588,35 @@ class QuerySet(object):
        clone.query.standard_ordering = not clone.query.standard_ordering
        return clone

    def defer(self, *fields):
        """
        Defers the loading of data for certain fields until they are accessed.
        The set of fields to defer is added to any existing set of deferred
        fields. The only exception to this is if None is passed in as the only
        parameter, in which case all deferrals are removed (None acts as a
        reset option).
        """
        clone = self._clone()
        if fields == (None,):
            clone.query.clear_deferred_loading()
        else:
            clone.query.add_deferred_loading(fields)
        return clone

    def only(self, *fields):
        """
        Essentially, the opposite of defer. Only the fields passed into this
        method and that are not already specified as deferred are loaded
        immediately when the queryset is evaluated.
        """
        if fields == [None]:
            # Can only pass None to defer(), not only(), as the rest option.
            # That won't stop people trying to do this, so let's be explicit.
            raise TypeError("Cannot pass None as an argument to only().")
        clone = self._clone()
        clone.query.add_immediate_loading(fields)
        return clone

    ###################
    # PRIVATE METHODS #
    ###################
@@ -757,6 +719,7 @@ class ValuesQuerySet(QuerySet):
        Called by the _clone() method after initializing the rest of the
        instance.
        """
        self.query.clear_deferred_loading()
        self.query.clear_select_fields()

        if self._fields:
@@ -847,9 +810,9 @@ class ValuesListQuerySet(ValuesQuerySet):
            for row in self.query.results_iter():
                yield tuple(row)
        else:
            # When extra(select=...) or an annotation is involved, the extra cols are
            # always at the start of the row, and we need to reorder the fields
            # to match the order in self._fields.
            # When extra(select=...) or an annotation is involved, the extra
            # cols are always at the start of the row, and we need to reorder
            # the fields to match the order in self._fields.
            extra_names = self.query.extra_select.keys()
            field_names = self.field_names
            aggregate_names = self.query.aggregate_select.keys()
@@ -884,6 +847,7 @@ class DateQuerySet(QuerySet):
        Called by the _clone() method after initializing the rest of the
        instance.
        """
        self.query.clear_deferred_loading()
        self.query = self.query.clone(klass=sql.DateQuery, setup=True)
        self.query.select = []
        field = self.model._meta.get_field(self._field_name, many_to_many=False)
@@ -935,7 +899,7 @@ class EmptyQuerySet(QuerySet):


def get_cached_row(klass, row, index_start, max_depth=0, cur_depth=0,
                   requested=None, offset=0):
                   requested=None, offset=0, only_load=None):
    """
    Helper function that recursively returns an object with the specified
    related attributes already populated.
@@ -950,6 +914,23 @@ def get_cached_row(klass, row, index_start, max_depth=0, cur_depth=0,
    if not [x for x in fields if x is not None]:
        # If we only have a list of Nones, there was not related object.
        obj = None
    else:
        load_fields = only_load and only_load.get(klass) or None
        if load_fields:
            # Handle deferred fields.
            skip = set()
            init_list = []
            pk_val = fields[klass._meta.pk_index()]
            for field in klass._meta.fields:
                if field.name not in load_fields:
                    skip.add(field.name)
                else:
                    init_list.append(field.attname)
            if skip:
                klass = deferred_class_factory(klass, pk_val, skip)
                obj = klass(**dict(zip(init_list, fields)))
            else:
                obj = klass(*fields)
        else:
            obj = klass(*fields)
    index_end += offset
+169 −2
Original line number Diff line number Diff line
"""
Various data structures used in query construction.

Factored out from django.db.models.query so that they can also be used by other
modules without getting into circular import difficulties.
Factored out from django.db.models.query to avoid making the main module very
large and/or so that they can be used by other modules without getting into
circular import difficulties.
"""

import weakref
from copy import deepcopy

from django.utils import tree
from django.utils.datastructures import SortedDict

try:
    sorted
except NameError:
    from django.utils.itercompat import sorted  # For Python 2.3.


class CyclicDependency(Exception):
    """
    An error when dealing with a collection of objects that have a cyclic
    dependency, i.e. when deleting multiple objects.
    """
    pass

class CollectedObjects(object):
    """
    A container that stores keys and lists of values along with remembering the
    parent objects for all the keys.

    This is used for the database object deletion routines so that we can
    calculate the 'leaf' objects which should be deleted first.
    """

    def __init__(self):
        self.data = {}
        self.children = {}

    def add(self, model, pk, obj, parent_model, nullable=False):
        """
        Adds an item to the container.

        Arguments:
        * model - the class of the object being added.
        * pk - the primary key.
        * obj - the object itself.
        * parent_model - the model of the parent object that this object was
          reached through.
        * nullable - should be True if this relation is nullable.

        Returns True if the item already existed in the structure and
        False otherwise.
        """
        d = self.data.setdefault(model, SortedDict())
        retval = pk in d
        d[pk] = obj
        # Nullable relationships can be ignored -- they are nulled out before
        # deleting, and therefore do not affect the order in which objects
        # have to be deleted.
        if parent_model is not None and not nullable:
            self.children.setdefault(parent_model, []).append(model)
        return retval

    def __contains__(self, key):
        return self.data.__contains__(key)

    def __getitem__(self, key):
        return self.data[key]

    def __nonzero__(self):
        return bool(self.data)

    def iteritems(self):
        for k in self.ordered_keys():
            yield k, self[k]

    def items(self):
        return list(self.iteritems())

    def keys(self):
        return self.ordered_keys()

    def ordered_keys(self):
        """
        Returns the models in the order that they should be dealt with (i.e.
        models with no dependencies first).
        """
        dealt_with = SortedDict()
        # Start with items that have no children
        models = self.data.keys()
        while len(dealt_with) < len(models):
            found = False
            for model in models:
                if model in dealt_with:
                    continue
                children = self.children.setdefault(model, [])
                if len([c for c in children if c not in dealt_with]) == 0:
                    dealt_with[model] = None
                    found = True
            if not found:
                raise CyclicDependency(
                    "There is a cyclic dependency of items to be processed.")

        return dealt_with.keys()

    def unordered_keys(self):
        """
        Fallback for the case where is a cyclic dependency but we don't  care.
        """
        return self.data.keys()

class QueryWrapper(object):
    """
@@ -51,6 +153,39 @@ class Q(tree.Node):
        obj.negate()
        return obj

class DeferredAttribute(object):
    """
    A wrapper for a deferred-loading field. When the value is read from this
    object the first time, the query is executed.
    """
    def __init__(self, field_name, pk_value, model):
        self.field_name = field_name
        self.pk_value = pk_value
        self.model_ref = weakref.ref(model)
        self.loaded = False

    def __get__(self, instance, owner):
        """
        Retrieves and caches the value from the datastore on the first lookup.
        Returns the cached value.
        """
        assert instance is not None
        if not self.loaded:
            obj = self.model_ref()
            if obj is None:
                return
            self.value = list(obj._base_manager.filter(pk=self.pk_value).values_list(self.field_name, flat=True))[0]
            self.loaded = True
        return self.value

    def __set__(self, name, value):
        """
        Deferred loading attributes can be set normally (which means there will
        never be a database lookup involved.
        """
        self.value = value
        self.loaded = True

def select_related_descend(field, restricted, requested):
    """
    Returns True if this field should be used to descend deeper for
@@ -67,3 +202,35 @@ def select_related_descend(field, restricted, requested):
    if not restricted and field.null:
        return False
    return True

# This function is needed because data descriptors must be defined on a class
# object, not an instance, to have any effect.

def deferred_class_factory(model, pk_value, attrs):
    """
    Returns a class object that is a copy of "model" with the specified "attrs"
    being replaced with DeferredAttribute objects. The "pk_value" ties the
    deferred attributes to a particular instance of the model.
    """
    class Meta:
        pass
    setattr(Meta, "proxy", True)
    setattr(Meta, "app_label", model._meta.app_label)

    # The app_cache wants a unique name for each model, otherwise the new class
    # won't be created (we get an old one back). Therefore, we generate the
    # name using the passed in attrs. It's OK to reuse an old case if the attrs
    # are identical.
    name = "%s_Deferred_%s" % (model.__name__, '_'.join(sorted(list(attrs))))

    overrides = dict([(attr, DeferredAttribute(attr, pk_value, model))
            for attr in attrs])
    overrides["Meta"] = Meta
    overrides["__module__"] = model.__module__
    overrides["_deferred"] = True
    return type(name, (model,), overrides)

# The above function is also used to unpickle model instances with deferred
# fields.
deferred_class_factory.__safe_for_unpickling__ = True
Loading