Fixed #18702 -- Removed chunked reads from QuerySet iteration (70679243) · Commits · Dom Sekotill / django

django/db/models/query.py

+29 −120

Original line number	Diff line number	Diff line
		@@ -20,11 +20,6 @@ from django.utils.functional import partition
		from django.utils import six
		from django.utils import timezone

		# Used to control how many objects are worked with at once in some cases (e.g.
		# when deleting objects).
		CHUNK_SIZE = 100
		ITER_CHUNK_SIZE = CHUNK_SIZE

		# The maximum number of items to display in a QuerySet.__repr__
		REPR_OUTPUT_SIZE = 20

		@@ -41,7 +36,6 @@ class QuerySet(object):
		self._db = using
		self.query = query or sql.Query(self.model)
		self._result_cache = None
		self._iter = None
		self._sticky_filter = False
		self._for_write = False
		self._prefetch_related_lookups = []
		@@ -58,7 +52,7 @@ class QuerySet(object):
		"""
		obj = self.__class__()
		for k, v in self.__dict__.items():
		if k in ('_iter','_result_cache'):
		if k == '_result_cache':
		obj.__dict__[k] = None
		else:
		obj.__dict__[k] = copy.deepcopy(v, memo)
		@@ -69,10 +63,8 @@ class QuerySet(object):
		Allows the QuerySet to be pickled.
		"""
		# Force the cache to be fully populated.
		len(self)

		self._fetch_all()
		obj_dict = self.__dict__.copy()
		obj_dict['_iter'] = None
		return obj_dict

		def __repr__(self):
		@@ -82,95 +74,31 @@ class QuerySet(object):
		return repr(data)

		def __len__(self):
		# Since __len__ is called quite frequently (for example, as part of
		# list(qs), we make some effort here to be as efficient as possible
		# whilst not messing up any existing iterators against the QuerySet.
		if self._result_cache is None:
		if self._iter:
		self._result_cache = list(self._iter)
		else:
		self._result_cache = list(self.iterator())
		elif self._iter:
		self._result_cache.extend(self._iter)
		if self._prefetch_related_lookups and not self._prefetch_done:
		self._prefetch_related_objects()
		self._fetch_all()
		return len(self._result_cache)

		def __iter__(self):
		if self._prefetch_related_lookups and not self._prefetch_done:
		# We need all the results in order to be able to do the prefetch
		# in one go. To minimize code duplication, we use the __len__
		# code path which also forces this, and also does the prefetch
		len(self)

		if self._result_cache is None:
		self._iter = self.iterator()
		self._result_cache = []
		if self._iter:
		return self._result_iter()
		# Python's list iterator is better than our version when we're just
		# iterating over the cache.
		"""
		The queryset iterator protocol uses three nested iterators in the
		default case:
		1. sql.compiler:execute_sql()
		- Returns 100 rows at time (constants.GET_ITERATOR_CHUNK_SIZE)
		using cursor.fetchmany(). This part is responsible for
		doing some column masking, and returning the rows in chunks.
		2. sql/compiler.results_iter()
		- Returns one row at time. At this point the rows are still just
		tuples. In some cases the return values are converted to
		Python values at this location (see resolve_columns(),
		resolve_aggregate()).
		3. self.iterator()
		- Responsible for turning the rows into model objects.
		"""
		self._fetch_all()
		return iter(self._result_cache)

		def _result_iter(self):
		pos = 0
		while 1:
		upper = len(self._result_cache)
		while pos < upper:
		yield self._result_cache[pos]
		pos = pos + 1
		if not self._iter:
		raise StopIteration
		if len(self._result_cache) <= pos:
		self._fill_cache()

		def __bool__(self):
		if self._prefetch_related_lookups and not self._prefetch_done:
		# We need all the results in order to be able to do the prefetch
		# in one go. To minimize code duplication, we use the __len__
		# code path which also forces this, and also does the prefetch
		len(self)

		if self._result_cache is not None:
		def __nonzero__(self):
		self._fetch_all()
		return bool(self._result_cache)
		try:
		next(iter(self))
		except StopIteration:
		return False
		return True

		def __nonzero__(self): # Python 2 compatibility
		return type(self).__bool__(self)

		def __contains__(self, val):
		# The 'in' operator works without this method, due to __iter__. This
		# implementation exists only to shortcut the creation of Model
		# instances, by bailing out early if we find a matching element.
		pos = 0
		if self._result_cache is not None:
		if val in self._result_cache:
		return True
		elif self._iter is None:
		# iterator is exhausted, so we have our answer
		return False
		# remember not to check these again:
		pos = len(self._result_cache)
		else:
		# We need to start filling the result cache out. The following
		# ensures that self._iter is not None and self._result_cache is not
		# None
		it = iter(self)

		# Carry on, one result at a time.
		while True:
		if len(self._result_cache) <= pos:
		self._fill_cache(num=1)
		if self._iter is None:
		# we ran out of items
		return False
		if self._result_cache[pos] == val:
		return True
		pos += 1

		def __getitem__(self, k):
		"""
		@@ -184,19 +112,6 @@ class QuerySet(object):
		"Negative indexing is not supported."

		if self._result_cache is not None:
		if self._iter is not None:
		# The result cache has only been partially populated, so we may
		# need to fill it out a bit more.
		if isinstance(k, slice):
		if k.stop is not None:
		# Some people insist on passing in strings here.
		bound = int(k.stop)
		else:
		bound = None
		else:
		bound = k + 1
		if len(self._result_cache) < bound:
		self._fill_cache(bound - len(self._result_cache))
		return self._result_cache[k]

		if isinstance(k, slice):
		@@ -370,7 +285,7 @@ class QuerySet(object):
		If the QuerySet is already fully cached this simply returns the length
		of the cached results set to avoid multiple SELECT COUNT(*) calls.
		"""
		if self._result_cache is not None and not self._iter:
		if self._result_cache is not None:
		return len(self._result_cache)

		return self.query.get_count(using=self.db)
		@@ -933,17 +848,11 @@ class QuerySet(object):
		c._setup_query()
		return c

		def _fill_cache(self, num=None):
		"""
		Fills the result cache with 'num' more entries (or until the results
		iterator is exhausted).
		"""
		if self._iter:
		try:
		for i in range(num or ITER_CHUNK_SIZE):
		self._result_cache.append(next(self._iter))
		except StopIteration:
		self._iter = None
		def _fetch_all(self):
		if self._result_cache is None:
		self._result_cache = list(self.iterator())
		if self._prefetch_related_lookups and not self._prefetch_done:
		self._prefetch_related_objects()

		def _next_is_sticky(self):
		"""

docs/releases/1.6.txt

+19 −0

Original line number	Diff line number	Diff line
		@@ -524,6 +524,25 @@ non-standard behavior has been preserved but moved to the model form field layer
		and occurs only when the associated widget is
		:class:`~django.forms.SelectMultiple` or a subclass.

		QuerySet iteration
		~~~~~~~~~~~~~~~~~~

		The ``QuerySet`` iteration was changed to immediately convert all fetched
		rows to ``Model`` objects. In Django 1.5 and earlier the fetched rows were
		converted to ``Model`` objects in chunks of 100.

		Existing code will work, but the amount of rows converted to objects
		might change in certain use cases. Such usages include partially looping
		over a queryset or any usage which ends up doing ``__bool__`` or
		``__contains__``.

		Notably most database backends did fetch all the rows in one go already in
		1.5.

		It is still possible to convert the fetched rows to ``Model`` objects
		lazily by using the :meth:`~django.db.models.query.QuerySet.iterator()`
		method.

		Miscellaneous
		~~~~~~~~~~~~~

tests/queries/tests.py

+8 −36

Original line number	Diff line number	Diff line
		@@ -9,7 +9,6 @@ from django.conf import settings
		from django.core.exceptions import FieldError
		from django.db import DatabaseError, connection, connections, DEFAULT_DB_ALIAS
		from django.db.models import Count, F, Q
		from django.db.models.query import ITER_CHUNK_SIZE
		from django.db.models.sql.where import WhereNode, EverythingNode, NothingNode
		from django.db.models.sql.datastructures import EmptyResultSet
		from django.test import TestCase, skipUnlessDBFeature
		@@ -1211,16 +1210,6 @@ class Queries2Tests(TestCase):
		ordered=False
		)

		def test_ticket7411(self):
		# Saving to db must work even with partially read result set in another
		# cursor.
		for num in range(2 * ITER_CHUNK_SIZE + 1):
		_ = Number.objects.create(num=num)

		for i, obj in enumerate(Number.objects.all()):
		obj.save()
		if i > 10: break

		def test_ticket7759(self):
		# Count should work with a partially read result set.
		count = Number.objects.count()
		@@ -1700,31 +1689,6 @@ class Queries6Tests(TestCase):
		ann1.notes.add(n1)
		ann2 = Annotation.objects.create(name='a2', tag=t4)

		# This next test used to cause really weird PostgreSQL behavior, but it was
		# only apparent much later when the full test suite ran.
		# - Yeah, it leaves global ITER_CHUNK_SIZE to 2 instead of 100...
		#@unittest.expectedFailure
		def test_slicing_and_cache_interaction(self):
		# We can do slicing beyond what is currently in the result cache,
		# too.

		# We need to mess with the implementation internals a bit here to decrease the
		# cache fill size so that we don't read all the results at once.
		from django.db.models import query
		query.ITER_CHUNK_SIZE = 2
		qs = Tag.objects.all()

		# Fill the cache with the first chunk.
		self.assertTrue(bool(qs))
		self.assertEqual(len(qs._result_cache), 2)

		# Query beyond the end of the cache and check that it is filled out as required.
		self.assertEqual(repr(qs[4]), '<Tag: t5>')
		self.assertEqual(len(qs._result_cache), 5)

		# But querying beyond the end of the result set will fail.
		self.assertRaises(IndexError, lambda: qs[100])

		def test_parallel_iterators(self):
		# Test that parallel iterators work.
		qs = Tag.objects.all()
		@@ -2533,6 +2497,14 @@ class WhereNodeTest(TestCase):
		w = WhereNode(children=[empty_w, NothingNode()], connector='OR')
		self.assertRaises(EmptyResultSet, w.as_sql, qn, connection)


		class IteratorExceptionsTest(TestCase):
		def test_iter_exceptions(self):
		qs = ExtraInfo.objects.only('author')
		with self.assertRaises(AttributeError):
		list(qs)


		class NullJoinPromotionOrTest(TestCase):
		def setUp(self):
		self.d1 = ModelD.objects.create(name='foo')