Commit 29132ebd authored by Anssi Kääriäinen's avatar Anssi Kääriäinen
Browse files

Fixed #17788 -- Added batch_size argument to qs.bulk_create()

The qs.bulk_create() method did not work with large batches together
with SQLite3. This commit adds a way to split the bulk into smaller
batches. The default batch size is unlimited except for SQLite3 where
the batch size is limited to 999 SQL parameters per batch.

Thanks to everybody who participated in the discussions at Trac.
parent fcad6c48
Loading
Loading
Loading
Loading
+19 −11
Original line number Diff line number Diff line
@@ -475,6 +475,14 @@ class BaseDatabaseOperations(object):
        """
        return None

    def bulk_batch_size(self, fields, objs):
        """
        Returns the maximum allowed batch size for the backend. The fields
        are the fields going to be inserted in the batch, the objs contains
        all the objects to be inserted.
        """
        return len(objs)

    def cache_key_culling_sql(self):
        """
        Returns a SQL query that retrieves the first cache key greater than the
@@ -522,6 +530,17 @@ class BaseDatabaseOperations(object):
        """
        return ''

    def distinct_sql(self, fields):
        """
        Returns an SQL DISTINCT clause which removes duplicate rows from the
        result set. If any fields are given, only the given fields are being
        checked for duplicates.
        """
        if fields:
            raise NotImplementedError('DISTINCT ON fields is not supported by this database backend')
        else:
            return 'DISTINCT'

    def drop_foreignkey_sql(self):
        """
        Returns the SQL command that drops a foreign key.
@@ -577,17 +596,6 @@ class BaseDatabaseOperations(object):
        """
        raise NotImplementedError('Full-text search is not implemented for this database backend')

    def distinct_sql(self, fields):
        """
        Returns an SQL DISTINCT clause which removes duplicate rows from the
        result set. If any fields are given, only the given fields are being
        checked for duplicates.
        """
        if fields:
            raise NotImplementedError('DISTINCT ON fields is not supported by this database backend')
        else:
            return 'DISTINCT'

    def last_executed_query(self, cursor, sql, params):
        """
        Returns a string of the query last executed by the given cursor, with
+8 −1
Original line number Diff line number Diff line
@@ -85,7 +85,7 @@ class DatabaseFeatures(BaseDatabaseFeatures):
    supports_1000_query_parameters = False
    supports_mixed_date_datetime_comparisons = False
    has_bulk_insert = True
    can_combine_inserts_with_and_without_auto_increment_pk = True
    can_combine_inserts_with_and_without_auto_increment_pk = False

    @cached_property
    def supports_stddev(self):
@@ -107,6 +107,13 @@ class DatabaseFeatures(BaseDatabaseFeatures):
        return has_support

class DatabaseOperations(BaseDatabaseOperations):
    def bulk_batch_size(self, fields, objs):
        """
        SQLite has a compile-time default (SQLITE_LIMIT_VARIABLE_NUMBER) of
        999 variables per query.
        """
        return (999 // len(fields)) if len(fields) > 0 else len(objs)

    def date_extract_sql(self, lookup_type, field_name):
        # sqlite doesn't support extract, so we fake it with the user-defined
        # function django_extract that's registered in connect(). Note that
+23 −6
Original line number Diff line number Diff line
@@ -388,7 +388,7 @@ class QuerySet(object):
        obj.save(force_insert=True, using=self.db)
        return obj

    def bulk_create(self, objs):
    def bulk_create(self, objs, batch_size=None):
        """
        Inserts each of the instances into the database. This does *not* call
        save() on each of the instances, does not send any pre/post save
@@ -401,8 +401,10 @@ class QuerySet(object):
        # this could be implemented if you didn't have an autoincrement pk,
        # and 2) you could do it by doing O(n) normal inserts into the parent
        # tables to get the primary keys back, and then doing a single bulk
        # insert into the childmost table. We're punting on these for now
        # because they are relatively rare cases.
        # insert into the childmost table. Some databases might allow doing
        # this by using RETURNING clause for the insert query. We're punting
        # on these for now because they are relatively rare cases.
        assert batch_size is None or batch_size > 0
        if self.model._meta.parents:
            raise ValueError("Can't bulk create an inherited model")
        if not objs:
@@ -418,13 +420,14 @@ class QuerySet(object):
        try:
            if (connection.features.can_combine_inserts_with_and_without_auto_increment_pk
                and self.model._meta.has_auto_field):
                self.model._base_manager._insert(objs, fields=fields, using=self.db)
                self._batched_insert(objs, fields, batch_size)
            else:
                objs_with_pk, objs_without_pk = partition(lambda o: o.pk is None, objs)
                if objs_with_pk:
                    self.model._base_manager._insert(objs_with_pk, fields=fields, using=self.db)
                    self._batched_insert(objs_with_pk, fields, batch_size)
                if objs_without_pk:
                    self.model._base_manager._insert(objs_without_pk, fields=[f for f in fields if not isinstance(f, AutoField)], using=self.db)
                    fields= [f for f in fields if not isinstance(f, AutoField)]
                    self._batched_insert(objs_without_pk, fields, batch_size)
            if forced_managed:
                transaction.commit(using=self.db)
            else:
@@ -860,6 +863,20 @@ class QuerySet(object):
    ###################
    # PRIVATE METHODS #
    ###################
    def _batched_insert(self, objs, fields, batch_size):
        """
        A little helper method for bulk_insert to insert the bulk one batch
        at a time. Inserts recursively a batch from the front of the bulk and
        then _batched_insert() the remaining objects again.
        """
        if not objs:
            return
        ops = connections[self.db].ops
        batch_size = (batch_size or max(ops.bulk_batch_size(fields, objs), 1))
        for batch in [objs[i:i+batch_size]
                      for i in range(0, len(objs), batch_size)]:
            self.model._base_manager._insert(batch, fields=fields,
                                             using=self.db)

    def _clone(self, klass=None, setup=False, **kwargs):
        if klass is None:
+6 −14
Original line number Diff line number Diff line
@@ -1350,7 +1350,7 @@ has a side effect on your data. For more, see `Safe methods`_ in the HTTP spec.
bulk_create
~~~~~~~~~~~

.. method:: bulk_create(objs)
.. method:: bulk_create(objs, batch_size=None)

.. versionadded:: 1.4

@@ -1372,20 +1372,12 @@ This has a number of caveats though:
* If the model's primary key is an :class:`~django.db.models.AutoField` it
  does not retrieve and set the primary key attribute, as ``save()`` does.

.. admonition:: Limits of SQLite
The ``batch_size`` parameter controls how many objects are created in single
query. The default is to create all objects in one batch, except for SQLite
where the default is such that at maximum 999 variables per query is used.

    SQLite sets a limit on the number of parameters per SQL statement. The
    maximum is defined by the SQLITE_MAX_VARIABLE_NUMBER_ compilation option,
    which defaults to 999. For instance, if your model has 8 fields (including
    the primary key), you cannot create more than 999 // 8 = 124 instances at
    a time. If you exceed this limit, you'll get an exception::

        django.db.utils.DatabaseError: too many SQL variables

    If your application's performance requirements exceed SQLite's limits, you
    should switch to another database engine, such as PostgreSQL.

.. _SQLITE_MAX_VARIABLE_NUMBER: http://sqlite.org/limits.html#max_variable_number
.. versionadded:: 1.5
    The ``batch_size`` parameter was added in version 1.5.

count
~~~~~
+5 −0
Original line number Diff line number Diff line
@@ -106,6 +106,11 @@ Django 1.5 also includes several smaller improvements worth noting:
* The :ref:`receiver <connecting-receiver-functions>` decorator is now able to
  connect to more than one signal by supplying a list of signals.

* :meth:`QuerySet.bulk_create()
  <django.db.models.query.QuerySet.bulk_create>` has now a batch_size
  argument. By default the batch_size is unlimited except for SQLite where
  single batch is limited so that 999 parameters per query isn't exceeded.

Backwards incompatible changes in 1.5
=====================================

Loading