Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Multiple databases in Django

Multiple databases in Django

Support for multiple databases has been introduced in Django 1.2 in 2010, and, while amazing, it's still a minefield filled with programming gotchas. I've stumbled upon many of them while developing a highly distributed system at SetJam and want to share my pain with you.

Knowing how to properly route queries, migrate schemas (using lovely South library) and then test the whole thing, while dodging chainsaws of Multi-DB API, will probably save you a few days of cursing.

Marek Stępniowski

May 16, 2013
Tweet

More Decks by Marek Stępniowski

Other Decks in Programming

Transcript

  1. NOT

  2. 1. A huge amount of data was unnecessarily duplicated. 2.

    Configuration outside of settings.py was error prone. 3. All of the issues found in the following solutions.
  3. DATABASES = { 'default': {'ENGINE': 'django.db.backends.mysql', 'NAME': 'setjam', 'USER': 'setjam'},

    'slave': {'ENGINE': 'django.db.backends.mysql', 'NAME': 'slave', 'USER': 'setjam', 'TEST_MIRROR': 'default'}, }
  4. DATABASES = { 'default': {'ENGINE': 'django.db.backends.mysql', 'NAME': 'setjam', 'USER': 'setjam'},

    'slave': {'ENGINE': 'django.db.backends.mysql', 'NAME': 'slave', 'USER': 'setjam', 'TEST_MIRROR': 'default'}, 'crawlers': {'ENGINE': 'django.db.backends.mysql', 'NAME': 'crawlers', 'USER': 'setjam'} }
  5. class AppRouter(object): def db_for_read(self, model, **hints): if model._meta.app_label == 'app':

    return 'appdb' def db_for_write(self, model, **hints): if model._meta.app_label == 'app': return 'appdb'
  6. class AppRouter(object): def db_for_read(self, model, **hints): if model._meta.app_label == 'app':

    return 'appdb' return None def db_for_write(self, model, **hints): if model._meta.app_label == 'app': return 'appdb' return None
  7. class AppRouter(object): def allow_relation(obj1, obj2, **hints): if (obj1._meta.app_label == 'app'

    and obj2._meta.app_label != 'app'): return False if (obj2._meta.app_label == 'app' and obj2._meta.app_label != 'app'): return False return True def allow_syncdb(db, model): if db == 'appdb': return model._meta.app_label == 'app'
  8. class HorizontalPartitioningRouter(object): parts = ['p1', 'p2', 'p3'] def _db(self, obj):

    if obj is None or obj.pk is None: return None else: return self.parts[ instance.pk % len(self.parts)] def db_for_read(self, model, **hints): return self._db(hints.get('instance')) def db_for_write(self, model, **hints): return self._db(hints.get('instance'))
  9. class AppRouter(object): def db_for_read(self, model, **hints): if model._meta.app_label == 'app':

    return 'appdb' def db_for_write(self, model, **hints): if model._meta.app_label == 'app': return 'appdb'
  10. class AppRouter(object): def db_for_read(self, model, **hints): if model._meta.app_label in (

    'app1', 'app2', 'app3'): return 'appdb' def db_for_write(self, model, **hints): if model._meta.app_label in ( 'app1', 'app2', 'app3'): return 'appdb'
  11. class AppRouter(object): def db_for_read(self, model, **hints): if model._meta.app_label in (

    'app1', 'app2', 'app3'): return 'appdb' def db_for_write(self, model, **hints): if model._meta.app_label in ( 'app1', 'app2', 'app3'): return 'appdb' class TMSCrawler(object): CRAWLER = True # ...Field definitions...
  12. class AppRouter(object): def _db(self, model): if getattr(model, 'CRAWLER', False): return

    'appdb' def db_for_read(self, model, **hints): return self._db(self, model) def db_for_write(self, model, **hints): return self._db(self, model) class TMSCrawler(object): CRAWLER = True # ...Field definitions...
  13. class ReadFromSlaveRouter(object): slave_db = 'slave' def db_for_read(self, model, **hints): if

    (model._meta.app_label not in ('django_cache', 'sessions', 'jobs', 'feedimporter') and not getattr(model, 'CRAWLER'): return self.slave_db def db_for_write(self, model, **hints): return 'default'
  14. class GenerateFeedCommand(Command): def post_publish(self): if self.incremental: # Purge all processed

    diffs from database (FeedDiff.objects .filter(id__lte=self.last_diff_id).raw_delete()) else: # If we have just published full feed, # we can get rid of all the diffs FeedDiff.objects.all().raw_delete()
  15. class QuerySet(object): @property def db(self): """Return the database that will

    be used if this query is executed now""" if self._for_write: return self._db or router.db_for_write(self.model) return self._db or router.db_for_read(self.model)
  16. How does a router know whether the query is for

    read or write? queryset._for_write
  17. class Fetcher(object): def run(self, pk, generation): show = self.prepare(pk, generation)

    # ...Magic... self.update_show(show.pk) self.postprocess(show.pk)
  18. class Fetcher(object): def run(self, pk, generation): show = self.prepare(pk, generation)

    # ...Magic... self.update_show(show.pk) self.postprocess(show.pk) def update_show(self, pk): (Show.objects.filter(pk=pk) .update(classified=True)) def postprocess(self, pk): if Show.objects.get(pk=pk).classified: # ...Do something...
  19. class Fetcher(object): def run(self, pk, generation): show = self.prepare(pk, generation)

    # ...Magic... self.update_show(show) self.postprocess(show) show.save() def update_show(self, show): show.classified = True def postprocess(self, show): if show.classified: # ...Do something...
  20. class Fetcher(object): def run(self, pk, generation): show = self.prepare(pk, generation)

    # ...Magic... self.update_show(show.pk) self.postprocess(show.pk) def update_show(self, pk): (Show.objects.filter(pk=pk) .update(classified=True)) def postprocess(self, pk): if (Show.objects.get(pk=pk) .using('default') .classified: # ...Do something...
  21. class BatchInsertManager(models.Manager): @transaction.commit_manually def batch_insert(self, row_iterator, replace=False): # ...Initialization... for

    row in row_iterator: sql, vals = self.prepare(row, fields) if query_len + len(sql) + len(vals) > max_query_len: count += self.perform_query( connection, header_sql + ', '.join(query_sql), query_vals) connection.commit() # ...Bookkeeping... else: query_sql.append(sql) query_vals.extend(vals) # ...More bookkeeping...
  22. How do transaction decorators know which database to work on?

    They don’t know. They always use default database.
  23. class BatchInsertManager(models.Manager): @transaction.commit_manually def batch_insert(self, row_iterator, replace=False): # ...Initialization... for

    row in row_iterator: sql, vals = self.prepare(row, fields) if query_len + len(sql) + len(vals) > max_query_len: count += self.perform_query( connection, header_sql + ', '.join(query_sql), query_vals) connection.commit() # ...Bookkeeping... else: query_sql.append(sql) query_vals.extend(vals) # ...More bookkeeping...
  24. 1. Perform all schema migrations on each database. 2. Always

    use .using in data migrations (code reviews help). 3. Accept that we will have a lot of empty tables :-(
  25. class RawDeleteQuerySet(QuerySet): @transaction.commit_on_success def raw_delete(self): qn = connection.ops.quote_name sql =

    'DELETE FROM %s' % qn(self.model._meta.db_table) cursor = connection.cursor() cursor.execute(sql) return cursor.rowcount
  26. class RawDeleteQuerySet(QuerySet): @transaction.commit_on_success def raw_delete(self): connection = connections[self.db] qn =

    connection.ops.quote_name sql = 'DELETE FROM %s' % qn(self.model._meta.db_table) cursor = connection.cursor() cursor.execute(sql) return cursor.rowcount
  27. class RawDeleteQuerySet(QuerySet): @transaction.commit_on_success def raw_delete(self): connection = connections[self.db] qn =

    connection.ops.quote_name sql = 'DELETE FROM %s' % qn(self.model._meta.db_table) cursor = connection.cursor() cursor.execute(sql) connection.set_dirty() return cursor.rowcount
  28. class RawDeleteQuerySet(QuerySet): def raw_delete(self): return (transaction.commit_on_success(self.db) (self._raw_delete)()) def _raw_delete(self): connection

    = connections[self.db] qn = connection.ops.quote_name sql = 'DELETE FROM %s' % qn(self.model._meta.db_table) cursor = connection.cursor() cursor.execute(sql) connection.set_dirty() return cursor.rowcount
  29. class RawDeleteQuerySet(QuerySet): def raw_delete(self): self._for_write = True return (transaction.commit_on_success(self.db) (self._raw_delete)())

    def _raw_delete(self): connection = connections[self.db] qn = connection.ops.quote_name sql = 'DELETE FROM %s' % qn(self.model._meta.db_table) cursor = connection.cursor() cursor.execute(sql) connection.set_dirty() return cursor.rowcount
  30. class RawDeleteQuerySet(QuerySet): def raw_delete(self): self._for_write = True return (transaction.commit_on_success(self.db) (self._raw_delete)())

    def _raw_delete(self): connection = connections[self.db] qn = connection.ops.quote_name sql = 'DELETE FROM %s' % qn(self.model._meta.db_table) cursor = connection.cursor() cursor.execute(sql) connection.set_dirty() return cursor.rowcount
  31. 1. Are you sure that there are no such bugs

    in your code? 2. Are you sure that there are no such bugs in 3rd party applications you use?
  32. 1. We need more (& better) documentation. 2. We need

    full support for multidb in schema migrations. 3. We need better debugging tools (whiny transaction decorators). 4. Attributes like _for_write should be made more visible. (5. We need better testing tools for master-slave configurations.) MultiDB is awesome, but…