Save 37% off PRO during our Black Friday Sale! »

Multiple databases in Django

Multiple databases in Django

Support for multiple databases has been introduced in Django 1.2 in 2010, and, while amazing, it's still a minefield filled with programming gotchas. I've stumbled upon many of them while developing a highly distributed system at SetJam and want to share my pain with you.

Knowing how to properly route queries, migrate schemas (using lovely South library) and then test the whole thing, while dodging chainsaws of Multi-DB API, will probably save you a few days of cursing.

4885da39b4bc6628c5599eb9253dcf32?s=128

Marek Stępniowski

May 16, 2013
Tweet

Transcript

  1. MultiDB

  2. MultiDB This

  3. None
  4. None
  5. None
  6. None
  7. None
  8. None
  9. None
  10. None
  11. None
  12. None
  13. None
  14. None
  15. We came to Django for the views ... But stayed

    for the ORM
  16. need

  17. None
  18. None
  19. NOT

  20. None
  21. None
  22. None
  23. 1 year BMDB (before

  24. None
  25. READ_FROM_SLAVE = bool(os.environ.get('READ_FROM_SLAVE')) DATABASE_ENGINE = 'mysql' DATABASE_NAME = ('setjam_slave' if

    READ_FROM_SLAVE else 'setjam') DATABASE_USER = 'setjam'
  26. It worked! (for

  27. 1. A huge amount of data was unnecessarily duplicated. 2.

    Configuration outside of settings.py was error prone. 3. All of the issues found in the following solutions.
  28. And then came MultiDB And all was good

  29. DATABASES = { 'default': {'ENGINE': 'django.db.backends.mysql', 'NAME': 'setjam', 'USER': 'setjam'},

    'slave': {'ENGINE': 'django.db.backends.mysql', 'NAME': 'slave', 'USER': 'setjam', 'TEST_MIRROR': 'default'}, }
  30. DATABASES = { 'default': {'ENGINE': 'django.db.backends.mysql', 'NAME': 'setjam', 'USER': 'setjam'},

    'slave': {'ENGINE': 'django.db.backends.mysql', 'NAME': 'slave', 'USER': 'setjam', 'TEST_MIRROR': 'default'}, 'crawlers': {'ENGINE': 'django.db.backends.mysql', 'NAME': 'crawlers', 'USER': 'setjam'} }
  31. ./manage.py syncdb --database=default ./manage.py syncdb --database=slave

  32. ./manage.py syncdb --database=default ./manage.py syncdb --database=slave queryset.using('default') queryset.using('crawlers')

  33. ./manage.py syncdb --database=default ./manage.py syncdb --database=slave queryset.using('default') queryset.using('crawlers') model.objects.dbmanager('default') model.save(using='crawlers')

    model.delete(using='crawlers')
  34. Routers

  35. class AppRouter(object): def db_for_read(self, model, **hints): if model._meta.app_label == 'app':

    return 'appdb' def db_for_write(self, model, **hints): if model._meta.app_label == 'app': return 'appdb'
  36. class AppRouter(object): def db_for_read(self, model, **hints): if model._meta.app_label == 'app':

    return 'appdb' return None def db_for_write(self, model, **hints): if model._meta.app_label == 'app': return 'appdb' return None
  37. class AppRouter(object): def allow_relation(obj1, obj2, **hints): if (obj1._meta.app_label == 'app'

    and obj2._meta.app_label != 'app'): return False if (obj2._meta.app_label == 'app' and obj2._meta.app_label != 'app'): return False return True def allow_syncdb(db, model): if db == 'appdb': return model._meta.app_label == 'app'
  38. DATABASE_ROUTERS = ('multidb.MasterSlaveRouter', 'routers.AppRouter')

  39. master-slave logical

  40. class HorizontalPartitioningRouter(object): parts = ['p1', 'p2', 'p3'] def _db(self, obj):

    if obj is None or obj.pk is None: return None else: return self.parts[ instance.pk % len(self.parts)] def db_for_read(self, model, **hints): return self._db(hints.get('instance')) def db_for_write(self, model, **hints): return self._db(hints.get('instance'))
  41. class AppRouter(object): def db_for_read(self, model, **hints): if model._meta.app_label == 'app':

    return 'appdb' def db_for_write(self, model, **hints): if model._meta.app_label == 'app': return 'appdb'
  42. class AppRouter(object): def db_for_read(self, model, **hints): if model._meta.app_label in (

    'app1', 'app2', 'app3'): return 'appdb' def db_for_write(self, model, **hints): if model._meta.app_label in ( 'app1', 'app2', 'app3'): return 'appdb'
  43. class AppRouter(object): def db_for_read(self, model, **hints): if model._meta.app_label in (

    'app1', 'app2', 'app3'): return 'appdb' def db_for_write(self, model, **hints): if model._meta.app_label in ( 'app1', 'app2', 'app3'): return 'appdb' class TMSCrawler(object): CRAWLER = True # ...Field definitions...
  44. class AppRouter(object): def _db(self, model): if getattr(model, 'CRAWLER', False): return

    'appdb' def db_for_read(self, model, **hints): return self._db(self, model) def db_for_write(self, model, **hints): return self._db(self, model) class TMSCrawler(object): CRAWLER = True # ...Field definitions...
  45. class ReadFromSlaveRouter(object): slave_db = 'slave' def db_for_read(self, model, **hints): if

    (model._meta.app_label not in ('django_cache', 'sessions', 'jobs', 'feedimporter') and not getattr(model, 'CRAWLER'): return self.slave_db def db_for_write(self, model, **hints): return 'default'
  46. How does a router know whether the query is for

    read or write?
  47. class GenerateFeedCommand(Command): def post_publish(self): if self.incremental: # Purge all processed

    diffs from database (FeedDiff.objects .filter(id__lte=self.last_diff_id).raw_delete()) else: # If we have just published full feed, # we can get rid of all the diffs FeedDiff.objects.all().raw_delete()
  48. class QuerySet(object): @property def db(self): """Return the database that will

    be used if this query is executed now""" if self._for_write: return self._db or router.db_for_write(self.model) return self._db or router.db_for_read(self.model)
  49. How does a router know whether the query is for

    read or write? queryset._for_write
  50. Slave delay

  51. class Fetcher(object): def run(self, pk, generation): show = self.prepare(pk, generation)

    # ...Magic... self.update_show(show.pk) self.postprocess(show.pk)
  52. class Fetcher(object): def run(self, pk, generation): show = self.prepare(pk, generation)

    # ...Magic... self.update_show(show.pk) self.postprocess(show.pk) def update_show(self, pk): (Show.objects.filter(pk=pk) .update(classified=True)) def postprocess(self, pk): if Show.objects.get(pk=pk).classified: # ...Do something...
  53. class Fetcher(object): def run(self, pk, generation): show = self.prepare(pk, generation)

    # ...Magic... self.update_show(show) self.postprocess(show) show.save() def update_show(self, show): show.classified = True def postprocess(self, show): if show.classified: # ...Do something...
  54. class Fetcher(object): def run(self, pk, generation): show = self.prepare(pk, generation)

    # ...Magic... self.update_show(show.pk) self.postprocess(show.pk) def update_show(self, pk): (Show.objects.filter(pk=pk) .update(classified=True)) def postprocess(self, pk): if (Show.objects.get(pk=pk) .using('default') .classified: # ...Do something...
  55. jbalogh / django-multidb-router multidb.PinningMasterSlaveRouter from multidb.pinning import use_master @use_master def

    func(*args, **kw): # Touches the master database.
  56. Transactions

  57. class BatchInsertManager(models.Manager): @transaction.commit_manually def batch_insert(self, row_iterator, replace=False): # ...Initialization... for

    row in row_iterator: sql, vals = self.prepare(row, fields) if query_len + len(sql) + len(vals) > max_query_len: count += self.perform_query( connection, header_sql + ', '.join(query_sql), query_vals) connection.commit() # ...Bookkeeping... else: query_sql.append(sql) query_vals.extend(vals) # ...More bookkeeping...
  58. How do transaction decorators know which database to work on?

  59. How do transaction decorators know which database to work on?

    They don’t know. They always use default database.
  60. class BatchInsertManager(models.Manager): @transaction.commit_manually def batch_insert(self, row_iterator, replace=False): # ...Initialization... for

    row in row_iterator: sql, vals = self.prepare(row, fields) if query_len + len(sql) + len(vals) > max_query_len: count += self.perform_query( connection, header_sql + ', '.join(query_sql), query_vals) connection.commit() # ...Bookkeeping... else: query_sql.append(sql) query_vals.extend(vals) # ...More bookkeeping...
  61. class BatchInsertManager(models.Manager): def batch_insert(self, row_iterator, replace=False): return (transaction.commit_manually(self._db) (self._batch_insert)(row_iterator, replace=replace))

    def _batch_insert(self, row_iterator, replace=False): # ...Initialization... for row in row_iterator: # ...Loop...
  62. Going south

  63. ./manage.py migrate --database=default ./manage.py migrate --database=slave

  64. ./manage.py migrate --database=default ./manage.py migrate --database=slave orm.using('default') orm.using('crawlers')

  65. ticket #370 extended

  66. ticket #370 (3

  67. 1. Perform all schema migrations on each database. 2. Always

    use .using in data migrations (code reviews help). 3. Accept that we will have a lot of empty tables :-(
  68. Dodging chainsaws Practical

  69. class RawDeleteQuerySet(QuerySet): @transaction.commit_on_success def raw_delete(self): qn = connection.ops.quote_name sql =

    'DELETE FROM %s' % qn(self.model._meta.db_table) cursor = connection.cursor() cursor.execute(sql) return cursor.rowcount
  70. class RawDeleteQuerySet(QuerySet): @transaction.commit_on_success def raw_delete(self): connection = connections[self.db] qn =

    connection.ops.quote_name sql = 'DELETE FROM %s' % qn(self.model._meta.db_table) cursor = connection.cursor() cursor.execute(sql) return cursor.rowcount
  71. None
  72. class RawDeleteQuerySet(QuerySet): @transaction.commit_on_success def raw_delete(self): connection = connections[self.db] qn =

    connection.ops.quote_name sql = 'DELETE FROM %s' % qn(self.model._meta.db_table) cursor = connection.cursor() cursor.execute(sql) connection.set_dirty() return cursor.rowcount
  73. None
  74. class RawDeleteQuerySet(QuerySet): def raw_delete(self): return (transaction.commit_on_success(self.db) (self._raw_delete)()) def _raw_delete(self): connection

    = connections[self.db] qn = connection.ops.quote_name sql = 'DELETE FROM %s' % qn(self.model._meta.db_table) cursor = connection.cursor() cursor.execute(sql) connection.set_dirty() return cursor.rowcount
  75. None
  76. class RawDeleteQuerySet(QuerySet): def raw_delete(self): self._for_write = True return (transaction.commit_on_success(self.db) (self._raw_delete)())

    def _raw_delete(self): connection = connections[self.db] qn = connection.ops.quote_name sql = 'DELETE FROM %s' % qn(self.model._meta.db_table) cursor = connection.cursor() cursor.execute(sql) connection.set_dirty() return cursor.rowcount
  77. None
  78. class RawDeleteQuerySet(QuerySet): def raw_delete(self): self._for_write = True return (transaction.commit_on_success(self.db) (self._raw_delete)())

    def _raw_delete(self): connection = connections[self.db] qn = connection.ops.quote_name sql = 'DELETE FROM %s' % qn(self.model._meta.db_table) cursor = connection.cursor() cursor.execute(sql) connection.set_dirty() return cursor.rowcount
  79. 1. Are you sure that there are no such bugs

    in your code? 2. Are you sure that there are no such bugs in 3rd party applications you use?
  80. Concluding

  81. 1. We need more (& better) documentation. 2. We need

    full support for multidb in schema migrations. 3. We need better debugging tools (whiny transaction decorators). 4. Attributes like _for_write should be made more visible. (5. We need better testing tools for master-slave configurations.) MultiDB is awesome, but…
  82. Marek Stępniowski @mstepniowski