Data Warehouse in Ruby, why not? I'll demonstrate how we Open Sourced our homegrown ETL engine. I'll show the challenges that we had and how we solved it here at ZestFinance.
task :source_history => :environment do puts "[ #{Time.now} ] importing history from the Source database (new sql)" SourceHistory.import puts "[ #{Time.now} ] imported #{SourceHistory.count} new entries into the SourceHistory" end
:source_history => :environment do puts "[ #{Time.now} ] importing history from the Source database (new sql)" SourceHistory.import puts "[ #{Time.now} ] imported #{SourceHistory.count} new entries into the SourceHistory" end
Petl extend self ! def extract max_id = Destination.maximum(:id) Source.since(max_id).collect(&:attributes) end ! def transform rows rows.collect do |row| row.slice *Destination.column_names end end ! def load rows Destination.create! rows end ! def source_count Source.since(0).count end ! def destination_count Destination.count end end
def query_source Source::History.since latest_date end ! def source_count Source::History.since(EPOCH.to_date).count end ! def latest_date (self.class.maximum(:Date) || EPOCH).to_date end def verify_count logger = Rails.logger if self.source_count != self.class.count logger.error "source_history_etl: counts don't match" end logger.info "source_history_etl: source count #{self.source_count}" logger.info "source_history_etl: destination count #{self.class.count}" end ! def import(models) puts " importing #{models.length}..." columns = self.class.column_names inserts = [] ! models.each do |model| values = columns.inject([]) do |memo,key| memo << (model.attributes[key].nil? ? 'NULL' : "'#{model.attributes[key]}'") end inserts << "(#{values.join(',')})" end ! sql ="INSERT IGNORE INTO #{self.class.table_name} (#{columns.join(',')}) VALUES #{inserts.join(",")}" self.class.connection.execute sql end end desc "imports new history data from source database" task :source_history => :environment do puts "[ #{Time.now} ] importing history from the Source database (new sql)" SourceHistory.import puts "[ #{Time.now} ] imported #{SourceHistory.count} new entries into the SourceHistory" end
! def extract max_id = Destination.maximum(:id) Source.since(max_id).collect(&:attributes) end ! def transform rows rows.collect do |row| row.slice *Destination.column_names end end ! def load rows Destination.import! rows end ! def source_count Source.since(0).count end ! def destination_count Destination.count end end desc "Update history table from LA Pro" task :history_raw => :environment do ETL::Example.perform end