Class: DataDuck::Table

Inherits:

Object

Object
DataDuck::Table

show all

Defined in:: lib/dataduck/table.rb

Direct Known Subclasses

IntegrationTable

Class Attribute Summary collapse

.actions ⇒ Object

Returns the value of attribute actions.
.output_schema ⇒ Object

Returns the value of attribute output_schema.
.sources ⇒ Object

Returns the value of attribute sources.

Instance Attribute Summary collapse

#data ⇒ Object

Returns the value of attribute data.
#errors ⇒ Object

Returns the value of attribute errors.

Class Method Summary collapse

Instance Method Summary collapse

Class Attribute Details

.actions ⇒ `Object`

Returns the value of attribute actions.



8
9
10

# File 'lib/dataduck/table.rb', line 8

def actions
  @actions
end

.output_schema ⇒ `Object`

Returns the value of attribute output_schema.



7
8
9

# File 'lib/dataduck/table.rb', line 7

def output_schema
  @output_schema
end

.sources ⇒ `Object`

Returns the value of attribute sources.



6
7
8

# File 'lib/dataduck/table.rb', line 6

def sources
  @sources
end

Instance Attribute Details

#data ⇒ `Object`

Returns the value of attribute data.



11
12
13

# File 'lib/dataduck/table.rb', line 11

def data
  @data
end

#errors ⇒ `Object`

Returns the value of attribute errors.



12
13
14

# File 'lib/dataduck/table.rb', line 12

def errors
  @errors
end

Class Method Details

.output(schema) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 42

def self.output(schema)
  self.output_schema ||= {}
  self.output_schema.merge!(schema)
end

.source(source_name, source_table_or_query = nil, source_columns = nil) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 26

def self.source(source_name, source_table_or_query = nil, source_columns = nil)
  self.sources ||= []

  source_spec = {}
  if source_table_or_query.respond_to?(:to_s) && source_table_or_query.to_s.downcase.include?('select ')
    source_spec = {query: source_table_or_query}
  elsif source_columns.nil? && source_table_or_query.respond_to?(:each)
    source_spec = {columns: source_table_or_query, table_name: DataDuck::Util.camelcase_to_underscore(self.name)}
  else
    source_spec = {columns: source_columns, table_name: source_table_or_query.to_s}
  end

  source_spec[:source] = DataDuck::Source.source(source_name)
  self.sources << source_spec
end

.transforms(transformation_name) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 14

def self.transforms(transformation_name)
  self.actions ||= []
  self.actions << [:transform, transformation_name]
end

.validates(validation_name) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 20

def self.validates(validation_name)
  self.actions ||= []
  self.actions << [:validate, validation_name]
end

Instance Method Details

#actions ⇒ `Object`

# File 'lib/dataduck/table.rb', line 47

def actions
  my_actions = []
  for_class = self.class
  while for_class < Table
    my_actions.concat(for_class.actions || [])
    for_class = for_class.superclass
  end

  my_actions
end

#autogenerate_identity? ⇒ `Boolean`

Returns:

(Boolean)



223
224
225

# File 'lib/dataduck/table.rb', line 223

def autogenerate_identity?
  false
end

#batch_size ⇒ `Object`



203
204
205

# File 'lib/dataduck/table.rb', line 203

def batch_size
  nil
end

#building_name ⇒ `Object`



227
228
229

# File 'lib/dataduck/table.rb', line 227

def building_name
  self.should_fully_reload? ? self.staging_name : self.name
end

#check_table_valid! ⇒ `Object`

# File 'lib/dataduck/table.rb', line 58

def check_table_valid!
  if !self.batch_size.nil?
    raise "Table #{ self.name }'s batch_size must be > 0" unless self.batch_size > 0
    raise "Table #{ self.name } has batch_size defined but no extract_by_column" if self.extract_by_column.nil?
  end
end

#create_column_names ⇒ `Object`



243
244
245

# File 'lib/dataduck/table.rb', line 243

def create_column_names
  self.create_schema.keys.map(&:to_s).sort
end

#create_schema ⇒ `Object`

# File 'lib/dataduck/table.rb', line 235

def create_schema
  if self.autogenerate_identity?
    Util.deep_merge(output_schema, {dataduck_identity: 'bigint identity(1, 1)'}) # Redshift only
  else
    output_schema
  end
end

#distribution_key ⇒ `Object`

# File 'lib/dataduck/table.rb', line 65

def distribution_key
  if self.output_column_names.include?("id")
    "id"
  else
    nil
  end
end

#distribution_style ⇒ `Object`



73
74
75

# File 'lib/dataduck/table.rb', line 73

def distribution_style
  nil
end

#etl!(destinations, options = {}) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 77

def etl!(destinations, options = {})
  if destinations.length != 1
    raise ArgumentError.new("DataDuck can only etl to one destination at a time for now.")
  end

  if options[:dates].nil?
    options[:dates] = [Date.today]
  end

  self.check_table_valid!

  destination = destinations.first

  if self.should_fully_reload?
    destination.drop_staging_table!(self)
  end

  data_processed = false
  batch_number = 0
  while batch_number < 1_000
    batch_number += 1
    self.extract!(destination, options)
    if self.data.length > 0
      self.transform!
      self.data.compact!
      self.load!(destination) if self.data.length > 0
      data_processed = true
    end

    if self.batch_size.nil?
      break
    else
      if self.batch_size == self.data.length
        DataDuck::Logs.info "Finished batch #{ batch_number }, continuing with the next batch"
      else
        DataDuck::Logs.info "Finished batch #{ batch_number } (last batch)"
        break
      end
    end
  end

  self.data = []

  if data_processed
    if self.should_fully_reload?
      destination.finish_fully_reloading_table!(self)
    end

    self.postprocess!(destination, options)
  else
    DataDuck::Logs.info "No data extracted for table #{ self.name }"
  end
end

#extract!(destination = nil, options = {}) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 131

def extract!(destination = nil, options = {})
  DataDuck::Logs.info "Extracting table #{ self.name }"

  self.errors ||= []
  self.data = []
  self.class.sources.each do |source_spec|
    source = source_spec[:source]
    my_query = self.extract_query(source_spec, destination)
    results = source.query(my_query)
    self.data.concat(results)
  end
  self.data
end

#extract_by_clause(value) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 171

def extract_by_clause(value)
  if value
    operator = self.should_fully_reload? ? '>' : '>='
    "WHERE #{ self.extract_by_column } #{ operator } '#{ value }'"
  else
    ""
  end
end

#extract_by_column ⇒ `Object`

# File 'lib/dataduck/table.rb', line 207

def extract_by_column
  return 'updated_at' if self.output_column_names.include?("updated_at")

  nil
end

#extract_query(source_spec, destination = nil) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 145

def extract_query(source_spec, destination = nil)
  escape_char = source_spec[:source].escape_char

  base_query = source_spec.has_key?(:query) ? source_spec[:query] :
     "SELECT #{ escape_char }#{ source_spec[:columns].sort.join(escape_char + ',' + escape_char) }#{ escape_char } FROM #{ source_spec[:table_name] }"

  extract_part = ""
  limit_part = self.limit_clause

  if self.extract_by_column
    if destination.table_names.include?(self.building_name)
      extract_by_column_without_table = self.extract_by_column.include?(".") ? self.extract_by_column.split(".").last : self.extract_by_column
      extract_by_value = destination.query("SELECT MAX(#{ extract_by_column_without_table }) AS val FROM #{ self.building_name }").first
      extract_by_value = extract_by_value.nil? ? nil : extract_by_value[:val]

      extract_part = self.extract_by_clause(extract_by_value)
    end
  end

  if base_query.downcase.split("from").last.include?(' where ')
    extract_part.gsub!('WHERE ', 'AND ')
  end

  [base_query, extract_part, limit_part].join(' ').strip
end

#identify_by_columns ⇒ `Object`

# File 'lib/dataduck/table.rb', line 213

def identify_by_columns
  return ["id"] if self.output_column_names.include?("id")

  []
end

#include_with_all? ⇒ `Boolean`

Returns:

(Boolean)



192
193
194

# File 'lib/dataduck/table.rb', line 192

def include_with_all?
  true
end

#indexes ⇒ `Object`

# File 'lib/dataduck/table.rb', line 196

def indexes
  which_columns = []
  which_columns << "id" if self.output_column_names.include?("id")
  which_columns << "created_at" if self.output_column_names.include?("created_at")
  which_columns
end

#limit_clause ⇒ `Object`

# File 'lib/dataduck/table.rb', line 180

def limit_clause
  if self.extract_by_column && self.batch_size
    "ORDER BY #{ self.extract_by_column } LIMIT #{ self.batch_size }"
  else
    ""
  end
end

#load!(destination) ⇒ `Object`



188
189
190

# File 'lib/dataduck/table.rb', line 188

def load!(destination)
  destination.load_table!(self)
end

#name ⇒ `Object`

# File 'lib/dataduck/table.rb', line 297

def name
  fixed_name = DataDuck::Util.camelcase_to_underscore(self.class.name)
  if fixed_name.start_with?("data_duck/")
    fixed_name = fixed_name.split("/").last
  end

  self.prefix + fixed_name
end

#output_column_names ⇒ `Object`



251
252
253

# File 'lib/dataduck/table.rb', line 251

def output_column_names
  self.output_schema.keys.map(&:to_s).sort
end

#output_schema ⇒ `Object`



247
248
249

# File 'lib/dataduck/table.rb', line 247

def output_schema
  self.class.output_schema || self.class.superclass.output_schema || {}
end

#postprocess!(destination, options = {}) ⇒ `Object`



255
256
257

# File 'lib/dataduck/table.rb', line 255

def postprocess!(destination, options = {})
  destination.postprocess!(self)
end

#prefix ⇒ `Object`



306
307
308

# File 'lib/dataduck/table.rb', line 306

def prefix
  ""
end

#recreate!(destination) ⇒ `Object`



259
260
261

# File 'lib/dataduck/table.rb', line 259

def recreate!(destination)
  destination.recreate_table!(self)
end

#should_fully_reload? ⇒ `Boolean`

Returns:

(Boolean)



219
220
221

# File 'lib/dataduck/table.rb', line 219

def should_fully_reload?
  false
end

#show ⇒ `Object`

# File 'lib/dataduck/table.rb', line 263

def show
  puts "Table #{ self.name }"
  self.class.sources.each do |source_spec|
    puts "\nSources from #{ source_spec[:table_name] || source_spec[:query] } on #{ source_spec[:source].name }"
    source_spec[:columns].each do |col_name|
      puts "  #{ col_name }"
    end
  end

  puts "\nOutputs "
  num_separators = self.output_schema.keys.map { |key| key.length }.max
  self.output_schema.each_pair do |name, datatype|
    puts "  #{ name }#{ ' ' * (num_separators + 2 - name.length) }#{ datatype }"
  end
end

#staging_name ⇒ `Object`



231
232
233

# File 'lib/dataduck/table.rb', line 231

def staging_name
  "zz_dataduck_#{ self.name }"
end

#transform! ⇒ `Object`

# File 'lib/dataduck/table.rb', line 279

def transform!
  DataDuck::Logs.info "Transforming table #{ self.name }"

  self.errors ||= []
  self.actions.each do |action|
    action_type = action[0]
    action_method_name = action[1]
    if action_type == :transform
      self.data.map! { |row| self.public_send(action_method_name, row) }
    elsif action_type == :validate
      self.data.each do |row|
        error = self.public_send(action_method_name, row)
        self.errors << error if !error.blank?
      end
    end
  end
end

Class: DataDuck::Table

Direct Known Subclasses

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Attribute Details

.actions ⇒ Object

.output_schema ⇒ Object

.sources ⇒ Object

Instance Attribute Details

#data ⇒ Object

#errors ⇒ Object

Class Method Details

.output(schema) ⇒ Object

.source(source_name, source_table_or_query = nil, source_columns = nil) ⇒ Object

.transforms(transformation_name) ⇒ Object

.validates(validation_name) ⇒ Object

Instance Method Details

#actions ⇒ Object

#autogenerate_identity? ⇒ Boolean

#batch_size ⇒ Object

#building_name ⇒ Object

#check_table_valid! ⇒ Object

#create_column_names ⇒ Object

#create_schema ⇒ Object

#distribution_key ⇒ Object

#distribution_style ⇒ Object

#etl!(destinations, options = {}) ⇒ Object

#extract!(destination = nil, options = {}) ⇒ Object

#extract_by_clause(value) ⇒ Object

#extract_by_column ⇒ Object

#extract_query(source_spec, destination = nil) ⇒ Object

#identify_by_columns ⇒ Object

#include_with_all? ⇒ Boolean

#indexes ⇒ Object

#limit_clause ⇒ Object

#load!(destination) ⇒ Object

#name ⇒ Object

#output_column_names ⇒ Object

#output_schema ⇒ Object

#postprocess!(destination, options = {}) ⇒ Object

#prefix ⇒ Object

#recreate!(destination) ⇒ Object

#should_fully_reload? ⇒ Boolean

#show ⇒ Object

#staging_name ⇒ Object

#transform! ⇒ Object

.actions ⇒ `Object`

.output_schema ⇒ `Object`

.sources ⇒ `Object`

#data ⇒ `Object`

#errors ⇒ `Object`

.output(schema) ⇒ `Object`

.source(source_name, source_table_or_query = nil, source_columns = nil) ⇒ `Object`

.transforms(transformation_name) ⇒ `Object`

.validates(validation_name) ⇒ `Object`

#actions ⇒ `Object`

#autogenerate_identity? ⇒ `Boolean`

#batch_size ⇒ `Object`

#building_name ⇒ `Object`

#check_table_valid! ⇒ `Object`

#create_column_names ⇒ `Object`

#create_schema ⇒ `Object`

#distribution_key ⇒ `Object`

#distribution_style ⇒ `Object`

#etl!(destinations, options = {}) ⇒ `Object`

#extract!(destination = nil, options = {}) ⇒ `Object`

#extract_by_clause(value) ⇒ `Object`

#extract_by_column ⇒ `Object`

#extract_query(source_spec, destination = nil) ⇒ `Object`

#identify_by_columns ⇒ `Object`

#include_with_all? ⇒ `Boolean`

#indexes ⇒ `Object`

#limit_clause ⇒ `Object`

#load!(destination) ⇒ `Object`

#name ⇒ `Object`

#output_column_names ⇒ `Object`

#output_schema ⇒ `Object`

#postprocess!(destination, options = {}) ⇒ `Object`

#prefix ⇒ `Object`

#recreate!(destination) ⇒ `Object`

#should_fully_reload? ⇒ `Boolean`

#show ⇒ `Object`

#staging_name ⇒ `Object`

#transform! ⇒ `Object`