Class: DataDuck::RedshiftDestination

Inherits:

Destination

Object
Database
Destination
DataDuck::RedshiftDestination

show all

Defined in:: lib/dataduck/redshift_destination.rb

Instance Attribute Summary collapse

#aws_key ⇒ Object

Returns the value of attribute aws_key.
#aws_secret ⇒ Object

Returns the value of attribute aws_secret.
#database ⇒ Object

Returns the value of attribute database.
#host ⇒ Object

Returns the value of attribute host.
#password ⇒ Object

Returns the value of attribute password.
#port ⇒ Object

Returns the value of attribute port.
#s3_bucket ⇒ Object

Returns the value of attribute s3_bucket.
#s3_region ⇒ Object

Returns the value of attribute s3_region.
#schema ⇒ Object

Returns the value of attribute schema.
#username ⇒ Object

Returns the value of attribute username.

Attributes inherited from Database

#name

Class Method Summary collapse

.value_to_string(value) ⇒ Object

Instance Method Summary collapse

Methods inherited from Destination

destination, destination_config, load_config!, only_destination

Constructor Details

#initialize(name, config) ⇒ `RedshiftDestination`

Returns a new instance of RedshiftDestination.

# File 'lib/dataduck/redshift_destination.rb', line 18

def initialize(name, config)
  load_value('aws_key', name, config)
  load_value('aws_secret', name, config)
  load_value('s3_bucket', name, config)
  load_value('s3_region', name, config)
  load_value('host', name, config)
  load_value('port', name, config)
  load_value('database', name, config)
  load_value('schema', name, config)
  load_value('username', name, config)
  load_value('password', name, config)

  @redshift_connection = nil

  super
end

Instance Attribute Details

#aws_key ⇒ `Object`

Returns the value of attribute aws_key.



7
8
9

# File 'lib/dataduck/redshift_destination.rb', line 7

def aws_key
  @aws_key
end

#aws_secret ⇒ `Object`

Returns the value of attribute aws_secret.



8
9
10

# File 'lib/dataduck/redshift_destination.rb', line 8

def aws_secret
  @aws_secret
end

#database ⇒ `Object`

Returns the value of attribute database.



13
14
15

# File 'lib/dataduck/redshift_destination.rb', line 13

def database
  @database
end

#host ⇒ `Object`

Returns the value of attribute host.



11
12
13

# File 'lib/dataduck/redshift_destination.rb', line 11

def host
  @host
end

#password ⇒ `Object`

Returns the value of attribute password.



16
17
18

# File 'lib/dataduck/redshift_destination.rb', line 16

def password
  @password
end

#port ⇒ `Object`

Returns the value of attribute port.



12
13
14

# File 'lib/dataduck/redshift_destination.rb', line 12

def port
  @port
end

#s3_bucket ⇒ `Object`

Returns the value of attribute s3_bucket.



9
10
11

# File 'lib/dataduck/redshift_destination.rb', line 9

def s3_bucket
  @s3_bucket
end

#s3_region ⇒ `Object`

Returns the value of attribute s3_region.



10
11
12

# File 'lib/dataduck/redshift_destination.rb', line 10

def s3_region
  @s3_region
end

#schema ⇒ `Object`

Returns the value of attribute schema.



14
15
16

# File 'lib/dataduck/redshift_destination.rb', line 14

def schema
  @schema
end

#username ⇒ `Object`

Returns the value of attribute username.



15
16
17

# File 'lib/dataduck/redshift_destination.rb', line 15

def username
  @username
end

Class Method Details

.value_to_string(value) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 288

def self.value_to_string(value)
  string_value = ''

  if value.respond_to?(:strftime)
    from_value = value.respond_to?(:utc) ? value.utc : value
    string_value =  from_value.strftime('%Y-%m-%d %H:%M:%S')
  elsif value.respond_to?(:to_s)
    string_value = value.to_s
  end

  string_value.gsub!('"', '""')

  return string_value
end

Instance Method Details

#connection ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 35

def connection
  @redshift_connection ||= Sequel.connect("redshift://#{ self.username }:#{ self.password }@#{ self.host }:#{ self.port }/#{ self.database }" +
          "?force_standard_strings=f",
      :client_min_messages => '',
      :force_standard_strings => false
  )
end

#copy_query(table, s3_path) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 43

def copy_query(table, s3_path)
  properties_joined_string = "\"#{ table.output_column_names.join('","') }\""
  query_fragments = []
  query_fragments << "COPY #{ table.staging_name } (#{ properties_joined_string })"
  query_fragments << "FROM '#{ s3_path }'"
  query_fragments << "CREDENTIALS 'aws_access_key_id=#{ self.aws_key };aws_secret_access_key=#{ self.aws_secret }'"
  query_fragments << "REGION '#{ self.s3_region }'"
  query_fragments << "CSV IGNOREHEADER 1 TRUNCATECOLUMNS ACCEPTINVCHARS EMPTYASNULL"
  query_fragments << "DATEFORMAT 'auto'"
  return query_fragments.join(" ")
end

#create_columns_on_data_warehouse!(table) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 55

def create_columns_on_data_warehouse!(table)
  columns = get_columns_in_data_warehouse(table.building_name)
  column_names = columns.map { |col| col[:name].to_s }
  table.create_schema.map do |name, data_type|
    if !column_names.include?(name.to_s)
      redshift_data_type = self.type_to_redshift_type(data_type)
      self.query("ALTER TABLE #{ table.building_name } ADD #{ name } #{ redshift_data_type }")
    end
  end
end

#create_output_table_with_name!(table, name) ⇒ `Object`



91
92
93

# File 'lib/dataduck/redshift_destination.rb', line 91

def create_output_table_with_name!(table, name)
  self.query(self.create_table_query(table, name))
end

#create_output_tables!(table) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 81

def create_output_tables!(table)
  self.create_output_table_with_name!(table, table.building_name)
  self.create_columns_on_data_warehouse!(table)

  if table.building_name != table.staging_name
    self.drop_staging_table!(table)
    self.create_output_table_with_name!(table, table.staging_name)
  end
end

#create_table_query(table, table_name = nil) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 66

def create_table_query(table, table_name = nil)
  table_name ||= table.name
  props_array = table.create_schema.map do |name, data_type|
    redshift_data_type = self.type_to_redshift_type(data_type)
    "\"#{ name }\" #{ redshift_data_type }"
  end
  props_string = props_array.join(', ')

  distribution_clause = table.distribution_key ? "DISTKEY(#{ table.distribution_key })" : ""
  distribution_style_clause = table.distribution_style ? "DISTSTYLE #{ distribution_style }" : ""
  index_clause = table.indexes.length > 0 ? "INTERLEAVED SORTKEY (#{ table.indexes.join(',') })" : ""

  "CREATE TABLE IF NOT EXISTS #{ table_name } (#{ props_string }) #{ distribution_clause } #{ distribution_style_clause } #{ index_clause }"
end

#data_as_csv_string(data, property_names) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 95

def data_as_csv_string(data, property_names)
  data_string_components = [] # join strings this way for now, could be optimized later

  data_string_components << property_names.join(',') # header column
  data_string_components << "\n"

  data.each do |result|
    property_names.each_with_index do |property_name, index|
      value = result[property_name.to_sym]
      if value.nil?
        value = result[property_name.to_s]
      end

      if index == 0
        data_string_components << '"'
      end

      data_string_components << DataDuck::RedshiftDestination.value_to_string(value)

      if index == property_names.length - 1
        data_string_components << '"'
      else
        data_string_components << '","'
      end
    end
    data_string_components << "\n"
  end

  return data_string_components.join
end

#dbconsole(options = {}) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 140

def dbconsole(options = {})
  args = []
  args << "--host=#{ self.host }"
  args << "--username=#{ self.username }"
  args << "--dbname=#{ self.database }"
  args << "--port=#{ self.port }"

  ENV['PGPASSWORD'] = self.password

  self.find_command_and_execute("psql", *args)
end

#delete_before_inserting!(table) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 183

def delete_before_inserting!(table)
  staging_name = table.staging_name
  building_name = table.building_name

  where_equals_parts = []
  table.identify_by_columns.each do |attribute|
    where_equals_parts << "#{ building_name }.#{ attribute } = #{ staging_name }.#{ attribute }"
  end

  delete_query = "DELETE FROM #{ building_name } USING #{ staging_name } WHERE #{ where_equals_parts.join(' AND ') }"
  self.query(delete_query)
end

#drop_staging_table!(table) ⇒ `Object`



152
153
154

# File 'lib/dataduck/redshift_destination.rb', line 152

def drop_staging_table!(table)
  self.query("DROP TABLE IF EXISTS #{ table.staging_name }")
end

#finish_fully_reloading_table!(table) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 241

def finish_fully_reloading_table!(table)
  self.query("DROP TABLE IF EXISTS zz_dataduck_old_#{ table.name }")

  table_already_exists = self.table_names.include?(table.name)
  if table_already_exists
    self.query("ALTER TABLE #{ table.name } RENAME TO zz_dataduck_old_#{ table.name }")
  end

  self.query("ALTER TABLE #{ table.staging_name } RENAME TO #{ table.name }")
  self.query("DROP TABLE IF EXISTS zz_dataduck_old_#{ table.name }")
end

#get_columns_in_data_warehouse(table_name) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 156

def get_columns_in_data_warehouse(table_name)
  cols_query = "SELECT pg_table_def.column AS name, type AS data_type, distkey, sortkey FROM pg_table_def WHERE tablename='#{ table_name }'"
  results = self.query(cols_query)

  columns = []
  results.each do |result|
    columns << {
        name: result[:name],
        data_type: result[:data_type],
        distkey: result[:distkey],
        sortkey: result[:sortkey],
    }
  end

  return columns
end

#insert_from_staging!(table) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 196

def insert_from_staging!(table)
  staging_name = table.staging_name
  building_name = table.building_name
  insert_query = "INSERT INTO #{ building_name } (\"#{ table.output_column_names.join('","') }\") SELECT \"#{ table.output_column_names.join('","') }\" FROM #{ staging_name }"
  self.query(insert_query)
end

#load_table!(table) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 253

def load_table!(table)
  DataDuck::Logs.info "Loading table #{ table.name }..."
  s3_object = self.upload_table_to_s3!(table)
  self.create_output_tables!(table)
  query_to_run = self.copy_query(table, s3_object.s3_path)
  self.query(query_to_run)
  s3_object.delete!

  if table.staging_name != table.building_name
    self.merge_from_staging!(table)
    self.drop_staging_table!(table)
  end
end

#merge_from_staging!(table) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 173

def merge_from_staging!(table)
  if table.staging_name == table.building_name
    return
  end

  # Following guidelines in http://docs.aws.amazon.com/redshift/latest/dg/merge-examples.html
  self.delete_before_inserting!(table)
  self.insert_from_staging!(table)
end

#postprocess!(table) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 282

def postprocess!(table)
  DataDuck::Logs.info "Vacuuming table #{ table.name }"
  vacuum_type = table.indexes.length == 0 ? "FULL" : "REINDEX"
  self.query("VACUUM #{ vacuum_type } #{ table.name }")
end

#query(sql) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 203

def query(sql)
  Logs.debug("SQL executing on #{ self.name }:\n  " + sql)
  begin
    self.connection[sql].map { |elem| elem }
  rescue => err
    if err.to_s.include?("Check 'stl_load_errors' system table for details")
      self.raise_stl_load_error!
    else
      raise err
    end
  end
end

#raise_stl_load_error! ⇒ `Object`

Raises:

(RedshiftLoadError)

# File 'lib/dataduck/redshift_destination.rb', line 216

def raise_stl_load_error!
  load_error_sql = "SELECT filename, line_number, colname, position, err_code, err_reason FROM stl_load_errors ORDER BY starttime DESC LIMIT 1"
  load_error_details = self.connection[load_error_sql].map { |elem| elem }.first

  raise RedshiftLoadError.new("Error loading Redshift, '#{ load_error_details[:err_reason].strip }' " +
      "(code #{ load_error_details[:err_code] }) with file #{ load_error_details[:filename].strip } " +
      "for column '#{ load_error_details[:colname].strip }'. The error occurred at line #{ load_error_details[:line_number] }, position #{ load_error_details[:position] }.")
end

#recreate_table!(table) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 267

def recreate_table!(table)
  DataDuck::Logs.info "Recreating table #{ table.name }..."

  if !self.table_names.include?(table.name)
    raise "Table #{ table.name } doesn't exist on the Redshift database, so it can't be recreated. Did you want to use `dataduck create #{ table.name }` instead?"
  end

  recreating_temp_name = "zz_dataduck_recreating_#{ table.name }"
  self.create_output_table_with_name!(table, recreating_temp_name)
  self.query("INSERT INTO #{ recreating_temp_name } (\"#{ table.create_column_names.join('","') }\") SELECT \"#{ table.create_column_names.join('","') }\" FROM #{ table.name }")
  self.query("ALTER TABLE #{ table.name } RENAME TO zz_dataduck_recreating_old_#{ table.name }")
  self.query("ALTER TABLE #{ recreating_temp_name } RENAME TO #{ table.name }")
  self.query("DROP TABLE zz_dataduck_recreating_old_#{ table.name }")
end

#table_names ⇒ `Object`



225
226
227

# File 'lib/dataduck/redshift_destination.rb', line 225

def table_names
  self.query("SELECT DISTINCT(tablename) AS name FROM pg_table_def WHERE schemaname='public' ORDER BY name").map { |item| item[:name] }
end

#type_to_redshift_type(which_type) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 126

def type_to_redshift_type(which_type)
  which_type = which_type.to_s

  if ["string", "text", "bigtext"].include?(which_type)
    {
        "string" => "varchar(255)",
        "text" => "varchar(8191)",
        "bigtext" => "varchar(65535)", # Redshift maximum
    }[which_type]
  else
    which_type
  end
end

#upload_table_to_s3!(table) ⇒ `Object`

# File 'lib/dataduck/redshift_destination.rb', line 229

def upload_table_to_s3!(table)
  now_epoch = Time.now.to_i.to_s
  filepath = "pending/#{ table.name.downcase }_#{ now_epoch }.csv"

  table_csv = self.data_as_csv_string(table.data, table.output_column_names)

  s3_obj = S3Object.new(filepath, table_csv, self.aws_key, self.aws_secret,
      self.s3_bucket, self.s3_region)
  s3_obj.upload!
  return s3_obj
end

Class: DataDuck::RedshiftDestination

Instance Attribute Summary collapse

Attributes inherited from Database

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Destination

Constructor Details

#initialize(name, config) ⇒ RedshiftDestination

Instance Attribute Details

#aws_key ⇒ Object

#aws_secret ⇒ Object

#database ⇒ Object

#host ⇒ Object

#password ⇒ Object

#port ⇒ Object

#s3_bucket ⇒ Object

#s3_region ⇒ Object

#schema ⇒ Object

#username ⇒ Object

Class Method Details

.value_to_string(value) ⇒ Object

Instance Method Details

#connection ⇒ Object

#copy_query(table, s3_path) ⇒ Object

#create_columns_on_data_warehouse!(table) ⇒ Object

#create_output_table_with_name!(table, name) ⇒ Object

#create_output_tables!(table) ⇒ Object

#create_table_query(table, table_name = nil) ⇒ Object

#data_as_csv_string(data, property_names) ⇒ Object

#dbconsole(options = {}) ⇒ Object

#delete_before_inserting!(table) ⇒ Object

#drop_staging_table!(table) ⇒ Object

#finish_fully_reloading_table!(table) ⇒ Object

#get_columns_in_data_warehouse(table_name) ⇒ Object

#insert_from_staging!(table) ⇒ Object

#load_table!(table) ⇒ Object

#merge_from_staging!(table) ⇒ Object

#postprocess!(table) ⇒ Object

#query(sql) ⇒ Object

#raise_stl_load_error! ⇒ Object

#recreate_table!(table) ⇒ Object

#table_names ⇒ Object

#type_to_redshift_type(which_type) ⇒ Object

#upload_table_to_s3!(table) ⇒ Object