Class: DataDuck::Table
- Inherits:
-
Object
- Object
- DataDuck::Table
- Defined in:
- lib/dataduck/table.rb
Direct Known Subclasses
Class Attribute Summary collapse
-
.actions ⇒ Object
Returns the value of attribute actions.
-
.output_schema ⇒ Object
Returns the value of attribute output_schema.
-
.sources ⇒ Object
Returns the value of attribute sources.
Instance Attribute Summary collapse
-
#data ⇒ Object
Returns the value of attribute data.
-
#errors ⇒ Object
Returns the value of attribute errors.
Class Method Summary collapse
- .output(schema) ⇒ Object
- .source(source_name, source_table_or_query = nil, source_columns = nil) ⇒ Object
- .transforms(transformation_name) ⇒ Object
- .validates(validation_name) ⇒ Object
Instance Method Summary collapse
- #actions ⇒ Object
- #autogenerate_identity? ⇒ Boolean
- #batch_size ⇒ Object
- #building_name ⇒ Object
- #check_table_valid! ⇒ Object
- #create_column_names ⇒ Object
- #create_schema ⇒ Object
- #distribution_key ⇒ Object
- #distribution_style ⇒ Object
- #etl!(destinations, options = {}) ⇒ Object
- #extract!(destination = nil, options = {}) ⇒ Object
- #extract_by_clause(value) ⇒ Object
- #extract_by_column ⇒ Object
- #extract_query(source_spec, destination = nil) ⇒ Object
- #identify_by_columns ⇒ Object
- #include_with_all? ⇒ Boolean
- #indexes ⇒ Object
- #limit_clause ⇒ Object
- #load!(destination) ⇒ Object
- #name ⇒ Object
- #output_column_names ⇒ Object
- #output_schema ⇒ Object
- #postprocess!(destination, options = {}) ⇒ Object
- #prefix ⇒ Object
- #recreate!(destination) ⇒ Object
- #should_fully_reload? ⇒ Boolean
- #show ⇒ Object
- #staging_name ⇒ Object
- #transform! ⇒ Object
Class Attribute Details
.actions ⇒ Object
Returns the value of attribute actions.
8 9 10 |
# File 'lib/dataduck/table.rb', line 8 def actions @actions end |
.output_schema ⇒ Object
Returns the value of attribute output_schema.
7 8 9 |
# File 'lib/dataduck/table.rb', line 7 def output_schema @output_schema end |
.sources ⇒ Object
Returns the value of attribute sources.
6 7 8 |
# File 'lib/dataduck/table.rb', line 6 def sources @sources end |
Instance Attribute Details
#data ⇒ Object
Returns the value of attribute data.
11 12 13 |
# File 'lib/dataduck/table.rb', line 11 def data @data end |
#errors ⇒ Object
Returns the value of attribute errors.
12 13 14 |
# File 'lib/dataduck/table.rb', line 12 def errors @errors end |
Class Method Details
.output(schema) ⇒ Object
42 43 44 45 |
# File 'lib/dataduck/table.rb', line 42 def self.output(schema) self.output_schema ||= {} self.output_schema.merge!(schema) end |
.source(source_name, source_table_or_query = nil, source_columns = nil) ⇒ Object
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/dataduck/table.rb', line 26 def self.source(source_name, source_table_or_query = nil, source_columns = nil) self.sources ||= [] source_spec = {} if source_table_or_query.respond_to?(:to_s) && source_table_or_query.to_s.downcase.include?('select ') source_spec = {query: source_table_or_query} elsif source_columns.nil? && source_table_or_query.respond_to?(:each) source_spec = {columns: source_table_or_query, table_name: DataDuck::Util.camelcase_to_underscore(self.name)} else source_spec = {columns: source_columns, table_name: source_table_or_query.to_s} end source_spec[:source] = DataDuck::Source.source(source_name) self.sources << source_spec end |
.transforms(transformation_name) ⇒ Object
14 15 16 17 |
# File 'lib/dataduck/table.rb', line 14 def self.transforms(transformation_name) self.actions ||= [] self.actions << [:transform, transformation_name] end |
.validates(validation_name) ⇒ Object
20 21 22 23 |
# File 'lib/dataduck/table.rb', line 20 def self.validates(validation_name) self.actions ||= [] self.actions << [:validate, validation_name] end |
Instance Method Details
#actions ⇒ Object
47 48 49 50 51 52 53 54 55 56 |
# File 'lib/dataduck/table.rb', line 47 def actions my_actions = [] for_class = self.class while for_class < Table my_actions.concat(for_class.actions || []) for_class = for_class.superclass end my_actions end |
#autogenerate_identity? ⇒ Boolean
222 223 224 |
# File 'lib/dataduck/table.rb', line 222 def autogenerate_identity? false end |
#batch_size ⇒ Object
202 203 204 |
# File 'lib/dataduck/table.rb', line 202 def batch_size nil end |
#building_name ⇒ Object
226 227 228 |
# File 'lib/dataduck/table.rb', line 226 def building_name self.should_fully_reload? ? self.staging_name : self.name end |
#check_table_valid! ⇒ Object
58 59 60 61 62 63 |
# File 'lib/dataduck/table.rb', line 58 def check_table_valid! if !self.batch_size.nil? raise "Table #{ self.name }'s batch_size must be > 0" unless self.batch_size > 0 raise "Table #{ self.name } has batch_size defined but no extract_by_column" if self.extract_by_column.nil? end end |
#create_column_names ⇒ Object
242 243 244 |
# File 'lib/dataduck/table.rb', line 242 def create_column_names self.create_schema.keys.map(&:to_s).sort end |
#create_schema ⇒ Object
234 235 236 237 238 239 240 |
# File 'lib/dataduck/table.rb', line 234 def create_schema if self.autogenerate_identity? Util.deep_merge(output_schema, {dataduck_identity: 'bigint identity(1, 1)'}) # Redshift only else output_schema end end |
#distribution_key ⇒ Object
65 66 67 68 69 70 71 |
# File 'lib/dataduck/table.rb', line 65 def distribution_key if self.output_column_names.include?("id") "id" else nil end end |
#distribution_style ⇒ Object
73 74 75 |
# File 'lib/dataduck/table.rb', line 73 def distribution_style nil end |
#etl!(destinations, options = {}) ⇒ Object
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# File 'lib/dataduck/table.rb', line 77 def etl!(destinations, = {}) if destinations.length != 1 raise ArgumentError.new("DataDuck can only etl to one destination at a time for now.") end if [:dates].nil? [:dates] = [Date.today] end self.check_table_valid! destination = destinations.first if self.should_fully_reload? destination.drop_staging_table!(self) end data_processed = false batch_number = 0 while batch_number < 1_000 batch_number += 1 self.extract!(destination, ) if self.data.length > 0 self.transform! self.data.compact! self.load!(destination) if self.data.length > 0 data_processed = true end if self.batch_size.nil? break else if self.batch_size == self.data.length DataDuck::Logs.info "Finished batch #{ batch_number }, continuing with the next batch" else DataDuck::Logs.info "Finished batch #{ batch_number } (last batch)" break end end end self.data = [] if data_processed if self.should_fully_reload? destination.finish_fully_reloading_table!(self) end self.postprocess!(destination, ) else DataDuck::Logs.info "No data extracted for table #{ self.name }" end end |
#extract!(destination = nil, options = {}) ⇒ Object
131 132 133 134 135 136 137 138 139 140 141 142 143 |
# File 'lib/dataduck/table.rb', line 131 def extract!(destination = nil, = {}) DataDuck::Logs.info "Extracting table #{ self.name }" self.errors ||= [] self.data = [] self.class.sources.each do |source_spec| source = source_spec[:source] my_query = self.extract_query(source_spec, destination) results = source.query(my_query) self.data.concat(results) end self.data end |
#extract_by_clause(value) ⇒ Object
171 172 173 174 175 176 177 |
# File 'lib/dataduck/table.rb', line 171 def extract_by_clause(value) if value "WHERE #{ self.extract_by_column } >= '#{ value }'" else "" end end |
#extract_by_column ⇒ Object
206 207 208 209 210 |
# File 'lib/dataduck/table.rb', line 206 def extract_by_column return 'updated_at' if self.output_column_names.include?("updated_at") nil end |
#extract_query(source_spec, destination = nil) ⇒ Object
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# File 'lib/dataduck/table.rb', line 145 def extract_query(source_spec, destination = nil) escape_char = source_spec[:source].escape_char base_query = source_spec.has_key?(:query) ? source_spec[:query] : "SELECT #{ escape_char }#{ source_spec[:columns].sort.join(escape_char + ',' + escape_char) }#{ escape_char } FROM #{ source_spec[:table_name] }" extract_part = "" limit_part = self.limit_clause if self.extract_by_column if destination.table_names.include?(self.building_name) extract_by_column_without_table = self.extract_by_column.include?(".") ? self.extract_by_column.split(".").last : self.extract_by_column extract_by_value = destination.query("SELECT MAX(#{ extract_by_column_without_table }) AS val FROM #{ self.building_name }").first extract_by_value = extract_by_value.nil? ? nil : extract_by_value[:val] extract_part = self.extract_by_clause(extract_by_value) end end if base_query.downcase.split("from").last.include?(' where ') extract_part.gsub!('WHERE ', 'AND ') end [base_query, extract_part, limit_part].join(' ').strip end |
#identify_by_columns ⇒ Object
212 213 214 215 216 |
# File 'lib/dataduck/table.rb', line 212 def identify_by_columns return ["id"] if self.output_column_names.include?("id") [] end |
#include_with_all? ⇒ Boolean
191 192 193 |
# File 'lib/dataduck/table.rb', line 191 def include_with_all? true end |
#indexes ⇒ Object
195 196 197 198 199 200 |
# File 'lib/dataduck/table.rb', line 195 def indexes which_columns = [] which_columns << "id" if self.output_column_names.include?("id") which_columns << "created_at" if self.output_column_names.include?("created_at") which_columns end |
#limit_clause ⇒ Object
179 180 181 182 183 184 185 |
# File 'lib/dataduck/table.rb', line 179 def limit_clause if self.extract_by_column && self.batch_size "ORDER BY #{ self.extract_by_column } LIMIT #{ self.batch_size }" else "" end end |
#load!(destination) ⇒ Object
187 188 189 |
# File 'lib/dataduck/table.rb', line 187 def load!(destination) destination.load_table!(self) end |
#name ⇒ Object
296 297 298 299 300 301 302 303 |
# File 'lib/dataduck/table.rb', line 296 def name fixed_name = DataDuck::Util.camelcase_to_underscore(self.class.name) if fixed_name.start_with?("data_duck/") fixed_name = fixed_name.split("/").last end self.prefix + fixed_name end |
#output_column_names ⇒ Object
250 251 252 |
# File 'lib/dataduck/table.rb', line 250 def output_column_names self.output_schema.keys.map(&:to_s).sort end |
#output_schema ⇒ Object
246 247 248 |
# File 'lib/dataduck/table.rb', line 246 def output_schema self.class.output_schema || self.class.superclass.output_schema || {} end |
#postprocess!(destination, options = {}) ⇒ Object
254 255 256 |
# File 'lib/dataduck/table.rb', line 254 def postprocess!(destination, = {}) destination.postprocess!(self) end |
#prefix ⇒ Object
305 306 307 |
# File 'lib/dataduck/table.rb', line 305 def prefix "" end |
#recreate!(destination) ⇒ Object
258 259 260 |
# File 'lib/dataduck/table.rb', line 258 def recreate!(destination) destination.recreate_table!(self) end |
#should_fully_reload? ⇒ Boolean
218 219 220 |
# File 'lib/dataduck/table.rb', line 218 def should_fully_reload? false end |
#show ⇒ Object
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 |
# File 'lib/dataduck/table.rb', line 262 def show puts "Table #{ self.name }" self.class.sources.each do |source_spec| puts "\nSources from #{ source_spec[:table_name] || source_spec[:query] } on #{ source_spec[:source].name }" source_spec[:columns].each do |col_name| puts " #{ col_name }" end end puts "\nOutputs " num_separators = self.output_schema.keys.map { |key| key.length }.max self.output_schema.each_pair do |name, datatype| puts " #{ name }#{ ' ' * (num_separators + 2 - name.length) }#{ datatype }" end end |
#staging_name ⇒ Object
230 231 232 |
# File 'lib/dataduck/table.rb', line 230 def staging_name "zz_dataduck_#{ self.name }" end |
#transform! ⇒ Object
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 |
# File 'lib/dataduck/table.rb', line 278 def transform! DataDuck::Logs.info "Transforming table #{ self.name }" self.errors ||= [] self.actions.each do |action| action_type = action[0] action_method_name = action[1] if action_type == :transform self.data.map! { |row| self.public_send(action_method_name, row) } elsif action_type == :validate self.data.each do |row| error = self.public_send(action_method_name, row) self.errors << error if !error.blank? end end end end |