Class: UrlProcessor::Base
- Inherits:
-
Object
- Object
- UrlProcessor::Base
- Defined in:
- lib/url_processor/base.rb
Instance Attribute Summary collapse
-
#config ⇒ Object
readonly
Returns the value of attribute config.
Instance Method Summary collapse
- #find_in_batches(collection, batch_size) ⇒ Object
-
#initialize(c) ⇒ Base
constructor
A new instance of Base.
- #new_broken_link(params = {}) ⇒ Object
- #new_link_request(url, params = {}) ⇒ Object
- #pre_process_link(link) ⇒ Object
- #process_response(response) ⇒ Object
- #report_broken_link(link_id, params = {}) ⇒ Object
- #run ⇒ Object
Constructor Details
#initialize(c) ⇒ Base
Returns a new instance of Base.
5 6 7 8 9 10 11 |
# File 'lib/url_processor/base.rb', line 5 def initialize(c) raise ArgumentError unless c.is_a? UrlProcessor::Config @config = c # connect to the db #OnlinesearchesModels::connect end |
Instance Attribute Details
#config ⇒ Object (readonly)
Returns the value of attribute config.
3 4 5 |
# File 'lib/url_processor/base.rb', line 3 def config @config end |
Instance Method Details
#find_in_batches(collection, batch_size) ⇒ Object
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/url_processor/base.rb', line 48 def find_in_batches(collection, batch_size) if collection.respond_to? :find_in_batches collection.find_in_batches(batch_size: batch_size) do |group| # Output progress information config.logger.info "PROCESSED: #{processed_links}, NEXT GROUP SIZE: #{group.size}".yellow yield group # for debuggin purposes we do not want to process everything if config.debug && processed_links >= config.batch_size config.logger.debug "FINISHED first batch (#{@batch_size} records), exiting".yellow return end end else elements = [] collection.each do |element| elements << element if elements.size % batch_size == 0 yield elements elements = elements.clear end end # done iterating, yield whatever else we have left, if we have stuff left if elements.size > 0 yield elements end end end |
#new_broken_link(params = {}) ⇒ Object
13 14 15 |
# File 'lib/url_processor/base.rb', line 13 def new_broken_link(params={}) raise NotImplementedError.new "new_broken_link not implemented" end |
#new_link_request(url, params = {}) ⇒ Object
44 45 46 |
# File 'lib/url_processor/base.rb', line 44 def new_link_request(url, params={}) raise NotImplementedError.new "link_request is not implemented" end |
#pre_process_link(link) ⇒ Object
36 37 38 |
# File 'lib/url_processor/base.rb', line 36 def pre_process_link(link) # do nothing end |
#process_response(response) ⇒ Object
40 41 42 |
# File 'lib/url_processor/base.rb', line 40 def process_response(response) raise NotImplementedError.new "process_reponse is not implemented" end |
#report_broken_link(link_id, params = {}) ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/url_processor/base.rb', line 17 def report_broken_link(link_id, params={}) link_data = params[:link_data] response_code = params[:response_code] begin link = config.get_link_by_id.call(link_id) broken_link = new_broken_link( :link_id => link.id, :fips_code => link.fips_code, :link_data => link_data, :response_code => response_code, :reported_by => 'QC Report' ) broken_link.save config.logger.debug "broken link created (#{broken_link.id}): #{broken_link.serializable_hash}".red rescue ActiveRecord::RecordNotFound => e config.logger.warn "#{e}".red end end |
#run ⇒ Object
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/url_processor/base.rb', line 79 def run processed_links = 0 hydra = Typhoeus::Hydra.new(max_concurrency: config.max_concurrency, max_total_connections: config.max_total_connections) find_in_batches(config.links.call, config.batch_size) do |group| group.each do |link| # any custom pre-processing pre_process_link(link) if link.urls.empty? # In the event that we have a link that actually has no urls associated with it report_broken_link link.id, :response_code => :has_no_urls if config.report_records_without_urls else # Each record has 2 urls associated with it, process each separately link.urls.each do |url| config.logger.debug "link: #{link.serializable_hash}, url: #{url}".yellow link_request = config.new_link_request.call( url[:url], followlocation: true, method: :head, ssl_verifypeer: false, ssl_verifyhost: 2, cookiefile: config., cookiejar: config., link_id: link.id, link_data: url[:link_data], timeout: config.max_timeout, connecttimeout: config.max_timeout, max_retries: config.max_retries, forbid_reuse: 1, nosignal: 1 ) link_request.on_complete do |response| processed_links += 1 if ([:operation_timedout, :couldnt_resolve_host].include? response.return_code) && response.request.retry_request? config.logger.info "#{response.return_code} - #{response.effective_url} timed out, retrying".yellow hydra.queue response.request elsif response.return_code == :got_nothing && response.request.[:method] != :get config.logger.info "#{response.return_code} - #{response.effective_url} empty response, attempting GET request instead".yellow # set to GET request since HEAD may fail in some cases response.request.[:method] = :get hydra.queue response.request else config.process_response.call response end end hydra.queue link_request end end end hydra.run end end |