Class: UrlProcessor::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/url_processor/base.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(c) ⇒ Base

Returns a new instance of Base.

Raises:

  • (ArgumentError)


5
6
7
8
9
10
11
# File 'lib/url_processor/base.rb', line 5

def initialize(c)
  raise ArgumentError unless c.is_a? UrlProcessor::Config
  @config = c

  # connect to the db
  #OnlinesearchesModels::connect
end

Instance Attribute Details

#configObject (readonly)

Returns the value of attribute config.



3
4
5
# File 'lib/url_processor/base.rb', line 3

def config
  @config
end

Instance Method Details

#find_in_batches(collection, batch_size) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/url_processor/base.rb', line 48

def find_in_batches(collection, batch_size)
  if collection.respond_to? :find_in_batches
    collection.find_in_batches(batch_size: batch_size) do |group|
      # Output progress information
      config.logger.info "PROCESSED: #{processed_links}, NEXT GROUP SIZE: #{group.size}".yellow

      yield group

      # for debuggin purposes we do not want to process everything
      if config.debug && processed_links >= config.batch_size
        config.logger.debug "FINISHED first batch (#{@batch_size} records), exiting".yellow
        return
      end

    end
  else
    elements = []
    collection.each do |element|
      elements << element
      if elements.size % batch_size == 0
        yield elements
        elements = elements.clear
      end
    end
    # done iterating, yield whatever else we have left, if we have stuff left
    if elements.size > 0
      yield elements
    end
  end
end


13
14
15
# File 'lib/url_processor/base.rb', line 13

def new_broken_link(params={})
  raise NotImplementedError.new "new_broken_link not implemented"
end


44
45
46
# File 'lib/url_processor/base.rb', line 44

def new_link_request(url, params={})
  raise NotImplementedError.new "link_request is not implemented"
end


36
37
38
# File 'lib/url_processor/base.rb', line 36

def pre_process_link(link)
  # do nothing
end

#process_response(response) ⇒ Object



40
41
42
# File 'lib/url_processor/base.rb', line 40

def process_response(response)
  raise NotImplementedError.new "process_reponse is not implemented"
end


17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/url_processor/base.rb', line 17

def report_broken_link(link_id, params={})
  link_data = params[:link_data]
  response_code = params[:response_code]
  begin
    link = config.get_link_by_id.call(link_id)
    broken_link = new_broken_link(
      :link_id => link.id, 
      :fips_code => link.fips_code, 
      :link_data => link_data, 
      :response_code => response_code,
      :reported_by => 'QC Report'
    )
    broken_link.save
    config.logger.debug "broken link created (#{broken_link.id}): #{broken_link.serializable_hash}".red
  rescue ActiveRecord::RecordNotFound => e
    config.logger.warn "#{e}".red
  end
end

#runObject



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/url_processor/base.rb', line 79

def run
  processed_links = 0
  hydra = Typhoeus::Hydra.new(max_concurrency: config.max_concurrency, max_total_connections: config.max_total_connections)

  find_in_batches(config.links.call, config.batch_size) do |group|

    group.each do |link|
      # any custom pre-processing
      pre_process_link(link)

      if link.urls.empty?
        # In the event that we have a link that actually has no urls associated with it
        report_broken_link link.id, :response_code => :has_no_urls if config.report_records_without_urls
      else
        # Each record has 2 urls associated with it, process each separately
        link.urls.each do |url|
          config.logger.debug "link: #{link.serializable_hash}, url: #{url}".yellow

          link_request = config.new_link_request.call(
            url[:url], 
            followlocation: true, 
            method: :head, 
            ssl_verifypeer: false, 
            ssl_verifyhost: 2, 
            cookiefile: config.cookies_file, 
            cookiejar: config.cookies_file, 
            link_id: link.id,
            link_data: url[:link_data],
            timeout: config.max_timeout,
            connecttimeout: config.max_timeout,
            max_retries: config.max_retries,
            forbid_reuse: 1,
            nosignal: 1
          )

          link_request.on_complete do |response|
            processed_links += 1

            if ([:operation_timedout, :couldnt_resolve_host].include? response.return_code) && response.request.retry_request?
              config.logger.info "#{response.return_code} - #{response.effective_url} timed out, retrying".yellow
              hydra.queue response.request
            elsif response.return_code == :got_nothing && response.request.options[:method] != :get
              config.logger.info "#{response.return_code} - #{response.effective_url} empty response, attempting GET request instead".yellow
              
              # set to GET request since HEAD may fail in some cases
              response.request.options[:method] = :get
              hydra.queue response.request
            else
              config.process_response.call response
            end
          end

          hydra.queue link_request
        end
      end
    end

    hydra.run
  end
end