Class: ScraperUtils::MechanizeUtils::AgentConfig

Inherits:
Object
  • Object
show all
Defined in:
lib/scraper_utils/mechanize_utils/agent_config.rb

Overview

Configuration for a Mechanize agent with sensible defaults and configurable settings. Supports global configuration through AgentConfig.configure and per-instance overrides.

Examples:

Setting global defaults

ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
  config.default_timeout = 90
  config.default_random_delay = 5
end

Creating an instance with defaults

config = ScraperUtils::MechanizeUtils::AgentConfig.new

Overriding specific settings

config = ScraperUtils::MechanizeUtils::AgentConfig.new(
  timeout: 120,
  random_delay: 10
)

Constant Summary collapse

DEFAULT_TIMEOUT =
60
DEFAULT_RANDOM_DELAY =
0
DEFAULT_MAX_LOAD =
50.0
MAX_LOAD_CAP =
80.0

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(timeout: nil, compliant_mode: nil, random_delay: nil, max_load: nil, disable_ssl_certificate_check: nil, australian_proxy: nil, user_agent: nil) ⇒ AgentConfig

Creates Mechanize agent configuration with sensible defaults overridable via configure

Parameters:

  • timeout (Integer, nil) (defaults to: nil)

    Timeout for agent connections (default: 60)

  • compliant_mode (Boolean, nil) (defaults to: nil)

    Comply with headers and robots.txt (default: true)

  • random_delay (Integer, nil) (defaults to: nil)

    Average random delay in seconds (default: 3)

  • max_load (Float, nil) (defaults to: nil)

    Maximum server load percentage (nil = no delay, default: 20%) When compliant_mode is true, max_load is capped at 33%

  • disable_ssl_certificate_check (Boolean, nil) (defaults to: nil)

    Skip SSL verification (default: false)

  • australian_proxy (Boolean, nil) (defaults to: nil)

    Use proxy if available (default: false)

  • user_agent (String, nil) (defaults to: nil)

    Configure Mechanize user agent


99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 99

def initialize(timeout: nil,
               compliant_mode: nil,
               random_delay: nil,
               max_load: nil,
               disable_ssl_certificate_check: nil,
               australian_proxy: nil,
               user_agent: nil)
  @timeout = timeout.nil? ? self.class.default_timeout : timeout
  @compliant_mode = compliant_mode.nil? ? self.class.default_compliant_mode : compliant_mode
  @random_delay = random_delay.nil? ? self.class.default_random_delay : random_delay
  @max_load = max_load.nil? ? self.class.default_max_load : max_load
  @max_load = [@max_load || DEFAULT_MAX_LOAD, MAX_LOAD_CAP].min if @compliant_mode
  @user_agent = user_agent.nil? ? self.class.default_user_agent : user_agent

  @disable_ssl_certificate_check = if disable_ssl_certificate_check.nil?
                                     self.class.default_disable_ssl_certificate_check
                                   else
                                     disable_ssl_certificate_check
                                   end
  @australian_proxy = if australian_proxy.nil?
                        self.class.default_australian_proxy
                      else
                        australian_proxy
                      end

  # Validate proxy URL format if proxy will be used
  @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
  if @australian_proxy
    uri = begin
            URI.parse(ScraperUtils.australian_proxy.to_s)
          rescue URI::InvalidURIError => e
            raise URI::InvalidURIError, "Invalid proxy URL format: #{e}"
          end
    unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
      raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
    end
    unless !uri.host.to_s.empty? && uri.port&.positive?
      raise URI::InvalidURIError, "Proxy URL must include host and port"
    end
  end

  if @random_delay&.positive?
    min_random = Math.sqrt(@random_delay * 3.0 / 13.0)
    @random_range = min_random.round(3)..(3 * min_random).round(3)
  end

  today = Date.today.strftime("%Y-%m-%d")
  @user_agent = ENV.fetch("MORPH_USER_AGENT", nil)&.sub("TODAY", today)
  if @compliant_mode
    version = ScraperUtils::VERSION
    @user_agent ||= "Mozilla/5.0 (compatible; ScraperUtils/#{version} #{today}; +https://github.com/ianheggie-oaf/scraper_utils)"
  end

  @robots_checker = RobotsChecker.new(@user_agent) if @user_agent
  @adaptive_delay = AdaptiveDelay.new(max_load: @max_load) if @max_load
  display_options
end

Class Attribute Details

.default_australian_proxyBoolean

Returns Default flag for Australian proxy preference.

Returns:

  • (Boolean)

    Default flag for Australian proxy preference


49
50
51
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 49

def default_australian_proxy
  @default_australian_proxy
end

.default_compliant_modeBoolean

Returns Default setting for compliance with headers and robots.txt.

Returns:

  • (Boolean)

    Default setting for compliance with headers and robots.txt


37
38
39
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 37

def default_compliant_mode
  @default_compliant_mode
end

.default_disable_ssl_certificate_checkBoolean

Returns Default setting for SSL certificate verification.

Returns:

  • (Boolean)

    Default setting for SSL certificate verification


46
47
48
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 46

def default_disable_ssl_certificate_check
  @default_disable_ssl_certificate_check
end

.default_max_loadFloat?

Returns Default maximum server load percentage (nil = no response delay).

Returns:

  • (Float, nil)

    Default maximum server load percentage (nil = no response delay)


43
44
45
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 43

def default_max_load
  @default_max_load
end

.default_random_delayInteger?

Returns Default average random delay in seconds.

Returns:

  • (Integer, nil)

    Default average random delay in seconds


40
41
42
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 40

def default_random_delay
  @default_random_delay
end

.default_timeoutInteger

Returns Default timeout in seconds for agent connections.

Returns:

  • (Integer)

    Default timeout in seconds for agent connections


34
35
36
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 34

def default_timeout
  @default_timeout
end

.default_user_agentString?

Returns Default Mechanize user agent.

Returns:

  • (String, nil)

    Default Mechanize user agent


52
53
54
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 52

def default_user_agent
  @default_user_agent
end

Instance Attribute Details

#max_loadObject (readonly)

Give access for testing


88
89
90
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 88

def max_load
  @max_load
end

#random_rangeObject (readonly)

Give access for testing


88
89
90
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 88

def random_range
  @random_range
end

#user_agentString (readonly)

Returns User agent string.

Returns:

  • (String)

    User agent string


84
85
86
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 84

def user_agent
  @user_agent
end

Class Method Details

.configure {|self| ... } ⇒ void

This method returns an undefined value.

Configure default settings for all AgentConfig instances

Examples:

AgentConfig.configure do |config|
  config.default_timeout = 90
  config.default_random_delay = 5
  config.default_max_load = 15
end

Yields:

  • (self)

    Yields self for configuration


63
64
65
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 63

def configure
  yield self if block_given?
end

.reset_defaults!void

This method returns an undefined value.

Reset all configuration options to their default values


69
70
71
72
73
74
75
76
77
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 69

def reset_defaults!
  @default_timeout = ENV.fetch('MORPH_CLIENT_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
  @default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
  @default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 0
  @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 50.0
  @default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
  @default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
  @default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
end

Instance Method Details

#configure_agent(agent) ⇒ void

This method returns an undefined value.

Configures a Mechanize agent with these settings

Parameters:

  • agent (Mechanize)

    The agent to configure


160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/scraper_utils/mechanize_utils/agent_config.rb', line 160

def configure_agent(agent)
  agent.verify_mode = OpenSSL::SSL::VERIFY_NONE if @disable_ssl_certificate_check

  if @timeout
    agent.open_timeout = @timeout
    agent.read_timeout = @timeout
  end
  if @compliant_mode
    agent.user_agent = user_agent
    agent.request_headers ||= {}
    agent.request_headers["Accept"] =
      "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
    agent.request_headers["Upgrade-Insecure-Requests"] = "1"
  end
  if @australian_proxy
    agent.agent.set_proxy(ScraperUtils.australian_proxy)
    agent.request_headers["Accept-Language"] = "en-AU,en-US;q=0.9,en;q=0.8"
    verify_proxy_works(agent)
  end

  agent.pre_connect_hooks << method(:pre_connect_hook)
  agent.post_connect_hooks << method(:post_connect_hook)
end