Class: MechWarrior::Crawler
- Inherits:
-
Object
- Object
- MechWarrior::Crawler
- Defined in:
- lib/mech_warrior/crawler.rb
Instance Attribute Summary collapse
-
#agent_pool ⇒ Object
readonly
Returns the value of attribute agent_pool.
-
#default_host ⇒ Object
readonly
Returns the value of attribute default_host.
-
#default_protocol ⇒ Object
readonly
Returns the value of attribute default_protocol.
-
#logger ⇒ Object
readonly
Returns the value of attribute logger.
-
#opts ⇒ Object
readonly
Returns the value of attribute opts.
-
#output_file ⇒ Object
readonly
Returns the value of attribute output_file.
-
#pages ⇒ Object
readonly
Returns the value of attribute pages.
Instance Method Summary collapse
- #index_url(href) ⇒ Object
-
#initialize(override_opts = {}) ⇒ Crawler
constructor
A new instance of Crawler.
Constructor Details
#initialize(override_opts = {}) ⇒ Crawler
Returns a new instance of Crawler.
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
# File 'lib/mech_warrior/crawler.rb', line 5 def initialize(override_opts={}) @opts = DEFAULTS.merge(override_opts) @default_host = opts[:default_host] @default_protocol = opts[:default_protocol] opts[:allowed_domains] << default_host @output_file = opts[:output_file] || File.open(opts[:log_file_name], 'a') @logger = opts[:logger_class].new(output_file) @agent_pool = MechCell.pool(size: opts[:pool_size], args: [logger]) @pages = {} start_url = opts[:start_url] || "#{default_protocol}#{default_host}/" pages[normalize_url(start_url)] = {} index_url(start_url) unless opts[:no_index] self ensure output_file.close if output_file.respond_to?(:close) end |
Instance Attribute Details
#agent_pool ⇒ Object (readonly)
Returns the value of attribute agent_pool.
3 4 5 |
# File 'lib/mech_warrior/crawler.rb', line 3 def agent_pool @agent_pool end |
#default_host ⇒ Object (readonly)
Returns the value of attribute default_host.
3 4 5 |
# File 'lib/mech_warrior/crawler.rb', line 3 def default_host @default_host end |
#default_protocol ⇒ Object (readonly)
Returns the value of attribute default_protocol.
3 4 5 |
# File 'lib/mech_warrior/crawler.rb', line 3 def default_protocol @default_protocol end |
#logger ⇒ Object (readonly)
Returns the value of attribute logger.
3 4 5 |
# File 'lib/mech_warrior/crawler.rb', line 3 def logger @logger end |
#opts ⇒ Object (readonly)
Returns the value of attribute opts.
3 4 5 |
# File 'lib/mech_warrior/crawler.rb', line 3 def opts @opts end |
#output_file ⇒ Object (readonly)
Returns the value of attribute output_file.
3 4 5 |
# File 'lib/mech_warrior/crawler.rb', line 3 def output_file @output_file end |
#pages ⇒ Object (readonly)
Returns the value of attribute pages.
3 4 5 |
# File 'lib/mech_warrior/crawler.rb', line 3 def pages @pages end |
Instance Method Details
#index_url(href) ⇒ Object
22 23 24 25 26 |
# File 'lib/mech_warrior/crawler.rb', line 22 def index_url(href) schemed_url = normalize_url(href) future = page_future(schemed_url) process_page(future, schemed_url) end |