Class: MechWarrior::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/mech_warrior/crawler.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(override_opts = {}) ⇒ Crawler

Returns a new instance of Crawler.



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/mech_warrior/crawler.rb', line 5

def initialize(override_opts={})
  @opts  = DEFAULTS.merge(override_opts)
  @default_host     = opts[:default_host]
  @default_protocol = opts[:default_protocol]
  opts[:allowed_domains] << default_host
  @output_file = opts[:output_file] || File.open(opts[:log_file_name], 'a')
  @logger = opts[:logger_class].new(output_file)
  @agent_pool = MechCell.pool(size: opts[:pool_size], args: [logger])
  @pages = {}
  start_url = opts[:start_url] || "#{default_protocol}#{default_host}/"
  pages[normalize_url(start_url)] = {}
  index_url(start_url) unless opts[:no_index]
  self
ensure
  output_file.close if output_file.respond_to?(:close)
end

Instance Attribute Details

#agent_poolObject (readonly)

Returns the value of attribute agent_pool.



3
4
5
# File 'lib/mech_warrior/crawler.rb', line 3

def agent_pool
  @agent_pool
end

#default_hostObject (readonly)

Returns the value of attribute default_host.



3
4
5
# File 'lib/mech_warrior/crawler.rb', line 3

def default_host
  @default_host
end

#default_protocolObject (readonly)

Returns the value of attribute default_protocol.



3
4
5
# File 'lib/mech_warrior/crawler.rb', line 3

def default_protocol
  @default_protocol
end

#loggerObject (readonly)

Returns the value of attribute logger.



3
4
5
# File 'lib/mech_warrior/crawler.rb', line 3

def logger
  @logger
end

#optsObject (readonly)

Returns the value of attribute opts.



3
4
5
# File 'lib/mech_warrior/crawler.rb', line 3

def opts
  @opts
end

#output_fileObject (readonly)

Returns the value of attribute output_file.



3
4
5
# File 'lib/mech_warrior/crawler.rb', line 3

def output_file
  @output_file
end

#pagesObject (readonly)

Returns the value of attribute pages.



3
4
5
# File 'lib/mech_warrior/crawler.rb', line 3

def pages
  @pages
end

Instance Method Details

#index_url(href) ⇒ Object



22
23
24
25
26
# File 'lib/mech_warrior/crawler.rb', line 22

def index_url(href)
  schemed_url                 = normalize_url(href)
  future                      = page_future(schemed_url)
  process_page(future, schemed_url)
end