Class: Scrape::Application

Inherits:
Object
  • Object
show all
Defined in:
lib/scrape/application.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(scrapefile, options = {}, loader = Scrape::DefaultLoader.new) ⇒ Application

Returns a new instance of Application.



4
5
6
7
8
9
10
11
# File 'lib/scrape/application.rb', line 4

def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader.new
  @scrapefile = File.expand_path scrapefile
  @loader = loader
  @sites = {}
  @queue = []
  @history = []
  @ignore_robots_txt = options.fetch(:ignore_robots_txt){ false }
end

Instance Attribute Details

#historyObject (readonly)

Returns the value of attribute history.



2
3
4
# File 'lib/scrape/application.rb', line 2

def history
  @history
end

#ignore_robots_txtObject

Returns the value of attribute ignore_robots_txt.



2
3
4
# File 'lib/scrape/application.rb', line 2

def ignore_robots_txt
  @ignore_robots_txt
end

#loaderObject (readonly)

Returns the value of attribute loader.



2
3
4
# File 'lib/scrape/application.rb', line 2

def loader
  @loader
end

#scrapefileObject (readonly)

Returns the value of attribute scrapefile.



2
3
4
# File 'lib/scrape/application.rb', line 2

def scrapefile
  @scrapefile
end

#sitesObject (readonly)

Returns the value of attribute sites.



2
3
4
# File 'lib/scrape/application.rb', line 2

def sites
  @sites
end

Instance Method Details

#[](url) ⇒ Object



52
53
54
# File 'lib/scrape/application.rb', line 52

def [] url
  @sites.values.detect{|site| site.accept? url }
end

#enqueue(*urls) ⇒ Object



41
42
43
44
45
# File 'lib/scrape/application.rb', line 41

def enqueue *urls
  urls.flatten.each do |url|
    @queue << url unless @history.include?(url) || @queue.include?(url)
  end
end

#load_scrapefileObject



56
57
58
59
60
61
62
63
64
# File 'lib/scrape/application.rb', line 56

def load_scrapefile
  return if @scrapefile_loaded
  raise Scrape::FileNotFound.new(scrapefile) unless File.exists? scrapefile
  result = loader.load scrapefile
  @sites.update result if result.is_a? Hash
  self.ignore_robots_txt = ignore_robots_txt
  reset
  @scrapefile_loaded = true
end

#queueObject



37
38
39
# File 'lib/scrape/application.rb', line 37

def queue
  @queue.dup
end

#resetObject



32
33
34
35
# File 'lib/scrape/application.rb', line 32

def reset
  @history = []
  @queue = sites.values.map{|site| site.to_s }
end

#runObject



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/scrape/application.rb', line 13

def run
  load_scrapefile

  while url = @queue.shift
    Scrape.logger.info "Loading: #{url}..."
    @history << url
    if site = self[url]
      if urls = site.parse(url)
        enqueue *urls
        Scrape.logger.info "Found #{urls.length} urls."
      else
        Scrape.logger.info "Done."
      end
    else
      Scrape.logger.info "Not defined."
    end
  end
end