Class: Scrape::Site

Inherits:
Object
  • Object
show all
Defined in:
lib/scrape/site.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options = {}) ⇒ Site

Returns a new instance of Site.



8
9
10
11
12
13
14
# File 'lib/scrape/site.rb', line 8

def initialize url, options = {}
  @url = Addressable::URI.parse url
  @url.query = nil
  @url.fragment = nil
  @matches = []
  @ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
end

Instance Attribute Details

#ignore_robots_txtObject

Returns the value of attribute ignore_robots_txt.



6
7
8
# File 'lib/scrape/site.rb', line 6

def ignore_robots_txt
  @ignore_robots_txt
end

#matchesObject (readonly)

Returns the value of attribute matches.



5
6
7
# File 'lib/scrape/site.rb', line 5

def matches
  @matches
end

#urlObject (readonly)

Returns the value of attribute url.



5
6
7
# File 'lib/scrape/site.rb', line 5

def url
  @url
end

Instance Method Details

#accept?(url) ⇒ Boolean

Returns:

  • (Boolean)


31
32
33
34
# File 'lib/scrape/site.rb', line 31

def accept? url
  url = normalize url
  url.starts_with(to_s) && !disallowed?(url)
end

#add_match(matcher, &proc) ⇒ Object



16
17
18
19
20
# File 'lib/scrape/site.rb', line 16

def add_match matcher, &proc
  match = Scrape::Match.new(matcher, &proc)
  @matches << match
  match
end

#ignore_robots_txt?Boolean

Returns:

  • (Boolean)


44
45
46
# File 'lib/scrape/site.rb', line 44

def ignore_robots_txt?
  !!@ignore_robots_txt
end

#normalize(url, base_url = self.url) ⇒ Object



36
37
38
# File 'lib/scrape/site.rb', line 36

def normalize url, base_url = self.url
  Addressable::URI.join(base_url, url).to_s
end

#parse(url) ⇒ Object



22
23
24
25
26
27
28
29
# File 'lib/scrape/site.rb', line 22

def parse url
  url = normalize url
  doc = Nokogiri::HTML Scrape.open(url)

  @matches.each{|match| match.invoke doc, url if match =~ url }

  doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
end

#robots_txtObject



40
41
42
# File 'lib/scrape/site.rb', line 40

def robots_txt
  @robots_txt ||= Scrape::RobotsTxt.load url
end

#to_sObject



48
49
50
# File 'lib/scrape/site.rb', line 48

def to_s
  url.to_s
end