Class: Scrape::Site

Inherits:

Object

Object
Scrape::Site

show all

Defined in:: lib/scrape/site.rb

Instance Attribute Summary collapse

#ignore_robots_txt ⇒ Object

Returns the value of attribute ignore_robots_txt.
#matches ⇒ Object readonly

Returns the value of attribute matches.
#url ⇒ Object readonly

Returns the value of attribute url.

Instance Method Summary collapse

#accept?(url) ⇒ Boolean
#add_match(matcher, &proc) ⇒ Object
#ignore_robots_txt? ⇒ Boolean
#initialize(url, options = {}) ⇒ Site constructor

A new instance of Site.
#normalize(url, base_url = self.url) ⇒ Object
#parse(url) ⇒ Object
#robots_txt ⇒ Object
#to_s ⇒ Object

Constructor Details

#initialize(url, options = {}) ⇒ `Site`

Returns a new instance of Site.

# File 'lib/scrape/site.rb', line 8

def initialize url, options = {}
  @url = Addressable::URI.parse url
  @url.query = nil
  @url.fragment = nil
  @matches = []
  @ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
end

Instance Attribute Details

#ignore_robots_txt ⇒ `Object`

Returns the value of attribute ignore_robots_txt.



6
7
8

# File 'lib/scrape/site.rb', line 6

def ignore_robots_txt
  @ignore_robots_txt
end

#matches ⇒ `Object` (readonly)

Returns the value of attribute matches.



5
6
7

# File 'lib/scrape/site.rb', line 5

def matches
  @matches
end

#url ⇒ `Object` (readonly)

Returns the value of attribute url.



5
6
7

# File 'lib/scrape/site.rb', line 5

def url
  @url
end

Instance Method Details

#accept?(url) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/scrape/site.rb', line 31

def accept? url
  url = normalize url
  url.starts_with(to_s) && !disallowed?(url)
end

#add_match(matcher, &proc) ⇒ `Object`

# File 'lib/scrape/site.rb', line 16

def add_match matcher, &proc
  match = Scrape::Match.new(matcher, &proc)
  @matches << match
  match
end

#ignore_robots_txt? ⇒ `Boolean`

Returns:

(Boolean)



44
45
46

# File 'lib/scrape/site.rb', line 44

def ignore_robots_txt?
  !!@ignore_robots_txt
end

#normalize(url, base_url = self.url) ⇒ `Object`



36
37
38

# File 'lib/scrape/site.rb', line 36

def normalize url, base_url = self.url
  Addressable::URI.join(base_url, url).to_s
end

#parse(url) ⇒ `Object`

# File 'lib/scrape/site.rb', line 22

def parse url
  url = normalize url
  doc = Nokogiri::HTML Scrape.open(url)

  @matches.each{|match| match.invoke doc, url if match =~ url }

  doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
end

#robots_txt ⇒ `Object`



40
41
42

# File 'lib/scrape/site.rb', line 40

def robots_txt
  @robots_txt ||= Scrape::RobotsTxt.load url
end

#to_s ⇒ `Object`



48
49
50

# File 'lib/scrape/site.rb', line 48

def to_s
  url.to_s
end

Class: Scrape::Site

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options = {}) ⇒ Site

Instance Attribute Details

#ignore_robots_txt ⇒ Object

#matches ⇒ Object (readonly)

#url ⇒ Object (readonly)

Instance Method Details

#accept?(url) ⇒ Boolean

#add_match(matcher, &proc) ⇒ Object

#ignore_robots_txt? ⇒ Boolean

#normalize(url, base_url = self.url) ⇒ Object

#parse(url) ⇒ Object

#robots_txt ⇒ Object

#to_s ⇒ Object

#initialize(url, options = {}) ⇒ `Site`

#ignore_robots_txt ⇒ `Object`

#matches ⇒ `Object` (readonly)

#url ⇒ `Object` (readonly)

#accept?(url) ⇒ `Boolean`

#add_match(matcher, &proc) ⇒ `Object`

#ignore_robots_txt? ⇒ `Boolean`

#normalize(url, base_url = self.url) ⇒ `Object`

#parse(url) ⇒ `Object`

#robots_txt ⇒ `Object`

#to_s ⇒ `Object`