Class: Scrape::Site
- Inherits:
-
Object
- Object
- Scrape::Site
- Defined in:
- lib/scrape/site.rb
Instance Attribute Summary collapse
-
#ignore_robots_txt ⇒ Object
Returns the value of attribute ignore_robots_txt.
-
#matches ⇒ Object
readonly
Returns the value of attribute matches.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
- #accept?(url) ⇒ Boolean
- #add_match(matcher, &proc) ⇒ Object
- #ignore_robots_txt? ⇒ Boolean
-
#initialize(url, options = {}) ⇒ Site
constructor
A new instance of Site.
- #normalize(url, base_url = self.url) ⇒ Object
- #parse(url) ⇒ Object
- #robots_txt ⇒ Object
- #to_s ⇒ Object
Constructor Details
#initialize(url, options = {}) ⇒ Site
Returns a new instance of Site.
8 9 10 11 12 13 14 |
# File 'lib/scrape/site.rb', line 8 def initialize url, = {} @url = Addressable::URI.parse url @url.query = nil @url.fragment = nil @matches = [] @ignore_robots_txt = .fetch(:ignore_robots_txt){ true } end |
Instance Attribute Details
#ignore_robots_txt ⇒ Object
Returns the value of attribute ignore_robots_txt.
6 7 8 |
# File 'lib/scrape/site.rb', line 6 def ignore_robots_txt @ignore_robots_txt end |
#matches ⇒ Object (readonly)
Returns the value of attribute matches.
5 6 7 |
# File 'lib/scrape/site.rb', line 5 def matches @matches end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
5 6 7 |
# File 'lib/scrape/site.rb', line 5 def url @url end |
Instance Method Details
#accept?(url) ⇒ Boolean
31 32 33 34 |
# File 'lib/scrape/site.rb', line 31 def accept? url url = normalize url url.starts_with(to_s) && !disallowed?(url) end |
#add_match(matcher, &proc) ⇒ Object
16 17 18 19 20 |
# File 'lib/scrape/site.rb', line 16 def add_match matcher, &proc match = Scrape::Match.new(matcher, &proc) @matches << match match end |
#ignore_robots_txt? ⇒ Boolean
44 45 46 |
# File 'lib/scrape/site.rb', line 44 def ignore_robots_txt? !!@ignore_robots_txt end |
#normalize(url, base_url = self.url) ⇒ Object
36 37 38 |
# File 'lib/scrape/site.rb', line 36 def normalize url, base_url = self.url Addressable::URI.join(base_url, url).to_s end |
#parse(url) ⇒ Object
22 23 24 25 26 27 28 29 |
# File 'lib/scrape/site.rb', line 22 def parse url url = normalize url doc = Nokogiri::HTML Scrape.open(url) @matches.each{|match| match.invoke doc, url if match =~ url } doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url } end |
#robots_txt ⇒ Object
40 41 42 |
# File 'lib/scrape/site.rb', line 40 def robots_txt @robots_txt ||= Scrape::RobotsTxt.load url end |
#to_s ⇒ Object
48 49 50 |
# File 'lib/scrape/site.rb', line 48 def to_s url.to_s end |