Module: Graboid::Scraper::InstanceMethods
- Defined in:
- lib/graboid/scraper.rb
Instance Method Summary collapse
- #all(opts = {}, reload = false) ⇒ Object (also: #scrape)
- #all_fragments ⇒ Object
- #attribute_map ⇒ Object
- #callbacks ⇒ Object
- #collection ⇒ Object
- #collection=(col) ⇒ Object
- #current_page ⇒ Object
- #current_page=(num) ⇒ Object
- #doc ⇒ Object
- #extract_instance(fragment) ⇒ Object
- #hash_map(fragment) ⇒ Object
- #initialize(opts = {}, &block) ⇒ Object
- #max_pages ⇒ Object
- #max_pages=(num) ⇒ Object
- #mode ⇒ Object
- #mode=(m) ⇒ Object
- #next_page? ⇒ Boolean
- #page_fragments ⇒ Object
- #paginate ⇒ Object
- #read_source ⇒ Object
- #reset_context ⇒ Object
- #source ⇒ Object
- #source=(src) ⇒ Object
Instance Method Details
#all(opts = {}, reload = false) ⇒ Object Also known as: scrape
69 70 71 72 73 74 |
# File 'lib/graboid/scraper.rb', line 69 def all opts={}, reload=false return self.collection if reload and !self.collection.empty? reset_context self.max_pages = opts[:max_pages] if opts[:max_pages].present? all_fragments.collect{ |frag| extract_instance(frag) } end |
#all_fragments ⇒ Object
78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/graboid/scraper.rb', line 78 def all_fragments return page_fragments if self.class.pager.nil? old_source = self.source while next_page? self.collection += page_fragments run_before_paginate_callbacks paginate run_after_paginate_callbacks end self.source = old_source self.collection end |
#attribute_map ⇒ Object
93 94 95 |
# File 'lib/graboid/scraper.rb', line 93 def attribute_map self.class.attribute_map end |
#callbacks ⇒ Object
97 98 99 |
# File 'lib/graboid/scraper.rb', line 97 def callbacks self.class.callbacks end |
#collection ⇒ Object
101 102 103 |
# File 'lib/graboid/scraper.rb', line 101 def collection @collection ||= [] end |
#collection=(col) ⇒ Object
105 106 107 |
# File 'lib/graboid/scraper.rb', line 105 def collection=(col) @collection = col end |
#current_page ⇒ Object
109 110 111 |
# File 'lib/graboid/scraper.rb', line 109 def current_page @current_page ||= 0 end |
#current_page=(num) ⇒ Object
113 114 115 |
# File 'lib/graboid/scraper.rb', line 113 def current_page=num @current_page = num end |
#doc ⇒ Object
117 118 119 |
# File 'lib/graboid/scraper.rb', line 117 def doc eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)" end |
#extract_instance(fragment) ⇒ Object
121 122 123 |
# File 'lib/graboid/scraper.rb', line 121 def extract_instance fragment OpenStruct.new(hash_map fragment) end |
#hash_map(fragment) ⇒ Object
125 126 127 128 129 130 131 132 133 |
# File 'lib/graboid/scraper.rb', line 125 def hash_map fragment attribute_map.inject({}) do |extracted_hash, at| selector, processor = at.last[:selector], at.last[:processor] node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector) extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue "" extracted_hash end end |
#initialize(opts = {}, &block) ⇒ Object
64 65 66 67 |
# File 'lib/graboid/scraper.rb', line 64 def initialize opts={}, &block raise ArgumentError unless opts[:source].present? self.source = opts[:source] end |
#max_pages ⇒ Object
135 136 137 |
# File 'lib/graboid/scraper.rb', line 135 def max_pages @max_pages ||= 0 end |
#max_pages=(num) ⇒ Object
139 140 141 |
# File 'lib/graboid/scraper.rb', line 139 def max_pages=num @max_pages = num end |
#mode ⇒ Object
143 144 145 |
# File 'lib/graboid/scraper.rb', line 143 def mode @mode ||= :html end |
#mode=(m) ⇒ Object
147 148 149 150 |
# File 'lib/graboid/scraper.rb', line 147 def mode=(m) raise ArgumentError unless [:html, :xml].include?(m) @mode = m end |
#next_page? ⇒ Boolean
152 153 154 155 156 157 158 |
# File 'lib/graboid/scraper.rb', line 152 def next_page? if max_pages.zero? return true unless self.class.pager.call(doc).nil? else current_page <= max_pages-1 end end |
#page_fragments ⇒ Object
160 161 162 |
# File 'lib/graboid/scraper.rb', line 160 def page_fragments doc.css(self.class.root_selector) end |
#paginate ⇒ Object
164 165 166 167 168 |
# File 'lib/graboid/scraper.rb', line 164 def paginate next_page_url = self.class.pager.call(doc) rescue nil self.source = next_page_url self.current_page += 1 end |
#read_source ⇒ Object
170 171 172 173 174 175 176 177 |
# File 'lib/graboid/scraper.rb', line 170 def read_source case self.source when /^http[s]?:\/\// open(self.source ,"User-Agent" => Graboid.user_agent) when String self.source end end |
#reset_context ⇒ Object
179 180 181 182 183 |
# File 'lib/graboid/scraper.rb', line 179 def reset_context self.collection = [] self.current_page = 0 self.max_pages = 0 end |
#source ⇒ Object
185 186 187 |
# File 'lib/graboid/scraper.rb', line 185 def source @source end |
#source=(src) ⇒ Object
189 190 191 |
# File 'lib/graboid/scraper.rb', line 189 def source=(src) @source = src end |