Module: Graboid::Scraper::InstanceMethods
- Defined in:
- lib/graboid/scraper.rb
Instance Method Summary collapse
- #all(opts = {}, reload = false) ⇒ Object (also: #scrape)
- #all_fragments ⇒ Object
- #attribute_map ⇒ Object
- #callbacks ⇒ Object
- #collection ⇒ Object
- #collection=(col) ⇒ Object
- #current_page ⇒ Object
- #current_page=(num) ⇒ Object
- #doc ⇒ Object
- #extract_instance(fragment) ⇒ Object
- #hash_map(fragment) ⇒ Object
- #host ⇒ Object
- #initialize(opts = {}, &block) ⇒ Object
- #max_pages ⇒ Object
- #max_pages=(num) ⇒ Object
- #mode ⇒ Object
- #mode=(m) ⇒ Object
- #next_page? ⇒ Boolean
- #original_source ⇒ Object
- #page_fragments ⇒ Object
- #paginate ⇒ Object
- #read_source ⇒ Object
- #reset_context ⇒ Object
- #source ⇒ Object
- #source=(src) ⇒ Object
Instance Method Details
#all(opts = {}, reload = false) ⇒ Object Also known as: scrape
83 84 85 86 87 88 |
# File 'lib/graboid/scraper.rb', line 83 def all opts={}, reload=false return self.collection if reload and !self.collection.empty? reset_context self.max_pages = opts[:max_pages] unless opts[:max_pages].nil? all_fragments.collect{ |frag| extract_instance(frag) } end |
#all_fragments ⇒ Object
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/graboid/scraper.rb', line 92 def all_fragments return page_fragments unless self.respond_to?(:pager) return page_fragments if self.pager(self.doc).nil? old_source = self.source while next_page? self.collection += page_fragments run_before_paginate_callbacks paginate run_after_paginate_callbacks end self.source = old_source self.collection end |
#attribute_map ⇒ Object
108 109 110 |
# File 'lib/graboid/scraper.rb', line 108 def attribute_map self.class.attribute_map end |
#callbacks ⇒ Object
112 113 114 |
# File 'lib/graboid/scraper.rb', line 112 def callbacks self.class.callbacks end |
#collection ⇒ Object
116 117 118 |
# File 'lib/graboid/scraper.rb', line 116 def collection @collection ||= [] end |
#collection=(col) ⇒ Object
120 121 122 |
# File 'lib/graboid/scraper.rb', line 120 def collection=(col) @collection = col end |
#current_page ⇒ Object
124 125 126 |
# File 'lib/graboid/scraper.rb', line 124 def current_page @current_page ||= 0 end |
#current_page=(num) ⇒ Object
128 129 130 |
# File 'lib/graboid/scraper.rb', line 128 def current_page=num @current_page = num end |
#doc ⇒ Object
132 133 134 |
# File 'lib/graboid/scraper.rb', line 132 def doc eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)" end |
#extract_instance(fragment) ⇒ Object
136 137 138 |
# File 'lib/graboid/scraper.rb', line 136 def extract_instance fragment OpenStruct.new(hash_map fragment) end |
#hash_map(fragment) ⇒ Object
140 141 142 143 144 145 146 147 148 |
# File 'lib/graboid/scraper.rb', line 140 def hash_map fragment attribute_map.inject({}) do |extracted_hash, at| selector, processor = at.last[:selector], at.last[:processor] node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector) extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue "" extracted_hash end end |
#host ⇒ Object
204 205 206 |
# File 'lib/graboid/scraper.rb', line 204 def host self.source.scan(/http[s]?:\/\/.*\//).first end |
#initialize(opts = {}, &block) ⇒ Object
78 79 80 81 |
# File 'lib/graboid/scraper.rb', line 78 def initialize opts={}, &block raise ArgumentError if opts[:source].nil? self.source = opts[:source] end |
#max_pages ⇒ Object
150 151 152 |
# File 'lib/graboid/scraper.rb', line 150 def max_pages @max_pages ||= 0 end |
#max_pages=(num) ⇒ Object
154 155 156 |
# File 'lib/graboid/scraper.rb', line 154 def max_pages=num @max_pages = num end |
#mode ⇒ Object
158 159 160 |
# File 'lib/graboid/scraper.rb', line 158 def mode @mode ||= :html end |
#mode=(m) ⇒ Object
162 163 164 165 |
# File 'lib/graboid/scraper.rb', line 162 def mode=(m) raise ArgumentError unless [:html, :xml].include?(m) @mode = m end |
#next_page? ⇒ Boolean
167 168 169 170 171 172 173 |
# File 'lib/graboid/scraper.rb', line 167 def next_page? if max_pages.zero? return true unless self.pager(doc).nil? else current_page <= max_pages-1 end end |
#original_source ⇒ Object
175 176 177 |
# File 'lib/graboid/scraper.rb', line 175 def original_source @original_source end |
#page_fragments ⇒ Object
179 180 181 |
# File 'lib/graboid/scraper.rb', line 179 def page_fragments doc.css(self.class.root_selector) end |
#paginate ⇒ Object
183 184 185 186 187 |
# File 'lib/graboid/scraper.rb', line 183 def paginate next_page_url = self.pager(doc) self.source = next_page_url self.current_page += 1 end |
#read_source ⇒ Object
189 190 191 192 193 194 195 196 |
# File 'lib/graboid/scraper.rb', line 189 def read_source case self.source when /^http[s]?:\/\// open(self.source ,"User-Agent" => Graboid.user_agent) when String self.source end end |
#reset_context ⇒ Object
198 199 200 201 202 |
# File 'lib/graboid/scraper.rb', line 198 def reset_context self.collection = [] self.current_page = 0 self.max_pages = 0 end |
#source ⇒ Object
208 209 210 |
# File 'lib/graboid/scraper.rb', line 208 def source @source end |
#source=(src) ⇒ Object
212 213 214 215 |
# File 'lib/graboid/scraper.rb', line 212 def source=(src) @original_source = src if @original_source.nil? @source = src end |