Module: Graboid::Scraper::InstanceMethods

Defined in:
lib/graboid/scraper.rb

Instance Method Summary collapse

Instance Method Details

#all(opts = {}, reload = false) ⇒ Object Also known as: scrape



83
84
85
86
87
88
# File 'lib/graboid/scraper.rb', line 83

def all opts={}, reload=false
  return self.collection if reload and !self.collection.empty?
  reset_context
  self.max_pages = opts[:max_pages] unless opts[:max_pages].nil?
  all_fragments.collect{ |frag| extract_instance(frag) }
end

#all_fragmentsObject



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/graboid/scraper.rb', line 92

def all_fragments
  return page_fragments unless self.respond_to?(:pager)
  return page_fragments if self.pager(self.doc).nil?
  old_source = self.source
  
  while next_page?
    self.collection += page_fragments
    run_before_paginate_callbacks
    paginate
    run_after_paginate_callbacks
  end
  
  self.source = old_source
  self.collection
end

#attribute_mapObject



108
109
110
# File 'lib/graboid/scraper.rb', line 108

def attribute_map
  self.class.attribute_map
end

#callbacksObject



112
113
114
# File 'lib/graboid/scraper.rb', line 112

def callbacks
  self.class.callbacks
end

#collectionObject



116
117
118
# File 'lib/graboid/scraper.rb', line 116

def collection
  @collection ||= []
end

#collection=(col) ⇒ Object



120
121
122
# File 'lib/graboid/scraper.rb', line 120

def collection=(col)
  @collection = col
end

#current_pageObject



124
125
126
# File 'lib/graboid/scraper.rb', line 124

def current_page
  @current_page ||= 0
end

#current_page=(num) ⇒ Object



128
129
130
# File 'lib/graboid/scraper.rb', line 128

def current_page=num
  @current_page = num
end

#docObject



132
133
134
# File 'lib/graboid/scraper.rb', line 132

def doc
  eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
end

#extract_instance(fragment) ⇒ Object



136
137
138
# File 'lib/graboid/scraper.rb', line 136

def extract_instance fragment
  OpenStruct.new(hash_map fragment)
end

#hash_map(fragment) ⇒ Object



140
141
142
143
144
145
146
147
148
# File 'lib/graboid/scraper.rb', line 140

def hash_map fragment
  attribute_map.inject({}) do |extracted_hash, at| 
    selector, processor       = at.last[:selector], at.last[:processor]
    node_collection           = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
    extracted_hash[at.first]  = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""

    extracted_hash
  end
end

#hostObject



204
205
206
# File 'lib/graboid/scraper.rb', line 204

def host
  self.source.scan(/http[s]?:\/\/.*\//).first
end

#initialize(opts = {}, &block) ⇒ Object

Raises:

  • (ArgumentError)


78
79
80
81
# File 'lib/graboid/scraper.rb', line 78

def initialize opts={}, &block
  raise ArgumentError if opts[:source].nil?
  self.source = opts[:source]
end

#max_pagesObject



150
151
152
# File 'lib/graboid/scraper.rb', line 150

def max_pages
  @max_pages ||= 0
end

#max_pages=(num) ⇒ Object



154
155
156
# File 'lib/graboid/scraper.rb', line 154

def max_pages=num
  @max_pages = num
end

#modeObject



158
159
160
# File 'lib/graboid/scraper.rb', line 158

def mode
  @mode ||= :html
end

#mode=(m) ⇒ Object

Raises:

  • (ArgumentError)


162
163
164
165
# File 'lib/graboid/scraper.rb', line 162

def mode=(m)
  raise ArgumentError unless [:html, :xml].include?(m)
  @mode = m
end

#next_page?Boolean

Returns:

  • (Boolean)


167
168
169
170
171
172
173
# File 'lib/graboid/scraper.rb', line 167

def next_page?
  if max_pages.zero?
    return true unless self.pager(doc).nil?
  else
    current_page <= max_pages-1
  end
end

#original_sourceObject



175
176
177
# File 'lib/graboid/scraper.rb', line 175

def original_source
  @original_source
end

#page_fragmentsObject



179
180
181
# File 'lib/graboid/scraper.rb', line 179

def page_fragments
  doc.css(self.class.root_selector)
end

#paginateObject



183
184
185
186
187
# File 'lib/graboid/scraper.rb', line 183

def paginate
  next_page_url = self.pager(doc)
  self.source   = next_page_url
  self.current_page += 1
end

#read_sourceObject



189
190
191
192
193
194
195
196
# File 'lib/graboid/scraper.rb', line 189

def read_source
  case self.source
    when /^http[s]?:\/\//
      open(self.source ,"User-Agent" => Graboid.user_agent)
    when String
      self.source
  end
end

#reset_contextObject



198
199
200
201
202
# File 'lib/graboid/scraper.rb', line 198

def reset_context
  self.collection   = []
  self.current_page = 0
  self.max_pages    = 0
end

#sourceObject



208
209
210
# File 'lib/graboid/scraper.rb', line 208

def source
  @source
end

#source=(src) ⇒ Object



212
213
214
215
# File 'lib/graboid/scraper.rb', line 212

def source=(src)
  @original_source = src if @original_source.nil?
  @source = src
end