Module: Graboid::Scraper::InstanceMethods

Defined in:
lib/graboid/scraper.rb

Instance Method Summary collapse

Instance Method Details

#all(opts = {}, reload = false) ⇒ Object Also known as: scrape



69
70
71
72
73
74
# File 'lib/graboid/scraper.rb', line 69

def all opts={}, reload=false
  return self.collection if reload and !self.collection.empty?
  reset_context
  self.max_pages = opts[:max_pages] if opts[:max_pages].present?
  all_fragments.collect{ |frag| extract_instance(frag) }
end

#all_fragmentsObject



78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/graboid/scraper.rb', line 78

def all_fragments
  return page_fragments if self.class.pager.nil?
  old_source = self.source
  
  while next_page?
    self.collection += page_fragments
    run_before_paginate_callbacks
    paginate
    run_after_paginate_callbacks
  end
  
  self.source = old_source
  self.collection
end

#attribute_mapObject



93
94
95
# File 'lib/graboid/scraper.rb', line 93

def attribute_map
  self.class.attribute_map
end

#callbacksObject



97
98
99
# File 'lib/graboid/scraper.rb', line 97

def callbacks
  self.class.callbacks
end

#collectionObject



101
102
103
# File 'lib/graboid/scraper.rb', line 101

def collection
  @collection ||= []
end

#collection=(col) ⇒ Object



105
106
107
# File 'lib/graboid/scraper.rb', line 105

def collection=(col)
  @collection = col
end

#current_pageObject



109
110
111
# File 'lib/graboid/scraper.rb', line 109

def current_page
  @current_page ||= 0
end

#current_page=(num) ⇒ Object



113
114
115
# File 'lib/graboid/scraper.rb', line 113

def current_page=num
  @current_page = num
end

#docObject



117
118
119
# File 'lib/graboid/scraper.rb', line 117

def doc
  eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
end

#extract_instance(fragment) ⇒ Object



121
122
123
# File 'lib/graboid/scraper.rb', line 121

def extract_instance fragment
  OpenStruct.new(hash_map fragment)
end

#hash_map(fragment) ⇒ Object



125
126
127
128
129
130
131
132
133
# File 'lib/graboid/scraper.rb', line 125

def hash_map fragment
  attribute_map.inject({}) do |extracted_hash, at| 
    selector, processor       = at.last[:selector], at.last[:processor]
    node_collection           = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
    extracted_hash[at.first]  = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""

    extracted_hash
  end
end

#initialize(opts = {}, &block) ⇒ Object

Raises:

  • (ArgumentError)


64
65
66
67
# File 'lib/graboid/scraper.rb', line 64

def initialize opts={}, &block
  raise ArgumentError unless opts[:source].present?
  self.source = opts[:source]
end

#max_pagesObject



135
136
137
# File 'lib/graboid/scraper.rb', line 135

def max_pages
  @max_pages ||= 0
end

#max_pages=(num) ⇒ Object



139
140
141
# File 'lib/graboid/scraper.rb', line 139

def max_pages=num
  @max_pages = num
end

#modeObject



143
144
145
# File 'lib/graboid/scraper.rb', line 143

def mode
  @mode ||= :html
end

#mode=(m) ⇒ Object

Raises:

  • (ArgumentError)


147
148
149
150
# File 'lib/graboid/scraper.rb', line 147

def mode=(m)
  raise ArgumentError unless [:html, :xml].include?(m)
  @mode = m
end

#next_page?Boolean

Returns:

  • (Boolean)


152
153
154
155
156
157
158
# File 'lib/graboid/scraper.rb', line 152

def next_page?
  if max_pages.zero?
    return true unless self.class.pager.call(doc).nil?
  else
    current_page <= max_pages-1
  end
end

#page_fragmentsObject



160
161
162
# File 'lib/graboid/scraper.rb', line 160

def page_fragments
  doc.css(self.class.root_selector)
end

#paginateObject



164
165
166
167
168
# File 'lib/graboid/scraper.rb', line 164

def paginate
  next_page_url = self.class.pager.call(doc) rescue nil
  self.source   = next_page_url
  self.current_page += 1
end

#read_sourceObject



170
171
172
173
174
175
176
177
# File 'lib/graboid/scraper.rb', line 170

def read_source
  case self.source
    when /^http[s]?:\/\//
      open(self.source ,"User-Agent" => Graboid.user_agent)
    when String
      self.source
  end
end

#reset_contextObject



179
180
181
182
183
# File 'lib/graboid/scraper.rb', line 179

def reset_context
  self.collection   = []
  self.current_page = 0
  self.max_pages    = 0
end

#sourceObject



185
186
187
# File 'lib/graboid/scraper.rb', line 185

def source
  @source
end

#source=(src) ⇒ Object



189
190
191
# File 'lib/graboid/scraper.rb', line 189

def source=(src)
  @source = src
end