Module: Graboid::Entity::ClassMethods

Defined in:
lib/graboid/entity.rb

Instance Method Summary collapse

Instance Method Details

#all(opts = {}) ⇒ Object



107
108
109
110
111
# File 'lib/graboid/entity.rb', line 107

def all opts={}
  reset_context
  self.max_pages = opts[:max_pages] if opts[:max_pages].present?
  all_fragments.collect{ |frag| extract_instance(frag) }
end

#all_fragmentsObject



76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/graboid/entity.rb', line 76

def all_fragments
  return page_fragments if @pager.nil?
  old_source = self.source
  while next_page?
    self.collection += page_fragments
    run_before_paginate_callbacks
    paginate
    run_after_paginate_callbacks
  end
  self.source = old_source
  self.collection
end

#attribute_mapObject



58
59
60
# File 'lib/graboid/entity.rb', line 58

def attribute_map
  read_inheritable_attribute :attribute_map
end

#collectionObject



50
51
52
# File 'lib/graboid/entity.rb', line 50

def collection
  @collection ||= []
end

#collection=(col) ⇒ Object



54
55
56
# File 'lib/graboid/entity.rb', line 54

def collection=(col)
  @collection = col
end

#current_pageObject



149
150
151
# File 'lib/graboid/entity.rb', line 149

def current_page
  @current_page ||= 0
end

#current_page=(num) ⇒ Object



153
154
155
# File 'lib/graboid/entity.rb', line 153

def current_page=num
  @current_page = num
end

#docObject



46
47
48
# File 'lib/graboid/entity.rb', line 46

def doc
  eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
end

#extract_instance(fragment) ⇒ Object



62
63
64
# File 'lib/graboid/entity.rb', line 62

def extract_instance fragment
  new(hash_map(fragment))
end

#hash_map(fragment) ⇒ Object



66
67
68
69
70
71
72
73
74
# File 'lib/graboid/entity.rb', line 66

def hash_map fragment
  attribute_map.inject({}) do |extracted_hash, at| 
    selector, processor       = at.last[:selector], at.last[:processor]
    node_collection           = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
    extracted_hash[at.first]  = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""

    extracted_hash
  end
end

#inferred_selectorObject



42
43
44
# File 'lib/graboid/entity.rb', line 42

def inferred_selector
  @inferred_selector ||= ".#{self.to_s.underscore}"
end

#max_pagesObject



141
142
143
# File 'lib/graboid/entity.rb', line 141

def max_pages
  @max_pages ||= 0
end

#max_pages=(num) ⇒ Object



145
146
147
# File 'lib/graboid/entity.rb', line 145

def max_pages=num
  @max_pages = num
end

#modeObject



132
133
134
# File 'lib/graboid/entity.rb', line 132

def mode
  @mode ||= :html
end

#mode=(m) ⇒ Object

Raises:

  • (ArgumentError)


136
137
138
139
# File 'lib/graboid/entity.rb', line 136

def mode=(m)
  raise ArgumentError unless [:html, :xml].include?(m)
  @mode = m
end

#next_page?Boolean

Returns:

  • (Boolean)


95
96
97
98
99
100
101
# File 'lib/graboid/entity.rb', line 95

def next_page?
  if max_pages.zero?
    return true unless @pager.call(doc).nil?
  else
    current_page <= max_pages-1
  end
end

#page_fragmentsObject



103
104
105
# File 'lib/graboid/entity.rb', line 103

def page_fragments
  doc.css(root_selector)
end

#pager(&block) ⇒ Object



128
129
130
# File 'lib/graboid/entity.rb', line 128

def pager &block
  @pager = block
end

#paginateObject



89
90
91
92
93
# File 'lib/graboid/entity.rb', line 89

def paginate
  next_page_url = @pager.call(doc) rescue nil
  self.source   = next_page_url
  self.current_page += 1
end

#read_sourceObject



119
120
121
122
123
124
125
126
# File 'lib/graboid/entity.rb', line 119

def read_source
  case self.source
    when /^http[s]?:\/\//
      open(self.source, "User-Agent" => Graboid.user_agent)
    when String
      self.source
  end
end

#reset_contextObject



113
114
115
116
117
# File 'lib/graboid/entity.rb', line 113

def reset_context
  self.collection   = []
  self.current_page = 0
  self.max_pages    = 0
end

#root_selectorObject



38
39
40
# File 'lib/graboid/entity.rb', line 38

def root_selector
  @root_selector || inferred_selector
end

#selector(selector) ⇒ Object Also known as: root



32
33
34
# File 'lib/graboid/entity.rb', line 32

def selector selector
  @root_selector = selector
end

#set(name, opts = {}, &block) ⇒ Object Also known as: field



23
24
25
26
27
28
# File 'lib/graboid/entity.rb', line 23

def set name, opts={}, &block
  opts.merge!(:selector   => ".#{name}")  unless opts[:selector].present?
  opts.merge!(:processor  => block)       if block_given?
  
  attribute_map[name] = opts
end

#sourceObject



15
16
17
# File 'lib/graboid/entity.rb', line 15

def source
  @source
end

#source=(src) ⇒ Object



19
20
21
# File 'lib/graboid/entity.rb', line 19

def source=(src)
  @source = src
end