Module: Graboid::Entity::ClassMethods

Defined in:
lib/graboid/entity.rb

Instance Method Summary collapse

Instance Method Details

#all(opts = {}) ⇒ Object



126
127
128
129
130
# File 'lib/graboid/entity.rb', line 126

def all opts={}
  reset_context
  self.max_pages = opts[:max_pages] unless opts[:max_pages].nil?
  all_fragments.collect{ |frag| extract_instance(frag) }
end

#all_fragmentsObject



95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/graboid/entity.rb', line 95

def all_fragments
  return page_fragments if @pager.nil?
  old_source = self.source
  while next_page?
    self.collection += page_fragments
    run_before_paginate_callbacks
    paginate
    run_after_paginate_callbacks
  end
  self.source = old_source
  self.collection
end

#attribute_mapObject



77
78
79
# File 'lib/graboid/entity.rb', line 77

def attribute_map
  read_inheritable_attribute :attribute_map
end

#collectionObject



69
70
71
# File 'lib/graboid/entity.rb', line 69

def collection
  @collection ||= []
end

#collection=(col) ⇒ Object



73
74
75
# File 'lib/graboid/entity.rb', line 73

def collection=(col)
  @collection = col
end

#current_pageObject



168
169
170
# File 'lib/graboid/entity.rb', line 168

def current_page
  @current_page ||= 0
end

#current_page=(num) ⇒ Object



172
173
174
# File 'lib/graboid/entity.rb', line 172

def current_page=num
  @current_page = num
end

#docObject



65
66
67
# File 'lib/graboid/entity.rb', line 65

def doc
  eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
end

#extract_instance(fragment) ⇒ Object



81
82
83
# File 'lib/graboid/entity.rb', line 81

def extract_instance fragment
  new(hash_map(fragment))
end

#hash_map(fragment) ⇒ Object



85
86
87
88
89
90
91
92
93
# File 'lib/graboid/entity.rb', line 85

def hash_map fragment
  attribute_map.inject({}) do |extracted_hash, at| 
    selector, processor       = at.last[:selector], at.last[:processor]
    node_collection           = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
    extracted_hash[at.first]  = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""

    extracted_hash
  end
end

#inferred_selectorObject



61
62
63
# File 'lib/graboid/entity.rb', line 61

def inferred_selector
  @inferred_selector ||= ".#{self.to_s.underscore}"
end

#inherited(subclass) ⇒ Object



27
28
29
30
31
32
# File 'lib/graboid/entity.rb', line 27

def inherited(subclass)
  @inherited_attributes.each do |inheritable_attribute|
    instance_var = "@#{inheritable_attribute}"
    subclass.instance_variable_set(instance_var, instance_variable_get(instance_var))
  end
end

#inherited_attributes(*args) ⇒ Object



16
17
18
19
20
21
22
23
24
25
# File 'lib/graboid/entity.rb', line 16

def inherited_attributes(*args)
  @inherited_attributes ||= [:inherited_attributes]
  @inherited_attributes += args
  args.each do |arg|
    class_eval %(
      class << self; attr_accessor :#{arg} end
    )
  end
  @inherited_attributes
end

#max_pagesObject



160
161
162
# File 'lib/graboid/entity.rb', line 160

def max_pages
  @max_pages ||= 0
end

#max_pages=(num) ⇒ Object



164
165
166
# File 'lib/graboid/entity.rb', line 164

def max_pages=num
  @max_pages = num
end

#modeObject



151
152
153
# File 'lib/graboid/entity.rb', line 151

def mode
  @mode ||= :html
end

#mode=(m) ⇒ Object

Raises:

  • (ArgumentError)


155
156
157
158
# File 'lib/graboid/entity.rb', line 155

def mode=(m)
  raise ArgumentError unless [:html, :xml].include?(m)
  @mode = m
end

#next_page?Boolean

Returns:

  • (Boolean)


114
115
116
117
118
119
120
# File 'lib/graboid/entity.rb', line 114

def next_page?
  if max_pages.zero?
    return true unless @pager.call(doc).nil?
  else
    current_page <= max_pages-1
  end
end

#page_fragmentsObject



122
123
124
# File 'lib/graboid/entity.rb', line 122

def page_fragments
  doc.css(root_selector)
end

#pager(&block) ⇒ Object



147
148
149
# File 'lib/graboid/entity.rb', line 147

def pager &block
  @pager = block
end

#paginateObject



108
109
110
111
112
# File 'lib/graboid/entity.rb', line 108

def paginate
  next_page_url = @pager.call(doc) rescue nil
  self.source   = next_page_url
  self.current_page += 1
end

#read_sourceObject



138
139
140
141
142
143
144
145
# File 'lib/graboid/entity.rb', line 138

def read_source
  case self.source
    when /^http[s]?:\/\//
      open(self.source, "User-Agent" => Graboid.user_agent)
    when String
      self.source
  end
end

#reset_contextObject



132
133
134
135
136
# File 'lib/graboid/entity.rb', line 132

def reset_context
  self.collection   = []
  self.current_page = 0
  self.max_pages    = 0
end

#root_selectorObject



57
58
59
# File 'lib/graboid/entity.rb', line 57

def root_selector
  @root_selector || inferred_selector
end

#selector(selector) ⇒ Object Also known as: root



51
52
53
# File 'lib/graboid/entity.rb', line 51

def selector selector
  @root_selector = selector
end

#set(name, opts = {}, &block) ⇒ Object Also known as: field



42
43
44
45
46
47
# File 'lib/graboid/entity.rb', line 42

def set name, opts={}, &block
  opts.merge!(:selector   => ".#{name}")  if opts[:selector].nil?
  opts.merge!(:processor  => block)       if block_given?

  attribute_map[name] = opts
end

#sourceObject



34
35
36
# File 'lib/graboid/entity.rb', line 34

def source
  @source
end

#source=(src) ⇒ Object



38
39
40
# File 'lib/graboid/entity.rb', line 38

def source=(src)
  @source = src
end