Class: Apollo::Crawler::BaseCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/apollo_crawler/crawler/base_crawler.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeBaseCrawler

Returns a new instance of BaseCrawler.

[View source]

32
33
34
35
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 32

def initialize
	@backlog = []
	@visited = []
end

Class Method Details

.create_metadoc(url, doc) ⇒ Object

[View source]

172
173
174
175
176
177
178
179
180
181
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 172

def self.create_metadoc(url, doc)
	return {
		'url' => url,
		'doc' => doc.body.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
		'hash' => Digest::SHA256.new.update(doc.body).hexdigest,
		'created_at' => Time.now.utc,
		'expires_at' => nil,
		'version' => 0
	}
end

.fetch(url) ⇒ Object

[View source]

50
51
52
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 50

def self.fetch(url)
	RbConfig::DEFAULT_FETCHER.fetch(url)
end

.name_reObject

[View source]

37
38
39
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 37

def self.name_re()
	return /crawler$/
end

.try_get_doc(root, url) ⇒ Object

[View source]

62
63
64
65
66
67
68
69
70
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 62

def self.try_get_doc(root, url)
	doc = BaseCrawler.try_get_url(root, url)
	
	# TODO: Set experition header
	return {
		:doc => doc,
		:url => url
	}
end

.try_get_url(root, url) ⇒ Object

[View source]

54
55
56
57
58
59
60
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 54

def self.try_get_url(root, url)
	begin
		return URI.join(root, url)
	rescue
		return nil
	end
end

Instance Method Details

#enqueue_url(url) ⇒ Object

[View source]

130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 130

def enqueue_url(url)
	urls = []
	return urls if url.nil?
	# We support both - list of urls or single url
	if(url.kind_of?(Array))
		urls = urls.concat(url)
	else
		urls << url
	end

	urls.each do |u|
		if(url_processed?(u) == false)
			@backlog << u
		end
	end
end

#etl(url = nil, opts = {}, &block) ⇒ Object

  • (0) Figure out URL

  • (1) Extract Data

  • (2) Extract Links

  • (3) Go to (0) eventually

[View source]

76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 76

def etl(url=nil, opts={}, &block)
	# Look for passed URL use default instead and fail if it is not valid
	if(url.nil? || url.empty?)
		url = self.url
	end

	# TODO: Be more agressive, use assert, it is clients responsibility!
	if(url.nil?)
		return nil
	end

	enqueue_url(url)

	# Counter of processed documents (pages)
	docs_processed = 0

	res = []
	# TODO: Respect limit of documents/urls processed
	while(@backlog.empty? == false)
		url = @backlog.shift

		# puts "Processing '#{url}'"
		doc = self.process_url(url)
		
		# Increase counter of processed documents
		docs_processed = docs_processed + 1

		@visited << url

		# Process document if was successfuly retreived
		if(!doc.nil?)
			# TODO: Use log4r and log it only on info level
			if block_given?
				yield doc
			end

			# Add document to queue of results
			res << doc

			enqueue_url(doc[:links].map(){ |l| l[:link] }) if doc[:links]
		end

		# Break if limit of documents to processed was reached
		break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
	end

	# Return processed document
	return res
end

#extract_data(doc) ⇒ Object

Extracts data from document

[View source]

226
227
228
229
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 226

def extract_data(doc)
	res = []
	return res
end

Extract links to another documents from this document

[View source]

232
233
234
235
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 232

def extract_links(doc)
	res = []
	return res
end

#fetch_document(url) ⇒ Object

Fetch document

[View source]

184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 184

def fetch_document(url)
	# TODO: Refactor following idiom
	if(url == nil)
		url = self.url
	end

	if(url.nil?)
		return nil
	end

	url = url.to_s

	# TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
	cache = Apollo::Cache::Factory.instance.construct
	metadoc = cache.try_get(url) do
		max_attempts = 3
		attempt_no = 0
		success = false
		
		doc = nil
		while(attempt_no < max_attempts && success == false) do
			begin
				doc = BaseCrawler.fetch(url)
				success = true
			rescue Exception => e
				puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
				sleep 1

				attempt_no = attempt_no + 1
				success = false
			end
		end

		# Create metadata
		BaseCrawler.create_metadoc(url, doc)
	end

	# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
	return Nokogiri::HTML(metadoc['doc'])
end

#nameObject

Name of the crawler

[View source]

42
43
44
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 42

def name
	return "Crawler Base" 
end

#process_url(url) ⇒ Object

[View source]

147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 147

def process_url(url)
	doc = self.fetch_document(url)
	if(doc.nil?)
		return nil
	end

	# Try extract data from document
	data = self.extract_data(doc)

	# Try extract links for another documents 
	links = self.extract_links(doc)
	
	# TODO: Make configurable if links extracted from doc should be printed
	# puts links.inspect

	# Format ETL result
	res = { 
		:crawler => self.class.name,
		:data => data,
		:links => links
	}

	return res
end

#urlObject

[View source]

46
47
48
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 46

def url
	return nil
end

#url_processed?(url) ⇒ Boolean

Returns:

  • (Boolean)
[View source]

126
127
128
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 126

def url_processed?(url)
	return @backlog.include?(url) || @visited.include?(url)
end