Class: TheFox::Sengi::Crawler
- Inherits:
-
Object
- Object
- TheFox::Sengi::Crawler
- Defined in:
- lib/sengi/crawler.rb
Instance Method Summary collapse
- #go ⇒ Object
-
#initialize(url, options) ⇒ Crawler
constructor
A new instance of Crawler.
Constructor Details
#initialize(url, options) ⇒ Crawler
Returns a new instance of Crawler.
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/sengi/crawler.rb', line 21 def initialize(url, ) @url = url @options = @options['serial'] = false if !@options.has_key?('serial') @options['relative'] = false if !@options.has_key?('relative') @options['force'] = false if !@options.has_key?('force') @options['debug'] = false if !@options.has_key?('debug') @options['parent_id'] = 0 if !@options.has_key?('parent_id') @options['level'] = 0 if !@options.has_key?('level') #pp @options @redis = nil @uri = nil @request = nil @response = nil @html_doc = nil @url_delay = nil @url_separate_delay = nil @url_reschedule = nil end |
Instance Method Details
#go ⇒ Object
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/sengi/crawler.rb', line 44 def go redis_setup uri_setup puts "#{Time.now.strftime('%F %T')} perform: #{@options['parent_id']} #{@options['level']} #{@options['relative'] ? 'y' : 'n'} #{@uri}" check_blacklist puts "\t" + "blacklisted: #{@uri.is_blacklisted ? 'YES' : 'no'}" return if @uri.is_blacklisted insert_url puts "\t" + "url: #{@uri.id}" if @uri.is_ignored && !@options['debug'] && !@options['force'] puts "\t" + "ignored reason: #{@uri.is_ignored_reason}" return end insert_domain puts "\t" + "domain id: #{@uri.domain_id}" insert_request puts "\t" + "request id: #{@uri.request_id}" make_http_request puts "\t" + "http response: #{@response.nil? ? 'FAILED' : 'ok'}" return if @response.nil? insert_response puts "\t" + "response: #{@uri.response_id} #{@uri.response_size}" puts "\t" + 'process http response' process_http_response puts "\t" + "http response" if @uri.is_ignored && !@options['force'] puts "\t" + "ignored reason: #{@uri.is_ignored_reason}" return end if @html_doc.nil? puts "\t" + 'HTML INVALID' return end puts "\t" + 'process html links' process_html_links puts "\t" + 'process html meta' puts "\t" + 'url done' end |