Method: Apollo::Crawler::BaseCrawler#etl
- Defined in:
- lib/apollo_crawler/crawler/base_crawler.rb
#etl(url = nil, opts = {}, &block) ⇒ Object
-
(0) Figure out URL
-
(1) Extract Data
-
(2) Extract Links
-
(3) Go to (0) eventually
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 76 def etl(url=nil, opts={}, &block) # Look for passed URL use default instead and fail if it is not valid if(url.nil? || url.empty?) url = self.url end # TODO: Be more agressive, use assert, it is clients responsibility! if(url.nil?) return nil end enqueue_url(url) # Counter of processed documents (pages) docs_processed = 0 res = [] # TODO: Respect limit of documents/urls processed while(@backlog.empty? == false) url = @backlog.shift # puts "Processing '#{url}'" doc = self.process_url(url) # Increase counter of processed documents docs_processed = docs_processed + 1 @visited << url # Process document if was successfuly retreived if(!doc.nil?) # TODO: Use log4r and log it only on info level if block_given? yield doc end # Add document to queue of results res << doc enqueue_url(doc[:links].map(){ |l| l[:link] }) if doc[:links] end # Break if limit of documents to processed was reached break if opts[:doc_limit] && docs_processed >= opts[:doc_limit] end # Return processed document return res end |