Method: Apollo::Crawler::BaseCrawler#etl

Defined in:
lib/apollo_crawler/crawler/base_crawler.rb

#etl(url = nil, opts = {}, &block) ⇒ Object

  • (0) Figure out URL

  • (1) Extract Data

  • (2) Extract Links

  • (3) Go to (0) eventually



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 76

def etl(url=nil, opts={}, &block)
  # Look for passed URL use default instead and fail if it is not valid
  if(url.nil? || url.empty?)
    url = self.url
  end

  # TODO: Be more agressive, use assert, it is clients responsibility!
  if(url.nil?)
    return nil
  end

  enqueue_url(url)

  # Counter of processed documents (pages)
  docs_processed = 0

  res = []
  # TODO: Respect limit of documents/urls processed
  while(@backlog.empty? == false)
    url = @backlog.shift

    # puts "Processing '#{url}'"
    doc = self.process_url(url)
    
    # Increase counter of processed documents
    docs_processed = docs_processed + 1

    @visited << url

    # Process document if was successfuly retreived
    if(!doc.nil?)
      # TODO: Use log4r and log it only on info level
      if block_given?
        yield doc
      end

      # Add document to queue of results
      res << doc

      enqueue_url(doc[:links].map(){ |l| l[:link] }) if doc[:links]
    end

    # Break if limit of documents to processed was reached
    break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
  end

  # Return processed document
  return res
end