Class: GeneralScraper
- Inherits:
-
Object
- Object
- GeneralScraper
- Includes:
- ParsePage
- Defined in:
- lib/generalscraper.rb
Instance Method Summary collapse
-
#check_results(page, *requested_page) ⇒ Object
Check that page with links loaded.
-
#get_json_data ⇒ Object
Get the JSON of all the data.
-
#get_links(page, &block) ⇒ Object
Gets the links from the page that match css selector in block.
-
#getData ⇒ Object
Gets all data and returns in JSON.
-
#getURLs ⇒ Object
Returns a list of search result URLs.
-
#initialize(operators, searchterm, requests, solver_details, cm_hash) ⇒ GeneralScraper
constructor
A new instance of GeneralScraper.
-
#navigate_save_results(page) ⇒ Object
Categorizes the links on results page into results and other search pages.
-
#next_search_page(link) ⇒ Object
Process search links and go to next page.
-
#report_bulk(results) ⇒ Object
Add page hash to output for bulk reporting.
-
#report_incremental(results, link) ⇒ Object
Report results back to Harvester incrementally.
-
#report_results(results, link) ⇒ Object
Figure out how to report results.
-
#report_status(status_msg) ⇒ Object
Report Harvester status message.
-
#search ⇒ Object
Searches for links on Google.
-
#site_url_save(link) ⇒ Object
Parse and save the URLs for search results.
Methods included from ParsePage
#fixEncode, #getContent, #getHTMLText, #getMetadata, #getPDF, #getPageData
Constructor Details
#initialize(operators, searchterm, requests, solver_details, cm_hash) ⇒ GeneralScraper
Returns a new instance of GeneralScraper.
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/generalscraper.rb', line 13 def initialize(operators, searchterm, requests, solver_details, cm_hash) @operators = operators @searchterm = searchterm @op_val = @operators.split(" ")[0].split(":")[1] @requests = requests @solver_details = solver_details @output = Array.new @urllist = Array.new @startindex = 10 # Handle crawler manager info @cm_url = cm_hash[:crawler_manager_url] if cm_hash @selector_id = cm_hash[:selector_id] if cm_hash end |
Instance Method Details
#check_results(page, *requested_page) ⇒ Object
Check that page with links loaded
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/generalscraper.rb', line 37 def check_results(page, *requested_page) if page.include?("To continue, please type the characters below:") # Solve CAPTCHA if enabled if @solver_details c = Captcha.new(@requests, @solver_details) c.solve # Proceed as normal sleep(1) check_results(@requests.get_updated_current_page) else # Restart and try again if CAPTCHA-solving not enabled report_status("CAPTCHA Found. CAPTCHA solving not enabled. Trying to restart browser.") @requests.restart_browser check_results(@requests.get_page(requested_page), requested_page) end else # No CAPTCHA found :) begin navigate_save_results(page) rescue => e report_status("Error: " + e.to_s + " Retrying...") @requests.restart_browser check_results(@requests.get_page(requested_page), requested_page) end end end |
#get_json_data ⇒ Object
Get the JSON of all the data
167 168 169 |
# File 'lib/generalscraper.rb', line 167 def get_json_data return JSON.pretty_generate(@output) end |
#get_links(page, &block) ⇒ Object
Gets the links from the page that match css selector in block
65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# File 'lib/generalscraper.rb', line 65 def get_links(page, &block) html = Nokogiri::HTML(page) # Get array of links return yield(html).inject(Array.new) do |link_arr, al| begin link_arr.push(al["href"]) rescue => e report_status("Error getting links: " + e.to_s) end link_arr end end |
#getData ⇒ Object
Gets all data and returns in JSON
112 113 114 115 116 117 118 119 120 121 122 123 |
# File 'lib/generalscraper.rb', line 112 def getData search @urllist.each do |url| begin report_results(getPageData(url), url) rescue end end report_status("Finished collecting data for " + @operators.to_s + " " + @searchterm.to_s) @requests.close_all_browsers end |
#getURLs ⇒ Object
Returns a list of search result URLs
160 161 162 163 164 |
# File 'lib/generalscraper.rb', line 160 def getURLs search @requests.close_all_browsers return JSON.pretty_generate(@urllist) end |
#navigate_save_results(page) ⇒ Object
Categorizes the links on results page into results and other search pages
81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/generalscraper.rb', line 81 def navigate_save_results(page) # Save result links for page result_links = get_links(page) {|html| html.css("h3.r").css("a")} result_links.each do |link| site_url_save(link) end # Go to next page next_pages = get_links(page) {|html| html.css("#pnnext")} next_pages.each do |link| report_status("Going to next page: google.com"+link) next_search_page("google.com"+link) end end |
#next_search_page(link) ⇒ Object
Process search links and go to next page
102 103 104 105 106 107 108 109 |
# File 'lib/generalscraper.rb', line 102 def next_search_page(link) page_index_num = link.split("&start=")[1].split("&sa=N")[0] if page_index_num.to_i == @startindex @startindex += 10 check_results(@requests.get_page(link), link) end end |
#report_bulk(results) ⇒ Object
Add page hash to output for bulk reporting
154 155 156 |
# File 'lib/generalscraper.rb', line 154 def report_bulk(results) @output.push(results) end |
#report_incremental(results, link) ⇒ Object
Report results back to Harvester incrementally
135 136 137 138 139 140 141 |
# File 'lib/generalscraper.rb', line 135 def report_incremental(results, link) curl_url = @cm_url+"/relay_results" c = Curl::Easy.http_post(curl_url, Curl::PostField.content('selector_id', @selector_id), Curl::PostField.content('status_message', "Collected " + link), Curl::PostField.content('results', JSON.pretty_generate([results]))) end |
#report_results(results, link) ⇒ Object
Figure out how to report results
126 127 128 129 130 131 132 |
# File 'lib/generalscraper.rb', line 126 def report_results(results, link) if @cm_url report_incremental(results, link) else report_bulk(results) end end |
#report_status(status_msg) ⇒ Object
Report Harvester status message
144 145 146 147 148 149 150 151 |
# File 'lib/generalscraper.rb', line 144 def report_status(status_msg) if @cm_url curl_url = @cm_url+"/update_status" c = Curl::Easy.http_post(curl_url, Curl::PostField.content('selector_id', @selector_id), Curl::PostField.content('status_message', status_msg)) end end |
#search ⇒ Object
Searches for links on Google
30 31 32 33 34 |
# File 'lib/generalscraper.rb', line 30 def search check_results(@requests.get_page("http://google.com", @operators + " " + @searchterm), "http://google.com", (@operators + " " + @searchterm)) report_status("Got search results for " + @operators.to_s + " " + @searchterm.to_s) end |
#site_url_save(link) ⇒ Object
Parse and save the URLs for search results
97 98 99 |
# File 'lib/generalscraper.rb', line 97 def site_url_save(link) @urllist.push(link) end |