Class: GeneralScraper

Inherits:
Object
  • Object
show all
Includes:
ParsePage
Defined in:
lib/generalscraper.rb

Instance Method Summary collapse

Methods included from ParsePage

#fixEncode, #getContent, #getHTMLText, #getMetadata, #getPDF, #getPageData

Constructor Details

#initialize(operators, searchterm, requests, solver_details, cm_hash) ⇒ GeneralScraper

Returns a new instance of GeneralScraper.



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/generalscraper.rb', line 13

def initialize(operators, searchterm, requests, solver_details, cm_hash)
  @operators = operators
  @searchterm = searchterm
  @op_val = @operators.split(" ")[0].split(":")[1]
  @requests = requests
  @solver_details = solver_details
  
  @output = Array.new
  @urllist = Array.new
  @startindex = 10

  # Handle crawler manager info
  @cm_url = cm_hash[:crawler_manager_url] if cm_hash
  @selector_id = cm_hash[:selector_id] if cm_hash
end

Instance Method Details

#check_results(page, *requested_page) ⇒ Object

Check that page with links loaded



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/generalscraper.rb', line 37

def check_results(page, *requested_page)
  if page.include?("To continue, please type the characters below:")
    # Solve CAPTCHA if enabled
    if @solver_details
      c = Captcha.new(@requests, @solver_details)
      c.solve
      
      # Proceed as normal
      sleep(1)
      check_results(@requests.get_updated_current_page)
      
    else # Restart and try again if CAPTCHA-solving not enabled
      report_status("CAPTCHA Found. CAPTCHA solving not enabled. Trying to restart browser.")
      @requests.restart_browser
      check_results(@requests.get_page(requested_page), requested_page)
    end
  else # No CAPTCHA found :)
    begin
      navigate_save_results(page)
    rescue => e
      report_status("Error: " + e.to_s + " Retrying...")
      @requests.restart_browser
      check_results(@requests.get_page(requested_page), requested_page)
    end
  end
end

#get_json_dataObject

Get the JSON of all the data



167
168
169
# File 'lib/generalscraper.rb', line 167

def get_json_data
  return JSON.pretty_generate(@output)
end

Gets the links from the page that match css selector in block



65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/generalscraper.rb', line 65

def get_links(page, &block)
  html = Nokogiri::HTML(page)

  # Get array of links
  return yield(html).inject(Array.new) do |link_arr, al|
    begin
      link_arr.push(al["href"])
    rescue => e
      report_status("Error getting links: " + e.to_s)
    end
   
    link_arr
  end
end

#getDataObject

Gets all data and returns in JSON



112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/generalscraper.rb', line 112

def getData
  search
  @urllist.each do |url|
    begin
      report_results(getPageData(url), url)
    rescue
    end
  end

  report_status("Finished collecting data for " + @operators.to_s + " " + @searchterm.to_s)
  @requests.close_all_browsers
end

#getURLsObject

Returns a list of search result URLs



160
161
162
163
164
# File 'lib/generalscraper.rb', line 160

def getURLs
  search
  @requests.close_all_browsers
  return JSON.pretty_generate(@urllist)
end

Categorizes the links on results page into results and other search pages



81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/generalscraper.rb', line 81

def navigate_save_results(page)
  # Save result links for page
  result_links = get_links(page) {|html| html.css("h3.r").css("a")}
  result_links.each do |link|
    site_url_save(link)
  end

  # Go to next page
  next_pages = get_links(page) {|html| html.css("#pnnext")}
  next_pages.each do |link|
    report_status("Going to next page: google.com"+link)
    next_search_page("google.com"+link)
  end
end

#next_search_page(link) ⇒ Object

Process search links and go to next page



102
103
104
105
106
107
108
109
# File 'lib/generalscraper.rb', line 102

def next_search_page(link)
  page_index_num = link.split("&start=")[1].split("&sa=N")[0]

  if page_index_num.to_i == @startindex
    @startindex += 10
    check_results(@requests.get_page(link), link)
  end
end

#report_bulk(results) ⇒ Object

Add page hash to output for bulk reporting



154
155
156
# File 'lib/generalscraper.rb', line 154

def report_bulk(results)
  @output.push(results)
end

#report_incremental(results, link) ⇒ Object

Report results back to Harvester incrementally



135
136
137
138
139
140
141
# File 'lib/generalscraper.rb', line 135

def report_incremental(results, link)
  curl_url = @cm_url+"/relay_results"
  c = Curl::Easy.http_post(curl_url,
                           Curl::PostField.content('selector_id', @selector_id),
                           Curl::PostField.content('status_message', "Collected " + link),
                           Curl::PostField.content('results', JSON.pretty_generate([results])))
end

#report_results(results, link) ⇒ Object

Figure out how to report results



126
127
128
129
130
131
132
# File 'lib/generalscraper.rb', line 126

def report_results(results, link)
  if @cm_url
    report_incremental(results, link)
  else
    report_bulk(results)
  end
end

#report_status(status_msg) ⇒ Object

Report Harvester status message



144
145
146
147
148
149
150
151
# File 'lib/generalscraper.rb', line 144

def report_status(status_msg)
  if @cm_url
    curl_url = @cm_url+"/update_status"
    c = Curl::Easy.http_post(curl_url,
                             Curl::PostField.content('selector_id', @selector_id),
                             Curl::PostField.content('status_message', status_msg))
  end
end

#searchObject

Searches for links on Google



30
31
32
33
34
# File 'lib/generalscraper.rb', line 30

def search
  check_results(@requests.get_page("http://google.com", @operators + " " + @searchterm),
                "http://google.com", (@operators + " " + @searchterm))
  report_status("Got search results for " + @operators.to_s + " " + @searchterm.to_s)
end

#site_url_save(link) ⇒ Object

Parse and save the URLs for search results



97
98
99
# File 'lib/generalscraper.rb', line 97

def site_url_save(link)
  @urllist.push(link)
end