Module: Scrapey
- Defined in:
- lib/scrapey/scrapey.rb,
lib/scrapey/tee.rb,
lib/scrapey/tor.rb,
lib/scrapey/cache.rb,
lib/scrapey/multi.rb,
lib/scrapey/database.rb,
lib/scrapey/template.rb,
lib/scrapey/constants.rb,
lib/scrapey/cache/disk.rb,
lib/scrapey/cache/redis.rb
Overview
require ‘phantom_mechanize’
Defined Under Namespace
Modules: Template Classes: Tee
Constant Summary collapse
- VERSION =
"0.0.23"
- BASEDIR =
File.(File.dirname($0)).gsub(/\/src$/,'')
- URL =
"https://github.com/monkeysuffrage/scrapey"
Class Method Summary collapse
Instance Method Summary collapse
- #cache_filename(url) ⇒ Object
- #change_identity ⇒ Object
- #check_db_config ⇒ Object
- #debug(msg) ⇒ Object
- #delete_cache(url) ⇒ Object
- #disable_cache ⇒ Object
- #enqueue(url) ⇒ Object
- #fields(*args) ⇒ Object
- #get(*args) ⇒ Object
- #get_or_post(method, url, options = {}, *args) ⇒ Object
- #init_db ⇒ Object
- #is_cached?(url) ⇒ Boolean
-
#load_cache(url) ⇒ Object
def load_cache url filename = cache_filename url return nil unless File::exists?(filename) debug “Loading #filename from cache” begin Mechanize::Page.new URI.parse(url), [], Marshal.load(File.open(filename, “rb”){|f| f.read}), nil, @agent rescue Exception => e puts e.message end end.
- #multi_get(*args) ⇒ Object
- #multi_get_or_post(method, all_urls, options = {}) ⇒ Object
- #multi_head(*args) ⇒ Object
- #multi_post(*args) ⇒ Object
- #phget(*args) ⇒ Object
- #post(*args) ⇒ Object
- #save(item, output = nil) ⇒ Object
- #save_cache(url, body, options = {}) ⇒ Object
- #save_images(urls) ⇒ Object
- #set_proxy(*args) ⇒ Object
- #tables(*args) ⇒ Object
- #truncate(*args) ⇒ Object
- #ts ⇒ Object
- #use_cache(options = {}) ⇒ Object
- #use_tor ⇒ Object
- #visited?(url) ⇒ Boolean
- #with_cache(cassette_name = 'my_cassette') ⇒ Object
- #without_cache ⇒ Object
Class Method Details
.init(b) ⇒ Object
5 6 7 8 9 10 11 12 |
# File 'lib/scrapey/scrapey.rb', line 5 def self.init b eval "include Scrapey", b # some defaults that I like eval "@agent ||= Mechanize.new{|a| a.history.max_size = 10}", b eval "@agent.user_agent = 'Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}'", b eval "@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE", b end |
Instance Method Details
#cache_filename(url) ⇒ Object
5 6 7 |
# File 'lib/scrapey/cache/disk.rb', line 5 def cache_filename url @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url).sub(/(.)(.)/, '\1/\2/\1\2') + ".cache" end |
#change_identity ⇒ Object
8 9 10 11 12 13 14 |
# File 'lib/scrapey/tor.rb', line 8 def change_identity debug "changing identity..." localhost = Net::Telnet::new("Host" => "localhost", "Port" => "9051", "Timeout" => 10, "Prompt" => /250 OK\n/) localhost.cmd('AUTHENTICATE ""') {|c| raise "Cannot authenticate to Tor" unless "250 OK\n" == c} localhost.cmd('signal NEWNYM') {|c| raise "Cannot switch Tor to new route" unless "250 OK\n" == c} localhost.close end |
#check_db_config ⇒ Object
2 3 4 |
# File 'lib/scrapey/database.rb', line 2 def check_db_config raise 'No database configured' unless @config['database'] end |
#debug(msg) ⇒ Object
100 101 102 |
# File 'lib/scrapey/scrapey.rb', line 100 def debug msg puts msg if @debug end |
#delete_cache(url) ⇒ Object
48 49 50 |
# File 'lib/scrapey/cache/disk.rb', line 48 def delete_cache url FileUtils.rm(cache_filename(url)) rescue nil end |
#disable_cache ⇒ Object
20 21 22 23 24 |
# File 'lib/scrapey/cache.rb', line 20 def disable_cache @use_cache = false yield @use_cache = true end |
#enqueue(url) ⇒ Object
108 109 110 111 112 |
# File 'lib/scrapey/scrapey.rb', line 108 def enqueue url @url_list ||= File.open("#{BASEDIR}/config/urls.txt", 'w') @url_list << url @url_list << "\n" end |
#fields(*args) ⇒ Object
49 50 51 |
# File 'lib/scrapey/scrapey.rb', line 49 def fields *args @fields = args end |
#get(*args) ⇒ Object
41 |
# File 'lib/scrapey/scrapey.rb', line 41 def get *args; get_or_post 'get', *args; end |
#get_or_post(method, url, options = {}, *args) ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/scrapey/scrapey.rb', line 15 def get_or_post method, url, ={}, *args agent = ['goto', 'visit'].include?(method) ? @browser : @agent begin new_args = method, url unless .empty? && args.empty? new_args << args.each{|arg| new_args << arg} end key = method == 'post' ? url + .to_s : url doc = load_cache(key) if @use_cache return doc if doc page = agent.send *new_args # str = page.respond_to?('root') ? page.root.to_s : page.body # save_cache(url, str) if @use_cache save_cache(key, page.body) if @use_cache #exit if Object.const_defined? :Ocra page rescue Exception => e puts e. raise e end end |
#init_db ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/scrapey/database.rb', line 24 def init_db [ 'active_record', 'active_record/schema', 'active_record/connection_adapters/abstract/schema_definitions', @config['database']['adapter'], 'tzinfo', 'active_support/all', 'active_support/multibyte/chars' ].each{|lib| require lib} ActiveRecord::Base.establish_connection(@config['database']) end |
#is_cached?(url) ⇒ Boolean
9 10 11 |
# File 'lib/scrapey/cache/disk.rb', line 9 def is_cached? url File.exists? cache_filename(url) end |
#load_cache(url) ⇒ Object
def load_cache url
filename = cache_filename url
return nil unless File::exists?(filename)
debug "Loading #{filename} from cache"
begin
Mechanize::Page.new URI.parse(url), [], Marshal.load(File.open(filename, "rb"){|f| f.read}), nil, @agent
rescue Exception => e
puts e.message
end
end
def save_cache url, doc, options = {}
File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
end
30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/scrapey/cache/disk.rb', line 30 def load_cache url filename = cache_filename url return nil unless File::exists?(filename) debug "Loading #{filename} from cache" begin Mechanize::Page.new URI.parse(url), [], Marshal.load(Zlib::Inflate.inflate(File.open(filename, "rb"){|f| f.read})), nil, @agent rescue Exception => e puts e. # delete_cache url # Mechanize::Page.new URI.parse(url), [], '<html></html>', nil, @agent end end |
#multi_get(*args) ⇒ Object
50 |
# File 'lib/scrapey/multi.rb', line 50 def multi_get *args; multi_get_or_post 'get_content', *args; end |
#multi_get_or_post(method, all_urls, options = {}) ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/scrapey/multi.rb', line 7 def multi_get_or_post method, all_urls, = {} # some sensible defaults threads = [:threads] || 20 on_success = [:on_success] || :on_success on_error = [:on_error] || :on_error user_agent = [:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}" proxy = [:proxy] || nil timeout = [:timeout] || 1000 follow_redirect = [:follow_redirect] || true @lock ||= Mutex.new @http_clients ||= threads.times.map do c = HTTPClient.new proxy, user_agent c.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE c.receive_timeout = timeout yield c if block_given? c end debug 'starting multi' all_urls.each_slice(threads) do |urls| urls.each_with_index.map do |url, i| Thread.new do begin response = @http_clients[i].send method, url, [:query], [:headers], :follow_redirect => follow_redirect rescue Exception => e error = e end @lock.synchronize do if response send on_success, url, response else send on_error, url, e end end end end.each{|thread| thread.join} end end |
#multi_head(*args) ⇒ Object
52 |
# File 'lib/scrapey/multi.rb', line 52 def multi_head *args; multi_get_or_post 'head', *args; end |
#multi_post(*args) ⇒ Object
51 |
# File 'lib/scrapey/multi.rb', line 51 def multi_post *args; multi_get_or_post 'post_content', *args; end |
#phget(*args) ⇒ Object
43 |
# File 'lib/scrapey/scrapey.rb', line 43 def phget *args; get_or_post 'phget', *args; end |
#post(*args) ⇒ Object
42 |
# File 'lib/scrapey/scrapey.rb', line 42 def post *args; get_or_post 'post', *args; end |
#save(item, output = nil) ⇒ Object
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/scrapey/scrapey.rb', line 69 def save item, output = nil output ||= @output @csvs ||= {} unless @csvs[output] obj = {} begin fn = output.gsub(/(?<!csv)$/, '.csv') obj[:csv] = CSV.open fn, 'w' rescue Exception => e if e.is_a?(Errno::EACCES) puts "Unable to access #{fn} - is it locked?" exit else raise e end end obj[:fields] = output == @output && @fields && !@fields.empty? ? @fields : item.keys obj[:csv] << obj[:fields] @csvs[output] = obj end @csvs[output][:csv] << @csvs[output][:fields].map{|f| item[f]} end |
#save_cache(url, body, options = {}) ⇒ Object
43 44 45 |
# File 'lib/scrapey/cache/disk.rb', line 43 def save_cache url, doc, = {} File.open(cache_filename(url), "wb") {|f| f << Zlib::Deflate.deflate(Marshal.dump(doc)) } end |
#save_images(urls) ⇒ Object
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/scrapey/scrapey.rb', line 53 def save_images urls folder = "#{BASEDIR}/images" Dir.mkdir(folder) unless Dir.exists?(folder) names = [] urls.each do |url| name = url[/[^\/]+$/] binding.pry unless name names << name fn = "#{folder}/#{name}" next if File.exists?(fn) file = @agent.get(url) File.open(fn, 'wb'){|f| f << file.body} end names end |
#set_proxy(*args) ⇒ Object
45 46 47 |
# File 'lib/scrapey/scrapey.rb', line 45 def set_proxy *args @agent.set_proxy *args end |
#tables(*args) ⇒ Object
6 7 8 9 10 11 12 13 14 15 |
# File 'lib/scrapey/database.rb', line 6 def tables *args check_db_config missing_tables = false args.each do |arg| model = Object.const_set(arg, Class.new(ActiveRecord::Base) {}) missing_tables = true unless model.table_exists? end schema = "#{BASEDIR}/src/schema.rb" require schema if missing_tables && File.exists?(schema) end |
#truncate(*args) ⇒ Object
17 18 19 20 21 22 |
# File 'lib/scrapey/database.rb', line 17 def truncate *args check_db_config args.each do |arg| ActiveRecord::Base.connection.execute("TRUNCATE TABLE #{Object.const_get(arg).table_name}") end end |
#ts ⇒ Object
104 105 106 |
# File 'lib/scrapey/scrapey.rb', line 104 def ts Time.now.to_i.to_s end |
#use_cache(options = {}) ⇒ Object
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
# File 'lib/scrapey/cache.rb', line 3 def use_cache = {} @use_cache = true if @redis = .delete(:redis) require 'scrapey/cache/redis' else require 'scrapey/cache/disk' @config['cache_dir'] ||= "#{BASEDIR}/cache" FileUtils.mkdir_p @config['cache_dir'] ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"].each do |l1| ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"].each do |l2| FileUtils.mkdir_p "#{@config['cache_dir']}/#{l1}/#{l2}" end end end end |
#use_tor ⇒ Object
4 5 6 |
# File 'lib/scrapey/tor.rb', line 4 def use_tor set_proxy('localhost', 8118) end |
#visited?(url) ⇒ Boolean
93 94 95 96 97 98 |
# File 'lib/scrapey/scrapey.rb', line 93 def visited? url @visited ||= [] return true if @visited.include? url @visited << url false end |
#with_cache(cassette_name = 'my_cassette') ⇒ Object
31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/scrapey/cache.rb', line 31 def with_cache cassette_name = 'my_cassette' require 'vcr' require 'fakeweb' VCR.configure do |c| c.cassette_library_dir = "#{BASEDIR}/cache" c.hook_into :fakeweb c.allow_http_connections_when_no_cassette = true end VCR.use_cassette(cassette_name, :record => :new_episodes, :match_requests_on => [:method, :uri, :body]) do yield end end |
#without_cache ⇒ Object
27 28 29 |
# File 'lib/scrapey/cache.rb', line 27 def without_cache yield end |