Module: Scrapey

Defined in:
lib/scrapey/scrapey.rb,
lib/scrapey/tee.rb,
lib/scrapey/tor.rb,
lib/scrapey/cache.rb,
lib/scrapey/multi.rb,
lib/scrapey/database.rb,
lib/scrapey/template.rb,
lib/scrapey/constants.rb,
lib/scrapey/cache/disk.rb,
lib/scrapey/cache/redis.rb

Overview

require ‘phantom_mechanize’

Defined Under Namespace

Modules: Template Classes: Tee

Constant Summary collapse

VERSION =
"0.0.23"
BASEDIR =
File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
URL =
"https://github.com/monkeysuffrage/scrapey"

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.init(b) ⇒ Object



5
6
7
8
9
10
11
12
# File 'lib/scrapey/scrapey.rb', line 5

def self.init b
  eval "include Scrapey", b

  # some defaults that I like
  eval "@agent ||= Mechanize.new{|a| a.history.max_size = 10}", b
  eval "@agent.user_agent = 'Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}'", b
  eval "@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE", b
end

Instance Method Details

#cache_filename(url) ⇒ Object



5
6
7
# File 'lib/scrapey/cache/disk.rb', line 5

def cache_filename url
  @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url).sub(/(.)(.)/, '\1/\2/\1\2') + ".cache"
end

#change_identityObject



8
9
10
11
12
13
14
# File 'lib/scrapey/tor.rb', line 8

def change_identity
  debug "changing identity..."
  localhost = Net::Telnet::new("Host" => "localhost", "Port" => "9051", "Timeout" => 10, "Prompt" => /250 OK\n/)
  localhost.cmd('AUTHENTICATE ""') {|c| raise "Cannot authenticate to Tor" unless "250 OK\n" == c}
  localhost.cmd('signal NEWNYM') {|c| raise "Cannot switch Tor to new route" unless "250 OK\n" == c}
  localhost.close
end

#check_db_configObject



2
3
4
# File 'lib/scrapey/database.rb', line 2

def check_db_config
  raise 'No database configured' unless @config['database']
end

#debug(msg) ⇒ Object



100
101
102
# File 'lib/scrapey/scrapey.rb', line 100

def debug msg
  puts msg if @debug
end

#delete_cache(url) ⇒ Object



48
49
50
# File 'lib/scrapey/cache/disk.rb', line 48

def delete_cache url
  FileUtils.rm(cache_filename(url)) rescue nil
end

#disable_cacheObject



20
21
22
23
24
# File 'lib/scrapey/cache.rb', line 20

def disable_cache
  @use_cache = false
  yield
  @use_cache = true
end

#enqueue(url) ⇒ Object



108
109
110
111
112
# File 'lib/scrapey/scrapey.rb', line 108

def enqueue url
  @url_list ||= File.open("#{BASEDIR}/config/urls.txt", 'w')
  @url_list << url
  @url_list << "\n"
end

#fields(*args) ⇒ Object



49
50
51
# File 'lib/scrapey/scrapey.rb', line 49

def fields *args
  @fields = args
end

#get(*args) ⇒ Object



41
# File 'lib/scrapey/scrapey.rb', line 41

def get *args; get_or_post 'get', *args; end

#get_or_post(method, url, options = {}, *args) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/scrapey/scrapey.rb', line 15

def get_or_post method, url, options={}, *args
  agent = ['goto', 'visit'].include?(method) ? @browser : @agent
  begin
    new_args = method, url
    unless options.empty? && args.empty? 
      new_args << options
      args.each{|arg| new_args << arg}
    end
    
    key = method == 'post' ? url + options.to_s : url
    doc = load_cache(key) if @use_cache
    return doc if doc

    page = agent.send *new_args
    # str = page.respond_to?('root') ? page.root.to_s : page.body
    # save_cache(url, str) if @use_cache
    save_cache(key, page.body) if @use_cache

    #exit if Object.const_defined? :Ocra
    page
  rescue Exception => e
    puts e.message
    raise e
  end
end

#init_dbObject



24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/scrapey/database.rb', line 24

def init_db
  [
  'active_record',
  'active_record/schema',
  'active_record/connection_adapters/abstract/schema_definitions',
  @config['database']['adapter'],
  'tzinfo',
  'active_support/all',
  'active_support/multibyte/chars'
  ].each{|lib| require lib}
	ActiveRecord::Base.establish_connection(@config['database']) 
end

#is_cached?(url) ⇒ Boolean

Returns:

  • (Boolean)


9
10
11
# File 'lib/scrapey/cache/disk.rb', line 9

def is_cached? url
  File.exists? cache_filename(url)
end

#load_cache(url) ⇒ Object

def load_cache url

  filename = cache_filename url
  return nil unless File::exists?(filename)
  debug "Loading #{filename} from cache"
  begin
    Mechanize::Page.new URI.parse(url), [], Marshal.load(File.open(filename, "rb"){|f| f.read}), nil, @agent
  rescue Exception => e
    puts e.message
  end
end

def save_cache url, doc, options = {}
  File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
end


30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/scrapey/cache/disk.rb', line 30

def load_cache url
  filename = cache_filename url
  return nil unless File::exists?(filename)
  debug "Loading #{filename} from cache"
  begin
    Mechanize::Page.new URI.parse(url), [], Marshal.load(Zlib::Inflate.inflate(File.open(filename, "rb"){|f| f.read})), nil, @agent
  rescue Exception => e
    puts e.message
    # delete_cache url
    # Mechanize::Page.new URI.parse(url), [], '<html></html>', nil, @agent
  end
end

#multi_get(*args) ⇒ Object



50
# File 'lib/scrapey/multi.rb', line 50

def multi_get *args; multi_get_or_post 'get_content', *args; end

#multi_get_or_post(method, all_urls, options = {}) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/scrapey/multi.rb', line 7

def multi_get_or_post method, all_urls, options = {}

  # some sensible defaults
  threads         = options[:threads]         || 20
  on_success      = options[:on_success]      || :on_success
  on_error        = options[:on_error]        || :on_error
  user_agent      = options[:user_agent]      || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
  proxy           = options[:proxy]           || nil
  timeout         = options[:timeout]         || 1000
  follow_redirect = options[:follow_redirect] || true

  @lock ||= Mutex.new

  @http_clients ||= threads.times.map do
    c = HTTPClient.new proxy, user_agent
    c.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
    c.receive_timeout =  timeout
    yield c if block_given?
    c
  end

  debug 'starting multi'

  all_urls.each_slice(threads) do |urls|
    urls.each_with_index.map do |url, i|
      Thread.new do
        begin
          response = @http_clients[i].send method, url, options[:query], options[:headers], :follow_redirect => follow_redirect
        rescue Exception => e
          error = e
        end
        @lock.synchronize do
          if response
            send on_success, url, response
          else
            send on_error, url, e
          end
        end
      end
    end.each{|thread| thread.join}
  end
end

#multi_head(*args) ⇒ Object



52
# File 'lib/scrapey/multi.rb', line 52

def multi_head *args; multi_get_or_post 'head', *args; end

#multi_post(*args) ⇒ Object



51
# File 'lib/scrapey/multi.rb', line 51

def multi_post *args; multi_get_or_post 'post_content', *args; end

#phget(*args) ⇒ Object



43
# File 'lib/scrapey/scrapey.rb', line 43

def phget *args; get_or_post 'phget', *args; end

#post(*args) ⇒ Object



42
# File 'lib/scrapey/scrapey.rb', line 42

def post *args; get_or_post 'post', *args; end

#save(item, output = nil) ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/scrapey/scrapey.rb', line 69

def save item, output = nil
  output ||= @output
  @csvs ||= {}
  unless @csvs[output]
    obj = {}
    begin
      fn = output.gsub(/(?<!csv)$/, '.csv')
      obj[:csv] = CSV.open fn, 'w'
    rescue Exception => e
      if e.is_a?(Errno::EACCES)
        puts "Unable to access #{fn} - is it locked?"
        exit
      else
        raise e
      end
    end
    obj[:fields] = output == @output && @fields && !@fields.empty? ? @fields : item.keys
    obj[:csv] << obj[:fields]
    @csvs[output] = obj
  end
  @csvs[output][:csv] << @csvs[output][:fields].map{|f| item[f]}
end

#save_cache(url, body, options = {}) ⇒ Object



43
44
45
# File 'lib/scrapey/cache/disk.rb', line 43

def save_cache url, doc, options = {}
  File.open(cache_filename(url), "wb") {|f| f << Zlib::Deflate.deflate(Marshal.dump(doc)) }
end

#save_images(urls) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/scrapey/scrapey.rb', line 53

def save_images urls
  folder = "#{BASEDIR}/images"
  Dir.mkdir(folder) unless Dir.exists?(folder)
  names = []
  urls.each do |url|
    name = url[/[^\/]+$/]
    binding.pry unless name
    names << name
    fn = "#{folder}/#{name}"
    next if File.exists?(fn)
    file = @agent.get(url)
    File.open(fn, 'wb'){|f| f << file.body}
  end
  names
end

#set_proxy(*args) ⇒ Object



45
46
47
# File 'lib/scrapey/scrapey.rb', line 45

def set_proxy *args
  @agent.set_proxy *args
end

#tables(*args) ⇒ Object



6
7
8
9
10
11
12
13
14
15
# File 'lib/scrapey/database.rb', line 6

def tables *args
  check_db_config
  missing_tables = false
  args.each do |arg|
    model = Object.const_set(arg, Class.new(ActiveRecord::Base) {})
    missing_tables = true unless model.table_exists?
  end
  schema = "#{BASEDIR}/src/schema.rb"
  require schema if missing_tables && File.exists?(schema)
end

#truncate(*args) ⇒ Object



17
18
19
20
21
22
# File 'lib/scrapey/database.rb', line 17

def truncate *args
  check_db_config
  args.each do |arg|
    ActiveRecord::Base.connection.execute("TRUNCATE TABLE #{Object.const_get(arg).table_name}")
  end
end

#tsObject



104
105
106
# File 'lib/scrapey/scrapey.rb', line 104

def ts
  Time.now.to_i.to_s
end

#use_cache(options = {}) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/scrapey/cache.rb', line 3

def use_cache options = {}
  @use_cache = true
  if @redis = options.delete(:redis)
    require 'scrapey/cache/redis'
  else
    require 'scrapey/cache/disk'
    @config['cache_dir'] ||= "#{BASEDIR}/cache"
    FileUtils.mkdir_p @config['cache_dir']
    ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"].each do |l1|
      ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"].each do |l2|
        FileUtils.mkdir_p "#{@config['cache_dir']}/#{l1}/#{l2}"          
      end
    end

  end
end

#use_torObject



4
5
6
# File 'lib/scrapey/tor.rb', line 4

def use_tor
  set_proxy('localhost', 8118)
end

#visited?(url) ⇒ Boolean

Returns:

  • (Boolean)


93
94
95
96
97
98
# File 'lib/scrapey/scrapey.rb', line 93

def visited? url
  @visited ||= []
  return true if @visited.include? url
  @visited << url
  false
end

#with_cache(cassette_name = 'my_cassette') ⇒ Object



31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/scrapey/cache.rb', line 31

def with_cache cassette_name = 'my_cassette'
  require 'vcr'
  require 'fakeweb'

  VCR.configure do |c|
    c.cassette_library_dir = "#{BASEDIR}/cache"
    c.hook_into :fakeweb
    c.allow_http_connections_when_no_cassette = true
  end

  VCR.use_cassette(cassette_name, :record => :new_episodes, :match_requests_on => [:method, :uri, :body]) do
    yield
  end
end

#without_cacheObject



27
28
29
# File 'lib/scrapey/cache.rb', line 27

def without_cache
  yield
end