Class: Scrapers::ManningBooks::Scraper
- Inherits:
-
Object
- Object
- Scrapers::ManningBooks::Scraper
- Defined in:
- lib/scrapers/manning_books.rb
Instance Attribute Summary collapse
-
#delay_time ⇒ Object
Returns the value of attribute delay_time.
-
#destination ⇒ Object
Returns the value of attribute destination.
-
#dry_run ⇒ Object
Returns the value of attribute dry_run.
-
#pw ⇒ Object
Returns the value of attribute pw.
-
#user ⇒ Object
Returns the value of attribute user.
Instance Method Summary collapse
- #build_book_list(page) ⇒ Object
- #download_books(agent, books) ⇒ Object
-
#initialize(options = {}) ⇒ Scraper
constructor
A new instance of Scraper.
- #login(agent) {|agent| ... } ⇒ Object
- #scrape ⇒ Object
- #wait_a_bit(delay) ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ Scraper
Returns a new instance of Scraper.
15 16 17 18 19 20 21 22 |
# File 'lib/scrapers/manning_books.rb', line 15 def initialize(={}) netrc_reader = ::Scrapers::NetrcReader.new(NETRC_MANNING_ENTRY) @user = .fetch("user", netrc_reader.user) @pw = .fetch("pw", netrc_reader.pw) @delay_time = .fetch("delay", DELAY_TIME) @destination = .fetch("destination", ".") @dry_run = .fetch("dry_run", false) end |
Instance Attribute Details
#delay_time ⇒ Object
Returns the value of attribute delay_time.
13 14 15 |
# File 'lib/scrapers/manning_books.rb', line 13 def delay_time @delay_time end |
#destination ⇒ Object
Returns the value of attribute destination.
13 14 15 |
# File 'lib/scrapers/manning_books.rb', line 13 def destination @destination end |
#dry_run ⇒ Object
Returns the value of attribute dry_run.
13 14 15 |
# File 'lib/scrapers/manning_books.rb', line 13 def dry_run @dry_run end |
#pw ⇒ Object
Returns the value of attribute pw.
13 14 15 |
# File 'lib/scrapers/manning_books.rb', line 13 def pw @pw end |
#user ⇒ Object
Returns the value of attribute user.
13 14 15 |
# File 'lib/scrapers/manning_books.rb', line 13 def user @user end |
Instance Method Details
#build_book_list(page) ⇒ Object
54 55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/scrapers/manning_books.rb', line 54 def build_book_list(page) page.search('.book').map do |book| { title: book.at('[data-type=title]').children.first.text, downloads: book.at('.book_downloads').search('a').map do |link| type = link.children.first.text.downcase next unless type.match(/download/) type = type.split(" ").last [type.to_sym, link.attr(:href)] end.compact.to_h } end end |
#download_books(agent, books) ⇒ Object
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# File 'lib/scrapers/manning_books.rb', line 68 def download_books(agent, books) books.map do |book| puts "Retrieving #{book[:title]}" downloads = book[:downloads].map do |type, href| next unless %i[pdf epub kindle].include?(type) print " downloading #{type} ..." agent.get href unless dry_run agent.current_page.save! unless dry_run puts "saved #{agent.current_page.filename}" [agent.current_page.filename, href] end.compact.to_h wait_a_bit delay_time [book[:title], downloads] end.to_h end |
#login(agent) {|agent| ... } ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/scrapers/manning_books.rb', line 39 def login(agent, &block) raise "Must provide a block to execute after logged in to site" unless block_given? agent.get DASHBOARD_URL unless agent.current_page.uri == DASHBOARD_URL # log in agent.current_page.form.field_with(:type => 'email').value= user agent.current_page.form.field_with(:type => 'password').value= pw agent.current_page.form.submit sleep 2 raise "could not log in" unless agent.current_page.uri.to_s == DASHBOARD_URL end yield agent end |
#scrape ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/scrapers/manning_books.rb', line 24 def scrape @results = nil Dir.chdir(destination) do |dir| Mechanize.start do |m| login(m) do |m| books = build_book_list(m.current_page) @results = download_books(m, books) end end end @results end |
#wait_a_bit(delay) ⇒ Object
84 85 86 87 88 89 90 91 92 93 |
# File 'lib/scrapers/manning_books.rb', line 84 def wait_a_bit(delay) puts "delaying for #{delay} second(s)" %w[- * | +].cycle do |c| print "\r#{c}" sleep 1 delay -= 1 break if delay < 1 end print "\r" end |