Module: Scrapers::Xkcd

Defined in:
lib/scrapers/xkcd.rb

Constant Summary collapse

XKCD_URL =
"http://xkcd.com"
PUBDATE_FORMAT =
"%F"

Class Method Summary collapse

Class Method Details

.get_pubdate(url) ⇒ Object

Get the http header of the image file which reveals the last_modified date. We’ll use this as the publication date.



42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/scrapers/xkcd.rb', line 42

def self.get_pubdate(url)
  url = URI.parse(url.dup)
  head_req = Net::HTTP::Head.new url
  
  head = Net::HTTP.start(url.host, url.port) do |http|
    http.request head_req
  end
  return Time.now.strftime(PUBDATE_FORMAT) if head["Last-Modified"].nil?
  last_modified = Time.parse(head["Last-Modified"]) rescue nil
  return Time.now.strftime(PUBDATE_FORMAT) if last_modified.nil?
  last_modified.strftime(PUBDATE_FORMAT)
end

.scrape(comic = nil) ⇒ Object

Get the current or numbered xkcd comic

comic = (string) the number of the xkcd comic to retreive. Gets current comic if nil.

returns hash containing comic info:

{:title => "comic' title",
 :url => "url to comic",
 :img_src => "source url to comic image",
 :hover_text => "the hover (mouse-over) text",
 :pubdate => "publication date",
}


23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/scrapers/xkcd.rb', line 23

def self.scrape(comic=nil)
  results = Hash.new

  url = URI.parse XKCD_URL
  url.path = "/#{comic}/" unless comic.nil?
  results[:url] = url.to_s

  doc = Nokogiri::HTML(open(url.to_s))
  comic = doc.at_css("#comic img")
  results[:img_src] = comic.attr("src")
  results[:img_title] = comic.attr("title")
  results[:title] = results[:img_alt] = comic.attr("alt")
  results[:pubdate] = get_pubdate(results[:img_src])

  results
end