Class: Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/apod_cli/scraper.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeScraper

Returns a new instance of Scraper.



9
10
11
# File 'lib/apod_cli/scraper.rb', line 9

def initialize
  @@data = self.class.index_data
end

Class Method Details

.get_pageObject



17
18
19
# File 'lib/apod_cli/scraper.rb', line 17

def self.get_page
  Nokogiri::HTML(open("http://apod.nasa.gov/apod/archivepix.html"))
end

.index_dataObject



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/apod_cli/scraper.rb', line 32

def self.index_data
  array = []
  months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
  content = self.get_page.css("body b")
  links = content.css("a")
  date_titles = content.text.split("\n").reject!{|item| item == ""}
  date_titles.pop

  links_hash = {}
  links.each do |link|
    links_hash[link.text.strip] = link.attribute("href").value
  end

  toggle_2007 = false
  date_titles.each_with_index do |dt, idx|
    hash = {}
    #There's one bloody link with a \n typo in its name that requires me to write this code.
    if idx == date_titles.length - 1 || months.index(date_titles[idx + 1].match(/[a-zA-Z]{1,}/).to_s).nil?
      if idx != date_titles.length - 1
        toggle_2007 = true
        next
      end
    end
    if toggle_2007
      toggle_2007 = false
      next
    end

    month_str = ""
    month_num = months.index(dt.match(/[a-zA-Z]{1,}/).to_s) + 1
    if month_num.to_s.length == 1
      month_str = "0#{month_num}"
    else
      month_str = month_num.to_s
    end

    name_i = dt.match(/:.+/).to_s
    name_i[0] = " "

    hash[:date] = "#{dt.match(/[0-9]{4}/)}-#{month_str}-#{dt.match(/[^0-9][0-9]{2}[^0-9]/).to_s.gsub(/[: ]/, "")}"
    hash[:name] = name_i.strip
    hash[:link] = "http://apod.nasa.gov/apod/" + links_hash[hash[:name]]
    array << hash
  end

  array.insert(-4411, {date: "2007-07-16", name: "The Lagoon Nebula in Gas, Dust, and Stars", link: "http://apod.nasa.gov/apod/ap070716.html"}) #This is the dumb typo'd link that I decided to hardcode.
  array
end

Instance Method Details

#dataObject



13
14
15
# File 'lib/apod_cli/scraper.rb', line 13

def data
  @@data
end

#pic_data(url) ⇒ Object



21
22
23
24
25
26
27
28
29
30
# File 'lib/apod_cli/scraper.rb', line 21

def pic_data(url)
  explanation = Nokogiri::HTML(open(url)).css("body").text.match(/Explanation:[\s\S]+?(\n(\s*)){3}/).to_s.gsub(/\n/, " ").gsub(/\s{2,}/, " ").strip
  name = self.data.select{|hash| url.include?(hash[:link])}[0][:name]
  if Nokogiri::HTML(open(url)).css("p a img").to_a != []
    link = "http://apod.nasa.gov/apod/#{Nokogiri::HTML(open(url)).css("p a img").attribute("src").to_s}"
  else
    link = self.data.select{|hash| url.include?(hash[:link])}[0][:link]
  end
  hash = {name: name, expl: explanation, link:link}
end