Class: HackerCurse::HackerNewsParser

Inherits:
AbstractSiteParser show all
Defined in:
lib/hacker/curse/hackernewsparser.rb

Instance Attribute Summary

Attributes inherited from AbstractSiteParser

#host, #htmloutfile, #more_url, #num_pages, #save_html, #subforum

Instance Method Summary collapse

Methods inherited from AbstractSiteParser

#get_comments, #get_comments_url, #get_doc_for_url, #get_first_page, #get_next_page, #human_age_to_unix, #load_from_yml, #save_comments_as_yml, #save_page_as_yml, #to_yml

Constructor Details

#initialize(config = {}) ⇒ HackerNewsParser

Returns a new instance of HackerNewsParser.



6
7
8
9
10
11
12
13
# File 'lib/hacker/curse/hackernewsparser.rb', line 6

def initialize config={}
  @host = config[:host] || "https://news.ycombinator.com"
  subforum = config[:subforum] || "news"
  _url="#{@host}/#{subforum}"
  @subforum = subforum
  config[:url] ||= _url
  super config
end

Instance Method Details

#_retrieve_comments(url) ⇒ Object

currently returns a Hash. containing various entries relating to the main article

which can be avoiced.
Contains an array :comments which contains hashes, :head contains text of head, :comment contains 
 text of comment, and then there are entries for submitter.
 hash[:comments].each do |e| e[:comment] ; end

Returns:

  • Array of ForumComment objects. pages.each do |co| puts co.comment_text, co.head; end



29
30
31
32
33
34
# File 'lib/hacker/curse/hackernewsparser.rb', line 29

def _retrieve_comments url
  arr = to_hash_comment url
  # TODO break head into points age etc
  pages = hash_to_comment_class arr
  return pages
end

#_retrieve_page(url) ⇒ Object



14
15
16
17
18
19
20
21
# File 'lib/hacker/curse/hackernewsparser.rb', line 14

def _retrieve_page url
  #puts "got url #{url} "
  raise "url should be string" unless url.is_a? String
  arr = to_hash url
  page = hash_to_class arr
  #to_yml "#{@subforum}.yml", arr
  return page
end

#hash_to_class(h) ⇒ Object



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/hacker/curse/hackernewsparser.rb', line 112

def hash_to_class h
  p = ForumPage.new
  p.url = h[:url]
  p.next_url = h[:next_url]
  p.create_date = h[:create_date]
  p.subforum = h[:subforum]
  art = h[:articles]
  arts = []
  art.each do |a|
    fa = ForumArticle.new a
    fa.parent = self
    arts << fa
  end
  p.articles = arts
  return p
end

#hash_to_comment_class(arr) ⇒ Object



35
36
37
38
# File 'lib/hacker/curse/hackernewsparser.rb', line 35

def hash_to_comment_class arr
  page = ForumArticle.new arr
  return page
end

#oldhash_to_comment_class(arr) ⇒ Object



39
40
41
42
43
44
45
46
47
# File 'lib/hacker/curse/hackernewsparser.rb', line 39

def oldhash_to_comment_class arr
  co = arr[:comments]
  pages = Array.new
  co.each do |h|
    page = ForumComment.new h
    pages << page
  end
  return pages
end

#to_hash(url) ⇒ Object

convert the front page to a hash



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/hacker/curse/hackernewsparser.rb', line 129

def to_hash url
  doc  = get_doc_for_url url
  count = 0
  page = {}
  page[:url] = url
  now = Time.now
  page[:create_date_seconds] = now.to_i
  page[:create_date] = now
  page[:subforum] = @subforum

  arr = Array.new
  h = {}
  links = doc.xpath("//table/tr/td/table/tr")
  links.each_with_index do |li, i|
    x = li.css("td.title")
    if !x.empty?
      #puts "   ---- title ----- #{x.count} "
      count = x[0].text
      #puts count
      if x.count < 2
        # this block is for the next_url
        article_url = x[0].css("a")[0]["href"]   # link url
        #puts article_url
        h = {}
        h[:title] = count
        h[:article_url] = article_url
        more = count
        more_url = "#{@host}/#{article_url}"
        #arr << h
        page[:next_url] = more_url
        #puts li
      end
      break if x.count < 2

      # actual article url
      title = x[1].css("a")[0].text   # title
      article_url = x[1].css("a")[0]["href"]   # link url
      #puts article_url
      #puts title
      h = {}
      #h[:number] = count
      h[:title] = title
      # ask option does not have hostname since it is relative to HN
      if article_url.index("http") != 0
        article_url = "#{@host}/#{article_url}"
      end

      h[:article_url] = article_url
      arr << h
    else 
      x = li.css("td.subtext")
      if !x.empty?
        fulltext = x.text
        #puts "   ---- subtext ----- (#{fulltext})"
        submitter = nil
        submitter_url = nil
        comment = nil
        comments_url = nil
        t = x.css("a")
        t.each_with_index do |tt, ii|
          case ii
          when 0
            submitter = tt.text
            submitter_url = tt["href"]
          when 1
            comment = tt.text
            comments_url = tt["href"]
            comments_url = "#{@host}/#{comments_url}"
          end
        end
        points = x.css("span").text rescue ""
        #puts submitter
        #puts submitter_url
        #puts comment
        #puts comments_url
        #puts points
        h[:submitter] = submitter
        h[:submitter_url] = submitter_url
        h[:comment_count] = comment.to_i.to_s.rjust(4)
        h[:comments_url] = comments_url
        h[:points] = points.to_i.to_s.rjust(4)
        m = fulltext.scan(/\d+ \w+ ago/)
        if m
          #h[:age_text] = m.first
          h[:age_text] = m.first.scan(/\d+ \w/).first.rjust(4)
          h[:age] = human_age_to_unix(m.first)
        end
        #puts "fulltext: #{fulltext} "
        h[:byline] = fulltext
      end
    end
  end
  #return arr
  page[:articles] = arr
  return page
end

#to_hash_comment(url) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/hacker/curse/hackernewsparser.rb', line 48

def to_hash_comment url
  # for testing i may send in a saved file, so i don't keep hitting HN
  if !File.exists? url
    unless url.index("http")
      url = @host + "/" + url
    end
  end
  page = Nokogiri::HTML(open(url))
  h = {}
  title = page.css("td.title")
  article_url = title.css("a").first["href"]
  h[:title] = title.text
  h[:article_url] = article_url

  subtext = page.css("td.subtext")
  h[:byline] = subtext.text
  # TODO extract age_text
  h[:age_text] = subtext.text.scan(/\d+ \w+ ago/).first
  score = subtext.css("span").text
  h[:points] = score
  subtext.css("a").each_with_index do |e, i|
    link = e["href"]
    text = e.text
    if link.index("user") == 0
      h[:submitter] = text
      h[:submitter_url] = link
    elsif link.index("item") == 0
      h[:comment_count] = text
      h[:comments_url] = link
    end
  end

  # need to get points
  comheads = page.css("span.comhead") # .collect do |e| e.text ; end
  comments = page.css("span.comment").collect do |e| e.text ; end
  comheads.delete(comheads.first)
  # array of comments
  carr = Array.new
  comheads.zip(comments) do |head,c| 
    hh={}; hh[:head] = head.text; 
    #$stderr.puts "head:: #{head.text}"
    m = head.text.scan(/\d+ \w+ ago/)
    if !m.empty?
      hh[:age_text] = m.first.scan(/\d+ \w/).first.rjust(4)
      hh[:age] = human_age_to_unix(m.first)
      head.css("a").each_with_index do |e, i|
        link = e["href"]
        text = e.text
        if link.index("user") == 0
          hh[:submitter] = text
          hh[:submitter_url] = link
        elsif link.index("item") == 0
          hh[:text] = text
          hh[:comment_url] = link
        end
      end
    end
    hh[:comment_text]=c; 
    carr << hh 
  end

  h[:comments] = carr
  return h
end