Class: HackerCurse::RedditNewsParser

Inherits:
AbstractSiteParser show all
Defined in:
lib/hacker/curse/redditnewsparser.rb

Instance Attribute Summary

Attributes inherited from AbstractSiteParser

#host, #htmloutfile, #more_url, #num_pages, #save_html, #subforum

Instance Method Summary collapse

Methods inherited from AbstractSiteParser

#get_comments, #get_comments_url, #get_doc_for_url, #get_first_page, #get_next_page, #human_age_to_unix, #load_from_yml, #save_comments_as_yml, #save_page_as_yml, #to_yml

Constructor Details

#initialize(config = {}) ⇒ RedditNewsParser

Returns a new instance of RedditNewsParser.



6
7
8
9
10
11
12
13
# File 'lib/hacker/curse/redditnewsparser.rb', line 6

def initialize config={}
  @host = config[:host] || "https://www.reddit.com"
  subforum = config[:subforum] || "unknown"
  _url="#{@host}/r/#{subforum}/.mobile"
  config[:url] ||= _url
  @subforum = subforum
  super config
end

Instance Method Details

#_retrieve_comments(url) ⇒ Object

reddit

Returns:

  • array of ForumComment objects For each, you may retrieve hash or individual items such as comment_text, points, age, age_text, submitter, head



26
27
28
29
30
# File 'lib/hacker/curse/redditnewsparser.rb', line 26

def _retrieve_comments url
  arr = to_hash_comment url
  pages = hash_to_comment_class arr
  return pages
end

#_retrieve_page(url) ⇒ Object



14
15
16
17
18
19
20
21
22
# File 'lib/hacker/curse/redditnewsparser.rb', line 14

def _retrieve_page url
  $stderr.puts "_retrieve_page got url #{url} "
  raise "url should be string" unless url.is_a? String
  arr = to_hash url
  return nil unless arr # exception was caught
  page = hash_to_class arr
  #to_yml "#{@subforum}OLD.yml", arr
  return page
end

#hash_to_class(h) ⇒ Object

reddit



122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/hacker/curse/redditnewsparser.rb', line 122

def hash_to_class h
  p = ForumPage.new
  p.url = h[:url]
  p.next_url = h[:next_url]
  p.create_date = h[:create_date]
  p.subforum = h[:subforum]
  #p.create_date_seconds = h[:create_date_seconds]
  art = h[:articles]
  arts = []
  art.each do |a|
    fa = ForumArticle.new a
    fa.parent = self
    arts << fa
  end
  p.articles = arts
  return p
end

#hash_to_comment_class(arr) ⇒ Object

reddit



227
228
229
230
# File 'lib/hacker/curse/redditnewsparser.rb', line 227

def hash_to_comment_class arr
  page = ForumArticle.new arr
  return page
end

#old_hash_to_comment_class(arr) ⇒ Object

this returns an array of Forumcomments but that means the article title

etc is not there, and if the output is saved, then that info may be required.


233
234
235
236
237
238
239
240
241
# File 'lib/hacker/curse/redditnewsparser.rb', line 233

def old_hash_to_comment_class arr
  co = arr[:comments]
  pages = Array.new
  co.each do |h|
    page = ForumComment.new h
    pages << page
  end
  return pages
end

#to_hash(url) ⇒ Object

reddit parse to hash containing :url, :mext_url and :articles (an array of hashes for each article)



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/hacker/curse/redditnewsparser.rb', line 32

def to_hash url
  page = {}
  arr = Array.new
  doc  = get_doc_for_url url
  return nil unless doc # exception was caught
  page[:url] = url
  now = Time.now
  page[:create_date_seconds] = now.to_i
  page[:create_date] = now
  page[:subforum] = @subforum
  #filename = "r.#{subr}.yml"
  links = doc.css("li div.link")
  links.each do |li|
    h = {}
    e = li.css("a.title")
    if !e.empty?
      e = e.first
      h[:title] = e.text
      h[:article_url] = e["href"]
    end
    e = li.css("a.domain")
    if !e.empty?
      e = e.first
      h[:domain] = e.text
      h[:domain_url] = e["href"]
    end
    e = li.css("a.author")
    if !e.empty?
      e = e.first
      h[:submitter] = e.text
      h[:submitter_url] = e["href"]
    end
    e = li.css("span.buttons > a")
    if !e.empty?
      e = e.first
      #h[:comment_count] = e.text.to_i
      h[:comment_count] = e.text.to_i.to_s.rjust(4)
      h[:comments_url] = e["href"]
    else
      h[:comment_count] = "   0"
      h[:comments_url] = ""
    end
     =  li.css("p.byline").text
    h[:byline] = 
    # 2014-08-14 - 13:34 in some cases the byline just says "17 minutes ago" with no BAR or "by"
    # In one case in 'science' the name itself had BARs to the parse failed
    # In another case there was no comments, so parts[2] was nil !!
    parts = .split("|")
    age = points = nil
    parts.each do |ppp|
      if ppp.index("points")
        points = ppp.strip
      elsif ppp.index("comments")
        # we've taken it already
      elsif ppp.index(" ago ")
        age = ppp.split("by").first.strip
      end
    end


    #age = parts.last.split("by").first.strip

    #age = parts[2].split("by").first.strip
    if age
      if age.scan(/\d+ \w/).first.nil?
        raise "Nil in age: #{age} , parts = #{parts}"
      end
    end
    h[:age_text]= age.scan(/\d+ \w/).first.rjust(4) if age
    #h[:age_text]= age
    h[:age] = human_age_to_unix(age) if age
    #h[:points]= points.to_i
    h[:points]= points.to_i.to_s.rjust(4)
    #puts points
    #puts age
    arr << h
  end
  # some cases like rising do not have next prev
  #next_prev_url= doc.css("p.nextprev").first.css("a").first["href"]
  next_prev_url= doc.css("p.nextprev").first
  if next_prev_url #&& !next_prev_url.empty?
    next_prev_url = next_prev_url.css("a").first["href"]
    page[:next_url] = next_prev_url
  end
  page[:articles] = arr
  #arr << { :next_prev_url => next_prev_url }
  #@more_url = next_prev_url
  return page
end

#to_hash_comment(url) ⇒ Object

returns a hash. hash returns an array of hashes containing comment details



153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/hacker/curse/redditnewsparser.rb', line 153

def to_hash_comment url
  # for testing i may send in a saved file, so i don't keep hitting HN
  if !File.exists? url
    unless url.index("http")
      url = @host + "/" + url
    end
  end
  # comments are nested and there is a div for that,
  # Also blockquotes for when commenter quotes another.
  doc = Nokogiri::HTML(open(url))
  h = {}
  main = doc.css("li div.link")
  maintext = main.text
  #puts maintext
  #puts main.css("a").count
  #puts main.css("a").first
  # this dumps the whole line
  h[:main_text] = maintext
  main.css("a").each_with_index do |l, i|
    # this breaks the main line into text and links
    case i
    when 0
      h[:title] = l.text
      h[:article_url] = l["href"]
    when 1
      h[:comment_count] = l.text
      h[:comments_url] = l["href"]
    when 2
      h[:submitter] = l.text
      h[:submitter_url] = l["href"]
    when 3
      h[:domain] = l.text
      h[:domain_url] = l["href"]
    end
  end
   = main.css("p.byline").text
  h[:byline] = 
  points = .scan(/\d+ point/).first
  age_text = .scan(/\d+ \w+ ago/).first
  h[:points] = points
  h[:age_text] = age_text

  arr = []
  comments = doc.css("li div.comment")
  comments.each_with_index do |co, ix|
    #puts  ix
    hh = {}
    arr << hh
    comment = co.css("div.md").text
    hh[:comment_text] = comment
     = co.css("p.byline")
    #puts "byline:"
    #puts byline
    bytext = .text
    hh[:head] = bytext
    #puts "bytext:"
    #puts bytext
    m = bytext.scan(/\d+ \w+ ago/)
    hh[:age_text] = m.first.sub(/ago/,"")
    hh[:age] = human_age_to_unix(m.first)
    link = .css("a").first
    if link
      commenter = link.text
      hh[:submitter] = commenter
      submitter_url = link["href"]
      hh[:submitter_url] = submitter_url
    end
    points = .css("span.score").text rescue ""
    hh[:points] = points.sub(/points?/,"")
  end
  h[:comments] = arr
  return h
end