Class: HackerCurse::RedditNewsParser
- Inherits:
-
AbstractSiteParser
- Object
- AbstractSiteParser
- HackerCurse::RedditNewsParser
- Defined in:
- lib/hacker/curse/redditnewsparser.rb
Instance Attribute Summary
Attributes inherited from AbstractSiteParser
#host, #htmloutfile, #more_url, #num_pages, #save_html, #subforum
Instance Method Summary collapse
-
#_retrieve_comments(url) ⇒ Object
reddit.
- #_retrieve_page(url) ⇒ Object
-
#hash_to_class(h) ⇒ Object
reddit.
-
#hash_to_comment_class(arr) ⇒ Object
reddit.
-
#initialize(config = {}) ⇒ RedditNewsParser
constructor
A new instance of RedditNewsParser.
-
#old_hash_to_comment_class(arr) ⇒ Object
this returns an array of Forumcomments but that means the article title etc is not there, and if the output is saved, then that info may be required.
-
#to_hash(url) ⇒ Object
reddit parse to hash containing :url, :mext_url and :articles (an array of hashes for each article).
-
#to_hash_comment(url) ⇒ Object
returns a hash.
Methods inherited from AbstractSiteParser
#get_comments, #get_comments_url, #get_doc_for_url, #get_first_page, #get_next_page, #human_age_to_unix, #load_from_yml, #save_comments_as_yml, #save_page_as_yml, #to_yml
Constructor Details
#initialize(config = {}) ⇒ RedditNewsParser
Returns a new instance of RedditNewsParser.
6 7 8 9 10 11 12 13 |
# File 'lib/hacker/curse/redditnewsparser.rb', line 6 def initialize config={} @host = config[:host] || "https://www.reddit.com" subforum = config[:subforum] || "unknown" _url="#{@host}/r/#{subforum}/.mobile" config[:url] ||= _url @subforum = subforum super config end |
Instance Method Details
#_retrieve_comments(url) ⇒ Object
26 27 28 29 30 |
# File 'lib/hacker/curse/redditnewsparser.rb', line 26 def _retrieve_comments url arr = to_hash_comment url pages = hash_to_comment_class arr return pages end |
#_retrieve_page(url) ⇒ Object
14 15 16 17 18 19 20 21 22 |
# File 'lib/hacker/curse/redditnewsparser.rb', line 14 def _retrieve_page url $stderr.puts "_retrieve_page got url #{url} " raise "url should be string" unless url.is_a? String arr = to_hash url return nil unless arr # exception was caught page = hash_to_class arr #to_yml "#{@subforum}OLD.yml", arr return page end |
#hash_to_class(h) ⇒ Object
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/hacker/curse/redditnewsparser.rb', line 122 def hash_to_class h p = ForumPage.new p.url = h[:url] p.next_url = h[:next_url] p.create_date = h[:create_date] p.subforum = h[:subforum] #p.create_date_seconds = h[:create_date_seconds] art = h[:articles] arts = [] art.each do |a| fa = ForumArticle.new a fa.parent = self arts << fa end p.articles = arts return p end |
#hash_to_comment_class(arr) ⇒ Object
227 228 229 230 |
# File 'lib/hacker/curse/redditnewsparser.rb', line 227 def hash_to_comment_class arr page = ForumArticle.new arr return page end |
#old_hash_to_comment_class(arr) ⇒ Object
this returns an array of Forumcomments but that means the article title
etc is not there, and if the output is saved, then that info may be required.
233 234 235 236 237 238 239 240 241 |
# File 'lib/hacker/curse/redditnewsparser.rb', line 233 def old_hash_to_comment_class arr co = arr[:comments] pages = Array.new co.each do |h| page = ForumComment.new h pages << page end return pages end |
#to_hash(url) ⇒ Object
reddit parse to hash containing :url, :mext_url and :articles (an array of hashes for each article)
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/hacker/curse/redditnewsparser.rb', line 32 def to_hash url page = {} arr = Array.new doc = get_doc_for_url url return nil unless doc # exception was caught page[:url] = url now = Time.now page[:create_date_seconds] = now.to_i page[:create_date] = now page[:subforum] = @subforum #filename = "r.#{subr}.yml" links = doc.css("li div.link") links.each do |li| h = {} e = li.css("a.title") if !e.empty? e = e.first h[:title] = e.text h[:article_url] = e["href"] end e = li.css("a.domain") if !e.empty? e = e.first h[:domain] = e.text h[:domain_url] = e["href"] end e = li.css("a.author") if !e.empty? e = e.first h[:submitter] = e.text h[:submitter_url] = e["href"] end e = li.css("span.buttons > a") if !e.empty? e = e.first #h[:comment_count] = e.text.to_i h[:comment_count] = e.text.to_i.to_s.rjust(4) h[:comments_url] = e["href"] else h[:comment_count] = " 0" h[:comments_url] = "" end = li.css("p.byline").text h[:byline] = # 2014-08-14 - 13:34 in some cases the byline just says "17 minutes ago" with no BAR or "by" # In one case in 'science' the name itself had BARs to the parse failed # In another case there was no comments, so parts[2] was nil !! parts = .split("|") age = points = nil parts.each do |ppp| if ppp.index("points") points = ppp.strip elsif ppp.index("comments") # we've taken it already elsif ppp.index(" ago ") age = ppp.split("by").first.strip end end #age = parts.last.split("by").first.strip #age = parts[2].split("by").first.strip if age if age.scan(/\d+ \w/).first.nil? raise "Nil in age: #{age} , parts = #{parts}" end end h[:age_text]= age.scan(/\d+ \w/).first.rjust(4) if age #h[:age_text]= age h[:age] = human_age_to_unix(age) if age #h[:points]= points.to_i h[:points]= points.to_i.to_s.rjust(4) #puts points #puts age arr << h end # some cases like rising do not have next prev #next_prev_url= doc.css("p.nextprev").first.css("a").first["href"] next_prev_url= doc.css("p.nextprev").first if next_prev_url #&& !next_prev_url.empty? next_prev_url = next_prev_url.css("a").first["href"] page[:next_url] = next_prev_url end page[:articles] = arr #arr << { :next_prev_url => next_prev_url } #@more_url = next_prev_url return page end |
#to_hash_comment(url) ⇒ Object
returns a hash. hash returns an array of hashes containing comment details
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
# File 'lib/hacker/curse/redditnewsparser.rb', line 153 def to_hash_comment url # for testing i may send in a saved file, so i don't keep hitting HN if !File.exists? url unless url.index("http") url = @host + "/" + url end end # comments are nested and there is a div for that, # Also blockquotes for when commenter quotes another. doc = Nokogiri::HTML(open(url)) h = {} main = doc.css("li div.link") maintext = main.text #puts maintext #puts main.css("a").count #puts main.css("a").first # this dumps the whole line h[:main_text] = maintext main.css("a").each_with_index do |l, i| # this breaks the main line into text and links case i when 0 h[:title] = l.text h[:article_url] = l["href"] when 1 h[:comment_count] = l.text h[:comments_url] = l["href"] when 2 h[:submitter] = l.text h[:submitter_url] = l["href"] when 3 h[:domain] = l.text h[:domain_url] = l["href"] end end = main.css("p.byline").text h[:byline] = points = .scan(/\d+ point/).first age_text = .scan(/\d+ \w+ ago/).first h[:points] = points h[:age_text] = age_text arr = [] comments = doc.css("li div.comment") comments.each_with_index do |co, ix| #puts ix hh = {} arr << hh comment = co.css("div.md").text hh[:comment_text] = comment = co.css("p.byline") #puts "byline:" #puts byline bytext = .text hh[:head] = bytext #puts "bytext:" #puts bytext m = bytext.scan(/\d+ \w+ ago/) hh[:age_text] = m.first.sub(/ago/,"") hh[:age] = human_age_to_unix(m.first) link = .css("a").first if link commenter = link.text hh[:submitter] = commenter submitter_url = link["href"] hh[:submitter_url] = submitter_url end points = .css("span.score").text rescue "" hh[:points] = points.sub(/points?/,"") end h[:comments] = arr return h end |