Class: HackerCurse::HackerNewsParser
- Inherits:
-
AbstractSiteParser
- Object
- AbstractSiteParser
- HackerCurse::HackerNewsParser
- Defined in:
- lib/hacker/curse/hackernewsparser.rb
Instance Attribute Summary
Attributes inherited from AbstractSiteParser
#host, #htmloutfile, #more_url, #num_pages, #save_html, #subforum
Instance Method Summary collapse
-
#_retrieve_comments(url) ⇒ Object
currently returns a Hash.
- #_retrieve_page(url) ⇒ Object
- #hash_to_class(h) ⇒ Object
- #hash_to_comment_class(arr) ⇒ Object
-
#initialize(config = {}) ⇒ HackerNewsParser
constructor
A new instance of HackerNewsParser.
- #oldhash_to_comment_class(arr) ⇒ Object
-
#to_hash(url) ⇒ Object
convert the front page to a hash.
- #to_hash_comment(url) ⇒ Object
Methods inherited from AbstractSiteParser
#get_comments, #get_comments_url, #get_doc_for_url, #get_first_page, #get_next_page, #human_age_to_unix, #load_from_yml, #save_comments_as_yml, #save_page_as_yml, #to_yml
Constructor Details
#initialize(config = {}) ⇒ HackerNewsParser
Returns a new instance of HackerNewsParser.
6 7 8 9 10 11 12 13 |
# File 'lib/hacker/curse/hackernewsparser.rb', line 6 def initialize config={} @host = config[:host] || "https://news.ycombinator.com" subforum = config[:subforum] || "news" _url="#{@host}/#{subforum}" @subforum = subforum config[:url] ||= _url super config end |
Instance Method Details
#_retrieve_comments(url) ⇒ Object
currently returns a Hash. containing various entries relating to the main article
which can be avoiced.
Contains an array :comments which contains hashes, :head contains text of head, :comment contains
text of comment, and then there are entries for submitter.
hash[:comments].each do |e| e[:comment] ; end
29 30 31 32 33 34 |
# File 'lib/hacker/curse/hackernewsparser.rb', line 29 def _retrieve_comments url arr = to_hash_comment url # TODO break head into points age etc pages = hash_to_comment_class arr return pages end |
#_retrieve_page(url) ⇒ Object
14 15 16 17 18 19 20 21 |
# File 'lib/hacker/curse/hackernewsparser.rb', line 14 def _retrieve_page url #puts "got url #{url} " raise "url should be string" unless url.is_a? String arr = to_hash url page = hash_to_class arr #to_yml "#{@subforum}.yml", arr return page end |
#hash_to_class(h) ⇒ Object
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
# File 'lib/hacker/curse/hackernewsparser.rb', line 112 def hash_to_class h p = ForumPage.new p.url = h[:url] p.next_url = h[:next_url] p.create_date = h[:create_date] p.subforum = h[:subforum] art = h[:articles] arts = [] art.each do |a| fa = ForumArticle.new a fa.parent = self arts << fa end p.articles = arts return p end |
#hash_to_comment_class(arr) ⇒ Object
35 36 37 38 |
# File 'lib/hacker/curse/hackernewsparser.rb', line 35 def hash_to_comment_class arr page = ForumArticle.new arr return page end |
#oldhash_to_comment_class(arr) ⇒ Object
39 40 41 42 43 44 45 46 47 |
# File 'lib/hacker/curse/hackernewsparser.rb', line 39 def oldhash_to_comment_class arr co = arr[:comments] pages = Array.new co.each do |h| page = ForumComment.new h pages << page end return pages end |
#to_hash(url) ⇒ Object
convert the front page to a hash
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
# File 'lib/hacker/curse/hackernewsparser.rb', line 129 def to_hash url doc = get_doc_for_url url count = 0 page = {} page[:url] = url now = Time.now page[:create_date_seconds] = now.to_i page[:create_date] = now page[:subforum] = @subforum arr = Array.new h = {} links = doc.xpath("//table/tr/td/table/tr") links.each_with_index do |li, i| x = li.css("td.title") if !x.empty? #puts " ---- title ----- #{x.count} " count = x[0].text #puts count if x.count < 2 # this block is for the next_url article_url = x[0].css("a")[0]["href"] # link url #puts article_url h = {} h[:title] = count h[:article_url] = article_url more = count more_url = "#{@host}/#{article_url}" #arr << h page[:next_url] = more_url #puts li end break if x.count < 2 # actual article url title = x[1].css("a")[0].text # title article_url = x[1].css("a")[0]["href"] # link url #puts article_url #puts title h = {} #h[:number] = count h[:title] = title # ask option does not have hostname since it is relative to HN if article_url.index("http") != 0 article_url = "#{@host}/#{article_url}" end h[:article_url] = article_url arr << h else x = li.css("td.subtext") if !x.empty? fulltext = x.text #puts " ---- subtext ----- (#{fulltext})" submitter = nil submitter_url = nil comment = nil comments_url = nil t = x.css("a") t.each_with_index do |tt, ii| case ii when 0 submitter = tt.text submitter_url = tt["href"] when 1 comment = tt.text comments_url = tt["href"] comments_url = "#{@host}/#{comments_url}" end end points = x.css("span").text rescue "" #puts submitter #puts submitter_url #puts comment #puts comments_url #puts points h[:submitter] = submitter h[:submitter_url] = submitter_url h[:comment_count] = comment.to_i.to_s.rjust(4) h[:comments_url] = comments_url h[:points] = points.to_i.to_s.rjust(4) m = fulltext.scan(/\d+ \w+ ago/) if m #h[:age_text] = m.first h[:age_text] = m.first.scan(/\d+ \w/).first.rjust(4) h[:age] = human_age_to_unix(m.first) end #puts "fulltext: #{fulltext} " h[:byline] = fulltext end end end #return arr page[:articles] = arr return page end |
#to_hash_comment(url) ⇒ Object
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# File 'lib/hacker/curse/hackernewsparser.rb', line 48 def to_hash_comment url # for testing i may send in a saved file, so i don't keep hitting HN if !File.exists? url unless url.index("http") url = @host + "/" + url end end page = Nokogiri::HTML(open(url)) h = {} title = page.css("td.title") article_url = title.css("a").first["href"] h[:title] = title.text h[:article_url] = article_url subtext = page.css("td.subtext") h[:byline] = subtext.text # TODO extract age_text h[:age_text] = subtext.text.scan(/\d+ \w+ ago/).first score = subtext.css("span").text h[:points] = score subtext.css("a").each_with_index do |e, i| link = e["href"] text = e.text if link.index("user") == 0 h[:submitter] = text h[:submitter_url] = link elsif link.index("item") == 0 h[:comment_count] = text h[:comments_url] = link end end # need to get points comheads = page.css("span.comhead") # .collect do |e| e.text ; end comments = page.css("span.comment").collect do |e| e.text ; end comheads.delete(comheads.first) # array of comments carr = Array.new comheads.zip(comments) do |head,c| hh={}; hh[:head] = head.text; #$stderr.puts "head:: #{head.text}" m = head.text.scan(/\d+ \w+ ago/) if !m.empty? hh[:age_text] = m.first.scan(/\d+ \w/).first.rjust(4) hh[:age] = human_age_to_unix(m.first) head.css("a").each_with_index do |e, i| link = e["href"] text = e.text if link.index("user") == 0 hh[:submitter] = text hh[:submitter_url] = link elsif link.index("item") == 0 hh[:text] = text hh[:comment_url] = link end end end hh[:comment_text]=c; carr << hh end h[:comments] = carr return h end |