Class: HackerCurse::AbstractSiteParser
- Inherits:
-
Object
- Object
- HackerCurse::AbstractSiteParser
- Defined in:
- lib/hacker/curse/abstractsiteparser.rb
Overview
rn = RNParser.new [url] rn.subreddit = “ruby” resultset = rn.get_next_page :page => prevresultset, :number => 5 resultset.each do |art|
art.title, art.points
art.comments
end
hn = HNewsParser @options hn.subxxx = “news” / “newest”
redditnews.rb -s ruby –pages 2 hackernews.rb -s newest –pages 2 -d ‘|’
Direct Known Subclasses
Instance Attribute Summary collapse
-
#host ⇒ Object
Returns the value of attribute host.
-
#htmloutfile ⇒ Object
Returns the value of attribute htmloutfile.
-
#more_url ⇒ Object
readonly
Returns the value of attribute more_url.
-
#num_pages ⇒ Object
Returns the value of attribute num_pages.
-
#save_html ⇒ Object
should the html be saved.
-
#subforum ⇒ Object
Returns the value of attribute subforum.
Instance Method Summary collapse
- #_retrieve_comments(url) ⇒ Object
- #_retrieve_page(url) ⇒ Object
- #get_comments(index) ⇒ Object (also: #get_comments_for_link)
- #get_comments_url(index) ⇒ Object
-
#get_doc_for_url(url) ⇒ Object
returns nokogiri html doc and writes html is required.
-
#get_first_page ⇒ Object
puts “initialize: url is #@url ”.
- #get_next_page(opts = {}) ⇒ Object (also: #get_next)
- #human_age_to_unix(age_text) ⇒ Object
-
#initialize(options = {}) ⇒ AbstractSiteParser
constructor
HOST = “news.ycombinator.com”.
-
#load_from_yml(filename = "hn.yml") ⇒ Object
this is a test method so we don’t keep hitting HN while testing out and getting IP blocked.
-
#save_comments_as_yml(outputfile, url) ⇒ Object
retrieves the comments for a url and stores in outputfile in YML format.
-
#save_page_as_yml(outputfile, page) ⇒ Object
after called get_next_page, one may pass its return value to this method to convert it into an array of hashes and store it as a yml file It’s a bit silly, first we break the hash down into this structure and then deconstruct the whole thing.
-
#to_yml(outfile, arr = @arr) ⇒ Object
write as yml, this doesn’t work if multiple pages since we call x times so previous is overwritten This should be called with final class.
Constructor Details
#initialize(options = {}) ⇒ AbstractSiteParser
HOST = “news.ycombinator.com”
139 140 141 142 143 144 145 146 147 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 139 def initialize ={} @options = @url = @options[:url] @save_html = @options[:save_html] @htmloutfile = @options[:htmloutfile] @num_pages = @options[:num_pages] || 1 @more_url = nil #puts "initialize: url is #{@url} " end |
Instance Attribute Details
#host ⇒ Object
Returns the value of attribute host.
132 133 134 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 132 def host @host end |
#htmloutfile ⇒ Object
Returns the value of attribute htmloutfile.
137 138 139 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 137 def htmloutfile @htmloutfile end |
#more_url ⇒ Object (readonly)
Returns the value of attribute more_url.
131 132 133 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 131 def more_url @more_url end |
#num_pages ⇒ Object
Returns the value of attribute num_pages.
133 134 135 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 133 def num_pages @num_pages end |
#save_html ⇒ Object
should the html be saved
136 137 138 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 136 def save_html @save_html end |
#subforum ⇒ Object
Returns the value of attribute subforum.
134 135 136 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 134 def subforum @subforum end |
Instance Method Details
#_retrieve_comments(url) ⇒ Object
239 240 241 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 239 def _retrieve_comments url raise "Must be implemented by concrete class " end |
#_retrieve_page(url) ⇒ Object
175 176 177 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 175 def _retrieve_page url raise "must be implemented by concrete class" end |
#get_comments(index) ⇒ Object Also known as: get_comments_for_link
254 255 256 257 258 259 260 261 262 263 264 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 254 def get_comments index url = get_comments_url index if url #puts url comments = convert_comment_url url return comments #else #puts "Sorry no url for #{index} " end return [] end |
#get_comments_url(index) ⇒ Object
243 244 245 246 247 248 249 250 251 252 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 243 def get_comments_url index arr = @arr entry = arr[index] if entry if entry.key? :comments_url return entry[:comments_url] end end return nil end |
#get_doc_for_url(url) ⇒ Object
returns nokogiri html doc and writes html is required.
216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 216 def get_doc_for_url url #puts "get_doc #{url} " out = open(url) doc = Nokogiri::HTML(out) if @save_html subforum = @subforum || "unknown" outfile = @htmloutfile || "#{subforum}.html" #if !File.exists? url out.rewind File.open(outfile, 'w') {|f| f.write(out.read) } #end end return doc end |
#get_first_page ⇒ Object
puts “initialize: url is #@url ”
148 149 150 151 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 148 def get_first_page #@arr = to_hash @url page = _retrieve_page @url end |
#get_next_page(opts = {}) ⇒ Object Also known as: get_next
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 152 def get_next_page opts={} page = opts[:page] num_pages = opts[:num_pages] || @num_pages num_pages ||= 1 u = @more_url || @url if page u = page.next_url end pages = nil num_pages.times do |i| page = _retrieve_page u if pages.nil? pages = page else pages.merge_page page end u = page.next_url break unless u # sometimes there is no next @more_url = u end return pages end |
#human_age_to_unix(age_text) ⇒ Object
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 266 def human_age_to_unix age_text i = age_text.to_i ff=1 if age_text.index("hour") i *= ff*60*60 elsif age_text.index("second") i *= ff elsif age_text.index("minute") i *= ff*60 elsif age_text.index("day") i *= ff*60*60*24 elsif age_text.index("month") i *= ff*60*60*24*30 elsif age_text.index("week") i *= ff*60*60*24*7 elsif age_text.index("year") i *= ff*60*60*24*365 else #raise "don't know how to convert #{age_text} " return 0 end return (Time.now.to_i - i) end |
#load_from_yml(filename = "hn.yml") ⇒ Object
this is a test method so we don’t keep hitting HN while testing out and getting IP blocked.
231 232 233 234 235 236 237 238 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 231 def load_from_yml filename="hn.yml" @arr = YAML::load( File.open( filename ) ) next_url = @arr.last[:article_url] unless next_url.index("http") next_url = @host + "/" + next_url end @more_url = next_url end |
#save_comments_as_yml(outputfile, url) ⇒ Object
retrieves the comments for a url and stores in outputfile in YML format
209 210 211 212 213 214 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 209 def save_comments_as_yml outputfile, url pages = _retrieve_comments url if pages to_yml outputfile, pages.hash end end |
#save_page_as_yml(outputfile, page) ⇒ Object
after called get_next_page, one may pass its return value to this method to convert it into an array of hashes and store it as a yml file It’s a bit silly, first we break the hash down into this structure
and then deconstruct the whole thing.
195 196 197 198 199 200 201 202 203 204 205 206 207 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 195 def save_page_as_yml outputfile, page h = {} h[:url] = page.url h[:next_url] = page.next_url h[:subforum] = page.subforum h[:create_date] = page.create_date articles = [] page.each do |a| articles << a.hash; end h[:articles] = articles to_yml outputfile, h end |
#to_yml(outfile, arr = @arr) ⇒ Object
write as yml, this doesn’t work if multiple pages since we call x times
so previous is overwritten
This should be called with final class
181 182 183 184 185 186 187 188 189 190 |
# File 'lib/hacker/curse/abstractsiteparser.rb', line 181 def to_yml outfile, arr = @arr require 'yaml' # cannot just convert / to __ in filename since path gets converted too #if outfile.index("/") #outfile = outfile.gsub("/","__") #end File.open(outfile, 'w' ) do |f| f << YAML::dump(arr) end end |