Class: HackerCurse::AbstractSiteParser

Inherits:
Object
  • Object
show all
Defined in:
lib/hacker/curse/abstractsiteparser.rb

Overview

rn = RNParser.new [url] rn.subreddit = “ruby” resultset = rn.get_next_page :page => prevresultset, :number => 5 resultset.each do |art|

art.title, art.points
art.comments

end

hn = HNewsParser @options hn.subxxx = “news” / “newest”

redditnews.rb -s ruby –pages 2 hackernews.rb -s newest –pages 2 -d ‘|’

Direct Known Subclasses

HackerNewsParser, RedditNewsParser

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ AbstractSiteParser

HOST = “news.ycombinator.com



139
140
141
142
143
144
145
146
147
# File 'lib/hacker/curse/abstractsiteparser.rb', line 139

def initialize options={}
  @options = options
  @url = @options[:url]
  @save_html = @options[:save_html]
  @htmloutfile = @options[:htmloutfile]
  @num_pages = @options[:num_pages] || 1
  @more_url = nil
  #puts "initialize: url is #{@url} "
end

Instance Attribute Details

#hostObject

Returns the value of attribute host.



132
133
134
# File 'lib/hacker/curse/abstractsiteparser.rb', line 132

def host
  @host
end

#htmloutfileObject

Returns the value of attribute htmloutfile.



137
138
139
# File 'lib/hacker/curse/abstractsiteparser.rb', line 137

def htmloutfile
  @htmloutfile
end

#more_urlObject (readonly)

Returns the value of attribute more_url.



131
132
133
# File 'lib/hacker/curse/abstractsiteparser.rb', line 131

def more_url
  @more_url
end

#num_pagesObject

Returns the value of attribute num_pages.



133
134
135
# File 'lib/hacker/curse/abstractsiteparser.rb', line 133

def num_pages
  @num_pages
end

#save_htmlObject

should the html be saved



136
137
138
# File 'lib/hacker/curse/abstractsiteparser.rb', line 136

def save_html
  @save_html
end

#subforumObject

Returns the value of attribute subforum.



134
135
136
# File 'lib/hacker/curse/abstractsiteparser.rb', line 134

def subforum
  @subforum
end

Instance Method Details

#_retrieve_comments(url) ⇒ Object



250
251
252
# File 'lib/hacker/curse/abstractsiteparser.rb', line 250

def _retrieve_comments url
  raise "Must be implemented by concrete class "
end

#_retrieve_page(url) ⇒ Object



176
177
178
# File 'lib/hacker/curse/abstractsiteparser.rb', line 176

def _retrieve_page url
  raise "must be implemented by concrete class"
end

#get_comments(index) ⇒ Object Also known as: get_comments_for_link



265
266
267
268
269
270
271
272
273
274
275
# File 'lib/hacker/curse/abstractsiteparser.rb', line 265

def get_comments index
  url = get_comments_url index
  if url
    #puts url
    comments = convert_comment_url url
    return comments
  #else
    #puts "Sorry no url for #{index} "
  end
  return []
end

#get_comments_url(index) ⇒ Object



254
255
256
257
258
259
260
261
262
263
# File 'lib/hacker/curse/abstractsiteparser.rb', line 254

def get_comments_url index
  arr = @arr
  entry = arr[index]
  if entry
    if entry.key? :comments_url
      return entry[:comments_url]
    end
  end
  return nil
end

#get_doc_for_url(url) ⇒ Object

returns nokogiri html doc and writes html is required. returns nil if HTTPError



218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/hacker/curse/abstractsiteparser.rb', line 218

def get_doc_for_url url
  $stderr.puts "get_doc #{url} "
  doc = nil
  # 2016-03-20 - added check since sometimes server error was coming
  begin
    out = open(url)
  rescue StandardError=>e
    $stderr.puts "\tError: #{e}"
    # 2016-03-20 - adding exit since it will go to client that shelled this command.
    exit 1
  else
    doc  = Nokogiri::HTML(out)
    if @save_html
      subforum = @subforum || "unknown"
      outfile = @htmloutfile || "#{subforum}.html"
      #if !File.exists? url
      out.rewind
      File.open(outfile, 'w') {|f| f.write(out.read) }
      #end
    end
  end
  return doc
end

#get_first_pageObject

puts “initialize: url is #@url ”



148
149
150
151
152
# File 'lib/hacker/curse/abstractsiteparser.rb', line 148

def get_first_page
  #@arr = to_hash @url
  # 2016-03-20 - 23:45 page can be nil if HTTPError
  page = _retrieve_page @url
end

#get_next_page(opts = {}) ⇒ Object Also known as: get_next



153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'lib/hacker/curse/abstractsiteparser.rb', line 153

def get_next_page opts={}
  page = opts[:page]
  num_pages = opts[:num_pages] || @num_pages
  num_pages ||= 1
  u = @more_url || @url
  if page 
    u = page.next_url
  end
  pages = nil
  num_pages.times do |i|
    page = _retrieve_page u
    if pages.nil?
      pages = page
    else
      pages.merge_page page
    end
    u = page.next_url
    break unless u  # sometimes there is no next
    @more_url = u
  end
  return pages
end

#human_age_to_unix(age_text) ⇒ Object



277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# File 'lib/hacker/curse/abstractsiteparser.rb', line 277

def human_age_to_unix age_text
  i = age_text.to_i
  ff=1
  if age_text.index("hour")
    i *= ff*60*60
  elsif age_text.index("second")
    i *= ff
  elsif age_text.index("minute")
    i *= ff*60
  elsif age_text.index("day")
    i *= ff*60*60*24
  elsif age_text.index("month")
    i *= ff*60*60*24*30
  elsif age_text.index("week")
    i *= ff*60*60*24*7
  elsif age_text.index("year")
    i *= ff*60*60*24*365
  else
    #raise "don't know how to convert #{age_text} "
    return 0
  end
  return (Time.now.to_i - i)
end

#load_from_yml(filename = "hn.yml") ⇒ Object

this is a test method so we don’t keep hitting HN while testing out and getting IP blocked.



242
243
244
245
246
247
248
249
# File 'lib/hacker/curse/abstractsiteparser.rb', line 242

def load_from_yml filename="hn.yml"
  @arr = YAML::load( File.open( filename ) )
  next_url = @arr.last[:article_url]
  unless next_url.index("http")
    next_url = @host + "/" + next_url
  end
  @more_url = next_url
end

#save_comments_as_yml(outputfile, url) ⇒ Object

retrieves the comments for a url and stores in outputfile in YML format



210
211
212
213
214
215
# File 'lib/hacker/curse/abstractsiteparser.rb', line 210

def save_comments_as_yml outputfile, url
  pages = _retrieve_comments url
  if pages 
    to_yml outputfile, pages.hash
  end
end

#save_page_as_yml(outputfile, page) ⇒ Object

after called get_next_page, one may pass its return value to this method to convert it into an array of hashes and store it as a yml file It’s a bit silly, first we break the hash down into this structure

and then deconstruct the whole thing.


196
197
198
199
200
201
202
203
204
205
206
207
208
# File 'lib/hacker/curse/abstractsiteparser.rb', line 196

def save_page_as_yml outputfile, page
  h = {}
  h[:url] = page.url
  h[:next_url] = page.next_url
  h[:subforum] = page.subforum
  h[:create_date] = page.create_date
  articles = []
  page.each do |a| articles << a.hash; end

  h[:articles] = articles

  to_yml outputfile, h
end

#to_yml(outfile, arr = @arr) ⇒ Object

write as yml, this doesn’t work if multiple pages since we call x times

so previous is overwritten
This should be called with final class


182
183
184
185
186
187
188
189
190
191
# File 'lib/hacker/curse/abstractsiteparser.rb', line 182

def to_yml outfile, arr = @arr
  require 'yaml'
  # cannot just convert / to __ in filename since path gets converted too
  #if outfile.index("/")
    #outfile = outfile.gsub("/","__")
  #end
  File.open(outfile, 'w' ) do |f|
    f << YAML::dump(arr)
  end
end