Class: HackerCurse::AbstractSiteParser

Inherits:
Object
  • Object
show all
Defined in:
lib/hacker/curse/abstractsiteparser.rb

Overview

rn = RNParser.new [url] rn.subreddit = “ruby” resultset = rn.get_next_page :page => prevresultset, :number => 5 resultset.each do |art|

art.title, art.points
art.comments

end

hn = HNewsParser @options hn.subxxx = “news” / “newest”

redditnews.rb -s ruby –pages 2 hackernews.rb -s newest –pages 2 -d ‘|’

Direct Known Subclasses

HackerNewsParser, RedditNewsParser

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ AbstractSiteParser

HOST = “news.ycombinator.com



139
140
141
142
143
144
145
146
147
# File 'lib/hacker/curse/abstractsiteparser.rb', line 139

def initialize options={}
  @options = options
  @url = @options[:url]
  @save_html = @options[:save_html]
  @htmloutfile = @options[:htmloutfile]
  @num_pages = @options[:num_pages] || 1
  @more_url = nil
  #puts "initialize: url is #{@url} "
end

Instance Attribute Details

#hostObject

Returns the value of attribute host.



132
133
134
# File 'lib/hacker/curse/abstractsiteparser.rb', line 132

def host
  @host
end

#htmloutfileObject

Returns the value of attribute htmloutfile.



137
138
139
# File 'lib/hacker/curse/abstractsiteparser.rb', line 137

def htmloutfile
  @htmloutfile
end

#more_urlObject (readonly)

Returns the value of attribute more_url.



131
132
133
# File 'lib/hacker/curse/abstractsiteparser.rb', line 131

def more_url
  @more_url
end

#num_pagesObject

Returns the value of attribute num_pages.



133
134
135
# File 'lib/hacker/curse/abstractsiteparser.rb', line 133

def num_pages
  @num_pages
end

#save_htmlObject

should the html be saved



136
137
138
# File 'lib/hacker/curse/abstractsiteparser.rb', line 136

def save_html
  @save_html
end

#subforumObject

Returns the value of attribute subforum.



134
135
136
# File 'lib/hacker/curse/abstractsiteparser.rb', line 134

def subforum
  @subforum
end

Instance Method Details

#_retrieve_comments(url) ⇒ Object



239
240
241
# File 'lib/hacker/curse/abstractsiteparser.rb', line 239

def _retrieve_comments url
  raise "Must be implemented by concrete class "
end

#_retrieve_page(url) ⇒ Object



175
176
177
# File 'lib/hacker/curse/abstractsiteparser.rb', line 175

def _retrieve_page url
  raise "must be implemented by concrete class"
end

#get_comments(index) ⇒ Object Also known as: get_comments_for_link



254
255
256
257
258
259
260
261
262
263
264
# File 'lib/hacker/curse/abstractsiteparser.rb', line 254

def get_comments index
  url = get_comments_url index
  if url
    #puts url
    comments = convert_comment_url url
    return comments
  #else
    #puts "Sorry no url for #{index} "
  end
  return []
end

#get_comments_url(index) ⇒ Object



243
244
245
246
247
248
249
250
251
252
# File 'lib/hacker/curse/abstractsiteparser.rb', line 243

def get_comments_url index
  arr = @arr
  entry = arr[index]
  if entry
    if entry.key? :comments_url
      return entry[:comments_url]
    end
  end
  return nil
end

#get_doc_for_url(url) ⇒ Object

returns nokogiri html doc and writes html is required.



216
217
218
219
220
221
222
223
224
225
226
227
228
229
# File 'lib/hacker/curse/abstractsiteparser.rb', line 216

def get_doc_for_url url
  #puts "get_doc #{url} "
  out = open(url)
  doc  = Nokogiri::HTML(out)
  if @save_html
    subforum = @subforum || "unknown"
    outfile = @htmloutfile || "#{subforum}.html"
    #if !File.exists? url
    out.rewind
      File.open(outfile, 'w') {|f| f.write(out.read) }
    #end
  end
  return doc
end

#get_first_pageObject

puts “initialize: url is #@url ”



148
149
150
151
# File 'lib/hacker/curse/abstractsiteparser.rb', line 148

def get_first_page
  #@arr = to_hash @url
  page = _retrieve_page @url
end

#get_next_page(opts = {}) ⇒ Object Also known as: get_next



152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/hacker/curse/abstractsiteparser.rb', line 152

def get_next_page opts={}
  page = opts[:page]
  num_pages = opts[:num_pages] || @num_pages
  num_pages ||= 1
  u = @more_url || @url
  if page 
    u = page.next_url
  end
  pages = nil
  num_pages.times do |i|
    page = _retrieve_page u
    if pages.nil?
      pages = page
    else
      pages.merge_page page
    end
    u = page.next_url
    break unless u  # sometimes there is no next
    @more_url = u
  end
  return pages
end

#human_age_to_unix(age_text) ⇒ Object



266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# File 'lib/hacker/curse/abstractsiteparser.rb', line 266

def human_age_to_unix age_text
  i = age_text.to_i
  ff=1
  if age_text.index("hour")
    i *= ff*60*60
  elsif age_text.index("second")
    i *= ff
  elsif age_text.index("minute")
    i *= ff*60
  elsif age_text.index("day")
    i *= ff*60*60*24
  elsif age_text.index("month")
    i *= ff*60*60*24*30
  elsif age_text.index("week")
    i *= ff*60*60*24*7
  elsif age_text.index("year")
    i *= ff*60*60*24*365
  else
    #raise "don't know how to convert #{age_text} "
    return 0
  end
  return (Time.now.to_i - i)
end

#load_from_yml(filename = "hn.yml") ⇒ Object

this is a test method so we don’t keep hitting HN while testing out and getting IP blocked.



231
232
233
234
235
236
237
238
# File 'lib/hacker/curse/abstractsiteparser.rb', line 231

def load_from_yml filename="hn.yml"
  @arr = YAML::load( File.open( filename ) )
  next_url = @arr.last[:article_url]
  unless next_url.index("http")
    next_url = @host + "/" + next_url
  end
  @more_url = next_url
end

#save_comments_as_yml(outputfile, url) ⇒ Object

retrieves the comments for a url and stores in outputfile in YML format



209
210
211
212
213
214
# File 'lib/hacker/curse/abstractsiteparser.rb', line 209

def save_comments_as_yml outputfile, url
  pages = _retrieve_comments url
  if pages 
    to_yml outputfile, pages.hash
  end
end

#save_page_as_yml(outputfile, page) ⇒ Object

after called get_next_page, one may pass its return value to this method to convert it into an array of hashes and store it as a yml file It’s a bit silly, first we break the hash down into this structure

and then deconstruct the whole thing.


195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/hacker/curse/abstractsiteparser.rb', line 195

def save_page_as_yml outputfile, page
  h = {}
  h[:url] = page.url
  h[:next_url] = page.next_url
  h[:subforum] = page.subforum
  h[:create_date] = page.create_date
  articles = []
  page.each do |a| articles << a.hash; end

  h[:articles] = articles

  to_yml outputfile, h
end

#to_yml(outfile, arr = @arr) ⇒ Object

write as yml, this doesn’t work if multiple pages since we call x times

so previous is overwritten
This should be called with final class


181
182
183
184
185
186
187
188
189
190
# File 'lib/hacker/curse/abstractsiteparser.rb', line 181

def to_yml outfile, arr = @arr
  require 'yaml'
  # cannot just convert / to __ in filename since path gets converted too
  #if outfile.index("/")
    #outfile = outfile.gsub("/","__")
  #end
  File.open(outfile, 'w' ) do |f|
    f << YAML::dump(arr)
  end
end