Class: DQReadability::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/dq-readability.rb

Constant Summary collapse

DEFAULT_OPTIONS =
{
  :retry_length               => 250,
  :min_text_length            => 25,
  :remove_unlikely_candidates => true,
  :weight_classes             => true,
  :clean_conditionally        => true,
  :remove_empty_nodes         => true,
  :min_image_width            => 130,
  :min_image_height           => 80,
  :ignore_image_format        => [],
  :bypass                     => false,
  :math						  => false
}.freeze
REGEXES =
{
    :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
    :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
    :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
    :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
    :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
    :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
    :replaceFontsRe => /<(\/?)font[^>]*>/i,
    :trimRe => /^\s+|\s+$/,
    :normalizeRe => /\s{2,}/,
    :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
    :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input, options = {}) ⇒ Document

Returns a new instance of Document.



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/dq-readability.rb', line 41

def initialize(input, options = {})
  @options = DEFAULT_OPTIONS.merge(options)
  @input = open(input).read
	  @url = input
	  
  if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding]
    @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
    @options[:encoding] = @input.encoding.to_s
  end

  @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
  @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
  @weight_classes = @options[:weight_classes]
  @clean_conditionally = @options[:clean_conditionally]
  @best_candidate_has_image = true
  @bypass = @options[:bypass]
  @math = @options[:math]
  make_html
end

Instance Attribute Details

#best_candidateObject

Returns the value of attribute best_candidate.



39
40
41
# File 'lib/dq-readability.rb', line 39

def best_candidate
  @best_candidate
end

#best_candidate_has_imageObject

Returns the value of attribute best_candidate_has_image.



39
40
41
# File 'lib/dq-readability.rb', line 39

def best_candidate_has_image
  @best_candidate_has_image
end

#candidatesObject

Returns the value of attribute candidates.



39
40
41
# File 'lib/dq-readability.rb', line 39

def candidates
  @candidates
end

#htmlObject

Returns the value of attribute html.



39
40
41
# File 'lib/dq-readability.rb', line 39

def html
  @html
end

#optionsObject

Returns the value of attribute options.



39
40
41
# File 'lib/dq-readability.rb', line 39

def options
  @options
end

Instance Method Details

#authorObject

Look through the @html document looking for the author Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted) Returns nil if no author is detected



262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# File 'lib/dq-readability.rb', line 262

def author
  # Let's grab this author:
  # <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
  author_elements = @html.xpath('//meta[@name = "dc.creator"]')
  unless author_elements.empty?
    author_elements.each do |element|
      if element['content']
        return element['content'].strip
      end
    end
  end

  # Now let's try to grab this
  # <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
  # <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
  author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
  unless author_elements.empty?
    author_elements.each do |element|
      if element.text
        return element.text.strip
      end
    end
  end

  # Now let's try to grab this
  # <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
  # TODO: strip out the (rel)?
  author_elements = @html.xpath('//a[@rel = "author"]')
  unless author_elements.empty?
    author_elements.each do |element|
      if element.text
        return element.text.strip
      end
    end
  end

  author_elements = @html.xpath('//*[@id = "author"]')
  unless author_elements.empty?
    author_elements.each do |element|
      if element.text
        return element.text.strip
      end
    end
  end
end

#class_weight(e) ⇒ Object



425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
# File 'lib/dq-readability.rb', line 425

def class_weight(e)
  weight = 0
  return weight unless @weight_classes

  if e[:class] && e[:class] != ""
    if e[:class] =~ REGEXES[:negativeRe]
      weight -= 25
    end

    if e[:class] =~ REGEXES[:positiveRe]
      weight += 25
    end
  end

  if e[:id] && e[:id] != ""
    if e[:id] =~ REGEXES[:negativeRe]
      weight -= 25
    end

    if e[:id] =~ REGEXES[:positiveRe]
      weight += 25
    end
  end

  weight
end

#clean_conditionally(node, candidates, selector) ⇒ Object



584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
# File 'lib/dq-readability.rb', line 584

def clean_conditionally(node, candidates, selector)
  return unless @clean_conditionally
  node.css(selector).each do |el|
    weight = class_weight(el)
    content_score = candidates[el] ? candidates[el][:content_score] : 0
    name = el.name.downcase
    
    if weight + content_score < 0
      el.remove
      debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
    elsif el.text.count(",") < 10
      counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
      counts["li"] -= 100

      # For every img under a noscript tag discount one from the count to avoid double counting
      counts["img"] -= el.css("noscript").css("img").length
            
      content_length = el.text.strip.length  # Count the text length excluding any surrounding whitespace
      link_density = get_link_density(el)
      to_remove = false
      reason = ""
      
      if (counts["img"] > counts["p"]+2)
        reason = "too many images"
        to_remove = true
    elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
        reason = "more <li>s than <p>s"
        to_remove = true
		elsif counts["input"] > (counts["p"] / 3).to_i
        reason = "less than 3x <p>s than <input>s"
       to_remove = true
     elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
       reason = "too short a content length without a single image"
        to_remove = true
     elsif weight < 25 && link_density > 0.2
        reason = "too many links for its weight (#{weight})"
        to_remove = true
      elsif weight >= 25 && link_density > 0.5
        reason = "too many links for its weight (#{weight})"
        to_remove = true
      elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
        reason = "<embed>s with too short a content length, or too many <embed>s"
        to_remove = true
      end

      if to_remove
        debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
        el.remove
      end
    end
  end
end

#content(remove_unlikely_candidates = :default) ⇒ Object



308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
# File 'lib/dq-readability.rb', line 308

def content(remove_unlikely_candidates = :default)
  if @bypass == false
  @remove_unlikely_candidates = false if remove_unlikely_candidates == false

  prepare_candidates
  article = get_article(@candidates, @best_candidate)

  cleaned_article = sanitize(article, @candidates, options)
  if article.text.strip.length < options[:retry_length]
    if @remove_unlikely_candidates
      @remove_unlikely_candidates = false
    elsif @weight_classes
      @weight_classes = false
    elsif @clean_conditionally
      @clean_conditionally = false
    else
      # nothing we can do
      return cleaned_article
    end

    make_html
    content
  else
    cleaned_article
  end
  else
		make_html
		s = Nokogiri::XML::Node::SaveOptions
		save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML
		html = @html.serialize(:save_with => save_opts)
		return html
  end
end

#debug(str) ⇒ Object



467
468
469
# File 'lib/dq-readability.rb', line 467

def debug(str)
  puts str if options[:debug]
end

#get_article(candidates, best_candidate) ⇒ Object



342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
# File 'lib/dq-readability.rb', line 342

def get_article(candidates, best_candidate)
  # Now that we have the top candidate, look through its siblings for content that might also be related.
  # Things like preambles, content split by ads that we removed, etc.

  sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
  output = Nokogiri::XML::Node.new('div', @html)
  best_candidate[:elem].parent.children.each do |sibling|
    append = false
    append = true if sibling == best_candidate[:elem]
    append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold

    if sibling.name.downcase == "p"
      link_density = get_link_density(sibling)
      node_content = sibling.text
      node_length = node_content.length

      if node_length > 80 && link_density < 0.25
        append = true
      elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
        append = true
      end
    end

    if append
      sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
      sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
      output << sibling_dup
    end
  end

  output
end

#get_image_size(url) ⇒ Object



238
239
240
241
242
243
244
245
246
247
# File 'lib/dq-readability.rb', line 238

def get_image_size(url)
  begin
    w, h = FastImage.size(url)
    raise "Couldn't get size." if w.nil? || h.nil?
    {:width => w, :height => h}
  rescue => e
    debug("Image error: #{e}")
    nil
  end
end


389
390
391
392
393
# File 'lib/dq-readability.rb', line 389

def get_link_density(elem)
  link_length = elem.css("a").map(&:text).join("").length
  text_length = elem.text.length
  link_length / text_length.to_f
end

#image_meets_criteria?(image) ⇒ Boolean

Returns:

  • (Boolean)


249
250
251
252
# File 'lib/dq-readability.rb', line 249

def image_meets_criteria?(image)
  return false if options[:ignore_image_format].include?(image[:format].downcase)
  image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
end

#images(content = nil, reload = false) ⇒ Object



164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# File 'lib/dq-readability.rb', line 164

def images(content=nil, reload=false)
  begin
    require 'fastimage'
  rescue LoadError
    raise "Please install fastimage in order to use the #images feature."
  end

  @best_candidate_has_image = false if reload

  prepare_candidates
  list_images   = []
  tested_images = []
  content       = @best_candidate[:elem] unless reload

  return list_images if content.nil?
  elements = content.css("img").map(&:attributes)

    elements.each do |element|
      next unless element["src"]

      url     = element["src"].value
      height  = element["height"].nil?  ? 0 : element["height"].value.to_i
      width   = element["width"].nil?   ? 0 : element["width"].value.to_i
      
      if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
        image   = get_image_size(url) 
        next unless image
      else
        image = {:width => width, :height => height}
      end
      
      image[:format] = File.extname(url).gsub(".", "")
      
      if tested_images.include?(url)
        debug("Image was tested: #{url}")
        next
      end

      tested_images.push(url)
      if image_meets_criteria?(image)
        list_images << url
      else
        debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
      end
    end

  (list_images.empty? and content != @html) ? images(@html, true) : list_images
end

#images_with_fqdn_uris(document = @html.dup, source_uri) ⇒ Object



217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/dq-readability.rb', line 217

def images_with_fqdn_uris(document = @html.dup, source_uri)
  uri = URI.parse(source_uri)
  host = uri.host
  scheme = uri.scheme
  port = uri.port # defaults to 80

  base = "#{scheme}://#{host}:#{port}/"

  images = []
  document.css("img").each do |elem|
    begin
      elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil 
      images << elem['src'].to_s
    rescue URI::InvalidURIError => exc
      elem.remove
    end
  end

  images(document,true)
end

#images_with_fqdn_uris!(source_uri) ⇒ Object



213
214
215
# File 'lib/dq-readability.rb', line 213

def images_with_fqdn_uris!(source_uri)
  images_with_fqdn_uris(@html, source_uri)
end

#make_htmlObject



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/dq-readability.rb', line 70

def make_html
  @html = Nokogiri::HTML(@input, nil, @options[:encoding])
  # In case document has no body, such as from empty string or redirect
  @html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0

  # Remove html comment tags
  @html.xpath('//comment()').each { |i| i.remove }
  
  # making all the headings of same format
  @html.css("h1").each do |h|
		h.name = "h2"
  end
  
  @html.css("h2").each do |h|
		h.name = "h3"
  end
	
	  @html.css("h4").each do |h|
		h.name = "h3"
  end
	  
	  uri = URI.parse(@url)
  host = uri.host
  scheme = uri.scheme
  port = uri.port # defaults to 80
  base = "#{scheme}://#{host}:#{port}/"



	  # changing img src
  @html.css("img").each do |elem|
    begin
if elem['src'][0] == '/' 
			if elem['src'][1] == '/'
elem['src'] = 'http:'+elem['src']
			else
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil 
			end
else
			if @url.split('').last == '/'
elem['src'] = URI.join(@url,elem['src']).to_s if URI.parse(elem['src']).host == nil
			else
x = @url.split('/')
x.delete_at(x.length-1)
y = ''
x.each{|i| y += i+'/'}
elem['src'] = URI.join(y,elem['src']).to_s if URI.parse(elem['src']).host == nil
			end
end 
    rescue
      elem.remove
    end
  end

	# changing certain tags to <p> tags
	
	x = @html.css("ol")
	x.each do |t|
		t.name = "p"
	end
	len = @html.css('ol').length
	debug("length of ol tag #{len}")

 #changing the 'a' href

	 @html.css("a").each do |elem|
    begin
if elem['href'][0] == '/' 
			elem['href'] = URI.join(base,elem['href']).to_s if URI.parse(elem['href']).host == nil 
else
			if @url.split('').last == '/'
elem['href'] = URI.join(@url,elem['href']).to_s if URI.parse(elem['href']).host == nil
			else
x = @url.split('/')
x.delete_at(x.length-1)
y = ''
x.each{|i| y += i+'/'}
elem['href'] = URI.join(y,elem['href']).to_s if URI.parse(elem['href']).host == nil
			end
end 
    rescue
    end
  end

	  # removing edit spans
	  
	  @html.css('span').each do |elem|
		if elem.text.downcase == "[edit]"
			elem.remove
		end
	  end

end

#prepare_candidatesObject



61
62
63
64
65
66
67
68
# File 'lib/dq-readability.rb', line 61

def prepare_candidates
  @html.css("script, style").each { |i| i.remove }
  remove_unlikely_candidates! if @remove_unlikely_candidates
  transform_misused_divs_into_paragraphs!
  
  @candidates     = score_paragraphs(options[:min_text_length])
  @best_candidate = select_best_candidate(@candidates)
end

#remove_unlikely_candidates!Object



471
472
473
474
475
476
477
478
479
# File 'lib/dq-readability.rb', line 471

def remove_unlikely_candidates!
  @html.css("*").each do |elem|
    str = "#{elem[:class]}#{elem[:id]}"
    if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
      debug("Removing unlikely candidate - #{str}")
      elem.remove
    end
  end
end

#sanitize(node, candidates, options = {}) ⇒ Object



501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
# File 'lib/dq-readability.rb', line 501

def sanitize(node, candidates, options = {})    
  node.css("h1, h2, h3, h4, h5, h6").each do |header|
    header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
  end

  node.css("form, object, iframe, embed").each do |elem|
    elem.remove
  end

#      if @options[:remove_empty_nodes]
#        # remove <p> tags that have no text content - this will also remove p tags that contain only images.
#        node.css("p").each do |elem|
#          elem.remove if elem.content.strip.empty?
#        end
#      end

  # Conditionally clean <table>s, <ul>s, and <div>s
  clean_conditionally(node, candidates, "table, ul, div")

  # We'll sanitize all elements using a whitelist
  base_whitelist = @options[:tags] || %w[div p]
  # We'll add whitespace instead of block elements,
  # so a<br>b will have a nice space between them
  base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]

  # Use a hash for speed (don't want to make a million calls to include?)
  whitelist = Hash.new
  base_whitelist.each {|tag| whitelist[tag] = true }
  replace_with_whitespace = Hash.new
  base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }

  ([node] + node.css("*")).each do |el|
    # If element is in whitelist, delete all its attributes
    if whitelist[el.node_name]
      el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }

      # Otherwise, replace the element with its contents
    else
      # If element is root, replace the node as a text node
      if el.parent.nil?
        node = Nokogiri::XML::Text.new(el.text, el.document)
        break
      else
        if replace_with_whitespace[el.node_name]
          el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
        else
          el.swap(Nokogiri::XML::Text.new(el.text, el.document))
        end
      end
    end

  end

  s = Nokogiri::XML::Node::SaveOptions
  save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML
  html = node.serialize(:save_with => save_opts)

  # Get rid of duplicate whitespace
  if @math == false
		html = "<head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'></head>" + "\n" + html.gsub(/[\r\n\f]+/, "\n" )
  else
		head = <<HTML
<head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>
		<script type='text/x-mathjax-config'>
			MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\\\\(','\\\\)']]}});
		</script>
		<script type='text/javascript'
			src='http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'>
		</script>
</head>
HTML
		
		html = head + "\n" + html.gsub(/[\r\n\f]+/, "\n" )
  end
  
  # get rid of incompitable characters
  if html.encode('utf-8').include?('Â') 
		html = html.encode('utf-8').gsub('Â',' ')
  end
  
  return html
end

#score_node(elem) ⇒ Object



452
453
454
455
456
457
458
459
460
461
462
463
464
465
# File 'lib/dq-readability.rb', line 452

def score_node(elem)
  content_score = class_weight(elem)
  case elem.name.downcase
    when "div"
      content_score += 5
    when "blockquote"
      content_score += 3
    when "form"
      content_score -= 3
    when "th"
      content_score -= 5
  end
  { :content_score => content_score, :elem => elem }
end

#score_paragraphs(min_text_length) ⇒ Object



395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
# File 'lib/dq-readability.rb', line 395

def score_paragraphs(min_text_length)
  candidates = {}
  @html.css("p,td").each do |elem|
    parent_node = elem.parent
    grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
    inner_text = elem.text

    # If this paragraph is less than 25 characters, don't even count it.
    next if inner_text.length < min_text_length

    candidates[parent_node] ||= score_node(parent_node)
    candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node

    content_score = 1
    content_score += inner_text.split(',').length
    content_score += [(inner_text.length / 100).to_i, 3].min

    candidates[parent_node][:content_score] += content_score
    candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
  end

  # Scale the final candidates score based on link density. Good content should have a
  # relatively small link density (5% or less) and be mostly unaffected by this operation.
  candidates.each do |elem, candidate|
    candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
  end

  candidates
end

#select_best_candidate(candidates) ⇒ Object



375
376
377
378
379
380
381
382
383
384
385
386
387
# File 'lib/dq-readability.rb', line 375

def select_best_candidate(candidates)
  sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }

  debug("Top 5 candidates:")
  sorted_candidates[0...5].each do |candidate|
    debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
  end

  best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
  debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")

  best_candidate
end

#titleObject



254
255
256
257
# File 'lib/dq-readability.rb', line 254

def title
  title = @html.css("title").first
  title ? title.text : nil
end

#transform_misused_divs_into_paragraphs!Object



481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
# File 'lib/dq-readability.rb', line 481

def transform_misused_divs_into_paragraphs!
  @html.css("*").each do |elem|
    if elem.name.downcase == "div"
      # transform <div>s that do not contain other block elements into <p>s
      if elem.inner_html !~ REGEXES[:divToPElementsRe]
        debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
        elem.name = "p"
      end
    else
      # wrap text nodes in p tags
#          elem.children.each do |child|
#            if child.text?
#              debug("wrapping text node with a p")
#              child.swap("<p>#{child.text}</p>")
#            end
#          end
    end
  end
end