Class: LicenseMatcher::Preprocess

Inherits:

Object

Object
LicenseMatcher::Preprocess

show all

Defined in:: lib/license_matcher/preprocess.rb

Class Method Summary collapse

Class Method Details

.clean_html(html_doc) ⇒ `Object`

# File 'lib/license_matcher/preprocess.rb', line 37

def self.clean_html(html_doc)
  body_text = ""
  body_elements = html_doc.xpath(
    '//p | //h1 | //h2 | //h3 | //h4 | //h5 | //h6 | //em | //strong | //b | //td | //pre
    | //li[not(@id) and not(@class) and not(a)] | //section//section[@class="project-info"]
    | //blockquote | //textarea'
  ).to_a

  #extract text from html tag and separate them by space
  body_elements.each {|el| body_text += ' ' + el.text.to_s}

  #REMOVE XML CDATA like opensource.org pages has
  body_text = body_text.to_s.strip
  body_text.gsub!(/\<\!\[CDATA.+?\]\]\>/i, ' ')

  if body_text.empty?
    p "match_html: document didnt pass noise filter, will use whole body content"
    body_text = html_doc.xpath('//body').text.to_s.strip
  end

  return body_text
end

.parse_html(html_text) ⇒ `Object`

# File 'lib/license_matcher/preprocess.rb', line 60

def self.parse_html(html_text)
  begin
    return Nokogiri.HTML(safe_encode(html_text))
  rescue Exception => e
    p "failed to parse html doc: \n #{html_text} - #{e.message}"
    return nil
  end
end

.preprocess_html(html_text) ⇒ `Object`

# File 'lib/license_matcher/preprocess.rb', line 22

def self.preprocess_html(html_text)
  # if text is HTML doc, then
  # extract text only from visible html tags
  text = ""

  html_doc = parse_html(html_text)
  if html_doc
    text = clean_html(html_doc)
  else
    p "match_html: failed to parse html document\n#{html_text}"
  end

  return text
end

.preprocess_text(text) ⇒ `Object`

# File 'lib/license_matcher/preprocess.rb', line 5

def self.preprocess_text(text)
  text = safe_encode(text)

  #remove markdown url tags
  text = text.gsub(/\[.+?\]\(.+?\)/, ' ')

  #remove spam words
  text.gsub!(/\bTHE\b/i, '')

  #remove some XML grabage
  text = text.gsub(/\<\!\[CDATA.*?\]\]\>/, ' ').to_s
  text = text.gsub(/\<\!--.+?--\>/,  ' ').to_s
  text = text.gsub(/<\!\[CDATA.+?\]>/, ' ').to_s

  return text.to_s.strip.gsub(/\s+/, ' ')
end

.safe_encode(txt) ⇒ `Object`

# File 'lib/license_matcher/preprocess.rb', line 69

def self.safe_encode(txt)
  txt.to_s.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
rescue
  p "Failed to encode text:\n #{txt}i"
  return ""
end

Class: LicenseMatcher::Preprocess

Class Method Summary collapse

Class Method Details

.clean_html(html_doc) ⇒ Object

.parse_html(html_text) ⇒ Object

.preprocess_html(html_text) ⇒ Object

.preprocess_text(text) ⇒ Object

.safe_encode(txt) ⇒ Object

.clean_html(html_doc) ⇒ `Object`

.parse_html(html_text) ⇒ `Object`

.preprocess_html(html_text) ⇒ `Object`

.preprocess_text(text) ⇒ `Object`

.safe_encode(txt) ⇒ `Object`