Class: LicenseMatcher::Preprocess
- Inherits:
-
Object
- Object
- LicenseMatcher::Preprocess
- Defined in:
- lib/license_matcher/preprocess.rb
Class Method Summary collapse
- .clean_html(html_doc) ⇒ Object
- .parse_html(html_text) ⇒ Object
- .preprocess_html(html_text) ⇒ Object
- .preprocess_text(text) ⇒ Object
- .safe_encode(txt) ⇒ Object
Class Method Details
.clean_html(html_doc) ⇒ Object
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/license_matcher/preprocess.rb', line 37 def self.clean_html(html_doc) body_text = "" body_elements = html_doc.xpath( '//p | //h1 | //h2 | //h3 | //h4 | //h5 | //h6 | //em | //strong | //b | //td | //pre | //li[not(@id) and not(@class) and not(a)] | //section//section[@class="project-info"] | //blockquote | //textarea' ).to_a #extract text from html tag and separate them by space body_elements.each {|el| body_text += ' ' + el.text.to_s} #REMOVE XML CDATA like opensource.org pages has body_text = body_text.to_s.strip body_text.gsub!(/\<\!\[CDATA.+?\]\]\>/i, ' ') if body_text.empty? p "match_html: document didnt pass noise filter, will use whole body content" body_text = html_doc.xpath('//body').text.to_s.strip end return body_text end |
.parse_html(html_text) ⇒ Object
60 61 62 63 64 65 66 67 |
# File 'lib/license_matcher/preprocess.rb', line 60 def self.parse_html(html_text) begin return Nokogiri.HTML(safe_encode(html_text)) rescue Exception => e p "failed to parse html doc: \n #{html_text} - #{e.}" return nil end end |
.preprocess_html(html_text) ⇒ Object
22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/license_matcher/preprocess.rb', line 22 def self.preprocess_html(html_text) # if text is HTML doc, then # extract text only from visible html tags text = "" html_doc = parse_html(html_text) if html_doc text = clean_html(html_doc) else p "match_html: failed to parse html document\n#{html_text}" end return text end |
.preprocess_text(text) ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
# File 'lib/license_matcher/preprocess.rb', line 5 def self.preprocess_text(text) text = safe_encode(text) #remove markdown url tags text = text.gsub(/\[.+?\]\(.+?\)/, ' ') #remove spam words text.gsub!(/\bTHE\b/i, '') #remove some XML grabage text = text.gsub(/\<\!\[CDATA.*?\]\]\>/, ' ').to_s text = text.gsub(/\<\!--.+?--\>/, ' ').to_s text = text.gsub(/<\!\[CDATA.+?\]>/, ' ').to_s return text.to_s.strip.gsub(/\s+/, ' ') end |
.safe_encode(txt) ⇒ Object
69 70 71 72 73 74 |
# File 'lib/license_matcher/preprocess.rb', line 69 def self.safe_encode(txt) txt.to_s.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') rescue p "Failed to encode text:\n #{txt}i" return "" end |