Module: Emails

Includes:
UnescapeHtmlHelper
Included in:
BrilliantWebScraper
Defined in:
lib/parsers/emails.rb

Overview

Parses emails from html string

Instance Method Summary collapse

Instance Method Details

#get_processed_emails(email_set) ⇒ Object



21
22
23
24
25
26
27
28
29
# File 'lib/parsers/emails.rb', line 21

def get_processed_emails(email_set)
  return [] if email_set.nil? || email_set.empty?

  unescaped_emails = email_set.map { |email| unescape_html(email) }
  return [] if unescaped_emails.empty?

  email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg|css|js|ico|gif)[A-Z]{2,3}/im
  unescaped_emails.select { |data| data =~ email_match_regex }
end

#grep_emails(response) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
# File 'lib/parsers/emails.rb', line 9

def grep_emails(response)
  return if response.nil? || response.empty?

  first_regex = /(?im)mailto:\s*([^\?"',\\<>\s]+)/
  second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg|css|js|ico|gif)[A-Z]{2,3})["'\s><]}
  first_set = response.scan(first_regex).flatten.compact
  first_set = get_processed_emails(first_set)
  second_set = response.scan(second_regex).flatten.compact
  second_set = get_processed_emails(second_set)
  (first_set | second_set).compact.map(&:downcase).uniq
end