Class: Retriever::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/retriever/page.rb

Constant Summary collapse

HASH_RE =
Regexp.new(/^#/i).freeze
HTTP_RE =
Regexp.new(/^http/i).freeze
H1_RE =
Regexp.new(/<h1>(.*)<\/h1>/i).freeze
H2_RE =
Regexp.new(/<h2>(.*)<\/h2>/i).freeze
TITLE_RE =
Regexp.new(/<title>(.*)<\/title>/i).freeze
DESC_RE =
Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\']
[^>]*content=[\"]
(
  [^\"]*
)
[\"]
[^>]
*>
/ix).freeze
HREF_CONTENTS_RE =
Regexp.new(/\shref=
['|"]
(
  [^\s]
  [a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+
)
['|"]
[\s|\W]
/ix).freeze
NONPAGE_EXT_RE =
Regexp.new(/\.
(?:css|js|png|gif|jpg|mp4|
wmv|flv|mp3|wav|doc|txt|ico|xml)
/ix).freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, source, t) ⇒ Page

Returns a new instance of Page.



38
39
40
41
42
43
# File 'lib/retriever/page.rb', line 38

def initialize(url, source, t)
  @url = url
  @t = t
  @source = source.encode_utf8_and_replace
  @links = nil
end

Instance Attribute Details

receives page source as string returns array of unique href links



47
48
49
# File 'lib/retriever/page.rb', line 47

def links
  @links
end

#sourceObject (readonly)

Returns the value of attribute source.



36
37
38
# File 'lib/retriever/page.rb', line 36

def source
  @source
end

#tObject (readonly)

Returns the value of attribute t.



36
37
38
# File 'lib/retriever/page.rb', line 36

def t
  @t
end

#urlObject (readonly)

Returns the value of attribute url.



36
37
38
# File 'lib/retriever/page.rb', line 36

def url
  @url
end

Instance Method Details

#descObject



82
83
84
# File 'lib/retriever/page.rb', line 82

def desc
  DESC_RE =~ @source ? @source.match(DESC_RE)[1].decode_html  : ''
end

#h1Object



86
87
88
# File 'lib/retriever/page.rb', line 86

def h1
  H1_RE =~ @source ? @source.match(H1_RE)[1].decode_html  : ''
end

#h2Object



90
91
92
# File 'lib/retriever/page.rb', line 90

def h2
  H2_RE =~ @source ? @source.match(H2_RE)[1].decode_html  : ''
end

#parse_by_css(selector) ⇒ Object



73
74
75
76
# File 'lib/retriever/page.rb', line 73

def parse_by_css(selector)
  nokogiri_doc = Nokogiri::HTML(@source)
  nokogiri_doc.css(selector).text
end

#parse_files(arr = parse_internal) ⇒ Object



69
70
71
# File 'lib/retriever/page.rb', line 69

def parse_files(arr = parse_internal)
  arr.select { |x| @t.file_re =~ x }
end

#parse_internalObject



59
60
61
62
63
# File 'lib/retriever/page.rb', line 59

def parse_internal
  links.select do |x|
    @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host
  end
end

#parse_internal_visitableObject



65
66
67
# File 'lib/retriever/page.rb', line 65

def parse_internal_visitable
  parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
end

#parse_seoObject



94
95
96
# File 'lib/retriever/page.rb', line 94

def parse_seo
  [title, desc, h1, h2]
end

#titleObject



78
79
80
# File 'lib/retriever/page.rb', line 78

def title
  TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
end