Class: Retriever::Page
- Inherits:
-
Object
- Object
- Retriever::Page
- Defined in:
- lib/retriever/page.rb
Constant Summary collapse
- HASH_RE =
Regexp.new(/^#/i).freeze
- HTTP_RE =
Regexp.new(/^http/i).freeze
- H1_RE =
Regexp.new(/<h1>(.*)<\/h1>/i).freeze
- H2_RE =
Regexp.new(/<h2>(.*)<\/h2>/i).freeze
- TITLE_RE =
Regexp.new(/<title>(.*)<\/title>/i).freeze
- DESC_RE =
Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'] [^>]*content=[\"] ( [^\"]* ) [\"] [^>] *> /ix).freeze
- HREF_CONTENTS_RE =
Regexp.new(/\shref= ['|"] ( [^\s] [a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+ ) ['|"] [\s|\W] /ix).freeze
- NONPAGE_EXT_RE =
Regexp.new(/\. (?:css|js|png|gif|jpg|mp4| wmv|flv|mp3|wav|doc|txt|ico|xml) /ix).freeze
Instance Attribute Summary collapse
-
#links ⇒ Object
readonly
receives page source as string returns array of unique href links.
-
#source ⇒ Object
readonly
Returns the value of attribute source.
-
#t ⇒ Object
readonly
Returns the value of attribute t.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
- #desc ⇒ Object
- #h1 ⇒ Object
- #h2 ⇒ Object
-
#initialize(url, source, t) ⇒ Page
constructor
A new instance of Page.
- #parse_by_css(selector) ⇒ Object
- #parse_files(arr = parse_internal) ⇒ Object
- #parse_internal ⇒ Object
- #parse_internal_visitable ⇒ Object
- #parse_seo ⇒ Object
- #title ⇒ Object
Constructor Details
#initialize(url, source, t) ⇒ Page
Returns a new instance of Page.
38 39 40 41 42 43 |
# File 'lib/retriever/page.rb', line 38 def initialize(url, source, t) @url = url @t = t @source = source.encode_utf8_and_replace @links = nil end |
Instance Attribute Details
#links ⇒ Object (readonly)
receives page source as string returns array of unique href links
47 48 49 |
# File 'lib/retriever/page.rb', line 47 def links @links end |
#source ⇒ Object (readonly)
Returns the value of attribute source.
36 37 38 |
# File 'lib/retriever/page.rb', line 36 def source @source end |
#t ⇒ Object (readonly)
Returns the value of attribute t.
36 37 38 |
# File 'lib/retriever/page.rb', line 36 def t @t end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
36 37 38 |
# File 'lib/retriever/page.rb', line 36 def url @url end |
Instance Method Details
#desc ⇒ Object
82 83 84 |
# File 'lib/retriever/page.rb', line 82 def desc DESC_RE =~ @source ? @source.match(DESC_RE)[1].decode_html : '' end |
#h1 ⇒ Object
86 87 88 |
# File 'lib/retriever/page.rb', line 86 def h1 H1_RE =~ @source ? @source.match(H1_RE)[1].decode_html : '' end |
#h2 ⇒ Object
90 91 92 |
# File 'lib/retriever/page.rb', line 90 def h2 H2_RE =~ @source ? @source.match(H2_RE)[1].decode_html : '' end |
#parse_by_css(selector) ⇒ Object
73 74 75 76 |
# File 'lib/retriever/page.rb', line 73 def parse_by_css(selector) nokogiri_doc = Nokogiri::HTML(@source) nokogiri_doc.css(selector).text end |
#parse_files(arr = parse_internal) ⇒ Object
69 70 71 |
# File 'lib/retriever/page.rb', line 69 def parse_files(arr = parse_internal) arr.select { |x| @t.file_re =~ x } end |
#parse_internal ⇒ Object
59 60 61 62 63 |
# File 'lib/retriever/page.rb', line 59 def parse_internal links.select do |x| @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host end end |
#parse_internal_visitable ⇒ Object
65 66 67 |
# File 'lib/retriever/page.rb', line 65 def parse_internal_visitable parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) } end |
#parse_seo ⇒ Object
94 95 96 |
# File 'lib/retriever/page.rb', line 94 def parse_seo [title, desc, h1, h2] end |