Class: Terrier::HtmlData

Inherits:
Object
  • Object
show all
Includes:
HTTParty
Defined in:
lib/terrier/html_data.rb

Constant Summary collapse

PUBLICATION_META_TAGS =
["citation_journal_title", "dc.publisher", "prism.publicationName"]
TITLE_META_TAGS =
["citation_title", "dc.title", "prism.title"]
AUTHOR_META_TAGS =
["citation_author", "dc.creator", "citation_authors", "Authors", "AUTHOR", "creator"]
PUBLICATION_DATE_META_TAGS =
["citation_publication_date", "publisher", "dc.publisher"]
DOI_META_TAGS =
["citation_doi", "dc.identifier"]
LICENSING_TAGS =
["dc.rights"]
ISSN_TAGS =
["prism.issn"]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ HtmlData

Returns a new instance of HtmlData.

Raises:



13
14
15
16
17
18
# File 'lib/terrier/html_data.rb', line 13

def initialize(url)
  raise Terrier::UrlError, "bad url given" unless uri?(url)
  @url = url
  @raw = self.class.get(url)
  @parsed_html = Nokogiri::HTML(@raw)
end

Instance Attribute Details

#urlObject (readonly)

Returns the value of attribute url.



3
4
5
# File 'lib/terrier/html_data.rb', line 3

def url
  @url
end

Instance Method Details

#dataObject



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/terrier/html_data.rb', line 20

def data
  return @_data if @_data
  @_data = {
    url: url,
    journal: (@parsed_html, PUBLICATION_META_TAGS).first,
    title: ( @parsed_html, TITLE_META_TAGS).first,
    authors: ( @parsed_html, AUTHOR_META_TAGS).uniq,
    publication_date: (@parsed_html, PUBLICATION_DATE_META_TAGS).first,
    doi: (@parsed_html, DOI_META_TAGS).first,
    issn: nil,
    zenodo_pdf: zenodo_pdf
  }

  @_data.merge(bibliography: bibliography(@_data))
end