Class: ArxivSync::XMLParser

Inherits:

Ox::Sax

Object
Ox::Sax
ArxivSync::XMLParser

show all

Defined in:: lib/arxivsync/parser.rb

Instance Attribute Summary collapse

#papers ⇒ Object

Returns the value of attribute papers.

Instance Method Summary collapse

#clean(str) ⇒ Object
#decode(string) ⇒ Object
#end_element(name) ⇒ Object
#initialize ⇒ XMLParser constructor

A new instance of XMLParser.
#latex_decode(str) ⇒ Object

Like LaTeX.decode but without the punctuation weirdness.
#start_element(name, attributes = []) ⇒ Object
#text(str) ⇒ Object

Constructor Details

#initialize ⇒ `XMLParser`

Returns a new instance of XMLParser.



31
32
33

# File 'lib/arxivsync/parser.rb', line 31

def initialize
  @entities = HTMLEntities.new
end

Instance Attribute Details

#papers ⇒ `Object`

Returns the value of attribute papers.



29
30
31

# File 'lib/arxivsync/parser.rb', line 29

def papers
  @papers
end

Instance Method Details

#clean(str) ⇒ `Object`



48
49
50

# File 'lib/arxivsync/parser.rb', line 48

def clean(str)
  str.gsub(/\s+/, ' ').strip
end

#decode(string) ⇒ `Object`

# File 'lib/arxivsync/parser.rb', line 70

def decode(string)
  str = @entities.decode(string)

  # Process latex entities -- except inside equations
  decoded = ""
  equation = false
  segment = ""
  str.chars do |ch|
    if ch == '$' 
      if !equation
        decoded << latex_decode(segment)
        segment = ch
      else
        decoded << segment + ch
        segment = ""
      end

      equation = !equation
    else
      segment << ch
    end
  end

  decoded << latex_decode(segment)
end

#end_element(name) ⇒ `Object`

# File 'lib/arxivsync/parser.rb', line 156

def end_element(name)
  case name
  when :version
    @model.versions.push(@version)
  when :metadata # End of a paper entry
    @papers.push(@model)
  end
  @el = nil
end

#latex_decode(str) ⇒ `Object`

Like LaTeX.decode but without the punctuation weirdness

# File 'lib/arxivsync/parser.rb', line 53

def latex_decode(str)
  string = str.dup

  LaTeX::Decode::Base.normalize(string)

  LaTeX::Decode::Maths.decode!(string)

  LaTeX::Decode::Accents.decode!(string)
  LaTeX::Decode::Diacritics.decode!(string)
  #LaTeX::Decode::Punctuation.decode!(string)
  LaTeX::Decode::Symbols.decode!(string)

  LaTeX::Decode::Base.strip_braces(string)

  LaTeX.normalize_C(string)
end

#start_element(name, attributes = []) ⇒ `Object`

# File 'lib/arxivsync/parser.rb', line 35

def start_element(name, attributes=[])
  @el = name
  case name
  when :ListRecords
    @papers = []
  when :metadata
    @model = Paper.new
    @model.versions = []
  when :version
    @version = Version.new
  end
end

#text(str) ⇒ `Object`

# File 'lib/arxivsync/parser.rb', line 96

def text(str)
  case @el
  # Necessary elements
  when :id
    @model.id = clean(str)
  when :submitter
    @model.submitter = decode(clean(str))
  when :title
    @model.title = decode(clean(str))
  when :authors
    # Author strings may contain strange metadata
    # Non-regex parsing to handle nested parens
    @model.author_str = decode(clean(str))

    depth = 0
    no_parens = ""

    @model.author_str.chars do |ch|
      case ch
      when '('
        depth += 1
      when ')'
        depth -= 1
      else
        no_parens << ch if depth == 0
      end
    end

    @model.authors = no_parens.split(/,|:|;|\sand\s|\s?the\s/i)
      .map { |s| clean(s) }
      .reject { |s| s.empty? }
  when :categories
    @model.categories = clean(str).split(/\s/)
  when :abstract
    @model.abstract = decode(clean(str))

  # Optional elements
  when :comments
    @model.comments = decode(clean(str))
  when :"msc-class"
    @model.msc_class = clean(str)
  when :"report-no"
    @model.report_no = clean(str)
  when :"journal-ref"
    @model.journal_ref = clean(str)
  when :doi
    @model.doi = clean(str)
  when :proxy
    @model.proxy = clean(str)
  when :license
    @model.license = clean(str)

  # Versions
  when :date
    @version.date = Time.parse(clean(str))
  when :size
    @version.size = clean(str)
  end
end

Class: ArxivSync::XMLParser

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ XMLParser

Instance Attribute Details

#papers ⇒ Object

Instance Method Details

#clean(str) ⇒ Object

#decode(string) ⇒ Object

#end_element(name) ⇒ Object

#latex_decode(str) ⇒ Object

#start_element(name, attributes = []) ⇒ Object

#text(str) ⇒ Object

#initialize ⇒ `XMLParser`

#papers ⇒ `Object`

#clean(str) ⇒ `Object`

#decode(string) ⇒ `Object`

#end_element(name) ⇒ `Object`

#latex_decode(str) ⇒ `Object`

#start_element(name, attributes = []) ⇒ `Object`

#text(str) ⇒ `Object`