Class: Opener::PropertyTagger::Processor

Inherits:
Object
  • Object
show all
Defined in:
lib/opener/property_tagger/processor.rb

Overview

Class that applies property tagging to a given input KAF file.

Constant Summary collapse

FILE_ASPECTS_CACHE =

Global cache used for storing loaded aspects.

FileAspectsCache.new
REMOTE_ASPECTS_CACHE =
RemoteAspectsCache.new
MAX_NGRAM =

Use of n-grams to determine if a unigram (1 lemma) or bigram (2 lemmas) belong to a property.

2

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file, params: {}, url: nil, path: nil, timestamp: true, pretty: false) ⇒ Processor

Returns a new instance of Processor.

Parameters:

  • file (String|IO)

    The KAF file/input to process.

  • aspects_path (String)

    Path to the aspects.

  • timestamp (TrueClass|FalseClass) (defaults to: true)

    Add timestamps to the KAF.

  • pretty (TrueClass|FalseClass) (defaults to: false)

    Enable pretty formatting, disabled by default due to the performance overhead.



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/opener/property_tagger/processor.rb', line 28

def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
  @document     = Nokogiri.XML file
  raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
  @timestamp    = timestamp
  @pretty       = pretty

  @params       = params
  @remote       = !url.nil?
  @aspects_path = path
  @aspects_url  = url
  @cache_keys   = params[:cache_keys] || {}
  @cache_keys.merge! lang: @document.root.attr('xml:lang')

  @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
end

Instance Attribute Details

#aspectsObject

Returns the value of attribute aspects.



10
11
12
# File 'lib/opener/property_tagger/processor.rb', line 10

def aspects
  @aspects
end

#aspects_pathObject

Returns the value of attribute aspects_path.



9
10
11
# File 'lib/opener/property_tagger/processor.rb', line 9

def aspects_path
  @aspects_path
end

#aspects_urlObject

Returns the value of attribute aspects_url.



9
10
11
# File 'lib/opener/property_tagger/processor.rb', line 9

def aspects_url
  @aspects_url
end

#documentObject

Returns the value of attribute document.



8
9
10
# File 'lib/opener/property_tagger/processor.rb', line 8

def document
  @document
end

#lexiconsObject

Returns the value of attribute lexicons.



10
11
12
# File 'lib/opener/property_tagger/processor.rb', line 10

def lexicons
  @lexicons
end

#prettyObject

Returns the value of attribute pretty.



11
12
13
# File 'lib/opener/property_tagger/processor.rb', line 11

def pretty
  @pretty
end

#timestampObject

Returns the value of attribute timestamp.



11
12
13
# File 'lib/opener/property_tagger/processor.rb', line 11

def timestamp
  @timestamp
end

Instance Method Details

#add_features_layerObject

Remove the features layer from the KAF file if it exists and add a new one.



127
128
129
130
131
132
133
# File 'lib/opener/property_tagger/processor.rb', line 127

def add_features_layer
  existing = document.at_xpath('KAF/features')

  existing.remove if existing

  new_node('features', 'KAF')
end

#add_linguistic_processorObject



164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/opener/property_tagger/processor.rb', line 164

def add_linguistic_processor
  description = 'VUA property tagger'
  last_edited = '16jan2015'
  version     = '2.0'

  node = new_node('linguisticProcessors', 'KAF/kafHeader')
  node['layer'] = 'features'

  lp_node = new_node('lp', node)

  lp_node['version'] = "#{last_edited}-#{version}"
  lp_node['name']    = description

  if timestamp
    format = '%Y-%m-%dT%H:%M:%S%Z'

    lp_node['timestamp'] = Time.now.strftime(format)
  else
    lp_node['timestamp'] = '*'
  end
end

#add_properties_layerObject

Add the properties layer as a child to the features layer.



137
138
139
# File 'lib/opener/property_tagger/processor.rb', line 137

def add_properties_layer
  new_node("properties", "KAF/features")
end

#add_property(lemma, values, index) ⇒ Object



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/opener/property_tagger/processor.rb', line 141

def add_property lemma, values, index
  property_node = new_node("property", "KAF/features/properties")

  property_node['lemma'] = lemma.to_s
  property_node['pid']   = "p#{index.to_s}"

  references_node = new_node("references", property_node)

  values.each do |v|
    comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
    references_node.add_child comm_node

    span_node = new_node 'span', references_node

    v.term_ids.each do |id|
      target_node       = new_node 'target', span_node

      target_node['id'] = id.to_s
      target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
    end
  end
end

#extract_aspectsHash

Check which terms belong to an aspect (property) Text have priority over Lemmas, overriding if there is a conflict

Returns:

  • (Hash)


88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/opener/property_tagger/processor.rb', line 88

def extract_aspects
  all_term_ids = terms.keys
  lemmas       = terms.values
  uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }

  [:lemma, :text].each do |k|
    current_token = 0

    while current_token < terms.count
      (0..MAX_NGRAM).each do |tam_ngram|
        next unless current_token + tam_ngram <= terms.count

        ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase

        @lexicons[ngram.to_sym]&.each do |l|
          properties = if l.aspects.present? then l.aspects else [l.aspect] end
          properties.each do |p|
            next if p.blank?
            term_ids = all_term_ids[current_token..current_token+tam_ngram]
            next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }

            uniq_aspects[p.to_sym] << Hashie::Mash.new(
              term_ids: term_ids,
              ngram:    ngram,
              lexicon:  l,
            )
          end
        end
      end
      current_token += 1
    end
  end

  Hash[uniq_aspects.sort]
end

#languageObject



63
64
65
# File 'lib/opener/property_tagger/processor.rb', line 63

def language
  @language ||= document.at_xpath('KAF').attr('xml:lang')
end

#pretty_print(document) ⇒ String

Format the output document properly.

TODO: this should be handled by Oga in a nice way.

Returns:

  • (String)


193
194
195
196
197
198
199
200
201
202
# File 'lib/opener/property_tagger/processor.rb', line 193

def pretty_print(document)
  doc = REXML::Document.new document.to_xml
  doc.context[:attribute_quote] = :quote
  out = ""
  formatter = REXML::Formatters::Pretty.new
  formatter.compact = true
  formatter.write(doc, out)

  out.strip
end

#processString

Processes the input and returns the new KAF output.

Returns:

  • (String)


48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/opener/property_tagger/processor.rb', line 48

def process
  add_features_layer
  add_properties_layer

  extract_aspects.each.with_index do |(lemma, values), index|
    index += 1

    add_property lemma, values, index
  end

  add_linguistic_processor

  pretty ? pretty_print(document) : document.to_xml
end

#termsObject



67
68
69
70
71
72
73
74
75
76
77
# File 'lib/opener/property_tagger/processor.rb', line 67

def terms
  unless @terms
    @terms = {}

    document.xpath('KAF/terms/term').each do |term|
      @terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
    end
  end

  @terms
end