Class: Opener::PropertyTagger::Processor
- Inherits:
-
Object
- Object
- Opener::PropertyTagger::Processor
- Defined in:
- lib/opener/property_tagger/processor.rb
Overview
Class that applies property tagging to a given input KAF file.
Constant Summary collapse
- FILE_ASPECTS_CACHE =
Global cache used for storing loaded aspects.
FileAspectsCache.new
- REMOTE_ASPECTS_CACHE =
RemoteAspectsCache.new
- MAX_NGRAM =
Use of n-grams to determine if a unigram (1 lemma) or bigram (2 lemmas) belong to a property.
2
Instance Attribute Summary collapse
-
#aspects ⇒ Object
Returns the value of attribute aspects.
-
#aspects_path ⇒ Object
Returns the value of attribute aspects_path.
-
#aspects_url ⇒ Object
Returns the value of attribute aspects_url.
-
#document ⇒ Object
Returns the value of attribute document.
-
#lexicons ⇒ Object
Returns the value of attribute lexicons.
-
#pretty ⇒ Object
Returns the value of attribute pretty.
-
#timestamp ⇒ Object
Returns the value of attribute timestamp.
Instance Method Summary collapse
-
#add_features_layer ⇒ Object
Remove the features layer from the KAF file if it exists and add a new one.
- #add_linguistic_processor ⇒ Object
-
#add_properties_layer ⇒ Object
Add the properties layer as a child to the features layer.
- #add_property(lemma, values, index) ⇒ Object
-
#extract_aspects ⇒ Hash
Check which terms belong to an aspect (property) Text have priority over Lemmas, overriding if there is a conflict.
-
#initialize(file, params: {}, url: nil, path: nil, timestamp: true, pretty: false) ⇒ Processor
constructor
A new instance of Processor.
- #language ⇒ Object
-
#pretty_print(document) ⇒ String
Format the output document properly.
-
#process ⇒ String
Processes the input and returns the new KAF output.
- #terms ⇒ Object
Constructor Details
#initialize(file, params: {}, url: nil, path: nil, timestamp: true, pretty: false) ⇒ Processor
Returns a new instance of Processor.
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/opener/property_tagger/processor.rb', line 28 def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false @document = Nokogiri.XML file raise 'Error parsing input. Input is required to be KAF' unless is_kaf? @timestamp = @pretty = pretty @params = params @remote = !url.nil? @aspects_path = path @aspects_url = url @cache_keys = params[:cache_keys] || {} @cache_keys.merge! lang: @document.root.attr('xml:lang') @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end end |
Instance Attribute Details
#aspects ⇒ Object
Returns the value of attribute aspects.
10 11 12 |
# File 'lib/opener/property_tagger/processor.rb', line 10 def aspects @aspects end |
#aspects_path ⇒ Object
Returns the value of attribute aspects_path.
9 10 11 |
# File 'lib/opener/property_tagger/processor.rb', line 9 def aspects_path @aspects_path end |
#aspects_url ⇒ Object
Returns the value of attribute aspects_url.
9 10 11 |
# File 'lib/opener/property_tagger/processor.rb', line 9 def aspects_url @aspects_url end |
#document ⇒ Object
Returns the value of attribute document.
8 9 10 |
# File 'lib/opener/property_tagger/processor.rb', line 8 def document @document end |
#lexicons ⇒ Object
Returns the value of attribute lexicons.
10 11 12 |
# File 'lib/opener/property_tagger/processor.rb', line 10 def lexicons @lexicons end |
#pretty ⇒ Object
Returns the value of attribute pretty.
11 12 13 |
# File 'lib/opener/property_tagger/processor.rb', line 11 def pretty @pretty end |
#timestamp ⇒ Object
Returns the value of attribute timestamp.
11 12 13 |
# File 'lib/opener/property_tagger/processor.rb', line 11 def @timestamp end |
Instance Method Details
#add_features_layer ⇒ Object
Remove the features layer from the KAF file if it exists and add a new one.
127 128 129 130 131 132 133 |
# File 'lib/opener/property_tagger/processor.rb', line 127 def add_features_layer existing = document.at_xpath('KAF/features') existing.remove if existing new_node('features', 'KAF') end |
#add_linguistic_processor ⇒ Object
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# File 'lib/opener/property_tagger/processor.rb', line 164 def add_linguistic_processor description = 'VUA property tagger' last_edited = '16jan2015' version = '2.0' node = new_node('linguisticProcessors', 'KAF/kafHeader') node['layer'] = 'features' lp_node = new_node('lp', node) lp_node['version'] = "#{last_edited}-#{version}" lp_node['name'] = description if format = '%Y-%m-%dT%H:%M:%S%Z' lp_node['timestamp'] = Time.now.strftime(format) else lp_node['timestamp'] = '*' end end |
#add_properties_layer ⇒ Object
Add the properties layer as a child to the features layer.
137 138 139 |
# File 'lib/opener/property_tagger/processor.rb', line 137 def add_properties_layer new_node("properties", "KAF/features") end |
#add_property(lemma, values, index) ⇒ Object
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
# File 'lib/opener/property_tagger/processor.rb', line 141 def add_property lemma, values, index property_node = new_node("property", "KAF/features/properties") property_node['lemma'] = lemma.to_s property_node['pid'] = "p#{index.to_s}" references_node = new_node("references", property_node) values.each do |v| comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ") references_node.add_child comm_node span_node = new_node 'span', references_node v.term_ids.each do |id| target_node = new_node 'target', span_node target_node['id'] = id.to_s target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id end end end |
#extract_aspects ⇒ Hash
Check which terms belong to an aspect (property) Text have priority over Lemmas, overriding if there is a conflict
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# File 'lib/opener/property_tagger/processor.rb', line 88 def extract_aspects all_term_ids = terms.keys lemmas = terms.values uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] } [:lemma, :text].each do |k| current_token = 0 while current_token < terms.count (0..MAX_NGRAM).each do |tam_ngram| next unless current_token + tam_ngram <= terms.count ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase @lexicons[ngram.to_sym]&.each do |l| properties = if l.aspects.present? then l.aspects else [l.aspect] end properties.each do |p| next if p.blank? term_ids = all_term_ids[current_token..current_token+tam_ngram] next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids } uniq_aspects[p.to_sym] << Hashie::Mash.new( term_ids: term_ids, ngram: ngram, lexicon: l, ) end end end current_token += 1 end end Hash[uniq_aspects.sort] end |
#language ⇒ Object
63 64 65 |
# File 'lib/opener/property_tagger/processor.rb', line 63 def language @language ||= document.at_xpath('KAF').attr('xml:lang') end |
#pretty_print(document) ⇒ String
Format the output document properly.
TODO: this should be handled by Oga in a nice way.
193 194 195 196 197 198 199 200 201 202 |
# File 'lib/opener/property_tagger/processor.rb', line 193 def pretty_print(document) doc = REXML::Document.new document.to_xml doc.context[:attribute_quote] = :quote out = "" formatter = REXML::Formatters::Pretty.new formatter.compact = true formatter.write(doc, out) out.strip end |
#process ⇒ String
Processes the input and returns the new KAF output.
48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/opener/property_tagger/processor.rb', line 48 def process add_features_layer add_properties_layer extract_aspects.each.with_index do |(lemma, values), index| index += 1 add_property lemma, values, index end add_linguistic_processor pretty ? pretty_print(document) : document.to_xml end |
#terms ⇒ Object
67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/opener/property_tagger/processor.rb', line 67 def terms unless @terms @terms = {} document.xpath('KAF/terms/term').each do |term| @terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')} end end @terms end |