Class: Opener::Stanza::Processor

Inherits:
Object
  • Object
show all
Defined in:
lib/opener/stanza/processor.rb

Constant Summary collapse

DESC =
'Tokenizer / POS by Stanza'
VERSION =
'1.0'
BASE_URL =
ENV['STANZA_SERVER']
LANGUAGES_CACHE =
Opener::ChainedDaemon::LanguagesCache.new
RTL_LANGUAGES =
%w[
  ar ara arc ae ave egy he heb nqo pal phn sam
  syc syr fa per fas ku kur ur urd
]
POS =
{
  'DET'   => 'D',
  'ADJ'   => 'G',
  'NOUN'  => 'N',
  'VERB'  => 'V',
  'AUX'   => 'V',
  'ADV'   => 'A',
  'CCONJ' => 'J',
  'PUNCT' => '.',
  'ADP'   => 'P',
  'PRON'  => 'Q',
  'PROPN' => 'R',
  'PART'  => 'P',
  'NUM'   => 'I',
  'X'     => 'O',
  'SYM'   => 'I',
  'SCONJ' => 'P',
  'INTJ'  => 'I',
}
POS_OPEN =
%w[N R G V A O]

Instance Method Summary collapse

Instance Method Details

#run(input, params) ⇒ Object

Raises:

  • (Core::UnsupportedLanguageError)


38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/opener/stanza/processor.rb', line 38

def run input, params
  raise 'missing Stanza server' if ENV['STANZA_SERVER'].blank?

  kaf      = KAF::Document.from_xml input
  lang     = LANGUAGES_CACHE.get[kaf.language]
  env      = params.cache_keys.environment
  unless lang&.environments&.include? env or (params.cache_keys.merged and lang&.environments&.include? 'production')
    raise Core::UnsupportedLanguageError.new kaf.language
  end
  if env == 'production' and !lang.supported_by_opener
    raise Core::UnsupportedLanguageError.new kaf.language
  end

  input     = kaf.raw
  input     = input.gsub(/\,[^\ ]/, ', ')
  response  = ChainedDaemon.http.post BASE_URL, {lang: kaf.language, input: input}.to_query
  raise Core::UnsupportedLanguageError, kaf.language if response.status == 406
  raise response.body if response.status >= 400
  sentences = JSON.parse response.body
  sentences.each{ |s| s.map!{ |t| Hashie::Mash.new t } }

  w_index = 0

  miscs = {}
  sentences.each.with_index do |s, i|
    miscs[i] = {}
    s.each do |word|
      if word.id.is_a?(Array)
        (word.id.min..word.id.max).each { |id| miscs[i][id] = word.slice(:start_char, :end_char) }
      end
    end
  end

  sentences.map{ |s| s.reverse! } if RTL_LANGUAGES.include? kaf.language
  sentences.each.with_index do |s, s_index|
    s.each do |word|
      w_index += 1
      # save misc for later usase in a MWT case
      next if word.id.is_a? Array

      misc = word.slice(:start_char, :end_char).presence || miscs[s_index][word.id]

      Rollbar.scoped({ input: input, params: params, sentences: sentences, word: word }) do
        raise 'Missing misc'
      end unless misc.start_char and misc.end_char

      offset = misc.start_char
      length = misc.end_char - offset

      u_pos  = word.upos
      pos    = POS[u_pos]
      raise "Didn't find a map for #{u_pos}" if pos.nil?
      type   = if POS_OPEN.include? pos then 'open' else 'close' end

      params = Hashie::Mash.new(
        wid:        w_index,
        sid:        s_index + 1,
        tid:        w_index,
        para:       1,
        offset:     offset,
        length:     length,
        text:       word.text,
        lemma:      word.lemma,
        morphofeat: u_pos,
        pos:        pos,
        type:       type,
        head:       word.head,
        xpos:       word.xpos.to_s
      )

      kaf.add_word_form params
      kaf.add_term params
    end
  end

  kaf.add_linguistic_processor DESC, "#{VERSION}", 'text', timestamp: true

  kaf.to_xml
end