Class: Opener::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/opener/tokenizer.rb,
lib/opener/tokenizer/cli.rb,
lib/opener/tokenizer/server.rb,
lib/opener/tokenizer/version.rb

Overview

Primary tokenizer class that delegates the work to the various language specific tokenizers.

Defined Under Namespace

Classes: CLI, Server

Constant Summary collapse

DEFAULT_LANGUAGE =

The default language to use when no custom one is specified.

Returns:

  • (String)
'en'.freeze
DEFAULT_OPTIONS =

Hash containing the default options to use.

Returns:

  • (Hash)
{
  :args     => [],
  :kaf      => true,
  :language => DEFAULT_LANGUAGE
}.freeze
VERSION =
'2.2.0'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Tokenizer

Returns a new instance of Tokenizer.

Parameters:

  • options (Hash) (defaults to: {})

Options Hash (options):

  • :args (Array)

    Collection of arbitrary arguments to pass to the individual tokenizer commands.

  • :language (String)

    The language to use for the tokenization process.

  • :kaf (TrueClass|FalseClass)

    When set to ‘true` the input is assumed to be KAF.



52
53
54
# File 'lib/opener/tokenizer.rb', line 52

def initialize(options = {})
  @options = DEFAULT_OPTIONS.merge(options)
end

Instance Attribute Details

#optionsHash (readonly)

Returns:

  • (Hash)


19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/opener/tokenizer.rb', line 19

class Tokenizer
  attr_reader :options

  ##
  # The default language to use when no custom one is specified.
  #
  # @return [String]
  #
  DEFAULT_LANGUAGE = 'en'.freeze

  ##
  # Hash containing the default options to use.
  #
  # @return [Hash]
  #
  DEFAULT_OPTIONS = {
    :args     => [],
    :kaf      => true,
    :language => DEFAULT_LANGUAGE
  }.freeze

  ##
  # @param [Hash] options
  #
  # @option options [Array] :args Collection of arbitrary arguments to pass
  #  to the individual tokenizer commands.
  #
  # @option options [String] :language The language to use for the
  #  tokenization process.
  #
  # @option options [TrueClass|FalseClass] :kaf When set to `true` the input
  #  is assumed to be KAF.
  #
  def initialize(options = {})
    @options = DEFAULT_OPTIONS.merge(options)
  end

  ##
  # Tokenizes the input and returns the results as a KAF document.
  #
  # @param [String] input
  # @return [String]
  #
  def run input, params = {}
    if options[:kaf]
      language, input = kaf_elements(input)
    else
      language = options[:language]
    end

    unless valid_language?(language)
      raise Core::UnsupportedLanguageError, language
    end

    kernel = language_constant(language).new(:args => options[:args])

    stdout, stderr, process = Open3.capture3(
      *kernel.command.split(" "),
      :stdin_data => input
    )

    raise stderr unless process.success?

    return stdout
  end

  alias tokenize run

  ##
  # Returns an Array containing the language an input from a KAF document.
  #
  # @param [String] input The KAF document.
  # @return [Array]
  #
  def kaf_elements(input)
    document = Nokogiri::XML(input)
    language = document.at('KAF').attr('xml:lang')
    text     = document.at('raw').text

    return language, text
  end

  private

  ##
  # @param [String] language
  # @return [Class]
  #
  def language_constant(language)
    name = Core::LanguageCode.constant_name(language)

    Tokenizers.const_get(name)
  end

  ##
  # @return [TrueClass|FalseClass]
  #
  def valid_language?(language)
    name = Core::LanguageCode.constant_name(language)

    return Tokenizers.const_defined?(name)
  end
end

Instance Method Details

#kaf_elements(input) ⇒ Array

Returns an Array containing the language an input from a KAF document.

Parameters:

  • input (String)

    The KAF document.

Returns:

  • (Array)


93
94
95
96
97
98
99
# File 'lib/opener/tokenizer.rb', line 93

def kaf_elements(input)
  document = Nokogiri::XML(input)
  language = document.at('KAF').attr('xml:lang')
  text     = document.at('raw').text

  return language, text
end

#run(input, params = {}) ⇒ String Also known as: tokenize

Tokenizes the input and returns the results as a KAF document.

Parameters:

  • input (String)

Returns:

  • (String)


62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/opener/tokenizer.rb', line 62

def run input, params = {}
  if options[:kaf]
    language, input = kaf_elements(input)
  else
    language = options[:language]
  end

  unless valid_language?(language)
    raise Core::UnsupportedLanguageError, language
  end

  kernel = language_constant(language).new(:args => options[:args])

  stdout, stderr, process = Open3.capture3(
    *kernel.command.split(" "),
    :stdin_data => input
  )

  raise stderr unless process.success?

  return stdout
end