Class: Genebrand::PosParser

Inherits:
Object
  • Object
show all
Defined in:
lib/genebrand/posparser.rb

Instance Method Summary collapse

Constructor Details

#initializePosParser

Fills parts of speech table



7
8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/genebrand/posparser.rb', line 7

def initialize
  @parsed = {}
  @table = {}
  # Сущ
  @table['N'] = @parsed['noun'] = []
  # Мн. число
  @table['P'] = @parsed['plur'] = []
  # Глаг. прич, пер, непер
  @table['V'] = @parsed['verb_part'] = []
  @table['t'] = @parsed['verb_trans'] = []
  @table['i'] = @parsed['verb_intrans'] = []
  # Прилаг
  @table['A'] = @parsed['adj'] = []
end

Instance Method Details

#getparts(data) ⇒ Object



44
45
46
47
48
# File 'lib/genebrand/posparser.rb', line 44

def getparts(data)
  data[1].split('').each do |partofsp|
    @table[partofsp].push(data[0].downcase) if @table.key?(partofsp)
  end
end

#is_numeric?(obj) ⇒ Boolean

Returns:

  • (Boolean)


120
121
122
# File 'lib/genebrand/posparser.rb', line 120

def is_numeric?(obj)
  obj.to_s.match(/\A[+-]?\d+?(\.\d+)?\Z/).nil? ? false : true
end

#parse(filename) ⇒ Hash

Parses file of wordtpartofspeech

Parameters:

  • filename (String)

    that should be parsed

Returns:

  • (Hash)

    of partofspeech => words



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/genebrand/posparser.rb', line 26

def parse(filename)
  init

  unless File.exist?(filename)
    fail "File not found: #{filename}"
    return
  end

  puts 'Seeding'
  File.open(filename, 'r').each_line do |line|
    data = line.split("\t")

    getparts(data)
  end

  @parsed
end

#parse_top(filename, top) ⇒ Object



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/genebrand/posparser.rb', line 50

def parse_top(filename, top)
  init

  unless File.exist?(filename)
    fail "File not found: #{filename}"
    return
  end

  puts 'Load top'
  toparr = []
  File.open(top, 'r').each_line do |line|
    toparr << line.strip.downcase
  end
  puts toparr.count

  puts 'Seeding'
  it = 0
  File.open(filename, 'r').each_line do |line|
    data = line.split("\t")

    getparts(data) if toparr.include?(data[0])
    it += 1
    puts it if it % 10_000 == 0
  end

  @parsed
end

#parseandsave(filename, to) ⇒ Object



100
101
102
103
# File 'lib/genebrand/posparser.rb', line 100

def parseandsave(filename, to)
  FileUtils.mkdir_p 'lib/data'
  File.open(to, 'w+') { |f| f.write(parse(filename).to_json) }
end

#parseandsave_preseed(filename, to) ⇒ Object



105
106
107
108
109
110
111
112
113
# File 'lib/genebrand/posparser.rb', line 105

def parseandsave_preseed(filename, to)
  FileUtils.mkdir_p 'lib/data'
  File.open(to, 'w+') do |f|
    write = preseed(filename)
    write.each do |line|
      f.write(line)
    end
  end
end

#parseandsave_top(filename, top, to) ⇒ Object



115
116
117
118
# File 'lib/genebrand/posparser.rb', line 115

def parseandsave_top(filename, top, to)
  FileUtils.mkdir_p 'lib/data'
  File.open(to, 'w+') { |f| f.write(parse_top(filename, top).to_json) }
end

#preseed(filename) ⇒ Object



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/genebrand/posparser.rb', line 78

def preseed(filename)
  init

  prsdata = []

  unless File.exist?(filename)
    fail "File not found: #{filename}"
    return
  end

  puts 'Preseed'
  File.open(filename, 'r').each_line do |line|
    data = line.split("\t")

    if !is_numeric?(data[0]) && (!/\A[a-zA-Z0-9]{2,10}\z/.match(data[0]).nil?)
      prsdata << line
    end
  end

  prsdata
end