Module: TextDataTools::Column

Defined in:
lib/text-data-tools.rb

Overview

Tools for extracting data from text files where the data appears in columns with or without headers for each column.

Defined Under Namespace

Classes: DataFile, NotFoundError

Class Method Summary collapse

Class Method Details

.column_index_from_headers(line, column_header, header_match) ⇒ Object

Raises:

  • (ArgumentError)


144
145
146
147
148
149
150
151
152
# File 'lib/text-data-tools.rb', line 144

def self.column_index_from_headers(line, column_header, header_match)
  headers = line.scan(header_match)
  #p headers
  index_array = headers.map{|head| head =~ (column_header.kind_of?(Regexp) ? column_header : Regexp.new(Regexp.escape(column_header)))}
  #p index_array
  raise ArgumentError.new("column_header: #{column_header.inspect} does not match any columns in #{headers.inspect}") if index_array.compact.size == 0
  raise ArgumentError.new("column_header: #{column_header.inspect} matches more than 1 column in #{headers.inspect}") if index_array.compact.size > 1
  column_header = index_array.index(index_array.compact[0])
end

.get_1d_array(filename, has_header_line, column_header, match = /\S+/, header_match = /\S+/) ⇒ Object

Return a one-dimensional array containing data from the file filename,

which may or may not have a line of column headers,
in the column column_header, where column_header maybe either a string
or a regex which matches the title of the column,  or an integer
giving the zero-based  column number.

Match is a regexp that matches data items, and header_match is a regexp that
matches items in the headers.

All data is returned as strings

Raises:

  • (ArgumentError)


28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/text-data-tools.rb', line 28

def self.get_1d_array(filename, has_header_line, column_header, match=/\S+/, header_match=/\S+/)
  raise ArgumentError.new("column_header header should be a string, regex or integer") unless [String, Regexp, Integer].find{|cls| column_header.kind_of? cls}
  array = []
  File.open(filename) do |file|
    headers = file.gets if has_header_line
    if [String, Regexp].find{|cls| column_header.kind_of? cls}
      raise ("Header search given but has_header_line = false") if not has_header_line
      column_header = column_index_from_headers(headers, column_header, header_match)
    end
    while line = file.gets
      values = line.scan(match)
      array.push values[column_header]  
      #puts line
    end
  end
  array
end

.get_1d_array_float(*args) ⇒ Object

Calls get_1d_array and converts all data elements to floats



48
49
50
# File 'lib/text-data-tools.rb', line 48

def self.get_1d_array_float(*args)
  get_1d_array(*args).map{|v| v.to_f}
end

.get_1d_array_integer(*args) ⇒ Object



51
52
53
# File 'lib/text-data-tools.rb', line 51

def self.get_1d_array_integer(*args)
  get_1d_array(*args).map{|v| v.to_i}
end

.get_2d_array(filename, has_header_line, column_header, index_header = nil, match = /\S+/, header_match = /\S+/) ⇒ Object

Return a two-dimensional array containing data from the file filename,

which may or may not have a line of column headers,
in the column column_header, where column_header maybe either a string
or a regex which matches the title of the column,  or an integer
giving the zero-based  column number.

It is assumed that two-dimensional array is in one column. 
If index_header is nil, data is assumed to be separated by blank lines.
E.g.
    1.2
    4.2
    7.2

    8.2
    4.2
    2.2
If index_header is an integer or string or regexp, it selects a column
in the manner of column_header, and the data is divided by values of this
column.
E.g. 
    1  5.5
    1  3.2
    1  2.6
    2  3.2

2 2.2 2 6.3

Match is a regexp that matches data items, and header_match is a regexp that
matches items in the headers.

All data is returned as strings

Raises:

  • (ArgumentError)


86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/text-data-tools.rb', line 86

def self.get_2d_array(filename, has_header_line, column_header, index_header=nil, match=/\S+/, header_match=/\S+/)
  raise ArgumentError.new("column_header header should be a string, regex or integer") unless [String, Regexp, Integer].find{|cls| column_header.kind_of? cls}
  raise ArgumentError.new("index_header should be a string, regex, integer or nil") unless [String, Regexp, Integer, NilClass].find{|cls| column_header.kind_of? cls}
  array = []
  File.open(filename) do |file|
    headers = file.gets if has_header_line
    if [String, Regexp].find{|cls| column_header.kind_of? cls}
      raise ("Header search given but has_header_line = false") if not has_header_line
      column_header = column_index_from_headers(headers, column_header, header_match)
    end
    if [String, Regexp].find{|cls| index_header.kind_of? cls}
      raise ("Header search given but has_header_line = false") if not has_header_line
      index_header = column_index_from_headers(headers, index_header, header_match)
    end
    index_value = false
    index = 0
    while line = file.gets
      if index_header.nil?
        if line =~ /^\s*$/
          if array.size == 0 # ignore empty lines at top
            next
          else
            (array.push []; index+=1;next) 
          end
        end
        array.push [] if array.size = 0
      else
        next if line =~ /^\s*$/
      end
      values = line.scan(match)
      if not index_header.nil?
        if array.size ==0
          array.push []
          index_value = values[index_header]
        elsif index_value != values[index_header]
          array.push []
          index+=1
          index_value = values[index_header]
        end
      end
      array[index].push values[column_header] 
      #puts line
    end
  end
  array
end

.get_2d_array_float(*args) ⇒ Object

Calls get_2d_array and converts all data elements to floats



134
135
136
# File 'lib/text-data-tools.rb', line 134

def self.get_2d_array_float(*args)
  get_2d_array(*args).map{|a| a.map{|v| v.to_f}}
end

.get_2d_array_integer(*args) ⇒ Object



137
138
139
# File 'lib/text-data-tools.rb', line 137

def self.get_2d_array_integer(*args)
  get_2d_array(*args).map{|a| a.map{|v| v.to_i}}
end