Module: TextDataTools::Column

Defined in:
lib/text-data-tools.rb

Overview

Tools for extracting data from text files where the data appears in columns with or without headers for each column.

Defined Under Namespace

Classes: DataFile, NotFoundError

Class Method Summary collapse

Class Method Details

.column_index_from_headers(line, column_header, header_match) ⇒ Object

Raises:

  • (ArgumentError)


150
151
152
153
154
155
156
157
158
# File 'lib/text-data-tools.rb', line 150

def self.column_index_from_headers(line, column_header, header_match)
	headers = line.scan(header_match)
	#p headers
	index_array = headers.map{|head| head =~ (column_header.kind_of?(Regexp) ? column_header : Regexp.new(Regexp.escape(column_header)))}
	#p index_array
	raise ArgumentError.new("column_header: #{column_header.inspect} does not match any columns in #{headers.inspect}") if index_array.compact.size == 0
	raise ArgumentError.new("column_header: #{column_header.inspect} matches more than 1 column in #{headers.inspect}") if index_array.compact.size > 1
	column_header = index_array.index(index_array.compact[0])
end

.get_1d_array(filename, has_header_line, column_header, match = /\S+/, header_match = /\S+/) ⇒ Object

Return a one-dimensional array containing data from the file filename,

which may or may not have a line of column headers,
in the column column_header, where column_header maybe either a string
or a regex which matches the title of the column,  or an integer
giving the zero-based  column number.

Match is a regexp that matches data items, and header_match is a regexp that
matches items in the headers.

All data is returned as strings

Raises:

  • (ArgumentError)


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/text-data-tools.rb', line 34

def self.get_1d_array(filename, has_header_line, column_header, match=/\S+/, header_match=/\S+/)
	raise ArgumentError.new("column_header header should be a string, regex or integer") unless [String, Regexp, Integer].find{|cls| column_header.kind_of? cls}
	array = []
	File.open(filename) do |file|
		headers = file.gets if has_header_line
		if [String, Regexp].find{|cls| column_header.kind_of? cls}
			raise ("Header search given but has_header_line = false") if not has_header_line
			column_header = column_index_from_headers(headers, column_header, header_match)
		end
		while line = file.gets
			values = line.scan(match)
		 	array.push values[column_header]	
			#puts line
		end
	end
	array
end

.get_1d_array_float(*args) ⇒ Object

Calls get_1d_array and converts all data elements to floats



54
55
56
# File 'lib/text-data-tools.rb', line 54

def self.get_1d_array_float(*args)
	get_1d_array(*args).map{|v| v.to_f}
end

.get_1d_array_integer(*args) ⇒ Object



57
58
59
# File 'lib/text-data-tools.rb', line 57

def self.get_1d_array_integer(*args)
	get_1d_array(*args).map{|v| v.to_i}
end

.get_2d_array(filename, has_header_line, column_header, index_header = nil, match = /\S+/, header_match = /\S+/) ⇒ Object

Return a two-dimensional array containing data from the file filename,

which may or may not have a line of column headers,
in the column column_header, where column_header maybe either a string
or a regex which matches the title of the column,  or an integer
giving the zero-based  column number.

It is assumed that two-dimensional array is in one column. 
If index_header is nil, data is assumed to be separated by blank lines.
E.g.
		1.2
		4.2
		7.2

		8.2
		4.2
		2.2
If index_header is an integer or string or regexp, it selects a column
in the manner of column_header, and the data is divided by values of this
column.
E.g. 
		1  5.5
		1  3.2
		1  2.6
		2  3.2

2 2.2 2 6.3

Match is a regexp that matches data items, and header_match is a regexp that
matches items in the headers.

All data is returned as strings

Raises:

  • (ArgumentError)


92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/text-data-tools.rb', line 92

def self.get_2d_array(filename, has_header_line, column_header, index_header=nil, match=/\S+/, header_match=/\S+/)
	raise ArgumentError.new("column_header header should be a string, regex or integer") unless [String, Regexp, Integer].find{|cls| column_header.kind_of? cls}
	raise ArgumentError.new("index_header should be a string, regex, integer or nil") unless [String, Regexp, Integer, NilClass].find{|cls| column_header.kind_of? cls}
	array = []
	File.open(filename) do |file|
		headers = file.gets if has_header_line
		if [String, Regexp].find{|cls| column_header.kind_of? cls}
			raise ("Header search given but has_header_line = false") if not has_header_line
			column_header = column_index_from_headers(headers, column_header, header_match)
		end
		if [String, Regexp].find{|cls| index_header.kind_of? cls}
			raise ("Header search given but has_header_line = false") if not has_header_line
			index_header = column_index_from_headers(headers, index_header, header_match)
		end
		index_value = false
		index = 0
		while line = file.gets
			if index_header.nil?
				if line =~ /^\s*$/
					if array.size == 0 # ignore empty lines at top
						next
					else
						(array.push []; index+=1;next) 
					end
				end
				array.push [] if array.size = 0
			else
				next if line =~ /^\s*$/
			end
			values = line.scan(match)
			if not index_header.nil?
				if array.size ==0
					array.push []
					index_value = values[index_header]
				elsif index_value != values[index_header]
					array.push []
					index+=1
					index_value = values[index_header]
				end
			end
		 	array[index].push values[column_header]	
			#puts line
		end
	end
	array
end

.get_2d_array_float(*args) ⇒ Object

Calls get_2d_array and converts all data elements to floats



140
141
142
# File 'lib/text-data-tools.rb', line 140

def self.get_2d_array_float(*args)
	get_2d_array(*args).map{|a| a.map{|v| v.to_f}}
end

.get_2d_array_integer(*args) ⇒ Object



143
144
145
# File 'lib/text-data-tools.rb', line 143

def self.get_2d_array_integer(*args)
	get_2d_array(*args).map{|a| a.map{|v| v.to_i}}
end