Class: SimpleXlsxReader::Loader::SheetParser

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Extended by:
Forwardable
Defined in:
lib/simple_xlsx_reader/loader/sheet_parser.rb

Defined Under Namespace

Classes: HyperlinksParser

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file_io:, loader:) ⇒ SheetParser

Returns a new instance of SheetParser.



17
18
19
20
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 17

def initialize(file_io:, loader:)
  @file_io = file_io
  @loader = loader
end

Instance Attribute Details

Returns the value of attribute hyperlinks_by_cell.



11
12
13
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 11

def hyperlinks_by_cell
  @hyperlinks_by_cell
end

#load_errorsObject (readonly)

Returns the value of attribute load_errors.



13
14
15
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 13

def load_errors
  @load_errors
end

#xrels_fileObject

Returns the value of attribute xrels_file.



10
11
12
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 10

def xrels_file
  @xrels_file
end

Instance Method Details

#cell_idxObject



240
241
242
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 240

def cell_idx
  column_letter_to_number(@cell_name.scan(/[A-Z]+/).first) - 1
end

#characters(string) ⇒ Object



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 79

def characters(string)
  if @function
    # the only "function" we support is a hyperlink
    @url = string.slice(/HYPERLINK\("(.*?)"/, 1)
  end

  return unless @capture

  captured =
    begin
      SimpleXlsxReader::Loader.cast(
        string, @type, @style,
        url: @url || hyperlinks_by_cell&.[](@cell_name),
        shared_strings: shared_strings,
        base_date: base_date
      )
    rescue StandardError => e
      column, row = @cell_name.match(/([A-Z]+)([0-9]+)/).captures
      col_idx = column_letter_to_number(column) - 1
      row_idx = row.to_i - 1

      if !SimpleXlsxReader.configuration.catch_cell_load_errors
        error = CellLoadError.new(
          "Row #{row_idx}, Col #{col_idx}: #{e.message}"
        )
        error.set_backtrace(e.backtrace)
        raise error
      else
        @load_errors[[row_idx, col_idx]] = e.message

        string
      end
    end

  # For some reason I can't figure out in a reasonable timeframe,
  # SAX parsing some workbooks captures separate strings in the same cell
  # when we encounter UTF-8, although I can't get workbooks made in my
  # own version of excel to repro it. Our fix is just to keep building
  # the string in this case, although maybe there's a setting in Nokogiri
  # to make it not do this (looked, couldn't find it).
  #
  # Loading the workbook test/chunky_utf8.xlsx repros the issue.
  @captured = @captured ? @captured + (captured || '') : captured
end

#column_lengthObject



234
235
236
237
238
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 234

def column_length
  return 0 unless @dimension

  @column_length ||= column_letter_to_number(last_cell_letter)
end

#column_letter_to_number(column_letter) ⇒ Object

formula fits an exponential factorial function of the form: ‘A’ = 1 ‘B’ = 2 ‘Z’ = 26 ‘AA’ = 26 * 1 + 1 ‘AZ’ = 26 * 1 + 26 ‘BA’ = 26 * 2 + 1 ‘ZA’ = 26 * 26 + 1 ‘ZZ’ = 26 * 26 + 26 ‘AAA’ = 26 * 26 * 1 + 26 * 1 + 1 ‘AAZ’ = 26 * 26 * 1 + 26 * 1 + 26 ‘ABA’ = 26 * 26 * 1 + 26 * 2 + 1 ‘BZA’ = 26 * 26 * 2 + 26 * 26 + 1



265
266
267
268
269
270
271
272
273
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 265

def column_letter_to_number(column_letter)
  pow = column_letter.length - 1
  result = 0
  column_letter.each_byte do |b|
    result += 26**pow * (b - 64)
    pow -= 1
  end
  result
end

#column_number_to_letter(n) ⇒ Object



275
276
277
278
279
280
281
282
283
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 275

def column_number_to_letter(n)
  result = []
  loop do
    result.unshift((n % 26 + 65).chr)
    n = (n / 26) - 1
    break if n < 0
  end
  result.join
end

#end_element_namespace(name, _prefix, _uri) ⇒ Object



124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 124

def end_element_namespace(name, _prefix, _uri)
  case name
  when 'row'
    if @headers == true # ya a little funky
      @headers = @current_row
    elsif @headers.is_a?(Hash)
      test_headers_hash_against_current_row
      # in case there were empty rows before finding the header
      @last_seen_row_idx = @current_row_num - 1
    elsif @headers.respond_to?(:call)
      @headers = @current_row if @headers.call(@current_row)
      # in case there were empty rows before finding the header
      @last_seen_row_idx = @current_row_num - 1
    elsif @headers
      possibly_yield_empty_rows(headers: true)
      yield_row(@current_row, headers: true)
    else
      possibly_yield_empty_rows(headers: false)
      yield_row(@current_row, headers: false)
    end

    @last_seen_row_idx += 1

    # Note that excel writes a '/worksheet/dimension' node we can get
    # this from, but some libs (ex. simple_xlsx_writer) don't record it.
    # In that case, we assume the data is of uniform column length and
    # store the column name of the last header row we see. Obviously this
    # isn't the most robust strategy, but it likely fits 99% of use cases
    # considering it's not a problem with actual excel docs.
    @dimension = "A1:#{@cell_name}" if @dimension.nil?
  when 'v', 't'
    @current_row[cell_idx] = @captured
    @capture = false
    @captured = nil
  when 'f' then @function = false
  when 'c' then @url = nil
  end
end

#last_cell_letterObject

Returns the last column name, ex. ‘E’



246
247
248
249
250
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 246

def last_cell_letter
  return unless @dimension

  @dimension.scan(/:([A-Z]+)/)&.first&.first || 'A'
end

This sax-parses the whole sheet, just to extract hyperlink refs at the end.



197
198
199
200
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 197

def load_gui_hyperlinks
  self.hyperlinks_by_cell =
    HyperlinksParser.parse(@file_io, xrels: xrels)
end

#parse(headers: false, &block) ⇒ Object



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 22

def parse(headers: false, &block)
  raise 'parse called without a block; what should this do?'\
    unless block_given?

  @headers = headers
  @each_callback = block
  @load_errors = {}
  @current_row_num = nil
  @last_seen_row_idx = 0
  @url = nil # silence warnings
  @function = nil # silence warnings
  @capture = nil # silence warnings
  @captured = nil # silence warnings
  @dimension = nil # silence warnings
  @column_index = 0

  @file_io.rewind # if it's IO from IO.read, we need to rewind it

  # In this project this is only used for GUI-made hyperlinks (as opposed
  # to FUNCTION-based hyperlinks). Unfortunately the're needed to parse
  # the spreadsheet, and they come AFTER the sheet data. So, solution is
  # to just stream-parse the file twice, first for the hyperlinks at the
  # bottom of the file, then for the file itself. In the future it would
  # be clever to use grep to extract the xml into its own smaller file.
  if xrels_file
    if xrels_file.grep(/hyperlink/).any?
      xrels_file.rewind
      load_gui_hyperlinks # represented as hyperlinks_by_cell
    end
    @file_io.rewind # we've already parsed this once
  end

  Nokogiri::XML::SAX::Parser.new(self).parse(@file_io)
end

#possibly_yield_empty_rows(headers:) ⇒ Object



181
182
183
184
185
186
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 181

def possibly_yield_empty_rows(headers:)
  while @current_row_num && @current_row_num > @last_seen_row_idx + 1
    @last_seen_row_idx += 1
    yield_row(Array.new(column_length), headers: headers)
  end
end

#start_element_namespace(name, attrs = [], _prefix, _uri, _ns) ⇒ Object

SAX document hooks



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 60

def start_element_namespace(name, attrs = [], _prefix, _uri, _ns)
  case name
  when 'dimension'
    @dimension = attrs.last.value
  when 'row'
    @current_row_num = attrs.find {|attr| attr.localname == 'r'}&.value&.to_i
    @current_row = Array.new(column_length)
    @column_index = 0
  when 'c'
    attrs = attrs.inject({}) {|acc, attr| acc[attr.localname] = attr.value; acc}
    @cell_name = attrs['r'] || column_number_to_letter(@column_index)
    @type = attrs['t']
    @style = attrs['s'] && style_types[attrs['s'].to_i]
    @column_index += 1
  when 'f' then @function = true
  when 'v', 't' then @capture = true
  end
end

#test_headers_hash_against_current_rowObject

/End SAX hooks



166
167
168
169
170
171
172
173
174
175
176
177
178
179
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 166

def test_headers_hash_against_current_row
  found = false

  @current_row.each_with_index do |cell, cell_idx|
    @headers.each_pair do |key, search|
      if search.is_a?(String) ? cell == search : cell&.match?(search)
        found = true
        @current_row[cell_idx] = key
      end
    end
  end

  @headers = @current_row if found
end

#xrelsObject



230
231
232
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 230

def xrels
  @xrels ||= Nokogiri::XML(xrels_file.read) if xrels_file
end

#yield_row(row, headers:) ⇒ Object



188
189
190
191
192
193
194
# File 'lib/simple_xlsx_reader/loader/sheet_parser.rb', line 188

def yield_row(row, headers:)
  if headers
    @each_callback.call(Hash[@headers.zip(row)])
  else
    @each_callback.call(row)
  end
end