Class: String
Direct Known Subclasses
Defined Under Namespace
Classes: CheckError
Instance Method Summary collapse
- #ascii ⇒ Object
- #camel_to_hyphy ⇒ Object
- #check ⇒ Object
-
#decode_utf7 ⇒ Object
Decodes UTF-7 and returns the resulting decoded string as UTF-8.
- #display_length ⇒ Object
- #each(&b) ⇒ Object
- #find_all_positions(x) ⇒ Object
-
#fix_encoding! ⇒ Object
Fix the damn string! make sure it is valid utf-8, then convert to user encoding.
- #normalize_whitespace ⇒ Object
- #ord ⇒ Object
- #slice_by_display_length(len) ⇒ Object
-
#split_on_commas ⇒ Object
a very complicated regex found on teh internets to split on commas, unless they occurr within double quotes.
-
#split_on_commas_with_remainder ⇒ Object
ok, here we do it the hard way.
-
#to_set_of_symbols(split_on = nil) ⇒ Object
takes a list of words, and returns an array of symbols.
-
#transcode(to_encoding, from_encoding) ⇒ Object
transcode the string if original encoding is know fix if broken.
- #wrap(len) ⇒ Object
Instance Method Details
#ascii ⇒ Object
405 406 407 408 409 410 411 412 413 414 415 416 |
# File 'lib/sup/util.rb', line 405 def ascii out = "" each_byte do |b| if (b & 128) != 0 out << "\\x#{b.to_s 16}" else out << b.chr end end out = out.fix_encoding! # this should now be an utf-8 string of ascii # compat chars. end |
#camel_to_hyphy ⇒ Object
215 216 217 |
# File 'lib/sup/util.rb', line 215 def camel_to_hyphy self.gsub(/([a-z])([A-Z0-9])/, '\1-\2').downcase end |
#check ⇒ Object
396 397 398 399 400 401 402 403 |
# File 'lib/sup/util.rb', line 396 def check begin fail "unexpected encoding #{encoding}" if respond_to?(:encoding) && !(encoding == Encoding::UTF_8 || encoding == Encoding::ASCII) fail "invalid encoding" if respond_to?(:valid_encoding?) && !valid_encoding? rescue raise CheckError.new($!.) end end |
#decode_utf7 ⇒ Object
Decodes UTF-7 and returns the resulting decoded string as UTF-8.
Ruby doesn’t supply a UTF-7 encoding natively. There is Net::IMAP::decode_utf7 which only handles the IMAP “modified UTF-7” encoding. This implementation is inspired by that one but handles standard UTF-7 shift characters and not the IMAP-specific variation.
362 363 364 365 366 367 368 369 370 |
# File 'lib/sup/util.rb', line 362 def decode_utf7 gsub(/\+([^-]+)?-/) { if $1 ($1 + "===").unpack("m")[0].encode(Encoding::UTF_8, Encoding::UTF_16BE) else "+" end } end |
#display_length ⇒ Object
203 204 205 |
# File 'lib/sup/util.rb', line 203 def display_length @display_length ||= Unicode::DisplayWidth.of(self) end |
#each(&b) ⇒ Object
383 384 385 |
# File 'lib/sup/util.rb', line 383 def each &b each_line(&b) end |
#find_all_positions(x) ⇒ Object
219 220 221 222 223 224 225 226 227 228 229 |
# File 'lib/sup/util.rb', line 219 def find_all_positions x ret = [] start = 0 while start < length pos = index x, start break if pos.nil? ret << pos start = pos + 1 end ret end |
#fix_encoding! ⇒ Object
Fix the damn string! make sure it is valid utf-8, then convert to user encoding.
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 |
# File 'lib/sup/util.rb', line 314 def fix_encoding! # first try to encode to utf-8 from whatever current encoding encode!('UTF-8', :invalid => :replace, :undef => :replace) # do this anyway in case string is set to be UTF-8, encoding to # something else (UTF-16 which can fully represent UTF-8) and back # ensures invalid chars are replaced. encode!('UTF-16', 'UTF-8', :invalid => :replace, :undef => :replace) encode!('UTF-8', 'UTF-16', :invalid => :replace, :undef => :replace) fail "Could not create valid UTF-8 string out of: '#{self.to_s}'." unless valid_encoding? # now convert to $encoding encode!($encoding, :invalid => :replace, :undef => :replace) fail "Could not create valid #{$encoding.inspect} string out of: '#{self.to_s}'." unless valid_encoding? self end |
#normalize_whitespace ⇒ Object
372 373 374 |
# File 'lib/sup/util.rb', line 372 def normalize_whitespace gsub(/\t/, " ").gsub(/\r/, "") end |
#ord ⇒ Object
377 378 379 |
# File 'lib/sup/util.rb', line 377 def ord self[0] end |
#slice_by_display_length(len) ⇒ Object
207 208 209 210 211 212 213 |
# File 'lib/sup/util.rb', line 207 def slice_by_display_length len each_char.each_with_object "" do |c, buffer| len -= Unicode::DisplayWidth.of(c) return buffer if len < 0 buffer << c end end |
#split_on_commas ⇒ Object
a very complicated regex found on teh internets to split on commas, unless they occurr within double quotes.
233 234 235 |
# File 'lib/sup/util.rb', line 233 def split_on_commas normalize_whitespace().split(/,\s*(?=(?:[^"]*"[^"]*")*(?![^"]*"))/) end |
#split_on_commas_with_remainder ⇒ Object
ok, here we do it the hard way. got to have a remainder for purposes of tab-completing full email addresses
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 |
# File 'lib/sup/util.rb', line 239 def split_on_commas_with_remainder ret = [] state = :outstring pos = 0 region_start = 0 while pos <= length newpos = case state when :escaped_instring, :escaped_outstring then pos else index(/[,"\\]/, pos) end if newpos char = self[newpos] else char = nil newpos = length end case char when ?" state = case state when :outstring then :instring when :instring then :outstring when :escaped_instring then :instring when :escaped_outstring then :outstring end when ?,, nil state = case state when :outstring, :escaped_outstring then ret << self[region_start ... newpos].gsub(/^\s+|\s+$/, "") region_start = newpos + 1 :outstring when :instring then :instring when :escaped_instring then :instring end when ?\\ state = case state when :instring then :escaped_instring when :outstring then :escaped_outstring when :escaped_instring then :instring when :escaped_outstring then :outstring end end pos = newpos + 1 end remainder = case state when :instring self[region_start .. -1].gsub(/^\s+/, "") else nil end [ret, remainder] end |
#to_set_of_symbols(split_on = nil) ⇒ Object
takes a list of words, and returns an array of symbols. typically used in Sup for translating Xapian’s representation of a list of labels (a string) to an array of label symbols.
split_on will be passed to String#split, so you can leave this nil for space.
393 |
# File 'lib/sup/util.rb', line 393 def to_set_of_symbols split_on=nil; Set.new split(split_on).map { |x| x.strip.intern } end |
#transcode(to_encoding, from_encoding) ⇒ Object
transcode the string if original encoding is know fix if broken.
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 |
# File 'lib/sup/util.rb', line 336 def transcode to_encoding, from_encoding begin encode!(to_encoding, from_encoding, :invalid => :replace, :undef => :replace) unless valid_encoding? # fix encoding (through UTF-8) encode!('UTF-16', from_encoding, :invalid => :replace, :undef => :replace) encode!(to_encoding, 'UTF-16', :invalid => :replace, :undef => :replace) end rescue Encoding::ConverterNotFoundError debug "Encoding converter not found for #{from_encoding.inspect} or #{to_encoding.inspect}, fixing string: '#{self.to_s}', but expect weird characters." fix_encoding! end fail "Could not create valid #{to_encoding.inspect} string out of: '#{self.to_s}'." unless valid_encoding? self end |
#wrap(len) ⇒ Object
295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 |
# File 'lib/sup/util.rb', line 295 def wrap len ret = [] s = self while s.display_length > len slice = s.slice_by_display_length(len) cut = slice.rindex(/\s/) if cut ret << s[0 ... cut] s = s[(cut + 1) .. -1] else ret << slice s = s[slice.length .. -1] end end ret << s end |