Class: String

Inherits:

Object

Object
String

show all

Defined in:: lib/sup/util.rb

Direct Known Subclasses

Ncurses::CharCode

Defined Under Namespace

Classes: CheckError

Instance Method Summary collapse

#ascii ⇒ Object
#camel_to_hyphy ⇒ Object
#check ⇒ Object
#decode_utf7 ⇒ Object

Decodes UTF-7 and returns the resulting decoded string as UTF-8.
#display_length ⇒ Object
#each(&b) ⇒ Object
#find_all_positions(x) ⇒ Object
#fix_encoding! ⇒ Object

Fix the damn string! make sure it is valid utf-8, then convert to user encoding.
#normalize_whitespace ⇒ Object
#ord ⇒ Object
#slice_by_display_length(len) ⇒ Object
#split_on_commas ⇒ Object

a very complicated regex found on teh internets to split on commas, unless they occurr within double quotes.
#split_on_commas_with_remainder ⇒ Object

ok, here we do it the hard way.
#to_set_of_symbols(split_on = nil) ⇒ Object

takes a list of words, and returns an array of symbols.
#transcode(to_encoding, from_encoding) ⇒ Object

transcode the string if original encoding is know fix if broken.
#wrap(len) ⇒ Object

Instance Method Details

#ascii ⇒ `Object`

# File 'lib/sup/util.rb', line 405

def ascii
  out = ""
  each_byte do |b|
    if (b & 128) != 0
      out << "\\x#{b.to_s 16}"
    else
      out << b.chr
    end
  end
  out = out.fix_encoding! # this should now be an utf-8 string of ascii
                         # compat chars.
end

#camel_to_hyphy ⇒ `Object`



215
216
217

# File 'lib/sup/util.rb', line 215

def camel_to_hyphy
  self.gsub(/([a-z])([A-Z0-9])/, '\1-\2').downcase
end

#check ⇒ `Object`

# File 'lib/sup/util.rb', line 396

def check
  begin
    fail "unexpected encoding #{encoding}" if respond_to?(:encoding) && !(encoding == Encoding::UTF_8 || encoding == Encoding::ASCII)
    fail "invalid encoding" if respond_to?(:valid_encoding?) && !valid_encoding?
  rescue
    raise CheckError.new($!.message)
  end
end

#decode_utf7 ⇒ `Object`

Decodes UTF-7 and returns the resulting decoded string as UTF-8.

Ruby doesn’t supply a UTF-7 encoding natively. There is Net::IMAP::decode_utf7 which only handles the IMAP “modified UTF-7” encoding. This implementation is inspired by that one but handles standard UTF-7 shift characters and not the IMAP-specific variation.

# File 'lib/sup/util.rb', line 362

def decode_utf7
  gsub(/\+([^-]+)?-/) {
    if $1
      ($1 + "===").unpack("m")[0].encode(Encoding::UTF_8, Encoding::UTF_16BE)
    else
      "+"
    end
  }
end

#display_length ⇒ `Object`



203
204
205

# File 'lib/sup/util.rb', line 203

def display_length
  @display_length ||= Unicode::DisplayWidth.of(self)
end

#each(&b) ⇒ `Object`



383
384
385

# File 'lib/sup/util.rb', line 383

def each &b
  each_line(&b)
end

#find_all_positions(x) ⇒ `Object`

# File 'lib/sup/util.rb', line 219

def find_all_positions x
  ret = []
  start = 0
  while start < length
    pos = index x, start
    break if pos.nil?
    ret << pos
    start = pos + 1
  end
  ret
end

#fix_encoding! ⇒ `Object`

Fix the damn string! make sure it is valid utf-8, then convert to user encoding.

# File 'lib/sup/util.rb', line 314

def fix_encoding!
  # first try to encode to utf-8 from whatever current encoding
  encode!('UTF-8', :invalid => :replace, :undef => :replace)

  # do this anyway in case string is set to be UTF-8, encoding to
  # something else (UTF-16 which can fully represent UTF-8) and back
  # ensures invalid chars are replaced.
  encode!('UTF-16', 'UTF-8', :invalid => :replace, :undef => :replace)
  encode!('UTF-8', 'UTF-16', :invalid => :replace, :undef => :replace)

  fail "Could not create valid UTF-8 string out of: '#{self.to_s}'." unless valid_encoding?

  # now convert to $encoding
  encode!($encoding, :invalid => :replace, :undef => :replace)

  fail "Could not create valid #{$encoding.inspect} string out of: '#{self.to_s}'." unless valid_encoding?

  self
end

#normalize_whitespace ⇒ `Object`



372
373
374

# File 'lib/sup/util.rb', line 372

def normalize_whitespace
  gsub(/\t/, "    ").gsub(/\r/, "")
end

#ord ⇒ `Object`



377
378
379

# File 'lib/sup/util.rb', line 377

def ord
  self[0]
end

#slice_by_display_length(len) ⇒ `Object`

# File 'lib/sup/util.rb', line 207

def slice_by_display_length len
  each_char.each_with_object "" do |c, buffer|
    len -= Unicode::DisplayWidth.of(c)
    return buffer if len < 0
    buffer << c
  end
end

#split_on_commas ⇒ `Object`

a very complicated regex found on teh internets to split on commas, unless they occurr within double quotes.



233
234
235

# File 'lib/sup/util.rb', line 233

def split_on_commas
  normalize_whitespace().split(/,\s*(?=(?:[^"]*"[^"]*")*(?![^"]*"))/)
end

#split_on_commas_with_remainder ⇒ `Object`

ok, here we do it the hard way. got to have a remainder for purposes of tab-completing full email addresses

# File 'lib/sup/util.rb', line 239

def split_on_commas_with_remainder
  ret = []
  state = :outstring
  pos = 0
  region_start = 0
  while pos <= length
    newpos = case state
      when :escaped_instring, :escaped_outstring then pos
      else index(/[,"\\]/, pos)
    end

    if newpos
      char = self[newpos]
    else
      char = nil
      newpos = length
    end

    case char
    when ?"
      state = case state
        when :outstring then :instring
        when :instring then :outstring
        when :escaped_instring then :instring
        when :escaped_outstring then :outstring
      end
    when ?,, nil
      state = case state
        when :outstring, :escaped_outstring then
          ret << self[region_start ... newpos].gsub(/^\s+|\s+$/, "")
          region_start = newpos + 1
          :outstring
        when :instring then :instring
        when :escaped_instring then :instring
      end
    when ?\\
      state = case state
        when :instring then :escaped_instring
        when :outstring then :escaped_outstring
        when :escaped_instring then :instring
        when :escaped_outstring then :outstring
      end
    end
    pos = newpos + 1
  end

  remainder = case state
    when :instring
      self[region_start .. -1].gsub(/^\s+/, "")
    else
      nil
    end

  [ret, remainder]
end

#to_set_of_symbols(split_on = nil) ⇒ `Object`

takes a list of words, and returns an array of symbols. typically used in Sup for translating Xapian’s representation of a list of labels (a string) to an array of label symbols.

split_on will be passed to String#split, so you can leave this nil for space.

393	# File 'lib/sup/util.rb', line 393 def to_set_of_symbols split_on=nil; Set.new split(split_on).map { \|x\| x.strip.intern } end

#transcode(to_encoding, from_encoding) ⇒ `Object`

transcode the string if original encoding is know fix if broken.

# File 'lib/sup/util.rb', line 336

def transcode to_encoding, from_encoding
  begin
    encode!(to_encoding, from_encoding, :invalid => :replace, :undef => :replace)

    unless valid_encoding?
      # fix encoding (through UTF-8)
      encode!('UTF-16', from_encoding, :invalid => :replace, :undef => :replace)
      encode!(to_encoding, 'UTF-16', :invalid => :replace, :undef => :replace)
    end

  rescue Encoding::ConverterNotFoundError
    debug "Encoding converter not found for #{from_encoding.inspect} or #{to_encoding.inspect}, fixing string: '#{self.to_s}', but expect weird characters."
    fix_encoding!
  end

  fail "Could not create valid #{to_encoding.inspect} string out of: '#{self.to_s}'." unless valid_encoding?

  self
end

#wrap(len) ⇒ `Object`

# File 'lib/sup/util.rb', line 295

def wrap len
  ret = []
  s = self
  while s.display_length > len
    slice = s.slice_by_display_length(len)
    cut = slice.rindex(/\s/)
    if cut
      ret << s[0 ... cut]
      s = s[(cut + 1) .. -1]
    else
      ret << slice
      s = s[slice.length .. -1]
    end
  end
  ret << s
end

Class: String

Direct Known Subclasses

Defined Under Namespace

Instance Method Summary collapse

Instance Method Details

#ascii ⇒ Object

#camel_to_hyphy ⇒ Object

#check ⇒ Object

#decode_utf7 ⇒ Object

#display_length ⇒ Object

#each(&b) ⇒ Object

#find_all_positions(x) ⇒ Object

#fix_encoding! ⇒ Object

#normalize_whitespace ⇒ Object

#ord ⇒ Object

#slice_by_display_length(len) ⇒ Object

#split_on_commas ⇒ Object

#split_on_commas_with_remainder ⇒ Object

#to_set_of_symbols(split_on = nil) ⇒ Object

#transcode(to_encoding, from_encoding) ⇒ Object

#wrap(len) ⇒ Object