Author: ntk
License:
The MIT License, Copyright (c) 2007 ntk
Description: some basic UTF8-aware string methods for Ruby's String class (Ruby 1.8.6)
Requirements: save this snippet to an UTF-8 encoded file and set the character set encoding of Terminal.app
to UTF-8 (on Mac OS X: Terminal menu -> Window Settings -> Display -> Character Set Encoding; to enable additional features see
here)
Further tools:
-
rbuconv, a pure Ruby library for Unicode translation
-
unicode, a library for Unicode Normalization (sudo gem install unicode); for a Windows version see
Unicode in Ruby on Rails-
ICU4R, a Ruby C-extension binding for the
ICU library
-
Msort, a command-line sorting program
-
punycode4r, a pure Ruby implementation of Punycode (RFC 3492; sudo gem install punycode4r)
-
utf8proc, library for processing UTF-8 encoded Unicode strings, (sudo gem install utf8proc)
-
Oniguruma, Ruby's regular expression engine; cf.
Secure UTF-8 Input in Rails and
Migrating your Rails application to Unicode-
character-encodings, seamless integration of character encodings into Ruby's String class, (sudo gem install character-encodings)
class String
require 'iconv'
require 'open-uri'
UTF8REGEX = /\A(?: # ?: non-capturing group (grouping with no back references)
[\x09\x0A\x0D\x20-\x7E] # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*\z/mnx
@small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
@capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
@other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
if @small_letters_utf8.size != @small_letters_utf8.nitems then raise "Invalid UTF-8 char in @small_letters_utf8!" end
if @capital_letters_utf8.size != @capital_letters_utf8.nitems then raise "Invalid UTF-8 char in @capital_letters_utf8!" end
if @other_letters_utf8.size != @other_letters_utf8.nitems then raise "Invalid UTF-8 char in @other_letters_utf8!" end
@unicode_array = []
@letters_utf8 = @small_letters_utf8 + @capital_letters_utf8 + @other_letters_utf8
@downcase_table_utf8 = Hash[*@capital_letters_utf8.zip(@small_letters_utf8).flatten]
@upcase_table_utf8 = Hash[*@small_letters_utf8.zip(@capital_letters_utf8).flatten]
@letters_utf8_hash = Hash[*@letters_utf8.zip([]).flatten]
class << self
attr_accessor :small_letters_utf8
attr_accessor :capital_letters_utf8
attr_accessor :other_letters_utf8
attr_accessor :letters_utf8
attr_accessor :letters_utf8_hash
attr_accessor :unicode_array
attr_accessor :downcase_table_utf8
attr_accessor :upcase_table_utf8
end
def each_utf8_char
scan(/./mu) { |c| yield c }
end
def each_utf8_char_with_index
i = -1
scan(/./mu) { |c| i+=1; yield(c, i) }
end
def length_utf8
count = 0
scan(/./mu) { count += 1 }
count
end
alias :size_utf8 :length_utf8
def reverse_utf8
split(//mu).reverse.join
end
def reverse_utf8!
split(//mu).reverse!.join
end
def swapcase_utf8
gsub(/./mu) do |char|
if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
else char.swapcase
end
end
end
def swapcase_utf8!
gsub!(/./mu) do |char|
if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
else ret = char.swapcase end
end
end
def downcase_utf8
gsub(/./mu) do |char|
small_char = String.downcase_table_utf8[char]
small_char.nil? ? char.downcase : small_char
end
end
def downcase_utf8!
gsub!(/./mu) do |char|
small_char = String.downcase_table_utf8[char]
small_char.nil? ? char.downcase : small_char
end
end
def upcase_utf8
gsub(/./mu) do |char|
capital_char = String.upcase_table_utf8[char]
capital_char.nil? ? char.upcase : capital_char
end
end
def upcase_utf8!
gsub!(/./mu) do |char|
capital_char = String.upcase_table_utf8[char]
capital_char.nil? ? char.upcase : capital_char
end
end
def count_utf8(c)
return nil if c.empty?
r = %r{[#{c}]}mu
scan(r).size
end
def delete_utf8(c)
return self if c.empty?
r = %r{[#{c}]}mu
gsub(r, '')
end
def delete_utf8!(c)
return self if c.empty?
r = %r{[#{c}]}mu
gsub!(r, '')
end
def first_utf8
self[/\A./mu]
end
def last_utf8
self[/.\z/mu]
end
def capitalize_utf8
return self if self =~ /\A[[:space:]]*\z/m
ret = ""
split(/\x20/).each do |w|
count = 0
w.gsub(/./mu) do |char|
count += 1
capital_char = String.upcase_table_utf8[char]
if count == 1 then
capital_char.nil? ? char.upcase : char.upcase_utf8
else
capital_char.nil? ? char.downcase : char.downcase_utf8
end
end
ret << w + ' '
end
ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
end
def capitalize_utf8!
return self if self =~ /\A[[:space:]]*\z/m
ret = ""
split(/\x20/).each do |w|
count = 0
w.gsub!(/./mu) do |char|
count += 1
capital_char = String.upcase_table_utf8[char]
if count == 1 then
capital_char.nil? ? char.upcase : char.upcase_utf8
else
capital_char.nil? ? char.downcase : char.downcase_utf8
end
end
ret << w + ' '
end
ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
end
def index_utf8(s)
return nil unless !self.empty? && (s.class == Regexp || s.class == String)
if s.class == Regexp
opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
if opts.count('u') == 0 then opts = opts + "u" end
str = s.source
return nil if str.empty?
str = "%r{#{str}}" + opts
r = eval(str)
l = ""
sub(r) { l << $`; " " }
l.empty? ? nil : l.length_utf8
else
return nil if s.empty?
r = %r{#{s}}mu
l = ""
sub(r) { l << $`; " " }
l.empty? ? nil : l.length_utf8
end
end
def rindex_utf8(s)
return nil unless !self.empty? && (s.class == Regexp || s.class == String)
if s.class == Regexp
opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
if opts.count('u') == 0 then opts = opts + "u" end
str = s.source
return nil if str.empty?
str = "%r{#{str}}" + opts
r = eval(str)
l = ""
scan(r) { l = $` }
l.empty? ? nil : l.length_utf8
else
return nil if s.empty?
r = %r{#{s}}mu
l = ""
scan(r) { l = $` }
l.empty? ? nil : l.length_utf8
end
end
def slice_utf8(regex)
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
if opts.count('u') == 0 then opts = opts + "u" end
s = regex.source
str = "%r{#{s}}" + opts
r = eval(str)
slice(r)
end
def slice_utf8!(regex)
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
if opts.count('u') == 0 then opts = opts + "u" end
s = regex.source
str = "%r{#{s}}" + opts
r = eval(str)
slice!(r