Never been to DZone Snippets before?

Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

About this user

« Newer Snippets
Older Snippets »
Showing 1-5 of 5 total  RSS 

Punycoded URLs in Ruby

This is just a proof-of-concept snippet for how to internationalize domain names using punycode4r (sudo gem install punycode4r).

For more information please see:
- Punycode
- Internationalized domain name



#!/usr/local/bin/ruby -Ku

# NOTE: The following is not the complete source code by Kazuhiro NISHIYAMA.
#       For the full source code with more features, comments & test cases please see: 
#       open -e `gem environment gemdir`/gems/punycode4r-0.2.0/lib/punycode.rb
#
# This is pure Ruby implementing Punycode (RFC 3492).
# (original ANSI C code (C89) implementing Punycode is in RFC 3492)
#
# copyright (c) 2005 Kazuhiro NISHIYAMA
# You can redistribute it and/or modify it under the same terms as Ruby.


require "unicode"     # sudo gem install unicode

module Punycode

  module Status
    class Error < StandardError; end
    class PunycodeSuccess; end
    # Input is invalid.
    class PunycodeBadInput < Error; end
    # Output would exceed the space provided.
    class PunycodeBigOutput< Error; end
    # Input needs wider integers to process.
    class PunycodeOverflow < Error; end
  end
  include Status


  BASE = 36; TMIN = 1; TMAX = 26; SKEW = 38; DAMP = 700
  INITIAL_BIAS = 72; INITIAL_N = 0x80; DELIMITER = 0x2D

  module_function

  def basic(cp)
    cp < 0x80
  end

  def delim(cp)
    cp == DELIMITER
  end

  def decode_digit(cp)
    cp - 48 < 10 ? cp - 22 :  cp - 65 < 26 ? cp - 65 :
      cp - 97 < 26 ? cp - 97 : BASE
  end

  def encode_digit(d, flag)
    return d + 22 + 75 * ((d < 26) ? 1 : 0) - ((flag ? 1 : 0) << 5)
  end

  def flagged(bcp)
    (0...26) === (bcp - 65)
  end

  def encode_basic(bcp, flag)
    # bcp -= (bcp - 97 < 26) << 5;
    if (0...26) === (bcp - 97)
      bcp -= 1 << 5
    end
    # return bcp + ((!flag && (bcp - 65 < 26)) << 5);
    if !flag and (0...26) === (bcp - 65)
      bcp += 1 << 5
    end
    bcp
  end

  MAXINT = 1 << 64


  def adapt(delta, numpoints, firsttime)
    delta = firsttime ? delta / DAMP : delta >> 1
    delta += delta / numpoints

    k = 0
    while delta > ((BASE - TMIN) * TMAX) / 2
      delta /= BASE - TMIN
      k += BASE
    end

    k + (BASE - TMIN + 1) * delta / (delta + SKEW)
  end

  def punycode_encode(input_length, input, case_flags, output_length, output)

    n = INITIAL_N
    delta = out = 0
    max_out = output_length[0]
    bias = INITIAL_BIAS

    input_length.times do |j|
      if basic(input[j])
        raise PunycodeBigOutput if max_out - out < 2
        output[out] =
          if case_flags
            encode_basic(input[j], case_flags[j])
          else
            input[j]
          end
        out+=1
      # elsif (input[j] < n)
      #   raise PunycodeBadInput
      # (not needed for Punycode with unsigned code points)
      end
    end

    h = b = out

    if b > 0
      output[out] = DELIMITER
      out+=1
    end

   while h < input_length

      m = MAXINT
      input_length.times do |j|
        # next if basic(input[j])
        # (not needed for Punycode)
        m = input[j] if (n...m) === input[j]
      end

      raise PunycodeOverflow if m - n > (MAXINT - delta) / (h + 1)
      delta += (m - n) * (h + 1)
      n = m

      input_length.times do |j|
        # Punycode does not need to check whether input[j] is basic:
        if input[j] < n # || basic(input[j])
          delta+=1
          raise PunycodeOverflow if delta == 0
        end

        if input[j] == n

          q = delta; k = BASE
          while true
            raise PunycodeBigOutput if out >= max_out
            t = if k <= bias # + TMIN # +TMIN not needed
                  TMIN
                elsif k >= bias + TMAX
                  TMAX
                else
                  k - bias
                end
            break if q < t
            output[out] = encode_digit(t + (q - t) % (BASE - t), false)
            out+=1
            q = (q - t) / (BASE - t)
            k += BASE
          end

          output[out] = encode_digit(q, case_flags && case_flags[j])
          out+=1
          bias = adapt(delta, h + 1, h == b)
          delta = 0
          h+=1
        end
      end

      delta+=1; n+=1
    end

    output_length[0] = out
    return PunycodeSuccess
  end

  def punycode_decode(input_length, input, output_length, output, case_flags)

    n = INITIAL_N

    out = i = 0
    max_out = output_length[0]
    bias = INITIAL_BIAS

    b = 0
    input_length.times do |j|
      b = j if delim(input[j])
    end
    raise PunycodeBigOutput if b > max_out

    b.times do |j|
      case_flags[out] = flagged(input[j]) if case_flags
      raise PunycodeBadInput unless basic(input[j])
      output[out] = input[j]
      out+=1
    end

    in_ = b > 0 ? b + 1 : 0
    while in_ < input_length

      oldi = i; w = 1; k = BASE
      while true
        raise PunycodeBadInput if in_ >= input_length
        digit = decode_digit(input[in_])
        in_+=1
        raise PunycodeBadInput if digit >= BASE
        raise PunycodeOverflow if digit > (MAXINT - i) / w
        i += digit * w
        t = if k <= bias # + TMIN # +TMIN not needed
              TMIN
            elsif k >= bias + TMAX
              TMAX
            else
              k - bias
            end
        break if digit < t
        raise PunycodeOverflow if w > MAXINT / (BASE - t)
        w *= BASE - t
        k += BASE
      end

      bias = adapt(i - oldi, out + 1, oldi == 0)

      raise PunycodeOverflow if i / (out + 1) > MAXINT - n
      n += i / (out + 1)
      i %= out + 1

      # not needed for Punycode:
      # raise PUNYCODE_INVALID_INPUT if decode_digit(n) <= base
      raise PunycodeBigOutput if out >= max_out

      if case_flags
        #memmove(case_flags + i + 1, case_flags + i, out - i)
        case_flags[i + 1, out - i] = case_flags[i, out - i]

        # Case of last character determines uppercase flag:
        case_flags[i] = flagged(input[in_ - 1])
      end

      #memmove(output + i + 1, output + i, (out - i) * sizeof *output)
      output[i + 1, out - i] = output[i, out - i]
      output[i] = n
      i+=1

      out+=1
    end

    output_length[0] = out
    return PunycodeSuccess
  end

  def encode(unicode_string, case_flags=nil, print_ascii_only=false)
    input = unicode_string.unpack('U*')
    output = [0] * (ACE_MAX_LENGTH+1)
    output_length = [ACE_MAX_LENGTH]

    punycode_encode(input.size, input, case_flags, output_length, output)

    outlen = output_length[0]
    outlen.times do |j|
      c = output[j]
      unless c >= 0 && c <= 127
        raise Error, "assertion error: invalid output char"
      end
      unless PRINT_ASCII[c]
        raise PunycodeBadInput
      end
      output[j] = PRINT_ASCII[c] if print_ascii_only
    end

    output[0..outlen].map{|x|x.chr}.join('').sub(/\0+\z/, '')
  end

  def decode(punycode, case_flags=[])
    input = []
    output = []

    if ACE_MAX_LENGTH*2 < punycode.size
      raise PunycodeBigOutput
    end
    punycode.each_byte do |c|
      unless c >= 0 && c <= 127
        raise PunycodeBadInput
      end
      input.push(c)
    end

    output_length = [UNICODE_MAX_LENGTH]
    Punycode.punycode_decode(input.length, input, output_length,
                             output, case_flags)
    output.pack('U*')
  end

  UNICODE_MAX_LENGTH = 256
  ACE_MAX_LENGTH = 256

  # The following string is used to convert printable
  # characters between ASCII and the native charset:

  PRINT_ASCII =
    "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \
    "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \
    " !\"\#$%&'()*+,-./" \
    "0123456789:;<=>?" \
    "@ABCDEFGHIJKLMNO" \
    "PQRSTUVWXYZ[\\]^_" \
    "`abcdefghijklmno" \
    "pqrstuvwxyz{|}~\n"
end



# cf. http://snippets.dzone.com/posts/show/4527

UTF8REGEX = /\A(?:                                                            
              [\x09\x0A\x0D\x20-\x7E]            # ASCII
            | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
            |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
            | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
            |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
            |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
            | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
            |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
            )*\z/mnx


UTF8_REGEX_MBYTE = /(?:                                 
                 [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
               |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
               | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
               |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
               |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
               | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
               |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
               )/mnx



# cf. http://demo.icu-project.org/icu-bin/idnbrowser (samples)
# on Mac OS X you can check the Ruby conversions with the GUI app PunyCode, http://software.dibomedia.de/products/show/2

str = "http://www.ﺱﺲﺷ.com/"
str = "www.сделат картинки.com"
str = "http://www.сделаткартинки.com/"
str = "http://tūdaliņ.lv/"
str = "http://www.zürich.com/"
str = "http://www.hören.at/"
str = "http://www.žlutý kůň.com/"
str = "www.färgbolaget.nu"
str = "www.brændendekærlighed.com"
str = "www.mäkitorppa.com"
str = "www.färjestadsbk.net"
str = "あーるいん.com"
str = "www.예비교사.com"
str = "www.ハンドボールサムズ.com"
str = "www.日本平.jp"
str = "www.räksmörgås.se"
str = "www.różyczka.pl/"
str = "理容ナカムラ.com"
str = "http://Bücher.ch/"
str = "tūdaliņ.lv"


if str =~ UTF8REGEX && str =~ UTF8_REGEX_MBYTE

   s1 = str.gsub(/^(http:\/\/www\.|http:\/\/|).*?\.[^\.\/]+\/?$/n, '\1')
   s2 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|)(www\.|).*?\.[^\.\/]+\/?$/n, '\1')
   s3 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|www\.|)(.*?)\.[^\.\/]+\/?$/n, '\1')
   s4 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|www\.|).*?(\.[^\.\/]+\/?)$/n, '\1')

   if s1.empty? then s1 = 'http://' end

   s3 = Punycode.encode(Unicode::normalize_KC(Unicode::downcase(s3)))

   punycoded_url = s1 << s2 << "xn--" << s3 << s4

   puts punycoded_url

   %x{ /usr/bin/open "#{punycoded_url}" }

end


Convert Unicode codepoints to UTF-8 characters with Module#const_missing

From: http://www.davidflanagan.com/blog/2007_08.html#000136
Author: David Flanagan


# This module lazily defines constants of the form Uxxxx for all Unicode
# codepoints from U0000 to U10FFFF. The value of each constant is the
# UTF-8 string for the codepoint.
# Examples:
#   copyright = Unicode::U00A9
#   euro = Unicode::U20AC
#   infinity = Unicode::U221E
#
module Unicode
  def self.const_missing(name)  
    # Check that the constant name is of the right form: U0000 to U10FFFF
    if name.to_s =~ /^U([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
      # Convert the codepoint to an immutable UTF-8 string,
      # define a real constant for that value and return the value
      #p name, name.class
      const_set(name, [$1.to_i(16)].pack("U").freeze)
    else  # Raise an error for constants that are not Unicode.
      raise NameError, "Uninitialized constant: Unicode::#{name}"
    end
  end
end


puts copyright = Unicode::U00A9
puts euro = Unicode::U20AC
puts euro = Unicode::U20AC
puts infinity = Unicode::U221E
puts Unicode.const_get(:U221E)
p Unicode.constants
puts Unicode.constants
Unicode.constants.each { |u| puts Unicode.const_get(u) }


UTF8-aware string methods in Ruby

Author: ntk
License: The MIT License, Copyright (c) 2007 ntk
Description: some basic UTF8-aware string methods for Ruby's String class (Ruby 1.8.6)
Requirements: save this snippet to an UTF-8 encoded file and set the character set encoding of Terminal.app
to UTF-8 (on Mac OS X: Terminal menu -> Window Settings -> Display -> Character Set Encoding; to enable additional features see here)


Further tools:
- rbuconv, a pure Ruby library for Unicode translation
- unicode, a library for Unicode Normalization (sudo gem install unicode); for a Windows version see Unicode in Ruby on Rails
- ICU4R, a Ruby C-extension binding for the ICU library
- Msort, a command-line sorting program
- punycode4r, a pure Ruby implementation of Punycode (RFC 3492; sudo gem install punycode4r)
- utf8proc, library for processing UTF-8 encoded Unicode strings, (sudo gem install utf8proc)
- Oniguruma, Ruby's regular expression engine; cf. Secure UTF-8 Input in Rails and Migrating your Rails application to Unicode
- character-encodings, seamless integration of character encodings into Ruby's String class, (sudo gem install character-encodings)



class String

   require 'iconv' 
   require 'open-uri'      # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html

   # taken from: http://www.w3.org/International/questions/qa-forms-utf-8
   UTF8REGEX = /\A(?:                               # ?: non-capturing group (grouping with no back references)
                 [\x09\x0A\x0D\x20-\x7E]            # ASCII
               | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
               |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
               | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
               |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
               |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
               | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
               |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
               )*\z/mnx


#  create UTF-8 character arrays (as class instance variables)
#
#  mapping tables: - http://www.unicode.org/Public/UCA/latest/allkeys.txt
#                  - http://unicode.org/Public/UNIDATA/UnicodeData.txt 
#                  - http://unicode.org/Public/UNIDATA/CaseFolding.txt
#                  - http://www.decodeunicode.org 
#                  - ftp://ftp.mars.org/pub/ruby/Unicode.tar.bz2
#                  - http://camomile.sourceforge.net
#                  - Character Palette (Mac OS X)


   # test data
   @small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }


   @capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }


   @other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }

   if @small_letters_utf8.size != @small_letters_utf8.nitems then raise "Invalid UTF-8 char in @small_letters_utf8!" end
   if @capital_letters_utf8.size != @capital_letters_utf8.nitems then raise "Invalid UTF-8 char in @capital_letters_utf8!" end
   if @other_letters_utf8.size != @other_letters_utf8.nitems then raise "Invalid UTF-8 char in @other_letters_utf8!" end


   @unicode_array = []
   #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f| f.each(nil) { |line| line.scan(/^[^;]+/) { |u| @unicode_array << u } }  end
   #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f|                                                                               
   #   f.each do |line| line =~ /LATIN|GREEK|CYRILLIC/  ?  ( line.scan(/^[^;]+/) { |u| @unicode_array << u } )  :  next  end
   #end

   #@letters_utf8 = @unicode_array.map { |x| u = [x.hex].pack("U*"); u =~ UTF8REGEX ? u : nil }.compact   # code points from UnicodeData.txt
   @letters_utf8 = @small_letters_utf8 + @capital_letters_utf8 + @other_letters_utf8                      # test data only

   # Hash[*array_with_keys.zip(array_with_values).flatten]
   @downcase_table_utf8 = Hash[*@capital_letters_utf8.zip(@small_letters_utf8).flatten]
   @upcase_table_utf8 = Hash[*@small_letters_utf8.zip(@capital_letters_utf8).flatten]
   @letters_utf8_hash = Hash[*@letters_utf8.zip([]).flatten]    #=> ... "\341\272\242"=>nil ...

   class << self 
      attr_accessor :small_letters_utf8
      attr_accessor :capital_letters_utf8
      attr_accessor :other_letters_utf8
      attr_accessor :letters_utf8
      attr_accessor :letters_utf8_hash
      attr_accessor :unicode_array
      attr_accessor :downcase_table_utf8
      attr_accessor :upcase_table_utf8
   end


   def each_utf8_char
      scan(/./mu) { |c| yield c }
   end

   def each_utf8_char_with_index
      i = -1
      scan(/./mu) { |c| i+=1; yield(c, i) }
   end

   def length_utf8
      #scan(/./mu).size
      count = 0
      scan(/./mu) { count += 1 }
      count
   end
   alias :size_utf8 :length_utf8

   def reverse_utf8
      split(//mu).reverse.join
   end

   def reverse_utf8!
      split(//mu).reverse!.join
   end

   def swapcase_utf8
     gsub(/./mu) do |char|  
         if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
         elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
         else char.swapcase
         end
      end
   end

   def swapcase_utf8!
      gsub!(/./mu) do |char|  
         if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
         elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
         else ret = char.swapcase end
      end
   end

   def downcase_utf8
      gsub(/./mu) do |char|  
         small_char = String.downcase_table_utf8[char]
         small_char.nil? ? char.downcase : small_char
      end
   end

   def downcase_utf8!
      gsub!(/./mu) do |char|  
         small_char = String.downcase_table_utf8[char]
         small_char.nil? ? char.downcase : small_char
      end
   end

   def upcase_utf8
      gsub(/./mu) do |char|  
         capital_char = String.upcase_table_utf8[char]
         capital_char.nil? ? char.upcase : capital_char
      end
   end

   def upcase_utf8!
      gsub!(/./mu) do |char|  
         capital_char = String.upcase_table_utf8[char]
         capital_char.nil? ? char.upcase : capital_char
      end
   end

   def count_utf8(c)
      return nil if c.empty?
      r = %r{[#{c}]}mu
      scan(r).size
   end

   def delete_utf8(c)
      return self if c.empty?
      r = %r{[#{c}]}mu
      gsub(r, '')
   end

   def delete_utf8!(c)
      return self if c.empty?
      r = %r{[#{c}]}mu
      gsub!(r, '')
   end

   def first_utf8
      self[/\A./mu]
   end

   def last_utf8
      self[/.\z/mu]
   end

   def capitalize_utf8
     return self if self =~ /\A[[:space:]]*\z/m
     ret = ""
     split(/\x20/).each do |w| 
         count = 0
         w.gsub(/./mu) do |char|  
            count += 1
            capital_char = String.upcase_table_utf8[char]
            if count == 1 then 
               capital_char.nil? ? char.upcase : char.upcase_utf8
            else
               capital_char.nil? ? char.downcase : char.downcase_utf8
            end
         end
         ret << w + ' '
     end
     ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret  
   end

   def capitalize_utf8!
     return self if self =~ /\A[[:space:]]*\z/m 
     ret = ""
     split(/\x20/).each do |w| 
         count = 0
         w.gsub!(/./mu) do |char|  
            count += 1
            capital_char = String.upcase_table_utf8[char]
            if count == 1 then 
               capital_char.nil? ? char.upcase : char.upcase_utf8
            else
               capital_char.nil? ? char.downcase : char.downcase_utf8
            end
         end
         ret << w + ' '
     end
     ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
   end


   def index_utf8(s)

      return nil unless !self.empty? && (s.class == Regexp || s.class == String)
      #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)

      if s.class == Regexp
         opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
         if  opts.count('u') == 0 then opts = opts + "u" end
         str = s.source
         return nil if str.empty?
         str = "%r{#{str}}" + opts
         r = eval(str)
         l = ""
         sub(r) { l << $`; " " }  # $`: The string to the left of the last successful match (cf. http://www.zenspider.com/Languages/Ruby/QuickRef.html)
         l.empty? ? nil : l.length_utf8

      else

         return nil if s.empty?
         r = %r{#{s}}mu
         l = ""
         sub(r) { l << $`; " " }
         l.empty? ? nil : l.length_utf8

# this would be a non-regex solution
=begin 
         return nil if s.empty?
         return nil unless self =~ %r{#{s}}mu
         indices = []
         s.split(//mu).each do |x|
            ar = []
            self.each_utf8_char_with_index { |c,i| if c == x then ar << i end  }   # first get all matching indices c == x
            indices << ar unless ar.empty?
         end
         if indices.empty?
            return nil
         elsif indices.size == 1 
            indices.first.first
         else 
            #p indices
            ret = []
            a0 = indices.shift
            a0.each do |i|
               ret << i
               indices.each { |a| if a.include?(i+1) then i += 1; ret << i else ret = []; break end  }
               return ret.first unless ret.empty?
            end
            ret.empty? ? nil : ret.first
         end
=end

      end
   end   


   def rindex_utf8(s)

      return nil unless !self.empty? && (s.class == Regexp || s.class == String)
      #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)

      if s.class == Regexp
         opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
         if  opts.count('u') == 0 then opts = opts + "u" end
         str = s.source
         return nil if str.empty?
         str = "%r{#{str}}" + opts
         r = eval(str)
         l = ""
         scan(r) { l = $` }  
         #gsub(r) { l = $`; " " }  
         l.empty? ? nil : l.length_utf8
      else
         return nil if s.empty?
         r = %r{#{s}}mu
         l = ""
         scan(r) { l = $` }  
         #gsub(r) { l = $`; " " }
         l.empty? ? nil : l.length_utf8
      end

   end   


   # note that the i option does not work in special cases with back references
   # example: "àÀ".slice_utf8(/(.).*?\1/i) returns nil whereas "aA".slice(/(.).*?\1/i) returns "aA"
   def slice_utf8(regex)   
      opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
      if  opts.count('u') == 0 then opts = opts + "u" end
      s = regex.source
      str = "%r{#{s}}" + opts
      r = eval(str)
      slice(r)