For more information please see:
- Punycode
- Internationalized domain name
#!/usr/local/bin/ruby -Ku # NOTE: The following is not the complete source code by Kazuhiro NISHIYAMA. # For the full source code with more features, comments & test cases please see: # open -e `gem environment gemdir`/gems/punycode4r-0.2.0/lib/punycode.rb # # This is pure Ruby implementing Punycode (RFC 3492). # (original ANSI C code (C89) implementing Punycode is in RFC 3492) # # copyright (c) 2005 Kazuhiro NISHIYAMA # You can redistribute it and/or modify it under the same terms as Ruby. require "unicode" # sudo gem install unicode module Punycode module Status class Error < StandardError; end class PunycodeSuccess; end # Input is invalid. class PunycodeBadInput < Error; end # Output would exceed the space provided. class PunycodeBigOutput< Error; end # Input needs wider integers to process. class PunycodeOverflow < Error; end end include Status BASE = 36; TMIN = 1; TMAX = 26; SKEW = 38; DAMP = 700 INITIAL_BIAS = 72; INITIAL_N = 0x80; DELIMITER = 0x2D module_function def basic(cp) cp < 0x80 end def delim(cp) cp == DELIMITER end def decode_digit(cp) cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 : cp - 97 < 26 ? cp - 97 : BASE end def encode_digit(d, flag) return d + 22 + 75 * ((d < 26) ? 1 : 0) - ((flag ? 1 : 0) << 5) end def flagged(bcp) (0...26) === (bcp - 65) end def encode_basic(bcp, flag) # bcp -= (bcp - 97 < 26) << 5; if (0...26) === (bcp - 97) bcp -= 1 << 5 end # return bcp + ((!flag && (bcp - 65 < 26)) << 5); if !flag and (0...26) === (bcp - 65) bcp += 1 << 5 end bcp end MAXINT = 1 << 64 def adapt(delta, numpoints, firsttime) delta = firsttime ? delta / DAMP : delta >> 1 delta += delta / numpoints k = 0 while delta > ((BASE - TMIN) * TMAX) / 2 delta /= BASE - TMIN k += BASE end k + (BASE - TMIN + 1) * delta / (delta + SKEW) end def punycode_encode(input_length, input, case_flags, output_length, output) n = INITIAL_N delta = out = 0 max_out = output_length[0] bias = INITIAL_BIAS input_length.times do |j| if basic(input[j]) raise PunycodeBigOutput if max_out - out < 2 output[out] = if case_flags encode_basic(input[j], case_flags[j]) else input[j] end out+=1 # elsif (input[j] < n) # raise PunycodeBadInput # (not needed for Punycode with unsigned code points) end end h = b = out if b > 0 output[out] = DELIMITER out+=1 end while h < input_length m = MAXINT input_length.times do |j| # next if basic(input[j]) # (not needed for Punycode) m = input[j] if (n...m) === input[j] end raise PunycodeOverflow if m - n > (MAXINT - delta) / (h + 1) delta += (m - n) * (h + 1) n = m input_length.times do |j| # Punycode does not need to check whether input[j] is basic: if input[j] < n # || basic(input[j]) delta+=1 raise PunycodeOverflow if delta == 0 end if input[j] == n q = delta; k = BASE while true raise PunycodeBigOutput if out >= max_out t = if k <= bias # + TMIN # +TMIN not needed TMIN elsif k >= bias + TMAX TMAX else k - bias end break if q < t output[out] = encode_digit(t + (q - t) % (BASE - t), false) out+=1 q = (q - t) / (BASE - t) k += BASE end output[out] = encode_digit(q, case_flags && case_flags[j]) out+=1 bias = adapt(delta, h + 1, h == b) delta = 0 h+=1 end end delta+=1; n+=1 end output_length[0] = out return PunycodeSuccess end def punycode_decode(input_length, input, output_length, output, case_flags) n = INITIAL_N out = i = 0 max_out = output_length[0] bias = INITIAL_BIAS b = 0 input_length.times do |j| b = j if delim(input[j]) end raise PunycodeBigOutput if b > max_out b.times do |j| case_flags[out] = flagged(input[j]) if case_flags raise PunycodeBadInput unless basic(input[j]) output[out] = input[j] out+=1 end in_ = b > 0 ? b + 1 : 0 while in_ < input_length oldi = i; w = 1; k = BASE while true raise PunycodeBadInput if in_ >= input_length digit = decode_digit(input[in_]) in_+=1 raise PunycodeBadInput if digit >= BASE raise PunycodeOverflow if digit > (MAXINT - i) / w i += digit * w t = if k <= bias # + TMIN # +TMIN not needed TMIN elsif k >= bias + TMAX TMAX else k - bias end break if digit < t raise PunycodeOverflow if w > MAXINT / (BASE - t) w *= BASE - t k += BASE end bias = adapt(i - oldi, out + 1, oldi == 0) raise PunycodeOverflow if i / (out + 1) > MAXINT - n n += i / (out + 1) i %= out + 1 # not needed for Punycode: # raise PUNYCODE_INVALID_INPUT if decode_digit(n) <= base raise PunycodeBigOutput if out >= max_out if case_flags #memmove(case_flags + i + 1, case_flags + i, out - i) case_flags[i + 1, out - i] = case_flags[i, out - i] # Case of last character determines uppercase flag: case_flags[i] = flagged(input[in_ - 1]) end #memmove(output + i + 1, output + i, (out - i) * sizeof *output) output[i + 1, out - i] = output[i, out - i] output[i] = n i+=1 out+=1 end output_length[0] = out return PunycodeSuccess end def encode(unicode_string, case_flags=nil, print_ascii_only=false) input = unicode_string.unpack('U*') output = [0] * (ACE_MAX_LENGTH+1) output_length = [ACE_MAX_LENGTH] punycode_encode(input.size, input, case_flags, output_length, output) outlen = output_length[0] outlen.times do |j| c = output[j] unless c >= 0 && c <= 127 raise Error, "assertion error: invalid output char" end unless PRINT_ASCII[c] raise PunycodeBadInput end output[j] = PRINT_ASCII[c] if print_ascii_only end output[0..outlen].map{|x|x.chr}.join('').sub(/\0+\z/, '') end def decode(punycode, case_flags=[]) input = [] output = [] if ACE_MAX_LENGTH*2 < punycode.size raise PunycodeBigOutput end punycode.each_byte do |c| unless c >= 0 && c <= 127 raise PunycodeBadInput end input.push(c) end output_length = [UNICODE_MAX_LENGTH] Punycode.punycode_decode(input.length, input, output_length, output, case_flags) output.pack('U*') end UNICODE_MAX_LENGTH = 256 ACE_MAX_LENGTH = 256 # The following string is used to convert printable # characters between ASCII and the native charset: PRINT_ASCII = "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \ "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \ " !\"\#$%&'()*+,-./" \ "0123456789:;<=>?" \ "@ABCDEFGHIJKLMNO" \ "PQRSTUVWXYZ[\\]^_" \ "`abcdefghijklmno" \ "pqrstuvwxyz{|}~\n" end # cf. http://snippets.dzone.com/posts/show/4527 UTF8REGEX = /\A(?: [\x09\x0A\x0D\x20-\x7E] # ASCII | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )*\z/mnx UTF8_REGEX_MBYTE = /(?: [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )/mnx # cf. http://demo.icu-project.org/icu-bin/idnbrowser (samples) # on Mac OS X you can check the Ruby conversions with the GUI app PunyCode, http://software.dibomedia.de/products/show/2 str = "http://www.ﺱﺲﺷ.com/" str = "www.сделат картинки.com" str = "http://www.сделаткартинки.com/" str = "http://tūdaliņ.lv/" str = "http://www.zürich.com/" str = "http://www.hören.at/" str = "http://www.žlutý kůň.com/" str = "www.färgbolaget.nu" str = "www.brændendekærlighed.com" str = "www.mäkitorppa.com" str = "www.färjestadsbk.net" str = "あーるいん.com" str = "www.예비교사.com" str = "www.ハンドボールサムズ.com" str = "www.日本平.jp" str = "www.räksmörgås.se" str = "www.różyczka.pl/" str = "理容ナカムラ.com" str = "http://Bücher.ch/" str = "tūdaliņ.lv" if str =~ UTF8REGEX && str =~ UTF8_REGEX_MBYTE s1 = str.gsub(/^(http:\/\/www\.|http:\/\/|).*?\.[^\.\/]+\/?$/n, '\1') s2 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|)(www\.|).*?\.[^\.\/]+\/?$/n, '\1') s3 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|www\.|)(.*?)\.[^\.\/]+\/?$/n, '\1') s4 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|www\.|).*?(\.[^\.\/]+\/?)$/n, '\1') if s1.empty? then s1 = 'http://' end s3 = Punycode.encode(Unicode::normalize_KC(Unicode::downcase(s3))) punycoded_url = s1 << s2 << "xn--" << s3 << s4 puts punycoded_url %x{ /usr/bin/open "#{punycoded_url}" } end