This is just a proof-of-concept snippet for how to internationalize domain names using
punycode4r (sudo gem install punycode4r).
For more information please see:
-
Punycode-
Internationalized domain name
require "unicode"
module Punycode
module Status
class Error < StandardError; end
class PunycodeSuccess; end
class PunycodeBadInput < Error; end
class PunycodeBigOutput< Error; end
class PunycodeOverflow < Error; end
end
include Status
BASE = 36; TMIN = 1; TMAX = 26; SKEW = 38; DAMP = 700
INITIAL_BIAS = 72; INITIAL_N = 0x80; DELIMITER = 0x2D
module_function
def basic(cp)
cp < 0x80
end
def delim(cp)
cp == DELIMITER
end
def decode_digit(cp)
cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
cp - 97 < 26 ? cp - 97 : BASE
end
def encode_digit(d, flag)
return d + 22 + 75 * ((d < 26) ? 1 : 0) - ((flag ? 1 : 0) << 5)
end
def flagged(bcp)
(0...26) === (bcp - 65)
end
def encode_basic(bcp, flag)
if (0...26) === (bcp - 97)
bcp -= 1 << 5
end
if !flag and (0...26) === (bcp - 65)
bcp += 1 << 5
end
bcp
end
MAXINT = 1 << 64
def adapt(delta, numpoints, firsttime)
delta = firsttime ? delta / DAMP : delta >> 1
delta += delta / numpoints
k = 0
while delta > ((BASE - TMIN) * TMAX) / 2
delta /= BASE - TMIN
k += BASE
end
k + (BASE - TMIN + 1) * delta / (delta + SKEW)
end
def punycode_encode(input_length, input, case_flags, output_length, output)
n = INITIAL_N
delta = out = 0
max_out = output_length[0]
bias = INITIAL_BIAS
input_length.times do |j|
if basic(input[j])
raise PunycodeBigOutput if max_out - out < 2
output[out] =
if case_flags
encode_basic(input[j], case_flags[j])
else
input[j]
end
out+=1
end
end
h = b = out
if b > 0
output[out] = DELIMITER
out+=1
end
while h < input_length
m = MAXINT
input_length.times do |j|
m = input[j] if (n...m) === input[j]
end
raise PunycodeOverflow if m - n > (MAXINT - delta) / (h + 1)
delta += (m - n) * (h + 1)
n = m
input_length.times do |j|
if input[j] < n
delta+=1
raise PunycodeOverflow if delta == 0
end
if input[j] == n
q = delta; k = BASE
while true
raise PunycodeBigOutput if out >= max_out
t = if k <= bias
TMIN
elsif k >= bias + TMAX
TMAX
else
k - bias
end
break if q < t
output[out] = encode_digit(t + (q - t) % (BASE - t), false)
out+=1
q = (q - t) / (BASE - t)
k += BASE
end
output[out] = encode_digit(q, case_flags && case_flags[j])
out+=1
bias = adapt(delta, h + 1, h == b)
delta = 0
h+=1
end
end
delta+=1; n+=1
end
output_length[0] = out
return PunycodeSuccess
end
def punycode_decode(input_length, input, output_length, output, case_flags)
n = INITIAL_N
out = i = 0
max_out = output_length[0]
bias = INITIAL_BIAS
b = 0
input_length.times do |j|
b = j if delim(input[j])
end
raise PunycodeBigOutput if b > max_out
b.times do |j|
case_flags[out] = flagged(input[j]) if case_flags
raise PunycodeBadInput unless basic(input[j])
output[out] = input[j]
out+=1
end
in_ = b > 0 ? b + 1 : 0
while in_ < input_length
oldi = i; w = 1; k = BASE
while true
raise PunycodeBadInput if in_ >= input_length
digit = decode_digit(input[in_])
in_+=1
raise PunycodeBadInput if digit >= BASE
raise PunycodeOverflow if digit > (MAXINT - i) / w
i += digit * w
t = if k <= bias
TMIN
elsif k >= bias + TMAX
TMAX
else
k - bias
end
break if digit < t
raise PunycodeOverflow if w > MAXINT / (BASE - t)
w *= BASE - t
k += BASE
end
bias = adapt(i - oldi, out + 1, oldi == 0)
raise PunycodeOverflow if i / (out + 1) > MAXINT - n
n += i / (out + 1)
i %= out + 1
# not needed for Punycode:
# raise PUNYCODE_INVALID_INPUT if decode_digit(n) <= base
raise PunycodeBigOutput if out >= max_out
if case_flags
case_flags[i + 1, out - i] = case_flags[i, out - i]
case_flags[i] = flagged(input[in_ - 1])
end
output[i + 1, out - i] = output[i, out - i]
output[i] = n
i+=1
out+=1
end
output_length[0] = out
return PunycodeSuccess
end
def encode(unicode_string, case_flags=nil, print_ascii_only=false)
input = unicode_string.unpack('U*')
output = [0] * (ACE_MAX_LENGTH+1)
output_length = [ACE_MAX_LENGTH]
punycode_encode(input.size, input, case_flags, output_length, output)
outlen = output_length[0]
outlen.times do |j|
c = output[j]
unless c >= 0 && c <= 127
raise Error, "assertion error: invalid output char"
end
unless PRINT_ASCII[c]
raise PunycodeBadInput
end
output[j] = PRINT_ASCII[c] if print_ascii_only
end
output[0..outlen].map{|x|x.chr}.join('').sub(/\0+\z/, '')
end
def decode(punycode, case_flags=[])
input = []
output = []
if ACE_MAX_LENGTH*2 < punycode.size
raise PunycodeBigOutput
end
punycode.each_byte do |c|
unless c >= 0 && c <= 127
raise PunycodeBadInput
end
input.push(c)
end
output_length = [UNICODE_MAX_LENGTH]
Punycode.punycode_decode(input.length, input, output_length,
output, case_flags)
output.pack('U*')
end
UNICODE_MAX_LENGTH = 256
ACE_MAX_LENGTH = 256
PRINT_ASCII =
"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \
"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \
" !\"\#$%&'()*+,-./" \
"0123456789:;<=>?" \
"@ABCDEFGHIJKLMNO" \
"PQRSTUVWXYZ[\\]^_" \
"`abcdefghijklmno" \
"pqrstuvwxyz{|}~\n"
end
UTF8REGEX = /\A(?:
[\x09\x0A\x0D\x20-\x7E] # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*\z/mnx
UTF8_REGEX_MBYTE = /(?:
[\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)/mnx
str = "http://www.ﺱﺲﺷ.com/"
str = "www.сделат картинки.com"
str = "http://www.сделаткартинки.com/"
str = "http://tūdaliņ.lv/"
str = "http://www.zürich.com/"
str = "http://www.hören.at/"
str = "http://www.žlutý kůň.com/"
str = "www.färgbolaget.nu"
str = "www.brændendekærlighed.com"
str = "www.mäkitorppa.com"
str = "www.färjestadsbk.net"
str = "あーるいん.com"
str = "www.예비교사.com"
str = "www.ハンドボールサムズ.com"
str = "www.日本平.jp"
str = "www.räksmörgås.se"
str = "www.różyczka.pl/"
str = "理容ナカムラ.com"
str = "http://Bücher.ch/"
str = "tūdaliņ.lv"
if str =~ UTF8REGEX && str =~ UTF8_REGEX_MBYTE
s1 = str.gsub(/^(http:\/\/www\.|http:\/\/|).*?\.[^\.\/]+\/?$/n, '\1')
s2 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|)(www\.|).*?\.[^\.\/]+\/?$/n, '\1')
s3 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|www\.|)(.*?)\.[^\.\/]+\/?$/n, '\1')
s4 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|www\.|).*?(\.[^\.\/]+\/?)$/n, '\1')
if s1.empty? then s1 = 'http://' end
s3 = Punycode.encode(Unicode::normalize_KC(Unicode::downcase(s3)))
punycoded_url = s1 << s2 << "xn--" << s3 << s4
puts punycoded_url
%x{ /usr/bin/open "#{punycoded_url}" }
end