<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DZone Snippets: idn code</title>
    <link>http://snippets.dzone.com/posts</link>
    <pubDate>Fri, 25 Jul 2008 02:25:25 GMT</pubDate>
    <description>DZone Snippets: idn code</description>
    <item>
      <title>Punycoded URLs in Ruby</title>
      <link>http://snippets.dzone.com/posts/show/4575</link>
      <description>This is just a proof-of-concept snippet for how to internationalize domain names using &lt;a href="http://raa.ruby-lang.org/project/punycode4r/"&gt;punycode4r&lt;/a&gt; (sudo gem install punycode4r).&lt;br /&gt;&lt;br /&gt;For more information please see:&lt;br /&gt;- &lt;a href="http://en.wikipedia.org/wiki/Punycode"&gt;Punycode&lt;/a&gt;&lt;br /&gt;- &lt;a href="http://en.wikipedia.org/wiki/Internationalizing_Domain_Names_in_Applications"&gt;Internationalized domain name&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;&lt;br /&gt;#!/usr/local/bin/ruby -Ku&lt;br /&gt;&lt;br /&gt;# NOTE: The following is not the complete source code by Kazuhiro NISHIYAMA.&lt;br /&gt;#       For the full source code with more features, comments &amp; test cases please see: &lt;br /&gt;#       open -e `gem environment gemdir`/gems/punycode4r-0.2.0/lib/punycode.rb&lt;br /&gt;#&lt;br /&gt;# This is pure Ruby implementing Punycode (RFC 3492).&lt;br /&gt;# (original ANSI C code (C89) implementing Punycode is in RFC 3492)&lt;br /&gt;#&lt;br /&gt;# copyright (c) 2005 Kazuhiro NISHIYAMA&lt;br /&gt;# You can redistribute it and/or modify it under the same terms as Ruby.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;require "unicode"     # sudo gem install unicode&lt;br /&gt;&lt;br /&gt;module Punycode&lt;br /&gt;&lt;br /&gt;  module Status&lt;br /&gt;    class Error &lt; StandardError; end&lt;br /&gt;    class PunycodeSuccess; end&lt;br /&gt;    # Input is invalid.&lt;br /&gt;    class PunycodeBadInput &lt; Error; end&lt;br /&gt;    # Output would exceed the space provided.&lt;br /&gt;    class PunycodeBigOutput&lt; Error; end&lt;br /&gt;    # Input needs wider integers to process.&lt;br /&gt;    class PunycodeOverflow &lt; Error; end&lt;br /&gt;  end&lt;br /&gt;  include Status&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;  BASE = 36; TMIN = 1; TMAX = 26; SKEW = 38; DAMP = 700&lt;br /&gt;  INITIAL_BIAS = 72; INITIAL_N = 0x80; DELIMITER = 0x2D&lt;br /&gt;&lt;br /&gt;  module_function&lt;br /&gt;&lt;br /&gt;  def basic(cp)&lt;br /&gt;    cp &lt; 0x80&lt;br /&gt;  end&lt;br /&gt;&lt;br /&gt;  def delim(cp)&lt;br /&gt;    cp == DELIMITER&lt;br /&gt;  end&lt;br /&gt;&lt;br /&gt;  def decode_digit(cp)&lt;br /&gt;    cp - 48 &lt; 10 ? cp - 22 :  cp - 65 &lt; 26 ? cp - 65 :&lt;br /&gt;      cp - 97 &lt; 26 ? cp - 97 : BASE&lt;br /&gt;  end&lt;br /&gt;&lt;br /&gt;  def encode_digit(d, flag)&lt;br /&gt;    return d + 22 + 75 * ((d &lt; 26) ? 1 : 0) - ((flag ? 1 : 0) &lt;&lt; 5)&lt;br /&gt;  end&lt;br /&gt;&lt;br /&gt;  def flagged(bcp)&lt;br /&gt;    (0...26) === (bcp - 65)&lt;br /&gt;  end&lt;br /&gt;&lt;br /&gt;  def encode_basic(bcp, flag)&lt;br /&gt;    # bcp -= (bcp - 97 &lt; 26) &lt;&lt; 5;&lt;br /&gt;    if (0...26) === (bcp - 97)&lt;br /&gt;      bcp -= 1 &lt;&lt; 5&lt;br /&gt;    end&lt;br /&gt;    # return bcp + ((!flag &amp;&amp; (bcp - 65 &lt; 26)) &lt;&lt; 5);&lt;br /&gt;    if !flag and (0...26) === (bcp - 65)&lt;br /&gt;      bcp += 1 &lt;&lt; 5&lt;br /&gt;    end&lt;br /&gt;    bcp&lt;br /&gt;  end&lt;br /&gt;&lt;br /&gt;  MAXINT = 1 &lt;&lt; 64&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;  def adapt(delta, numpoints, firsttime)&lt;br /&gt;    delta = firsttime ? delta / DAMP : delta &gt;&gt; 1&lt;br /&gt;    delta += delta / numpoints&lt;br /&gt;&lt;br /&gt;    k = 0&lt;br /&gt;    while delta &gt; ((BASE - TMIN) * TMAX) / 2&lt;br /&gt;      delta /= BASE - TMIN&lt;br /&gt;      k += BASE&lt;br /&gt;    end&lt;br /&gt;&lt;br /&gt;    k + (BASE - TMIN + 1) * delta / (delta + SKEW)&lt;br /&gt;  end&lt;br /&gt;&lt;br /&gt;  def punycode_encode(input_length, input, case_flags, output_length, output)&lt;br /&gt;&lt;br /&gt;    n = INITIAL_N&lt;br /&gt;    delta = out = 0&lt;br /&gt;    max_out = output_length[0]&lt;br /&gt;    bias = INITIAL_BIAS&lt;br /&gt;&lt;br /&gt;    input_length.times do |j|&lt;br /&gt;      if basic(input[j])&lt;br /&gt;        raise PunycodeBigOutput if max_out - out &lt; 2&lt;br /&gt;        output[out] =&lt;br /&gt;          if case_flags&lt;br /&gt;            encode_basic(input[j], case_flags[j])&lt;br /&gt;          else&lt;br /&gt;            input[j]&lt;br /&gt;          end&lt;br /&gt;        out+=1&lt;br /&gt;      # elsif (input[j] &lt; n)&lt;br /&gt;      #   raise PunycodeBadInput&lt;br /&gt;      # (not needed for Punycode with unsigned code points)&lt;br /&gt;      end&lt;br /&gt;    end&lt;br /&gt;&lt;br /&gt;    h = b = out&lt;br /&gt;&lt;br /&gt;    if b &gt; 0&lt;br /&gt;      output[out] = DELIMITER&lt;br /&gt;      out+=1&lt;br /&gt;    end&lt;br /&gt;&lt;br /&gt;   while h &lt; input_length&lt;br /&gt;&lt;br /&gt;      m = MAXINT&lt;br /&gt;      input_length.times do |j|&lt;br /&gt;        # next if basic(input[j])&lt;br /&gt;        # (not needed for Punycode)&lt;br /&gt;        m = input[j] if (n...m) === input[j]&lt;br /&gt;      end&lt;br /&gt;&lt;br /&gt;      raise PunycodeOverflow if m - n &gt; (MAXINT - delta) / (h + 1)&lt;br /&gt;      delta += (m - n) * (h + 1)&lt;br /&gt;      n = m&lt;br /&gt;&lt;br /&gt;      input_length.times do |j|&lt;br /&gt;        # Punycode does not need to check whether input[j] is basic:&lt;br /&gt;        if input[j] &lt; n # || basic(input[j])&lt;br /&gt;          delta+=1&lt;br /&gt;          raise PunycodeOverflow if delta == 0&lt;br /&gt;        end&lt;br /&gt;&lt;br /&gt;        if input[j] == n&lt;br /&gt;&lt;br /&gt;          q = delta; k = BASE&lt;br /&gt;          while true&lt;br /&gt;            raise PunycodeBigOutput if out &gt;= max_out&lt;br /&gt;            t = if k &lt;= bias # + TMIN # +TMIN not needed&lt;br /&gt;                  TMIN&lt;br /&gt;                elsif k &gt;= bias + TMAX&lt;br /&gt;                  TMAX&lt;br /&gt;                else&lt;br /&gt;                  k - bias&lt;br /&gt;                end&lt;br /&gt;            break if q &lt; t&lt;br /&gt;            output[out] = encode_digit(t + (q - t) % (BASE - t), false)&lt;br /&gt;            out+=1&lt;br /&gt;            q = (q - t) / (BASE - t)&lt;br /&gt;            k += BASE&lt;br /&gt;          end&lt;br /&gt;&lt;br /&gt;          output[out] = encode_digit(q, case_flags &amp;&amp; case_flags[j])&lt;br /&gt;          out+=1&lt;br /&gt;          bias = adapt(delta, h + 1, h == b)&lt;br /&gt;          delta = 0&lt;br /&gt;          h+=1&lt;br /&gt;        end&lt;br /&gt;      end&lt;br /&gt;&lt;br /&gt;      delta+=1; n+=1&lt;br /&gt;    end&lt;br /&gt;&lt;br /&gt;    output_length[0] = out&lt;br /&gt;    return PunycodeSuccess&lt;br /&gt;  end&lt;br /&gt;&lt;br /&gt;  def punycode_decode(input_length, input, output_length, output, case_flags)&lt;br /&gt;&lt;br /&gt;    n = INITIAL_N&lt;br /&gt;&lt;br /&gt;    out = i = 0&lt;br /&gt;    max_out = output_length[0]&lt;br /&gt;    bias = INITIAL_BIAS&lt;br /&gt;&lt;br /&gt;    b = 0&lt;br /&gt;    input_length.times do |j|&lt;br /&gt;      b = j if delim(input[j])&lt;br /&gt;    end&lt;br /&gt;    raise PunycodeBigOutput if b &gt; max_out&lt;br /&gt;&lt;br /&gt;    b.times do |j|&lt;br /&gt;      case_flags[out] = flagged(input[j]) if case_flags&lt;br /&gt;      raise PunycodeBadInput unless basic(input[j])&lt;br /&gt;      output[out] = input[j]&lt;br /&gt;      out+=1&lt;br /&gt;    end&lt;br /&gt;&lt;br /&gt;    in_ = b &gt; 0 ? b + 1 : 0&lt;br /&gt;    while in_ &lt; input_length&lt;br /&gt;&lt;br /&gt;      oldi = i; w = 1; k = BASE&lt;br /&gt;      while true&lt;br /&gt;        raise PunycodeBadInput if in_ &gt;= input_length&lt;br /&gt;        digit = decode_digit(input[in_])&lt;br /&gt;        in_+=1&lt;br /&gt;        raise PunycodeBadInput if digit &gt;= BASE&lt;br /&gt;        raise PunycodeOverflow if digit &gt; (MAXINT - i) / w&lt;br /&gt;        i += digit * w&lt;br /&gt;        t = if k &lt;= bias # + TMIN # +TMIN not needed&lt;br /&gt;              TMIN&lt;br /&gt;            elsif k &gt;= bias + TMAX&lt;br /&gt;              TMAX&lt;br /&gt;            else&lt;br /&gt;              k - bias&lt;br /&gt;            end&lt;br /&gt;        break if digit &lt; t&lt;br /&gt;        raise PunycodeOverflow if w &gt; MAXINT / (BASE - t)&lt;br /&gt;        w *= BASE - t&lt;br /&gt;        k += BASE&lt;br /&gt;      end&lt;br /&gt;&lt;br /&gt;      bias = adapt(i - oldi, out + 1, oldi == 0)&lt;br /&gt;&lt;br /&gt;      raise PunycodeOverflow if i / (out + 1) &gt; MAXINT - n&lt;br /&gt;      n += i / (out + 1)&lt;br /&gt;      i %= out + 1&lt;br /&gt;&lt;br /&gt;      # not needed for Punycode:&lt;br /&gt;      # raise PUNYCODE_INVALID_INPUT if decode_digit(n) &lt;= base&lt;br /&gt;      raise PunycodeBigOutput if out &gt;= max_out&lt;br /&gt;&lt;br /&gt;      if case_flags&lt;br /&gt;        #memmove(case_flags + i + 1, case_flags + i, out - i)&lt;br /&gt;        case_flags[i + 1, out - i] = case_flags[i, out - i]&lt;br /&gt;&lt;br /&gt;        # Case of last character determines uppercase flag:&lt;br /&gt;        case_flags[i] = flagged(input[in_ - 1])&lt;br /&gt;      end&lt;br /&gt;&lt;br /&gt;      #memmove(output + i + 1, output + i, (out - i) * sizeof *output)&lt;br /&gt;      output[i + 1, out - i] = output[i, out - i]&lt;br /&gt;      output[i] = n&lt;br /&gt;      i+=1&lt;br /&gt;&lt;br /&gt;      out+=1&lt;br /&gt;    end&lt;br /&gt;&lt;br /&gt;    output_length[0] = out&lt;br /&gt;    return PunycodeSuccess&lt;br /&gt;  end&lt;br /&gt;&lt;br /&gt;  def encode(unicode_string, case_flags=nil, print_ascii_only=false)&lt;br /&gt;    input = unicode_string.unpack('U*')&lt;br /&gt;    output = [0] * (ACE_MAX_LENGTH+1)&lt;br /&gt;    output_length = [ACE_MAX_LENGTH]&lt;br /&gt;&lt;br /&gt;    punycode_encode(input.size, input, case_flags, output_length, output)&lt;br /&gt;&lt;br /&gt;    outlen = output_length[0]&lt;br /&gt;    outlen.times do |j|&lt;br /&gt;      c = output[j]&lt;br /&gt;      unless c &gt;= 0 &amp;&amp; c &lt;= 127&lt;br /&gt;        raise Error, "assertion error: invalid output char"&lt;br /&gt;      end&lt;br /&gt;      unless PRINT_ASCII[c]&lt;br /&gt;        raise PunycodeBadInput&lt;br /&gt;      end&lt;br /&gt;      output[j] = PRINT_ASCII[c] if print_ascii_only&lt;br /&gt;    end&lt;br /&gt;&lt;br /&gt;    output[0..outlen].map{|x|x.chr}.join('').sub(/\0+\z/, '')&lt;br /&gt;  end&lt;br /&gt;&lt;br /&gt;  def decode(punycode, case_flags=[])&lt;br /&gt;    input = []&lt;br /&gt;    output = []&lt;br /&gt;&lt;br /&gt;    if ACE_MAX_LENGTH*2 &lt; punycode.size&lt;br /&gt;      raise PunycodeBigOutput&lt;br /&gt;    end&lt;br /&gt;    punycode.each_byte do |c|&lt;br /&gt;      unless c &gt;= 0 &amp;&amp; c &lt;= 127&lt;br /&gt;        raise PunycodeBadInput&lt;br /&gt;      end&lt;br /&gt;      input.push(c)&lt;br /&gt;    end&lt;br /&gt;&lt;br /&gt;    output_length = [UNICODE_MAX_LENGTH]&lt;br /&gt;    Punycode.punycode_decode(input.length, input, output_length,&lt;br /&gt;                             output, case_flags)&lt;br /&gt;    output.pack('U*')&lt;br /&gt;  end&lt;br /&gt;&lt;br /&gt;  UNICODE_MAX_LENGTH = 256&lt;br /&gt;  ACE_MAX_LENGTH = 256&lt;br /&gt;&lt;br /&gt;  # The following string is used to convert printable&lt;br /&gt;  # characters between ASCII and the native charset:&lt;br /&gt;&lt;br /&gt;  PRINT_ASCII =&lt;br /&gt;    "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \&lt;br /&gt;    "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \&lt;br /&gt;    " !\"\#$%&amp;'()*+,-./" \&lt;br /&gt;    "0123456789:;&lt;=&gt;?" \&lt;br /&gt;    "@ABCDEFGHIJKLMNO" \&lt;br /&gt;    "PQRSTUVWXYZ[\\]^_" \&lt;br /&gt;    "`abcdefghijklmno" \&lt;br /&gt;    "pqrstuvwxyz{|}~\n"&lt;br /&gt;end&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;# cf. http://snippets.dzone.com/posts/show/4527&lt;br /&gt;&lt;br /&gt;UTF8REGEX = /\A(?:                                                            &lt;br /&gt;              [\x09\x0A\x0D\x20-\x7E]            # ASCII&lt;br /&gt;            | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte&lt;br /&gt;            |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs&lt;br /&gt;            | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte&lt;br /&gt;            |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates&lt;br /&gt;            |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3&lt;br /&gt;            | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15&lt;br /&gt;            |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16&lt;br /&gt;            )*\z/mnx&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;UTF8_REGEX_MBYTE = /(?:                                 &lt;br /&gt;                 [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte&lt;br /&gt;               |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs&lt;br /&gt;               | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte&lt;br /&gt;               |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates&lt;br /&gt;               |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3&lt;br /&gt;               | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15&lt;br /&gt;               |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16&lt;br /&gt;               )/mnx&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;# cf. http://demo.icu-project.org/icu-bin/idnbrowser (samples)&lt;br /&gt;# on Mac OS X you can check the Ruby conversions with the GUI app PunyCode, http://software.dibomedia.de/products/show/2&lt;br /&gt;&lt;br /&gt;str = "http://www.&#65201;&#65202;&#65207;.com/"&lt;br /&gt;str = "www.&#1089;&#1076;&#1077;&#1083;&#1072;&#1090; &#1082;&#1072;&#1088;&#1090;&#1080;&#1085;&#1082;&#1080;.com"&lt;br /&gt;str = "http://www.&#1089;&#1076;&#1077;&#1083;&#1072;&#1090;&#1082;&#1072;&#1088;&#1090;&#1080;&#1085;&#1082;&#1080;.com/"&lt;br /&gt;str = "http://t&#363;dali&#326;.lv/"&lt;br /&gt;str = "http://www.z&#252;rich.com/"&lt;br /&gt;str = "http://www.h&#246;ren.at/"&lt;br /&gt;str = "http://www.&#382;lut&#253; k&#367;&#328;.com/"&lt;br /&gt;str = "www.f&#228;rgbolaget.nu"&lt;br /&gt;str = "www.br&#230;ndendek&#230;rlighed.com"&lt;br /&gt;str = "www.m&#228;kitorppa.com"&lt;br /&gt;str = "www.f&#228;rjestadsbk.net"&lt;br /&gt;str = "&#12354;&#12540;&#12427;&#12356;&#12435;.com"&lt;br /&gt;str = "www.&#50696;&#48708;&#44368;&#49324;.com"&lt;br /&gt;str = "www.&#12495;&#12531;&#12489;&#12508;&#12540;&#12523;&#12469;&#12512;&#12474;.com"&lt;br /&gt;str = "www.&#26085;&#26412;&#24179;.jp"&lt;br /&gt;str = "www.r&#228;ksm&#246;rg&#229;s.se"&lt;br /&gt;str = "www.r&#243;&#380;yczka.pl/"&lt;br /&gt;str = "&#29702;&#23481;&#12490;&#12459;&#12512;&#12521;.com"&lt;br /&gt;str = "http://B&#252;cher.ch/"&lt;br /&gt;str = "t&#363;dali&#326;.lv"&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;if str =~ UTF8REGEX &amp;&amp; str =~ UTF8_REGEX_MBYTE&lt;br /&gt;&lt;br /&gt;   s1 = str.gsub(/^(http:\/\/www\.|http:\/\/|).*?\.[^\.\/]+\/?$/n, '\1')&lt;br /&gt;   s2 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|)(www\.|).*?\.[^\.\/]+\/?$/n, '\1')&lt;br /&gt;   s3 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|www\.|)(.*?)\.[^\.\/]+\/?$/n, '\1')&lt;br /&gt;   s4 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|www\.|).*?(\.[^\.\/]+\/?)$/n, '\1')&lt;br /&gt;&lt;br /&gt;   if s1.empty? then s1 = 'http://' end&lt;br /&gt;&lt;br /&gt;   s3 = Punycode.encode(Unicode::normalize_KC(Unicode::downcase(s3)))&lt;br /&gt;&lt;br /&gt;   punycoded_url = s1 &lt;&lt; s2 &lt;&lt; "xn--" &lt;&lt; s3 &lt;&lt; s4&lt;br /&gt;&lt;br /&gt;   puts punycoded_url&lt;br /&gt;&lt;br /&gt;   %x{ /usr/bin/open "#{punycoded_url}" }&lt;br /&gt;&lt;br /&gt;end&lt;br /&gt;&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;</description>
      <pubDate>Wed, 26 Sep 2007 21:00:18 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/4575</guid>
      <author>ntk ()</author>
    </item>
  </channel>
</rss>
