<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DZone Snippets: tags code</title>
    <link>http://snippets.dzone.com/posts</link>
    <pubDate>Sun, 27 Jul 2008 00:59:50 GMT</pubDate>
    <description>DZone Snippets: tags code</description>
    <item>
      <title>Parse Technorati-style Tags from HTML</title>
      <link>http://snippets.dzone.com/posts/show/1758</link>
      <description>An enhancement to the String class that parses out Technorati-style tags.  Uses Typo's strip_html method to remove img tags and other markup.&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;class String&lt;br /&gt;  # Strips any html markup from a string&lt;br /&gt;  TYPO_TAG_KEY = TYPO_ATTRIBUTE_KEY = /[\w:_-]+/&lt;br /&gt;  TYPO_ATTRIBUTE_VALUE = /(?:[A-Za-z0-9]+|(?:'[^']*?'|"[^"]*?"))/&lt;br /&gt;  TYPO_ATTRIBUTE = /(?:#{TYPO_ATTRIBUTE_KEY}(?:\s*=\s*#{TYPO_ATTRIBUTE_VALUE})?)/&lt;br /&gt;  TYPO_ATTRIBUTES = /(?:#{TYPO_ATTRIBUTE}(?:\s+#{TYPO_ATTRIBUTE})*)/&lt;br /&gt;  TAG = %r{&lt;[!/?\[]?(?:#{TYPO_TAG_KEY}|--)(?:\s+#{TYPO_ATTRIBUTES})?\s*(?:[!/?\]]+|--)?&gt;}&lt;br /&gt;  def strip_html&lt;br /&gt;    self.gsub(TAG, '').gsub(/\s+/, ' ').strip&lt;br /&gt;  end&lt;br /&gt;&lt;br /&gt;  def tags&lt;br /&gt;    scan(/&lt;a\s+[^&gt;]*\s*rel=\s*(.?)tag\1[^&gt;]*&gt;(.+?)&lt;\/a&gt;/i).&lt;br /&gt;    map { |match| match.last.strip_html rescue nil }.&lt;br /&gt;    compact.select { |s| !s.strip.empty? }&lt;br /&gt;  end&lt;br /&gt;end&lt;br /&gt;&lt;br /&gt;# Example usage&lt;br /&gt;&lt;br /&gt;s = %{&lt;a href="http://www.docstrangelove.com/tag/civil-war" rel="tag"&gt;civil war&lt;/a&gt; &lt;a href="http://www.technorati.com/tag/civil+war" rel="tag"&gt;&lt;img src="http://www.docstrangelove.com/wp-content/plugins/UltimateTagWarrior/technoratiicon.jpg" alt="Technorati tag page for civil war"/&gt;&lt;/a&gt; &lt;a href="http://www.docstrangelovecom/tag/iraq" rel="tag"&gt;iraq&lt;/a&gt; &lt;a href="http://www.technorati.com/tag/iraq" rel="tag"&gt;&lt;img src="http://www.docstrangelove.com/wp-content/plugins/UltimateTagWarrior/technoratiicon.jpg" alt="Technorati tag page for iraq"/&gt;&lt;/a&gt;}&lt;br /&gt;&lt;br /&gt;s.tags&lt;br /&gt;# =&gt; ["civil war", "iraq"]&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Sat, 25 Mar 2006 02:02:46 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/1758</guid>
      <author>canadaduane (Duane Johnson)</author>
    </item>
  </channel>
</rss>
