class String # Strips any html markup from a string TYPO_TAG_KEY = TYPO_ATTRIBUTE_KEY = /[\w:_-]+/ TYPO_ATTRIBUTE_VALUE = /(?:[A-Za-z0-9]+|(?:'[^']*?'|"[^"]*?"))/ TYPO_ATTRIBUTE = /(?:#{TYPO_ATTRIBUTE_KEY}(?:\s*=\s*#{TYPO_ATTRIBUTE_VALUE})?)/ TYPO_ATTRIBUTES = /(?:#{TYPO_ATTRIBUTE}(?:\s+#{TYPO_ATTRIBUTE})*)/ TAG = %r{<[!/?\[]?(?:#{TYPO_TAG_KEY}|--)(?:\s+#{TYPO_ATTRIBUTES})?\s*(?:[!/?\]]+|--)?>} def strip_html self.gsub(TAG, '').gsub(/\s+/, ' ').strip end def tags scan(/<a\s+[^>]*\s*rel=\s*(.?)tag\1[^>]*>(.+?)<\/a>/i). map { |match| match.last.strip_html rescue nil }. compact.select { |s| !s.strip.empty? } end end # Example usage s = %{<a href="http://www.docstrangelove.com/tag/civil-war" rel="tag">civil war</a> <a href="http://www.technorati.com/tag/civil+war" rel="tag"><img src="http://www.docstrangelove.com/wp-content/plugins/UltimateTagWarrior/technoratiicon.jpg" alt="Technorati tag page for civil war"/></a> <a href="http://www.docstrangelovecom/tag/iraq" rel="tag">iraq</a> <a href="http://www.technorati.com/tag/iraq" rel="tag"><img src="http://www.docstrangelove.com/wp-content/plugins/UltimateTagWarrior/technoratiicon.jpg" alt="Technorati tag page for iraq"/></a>} s.tags # => ["civil war", "iraq"]
You need to create an account or log in to post comments to this site.