<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DZone Snippets: tokenize code</title>
    <link>http://snippets.dzone.com/posts</link>
    <pubDate>Sun, 27 Jul 2008 02:07:30 GMT</pubDate>
    <description>DZone Snippets: tokenize code</description>
    <item>
      <title>Space-Separated Tag Parser</title>
      <link>http://snippets.dzone.com/posts/show/1625</link>
      <description>Here is a function that accepts a string containing tags and returns an array of extracted tags. (Updated to ignore duplicates)&lt;br /&gt;&lt;code&gt;&lt;br /&gt;/**&lt;br /&gt; * Parses a String of Tags&lt;br /&gt; *&lt;br /&gt; * Tags are space delimited. Either single or double quotes mark a phrase.&lt;br /&gt; * Odd quotes will cause everything on their right to reflect as one single&lt;br /&gt; * tag or phrase. All white-space within a phrase is converted to single&lt;br /&gt; * space characters. Quotes burried within tags are ignored! Duplicate tags&lt;br /&gt; * are ignored, even duplicate phrases that are equivalent.&lt;br /&gt; *&lt;br /&gt; * Returns an array of tags.&lt;br /&gt; */&lt;br /&gt;function ParseTagString($sTagString)&lt;br /&gt;{&lt;br /&gt;	$arTags = array();		// Array of Output&lt;br /&gt;	$cPhraseQuote = null;	// Record of the quote that opened the current phrase&lt;br /&gt;	$sPhrase = null;		// Temp storage for the current phrase we are building&lt;br /&gt;	&lt;br /&gt;	// Define some constants&lt;br /&gt;	static $sTokens = " \r\n\t";	// Space, Return, Newline, Tab&lt;br /&gt;	static $sQuotes = "'\"";		// Single and Double Quotes&lt;br /&gt;	&lt;br /&gt;	// Start the State Machine&lt;br /&gt;	do&lt;br /&gt;	{&lt;br /&gt;		// Get the next token, which may be the first&lt;br /&gt;		$sToken = isset($sToken)? strtok($sTokens) : strtok($sTagString, $sTokens);&lt;br /&gt;		&lt;br /&gt;		// Are there more tokens?&lt;br /&gt;		if ($sToken === false)&lt;br /&gt;		{&lt;br /&gt;			// Ensure that the last phrase is marked as ended&lt;br /&gt;			$cPhraseQuote = null;&lt;br /&gt;		}&lt;br /&gt;		else&lt;br /&gt;		{		&lt;br /&gt;			// Are we within a phrase or not?&lt;br /&gt;			if ($cPhraseQuote !== null)&lt;br /&gt;			{&lt;br /&gt;				// Will the current token end the phrase?&lt;br /&gt;				if (substr($sToken, -1, 1) === $cPhraseQuote)&lt;br /&gt;				{&lt;br /&gt;					// Trim the last character and add to the current phrase, with a single leading space if necessary&lt;br /&gt;					if (strlen($sToken) &gt; 1) $sPhrase .= ((strlen($sPhrase) &gt; 0)? ' ' : null) . substr($sToken, 0, -1);&lt;br /&gt;					$cPhraseQuote = null;&lt;br /&gt;				}&lt;br /&gt;				else&lt;br /&gt;				{&lt;br /&gt;					// If not, add the token to the phrase, with a single leading space if necessary&lt;br /&gt;					$sPhrase .= ((strlen($sPhrase) &gt; 0)? ' ' : null) . $sToken;&lt;br /&gt;				}&lt;br /&gt;			}&lt;br /&gt;			else&lt;br /&gt;			{&lt;br /&gt;				// Will the current token start a phrase?&lt;br /&gt;				if (strpos($sQuotes, $sToken[0]) !== false)&lt;br /&gt;				{&lt;br /&gt;					// Will the current token end the phrase?&lt;br /&gt;					if ((strlen($sToken) &gt; 1) &amp;&amp; ($sToken[0] === substr($sToken, -1, 1)))&lt;br /&gt;					{&lt;br /&gt;						// The current token begins AND ends the phrase, trim the quotes&lt;br /&gt;						$sPhrase = substr($sToken, 1, -1);&lt;br /&gt;					}&lt;br /&gt;					else&lt;br /&gt;					{&lt;br /&gt;						// Remove the leading quote&lt;br /&gt;						$sPhrase = substr($sToken, 1);&lt;br /&gt;						$cPhraseQuote = $sToken[0];&lt;br /&gt;					}&lt;br /&gt;				}&lt;br /&gt;				else&lt;br /&gt;					$sPhrase = $sToken;&lt;br /&gt;			}&lt;br /&gt;		}&lt;br /&gt;		&lt;br /&gt;		// If, at this point, we are not within a phrase, the prepared phrase is complete and can be added to the array&lt;br /&gt;		if (($cPhraseQuote === null) &amp;&amp; ($sPhrase != null))&lt;br /&gt;		{&lt;br /&gt;			$sPhrase = strtolower($sPhrase);&lt;br /&gt;			if (!in_array($sPhrase, $arTags)) $arTags[] = $sPhrase;&lt;br /&gt;			$sPhrase = null;&lt;br /&gt;		}&lt;br /&gt;	}&lt;br /&gt;	while ($sToken !== false);	// Stop when we receive FALSE from strtok()&lt;br /&gt;	return $arTags;&lt;br /&gt;}&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;&lt;br /&gt;The string can be recreated from the array with the use of this reverse function:&lt;br /&gt;&lt;code&gt;&lt;br /&gt;/**&lt;br /&gt; * Reverses ParseTagString()&lt;br /&gt; */&lt;br /&gt;function CreateTagString($arTags)&lt;br /&gt;{&lt;br /&gt;	// Prepare each tag to be imploded&lt;br /&gt;	for ($i = 0; $i &lt; sizeof($arTags); $i++)&lt;br /&gt;	{&lt;br /&gt;		// Record findings&lt;br /&gt;		$bContainsWhitespace = false;	// Was whitespace found?&lt;br /&gt;		$cRequiredQuote = '"';			// Use double-quote by default&lt;br /&gt;		$cLastChar = null;&lt;br /&gt;	&lt;br /&gt;		// Search the tag&lt;br /&gt;		for ($j = 0; $j &lt; strlen($arTags[$i]); $j++)&lt;br /&gt;		{&lt;br /&gt;			$c = $arTags[$i][$j];&lt;br /&gt;			&lt;br /&gt;			// If the current character is a space&lt;br /&gt;			if ($c === ' ')&lt;br /&gt;			{&lt;br /&gt;				$bContainsWhitespace = true;&lt;br /&gt;				&lt;br /&gt;				// If the previous char was a double quote, we require single quotes round our phrase&lt;br /&gt;				if ($cLastChar === '"')&lt;br /&gt;				{&lt;br /&gt;					$cRequiredQuote = "'";&lt;br /&gt;					break;	// There is no more point in continuing our search, we cant handle double-mixed quotes&lt;br /&gt;				}&lt;br /&gt;			}&lt;br /&gt;			&lt;br /&gt;			// Record this char as the last char&lt;br /&gt;			$cLastChar = $c;&lt;br /&gt;		}&lt;br /&gt;		&lt;br /&gt;		// Quote if necessary&lt;br /&gt;		if ($bContainsWhitespace) $arTags[$i] = $cRequiredQuote . $arTags[$i] . $cRequiredQuote;&lt;br /&gt;	}&lt;br /&gt;	return implode(' ', $arTags);&lt;br /&gt;}&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;&lt;br /&gt;To test the whole system, use the following array of test cases:&lt;br /&gt;&lt;code&gt;&lt;br /&gt;$arTestInputs = array(&lt;br /&gt;	"this test ensures that words are correctly split",&lt;br /&gt;	"in this test \"phrases\" and \"multi-word phrases\" are tested",&lt;br /&gt;	"this test shows the behaviour if an \"odd quote is detected",&lt;br /&gt;	"this test shows that 'different quotes' work too",&lt;br /&gt;	"but mixed quotes fail: \"test phrase' does not stop on the quote",&lt;br /&gt;	"which can be usefull in some cases where \"the systems' requirements\" state that it is necessary",&lt;br /&gt;	"quotes need not be attached to \" their phrase \"",&lt;br /&gt;	"embedded\"quotes are ignored!",&lt;br /&gt;	"this is also usefull and demonstrates the system's coolness",&lt;br /&gt;	"redundant   white-space is   removed from \"  tags    and phrases\"",&lt;br /&gt;	"\"\"double quotes\"\" will result in single quotes!",&lt;br /&gt;	"remember that 'double-quotes\" may be nested within single quotes'",&lt;br /&gt;	"TaGs ArE NOT case SENsITiVE!",&lt;br /&gt;	"a duplicate tag will be removed from the tag list",&lt;br /&gt;	"even a \" complex phrase\" that is equivalent to another 'compleX   PHrASe   '"&lt;br /&gt;);&lt;br /&gt;&lt;br /&gt;foreach ($arTestInputs as $sTest)&lt;br /&gt;{&lt;br /&gt;	print ("&lt;pre&gt;$sTest&lt;/pre&gt;");&lt;br /&gt;	print "&lt;pre&gt;";&lt;br /&gt;	print_r (ParseTagString($sTest));&lt;br /&gt;	print "&lt;/pre&gt;";&lt;br /&gt;	print "&lt;pre&gt;";&lt;br /&gt;	print CreateTagString(ParseTagString($sTest));&lt;br /&gt;	print "&lt;/pre&gt;";&lt;br /&gt;	print "&lt;hr /&gt;";&lt;br /&gt;}&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;&lt;br /&gt;2006-03-09 0.1.0 - 0.2.0 Duplicate phrases are now ignored.&lt;br /&gt;&lt;br /&gt;-- &lt;br /&gt;Version 0.2.0 - 2006-03-09&lt;br /&gt;STEM: The STEM Cells of PHP&lt;br /&gt;This work is licensed under a Creative Commons Attribution-ShareAlike 2.5 License&lt;br /&gt;http://creativecommons.org/licenses/by-sa/2.5/</description>
      <pubDate>Fri, 03 Mar 2006 16:58:01 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/1625</guid>
      <author>Charlie (Stephen Martindale)</author>
    </item>
  </channel>
</rss>
