Never been to DZone Snippets before?

Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

« Newer Snippets
Older Snippets »
Showing 1-10 of 14 total  RSS 

Split Apache logs according to GeoIP country

// Split Apache logs according to GeoIP country

#!/usr/bin/perl

# $Id$

# Split Apache logs according to GeoIP country

use strict;
use warnings;

## no critic (ValuesAndExpressions::RequireInterpolationOfMetachars)
our ($VERSION) = '$Revision$' =~ m{ \$Revision: \s+ (\S+) }xms;
## use critic

use Geo::IP;

my $gi = Geo::IP->open('/usr/local/share/GeoIP/GeoIPCity.dat', GEOIP_STANDARD);

my @logs = @ARGV;

my %record_for;

foreach my $log (@logs) {
    die "Can't read $log\n" if !-r $log;
    
    my %fh_for;
    my $num_lines_parsed = 0;
    
    my $log_fh;
    if ($log =~ m/ \.gz \z /xms) {
        open $log_fh, "gzip -cd $log |" or die "Can't open gzip pipe\n";
    }
    else {
        open $log_fh, '<', $log or die "Can't open $log\n";
    }
    
    my $log_base = $log;
    $log_base =~ s/ \.gz \z //xms;
    
    while (my $line = <$log_fh>) {
        $num_lines_parsed++;
        if (!($num_lines_parsed % 1000)) {
            print STDERR "Parsed $num_lines_parsed lines of $log\n";
        }
        
        my ($host) = $line =~ m/ \A (\S+) \s /xms;
        
        if (!exists $record_for{$host}) {
            my $record = $gi->record_by_name($host);
            $record_for{$host} = $record || 0;
        }
        
        my $country = 'unknown';
        if (exists $record_for{$host} && $record_for{$host}) {
            $country = lc($record_for{$host}->country_name());
            $country =~ s/\W+/_/gxms;
        }
        
        if (!exists $fh_for{$country}) {
            open $fh_for{$country}, '>', "$log_base.$country.out"
                or die "Can't write to $log_base.$country.out\n";
        }
        
        print {$fh_for{$country}} $line;
    }
    
    foreach my $fh (values %fh_for) {
        close $fh;
    }
    
    close $log_fh;
}

Splitting large Scriptella ETL files

The following example demonstrates how to split a large Scriptella ETL file into several parts. This example is based on a traditional XML parsed entities approach:

<!DOCTYPE etl SYSTEM "http://scriptella.javaforge.com/dtd/etl.dtd"
[
    <!-- Declaring the first external parsed entity to include -->
    <!ENTITY part1 SYSTEM "part1.xml">
    
    <!-- Declaring the second external parsed entity to include -->
    <!ENTITY part2 SYSTEM "part2.xml">
]>
<etl>
    <connection driver="text"/>

    <!-- Including file #1 -->
    &part1;

    <script>
        content of the script
    </script>
    
    <!-- Including file #2 -->
    &part2;

</etl>

Java - Splitta una stringa

	// Splitta una stringa
	private String[] splitString(String str, String delims)
	{
		if(str == null)
			return null;
		else if(str.equals("") || delims == null || delims.length() == 0)
			return new String[]{ str };
		
		String[] s;
	  	Vector v = new Vector();
		
	  	int pos = 0;
		int newpos = str.indexOf(delims, pos);;

		while(newpos != -1)
		{
			v.addElement(str.substring(pos, newpos));
			pos = newpos + delims.length();
			newpos = str.indexOf(delims, pos);
		}
		v.addElement(str.substring(pos));
		
		s = new String[v.size()];
		for(int i=0, cnt=s.length; i<cnt; i++)
			s[i] = (String) v.elementAt(i);
		
		return s;
	}

fractionfiles.py

// Splits a file into smaller ones, and joins them together.

#!/usr/bin/env python

"""Splits and joins files. Helpful when media can't fit a file.
Be prepared for a lot of output files!"""

__author__="Andrew Pennebaker (andrew.pennebaker@gmail.com)"
__date__="6 Jan 3006 - 12 Feb 2006"
__copyright__="Copyright 2006 Andrew Pennebaker"
__license__="GPL"
__version__="0.3"
__URL__="http://snippets.dzone.com/posts/show/3541"

import sys, os
from getopt import getopt

SPLIT_MODE="SPLIT"
JOIN_MODE="JOIN"

def splitFile(name, length, number):
	if length==None:
		infile=open(name, "rb")
		size=0
		while infile.read(1)!="":
			size+=1

		infile.close()

		maxlength=size/number
		if number*maxlength<size:
			maxlength+=1

	else:
		if length<1:
			raise Exception

	infile=None
	try:
		infile=open(name, "rb")
	except Exception, e:
		raise e

	i=0
	j=0
	c=infile.read(1)
	while c!="":
		outfile=None
		try:
			outfile=open("%s.%d" % (name, j), "wb")
		except Exception, e:
			raise e

		while i<length and c!="":
			outfile.write(c)
			c=infile.read(1)
			i+=1

		outfile.close()
		i=0
		j+=1

	infile.close()

def joinFiles(filenames):
	if len(filenames)<1:
		raise Exception

	filenames.sort() # ...0 must be first

	origFilename=filenames[0][0:-2] # take ".0" off the first file name
	origFile=None

	try:
		origFile=open(origFilename, "wb")
	except Exception, e:
		raise e

	c="&" # dummy

	for filename in filenames:
		smallFile=None
		try:
			smallFile=open(filename, "rb")
		except Exception, e:
			raise e

		c=smallFile.read(1)
		while c!="":
			origFile.write(c)
			c=smallFile.read(1)

		smallFile.close()

	origFile.close()

def usage():
	print "Usage: %s [options] [files]" % (sys.argv[0])
	print "\n--split <file1 file 2 file 3...>"
	print "--join <dir1 dir2 dir3 ...>"
	print "--maxlength <bytes>"
	print "--maxfiles <number>"
	print "--help (usage)"

	sys.exit()

def main():
	global SPLIT_MODE
	global JOIN_MODE

	mode=SPLIT_MODE
	filenames=[]
	maxlength=1024
	maxfiles=None

	systemArgs=sys.argv[1:] # ignore program name

	optlist=[]
	args=[]

	try:
		optlist, args=getopt(systemArgs, None, ["split", "join", "maxlength=", "maxfiles=", "help"])
	except Exception, e:
		usage()

	if len(optlist)<1 or len(args)<1:
		usage()

	for option, value in optlist:
		if option=="--help":
			usage()

		elif option=="--split":
			mode=SPLIT_MODE
		elif option=="--join":
			mode=JOIN_MODE
		elif option=="--maxlength":
			try:
				maxlength=int(value)
				if maxlength<1:
					raise Exception
				maxfiles=None
			except Exception, e:
				raise "Length must be at least one"
		elif option=="--maxfiles":
			try:
				maxfiles=int(value)
				if maxfiles<1:
					raise Exception
				maxlength=None
			except Exception, e:
				raise "Number must be at least one"

	filenames=args

	if mode==SPLIT_MODE:
		for filename in filenames:
			try:
				splitFile(filename, maxlength, maxfiles)
			except Exception, e:
				raise e

	elif mode==JOIN_MODE:
		for directory in filenames:
			files=["%s%s%s" % (directory, os.sep, file) for file in os.listdir(directory)]

			try:
				joinFiles(files)
			except Exception, e:
				raise e

if __name__=="__main__":
	main()

Split array into smaller arrays of equal size

Split an array of elements into a set of smaller arrays of equal size. Extra elements are preferentially assigned to earlier arrays. If there are no elements in a given returned array it will be [] (empty array)

# use as standalone function
def chunk_array(array, pieces=2)
  len = array.length;
  mid = (len/pieces)
  chunks = []
  start = 0
  1.upto(pieces) do |i|
    last = start+mid
    last = last-1 unless len%pieces >= i
    chunks << array[start..last] || []
    start = last+1
  end
  chunks
end

# use as array.chunk
class Array
  def chunk(pieces=2)
    len = self.length;
    mid = (len/pieces)
    chunks = []
    start = 0
    1.upto(pieces) do |i|
      last = start+mid
      last = last-1 unless len%pieces >= i
      chunks << self[start..last] || []
      start = last+1
    end
    chunks
  end
end



Examples of use:

>> chunk_array [1,2,3,4,5,6], 2
=> [[1, 2, 3], [4, 5, 6]]

>> chunk_array [1,2,3,4,5,6], 3
=> [[1, 2], [3, 4], [5, 6]]

>> chunk_array [1,2,3,4,5,6], 4
=> [[1, 2], [3, 4], [5], [6]]

>> chunk_array [1,2,3,4,5,6,7,8,9,10], 4
=> [[1, 2, 3], [4, 5, 6], [7, 8], [9, 10]]

>> chunk_array [1,2,3], 4
=> [[1], [2], [3], []]

>> chunk_array [], 2
=> [[], []]


if you prefer the second form (more ruby-ish, but not always appropriate)

>> [1,2,3,4,5,6,7,8,9,10].chunk
=> [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]

>> [1,2,3,4,5,6,7,8,9,10].chunk 3
=> [[1, 2, 3, 4], [5, 6, 7], [8, 9, 10]]


This is handy when used with a splat because you can do things like:

left, right = *chunk_array(all,2)

SPLIT-UNIQUE - split a block into unique and duplicate values

    split-unique: func [block [any-block!] /local uniq dupe dest] [
        uniq: copy []
        dupe: copy []
        foreach item block [
            dest: either find/only uniq item [dupe] [uniq]
            append/only dest item
        ]
        reduce [uniq dupe]
    ]

GROUP - group like elements in a block

    group: func [
        {Returns a block of sub-blocks with items partitioned by value.}
        block  [any-block!]
        /local result
    ][
        result: copy []
        ; First, build up a list of keys, with a place for values
        ; to go with each key.
        foreach item block [
            if not find/only/skip result item 2 [
                repend result [item copy []]
            ]
        ]
        ; Add items to the block associated with each key.
        foreach item block [append/only select result item item]
        result
    ]

Split String into roughly equal-sized chunks.

Split a string into an array of roughly equal sized chunks based on a string or regular expression delimiter.
Delimiter is preserved in output.

class String
  def chunk_string(average_segment_size = 40, sclice_on = /\s+/)
    out = []
    slices_estimate = self.size.divmod(average_segment_size)
    slice_count = (slices_estimate[1] > 0 ? slices_estimate[0] + 1 : slices_estimate[0])
    slice_guess = self.size / slice_count
    previous_slice_location = 0
    (1..slice_count - 1).each do
      |i|
      slice_location = self.nearest_split(slice_guess * i, sclice_on)
      out << self.slice(previous_slice_location..slice_location)
      previous_slice_location = slice_location + 1
    end
    out << self.slice(previous_slice_location..self.size)
    out
  end

  def nearest_split(slice_start, slice_on)
    left_scan_location  = (self.slice(0..slice_start).rindex(slice_on)).to_i
    right_scan_location = (self.slice((slice_start+1)..self.size).index(slice_on)).to_i + slice_start
    ((slice_start - left_scan_location) < (right_scan_location - slice_start) ? left_scan_location : right_scan_location)
  end
end

Space-Separated Tag Parser

Here is a function that accepts a string containing tags and returns an array of extracted tags. (Updated to ignore duplicates)
/**
 * Parses a String of Tags
 *
 * Tags are space delimited. Either single or double quotes mark a phrase.
 * Odd quotes will cause everything on their right to reflect as one single
 * tag or phrase. All white-space within a phrase is converted to single
 * space characters. Quotes burried within tags are ignored! Duplicate tags
 * are ignored, even duplicate phrases that are equivalent.
 *
 * Returns an array of tags.
 */
function ParseTagString($sTagString)
{
	$arTags = array();		// Array of Output
	$cPhraseQuote = null;	// Record of the quote that opened the current phrase
	$sPhrase = null;		// Temp storage for the current phrase we are building
	
	// Define some constants
	static $sTokens = " \r\n\t";	// Space, Return, Newline, Tab
	static $sQuotes = "'\"";		// Single and Double Quotes
	
	// Start the State Machine
	do
	{
		// Get the next token, which may be the first
		$sToken = isset($sToken)? strtok($sTokens) : strtok($sTagString, $sTokens);
		
		// Are there more tokens?
		if ($sToken === false)
		{
			// Ensure that the last phrase is marked as ended
			$cPhraseQuote = null;
		}
		else
		{		
			// Are we within a phrase or not?
			if ($cPhraseQuote !== null)
			{
				// Will the current token end the phrase?
				if (substr($sToken, -1, 1) === $cPhraseQuote)
				{
					// Trim the last character and add to the current phrase, with a single leading space if necessary
					if (strlen($sToken) > 1) $sPhrase .= ((strlen($sPhrase) > 0)? ' ' : null) . substr($sToken, 0, -1);
					$cPhraseQuote = null;
				}
				else
				{
					// If not, add the token to the phrase, with a single leading space if necessary
					$sPhrase .= ((strlen($sPhrase) > 0)? ' ' : null) . $sToken;
				}
			}
			else
			{
				// Will the current token start a phrase?
				if (strpos($sQuotes, $sToken[0]) !== false)
				{
					// Will the current token end the phrase?
					if ((strlen($sToken) > 1) && ($sToken[0] === substr($sToken, -1, 1)))
					{
						// The current token begins AND ends the phrase, trim the quotes
						$sPhrase = substr($sToken, 1, -1);
					}
					else
					{
						// Remove the leading quote
						$sPhrase = substr($sToken, 1);
						$cPhraseQuote = $sToken[0];
					}
				}
				else
					$sPhrase = $sToken;
			}
		}
		
		// If, at this point, we are not within a phrase, the prepared phrase is complete and can be added to the array
		if (($cPhraseQuote === null) && ($sPhrase != null))
		{
			$sPhrase = strtolower($sPhrase);
			if (!in_array($sPhrase, $arTags)) $arTags[] = $sPhrase;
			$sPhrase = null;
		}
	}
	while ($sToken !== false);	// Stop when we receive FALSE from strtok()
	return $arTags;
}


The string can be recreated from the array with the use of this reverse function:
/**
 * Reverses ParseTagString()
 */
function CreateTagString($arTags)
{
	// Prepare each tag to be imploded
	for ($i = 0; $i < sizeof($arTags); $i++)
	{
		// Record findings
		$bContainsWhitespace = false;	// Was whitespace found?
		$cRequiredQuote = '"';			// Use double-quote by default
		$cLastChar = null;
	
		// Search the tag
		for ($j = 0; $j < strlen($arTags[$i]); $j++)
		{
			$c = $arTags[$i][$j];
			
			// If the current character is a space
			if ($c === ' ')
			{
				$bContainsWhitespace = true;
				
				// If the previous char was a double quote, we require single quotes round our phrase
				if ($cLastChar === '"')
				{
					$cRequiredQuote = "'";
					break;	// There is no more point in continuing our search, we cant handle double-mixed quotes
				}
			}
			
			// Record this char as the last char
			$cLastChar = $c;
		}
		
		// Quote if necessary
		if ($bContainsWhitespace) $arTags[$i] = $cRequiredQuote . $arTags[$i] . $cRequiredQuote;
	}
	return implode(' ', $arTags);
}


To test the whole system, use the following array of test cases:
$arTestInputs = array(
	"this test ensures that words are correctly split",
	"in this test \"phrases\" and \"multi-word phrases\" are tested",
	"this test shows the behaviour if an \"odd quote is detected",
	"this test shows that 'different quotes' work too",
	"but mixed quotes fail: \"test phrase' does not stop on the quote",
	"which can be usefull in some cases where \"the systems' requirements\" state that it is necessary",
	"quotes need not be attached to \" their phrase \"",
	"embedded\"quotes are ignored!",
	"this is also usefull and demonstrates the system's coolness",
	"redundant   white-space is   removed from \"  tags    and phrases\"",
	"\"\"double quotes\"\" will result in single quotes!",
	"remember that 'double-quotes\" may be nested within single quotes'",
	"TaGs ArE NOT case SENsITiVE!",
	"a duplicate tag will be removed from the tag list",
	"even a \" complex phrase\" that is equivalent to another 'compleX   PHrASe   '"
);

foreach ($arTestInputs as $sTest)
{
	print ("<pre>$sTest</pre>");
	print "<pre>";
	print_r (ParseTagString($sTest));
	print "</pre>";
	print "<pre>";
	print CreateTagString(ParseTagString($sTest));
	print "</pre>";
	print "<hr />";
}


2006-03-09 0.1.0 - 0.2.0 Duplicate phrases are now ignored.

--
Version 0.2.0 - 2006-03-09
STEM: The STEM Cells of PHP
This work is licensed under a Creative Commons Attribution-ShareAlike 2.5 License
http://creativecommons.org/licenses/by-sa/2.5/

splitting a text list in sql

This goes along with my integer split procedure.
http://www.bigbold.com/snippets/posts/show/774

Often times I have a list of integers I need to pass to the database to get worked on. Such as checkboxes on a web page or some other list. I needed some TSQL that would take a text string and split it by a separator, in this case a comma. The following is the result of that need.
The way I normally use it is in a stored procedure like the one below with several text type arguments. This is a variation designed to split a list of strings separated by a special character sequence. Image two lists, one of the ids and one of the data. You parse the first list to get a table of the ids and you parse the second list to get the data and insert/update as appropriate.
http://www.bigbold.com/snippets/posts/show/774

IF EXISTS (SELECT * FROM dbo.sysobjects WHERE id = object_id(N'[dbo].[uspSplitTextList]') AND OBJECTPROPERTY(id, N'IsProcedure') = 1)
   DROP PROCEDURE [dbo].[uspSplitTextList]
GO
                                      
SET QUOTED_IDENTIFIER ON 
GO
SET ANSI_NULLS ON 
GO


/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
-- uspSplitTextList
--
-- Description:
--		splits a separated list of text items and returns the text items
--
-- Arguments:
--		@list_text				- list of text items
--		@Delimiter				- delimiter
--
-- Notes:
-- 02/22/2006 - WSR : use DATALENGTH instead of LEN throughout because LEN doesn't count trailing blanks
--
-- History:
-- 02/22/2006 - WSR : revised algorithm to account for items crossing 8000 character boundary
--
CREATE PROCEDURE uspSplitTextList
	@list_text				text,
   @Delimiter				varchar(3)
AS

SET NOCOUNT ON

DECLARE @InputLen			integer			-- input text length
DECLARE @TextPos			integer			-- current position within input text
DECLARE @Chunk				varchar(8000)	-- chunk within input text
DECLARE @ChunkPos			integer			-- current position within chunk
DECLARE @DelimPos			integer			-- position of delimiter
DECLARE @ChunkLen			integer			-- chunk length
DECLARE @DelimLen			integer			-- delimiter length
DECLARE @ItemBegPos		integer			-- item starting position in text
DECLARE @ItemOrder		integer			-- item order in list
DECLARE @DelimChar		varchar(1)		-- first character of delimiter (simple delimiter)

-- create table to hold list items
-- actually their positions because we may want to scrub this list eliminating bad entries before substring is applied
CREATE TABLE #list_items ( item_order integer, item_begpos integer, item_endpos integer )

-- process list
IF @list_text IS NOT NULL
   BEGIN

	-- initialize
   SET @InputLen = DATALENGTH(@list_text)
   SET @TextPos = 1
	SET @DelimChar = SUBSTRING(@Delimiter, 1, 1)
	SET @DelimLen = DATALENGTH(@Delimiter)
   SET @ItemBegPos = 1
   SET @ItemOrder = 1
   SET @ChunkLen = 1

   -- cycle through input processing chunks
   WHILE @TextPos <= @InputLen AND @ChunkLen <> 0
      BEGIN

      -- get current chunk
      SET @Chunk = SUBSTRING(@list_text, @TextPos, 8000)

      -- setup initial variable values
      SET @ChunkPos = 1
      SET @ChunkLen = DATALENGTH(@Chunk)
      SET @DelimPos = CHARINDEX(@DelimChar, @Chunk, @ChunkPos)

      -- loop over the chunk, until the last delimiter
      WHILE @ChunkPos <= @ChunkLen AND @DelimPos <> 0
         BEGIN

			-- see if this is a full delimiter
         IF SUBSTRING(@list_text, (@TextPos + @DelimPos - 1), @DelimLen) = @Delimiter
            BEGIN

				-- insert position
	         INSERT INTO #list_items (item_order, item_begpos, item_endpos)
	         VALUES (@ItemOrder, @ItemBegPos, (@TextPos + @DelimPos - 1) - 1)
	         
	         -- adjust positions
	         SET @ItemOrder = @ItemOrder + 1
	         SET @ItemBegPos = (@TextPos + @DelimPos - 1) + @DelimLen
	         SET @ChunkPos = @DelimPos + @DelimLen

				END
         ELSE
            BEGIN

            -- adjust positions
            SET @ChunkPos = @DelimPos + 1

            END
      
         -- find next delimiter      
         SET @DelimPos = CHARINDEX(@DelimChar, @Chunk, @ChunkPos)

         END

      -- adjust positions
      SET @TextPos = @TextPos + @ChunkLen

      END

	-- handle last item
   IF @ItemBegPos <= @InputLen
      BEGIN

      -- insert position
      INSERT INTO #list_items (item_order, item_begpos, item_endpos)
      VALUES (@ItemOrder, @ItemBegPos, @InputLen)

      END

	-- delete the bad items
   DELETE FROM #list_items
   WHERE item_endpos < item_begpos

   -- return list items
	SELECT SUBSTRING(@list_text, item_begpos, (item_endpos - item_begpos + 1)) AS item_text, item_order, item_begpos, item_endpos
   FROM #list_items
   ORDER BY item_order

   END

DROP TABLE #list_items

RETURN

/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */

GO
SET QUOTED_IDENTIFIER OFF 
GO
SET ANSI_NULLS ON 
GO
« Newer Snippets
Older Snippets »
Showing 1-10 of 14 total  RSS