DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Divyanand has posted 1 posts at DZone. View Full User Profile

A Python Tagger App

01.19.2010
| 3611 views |
  • submit to reddit
        A Tagger application, this is a Python app to generate an index of links organized by words that occur in their title strings, (done to self-teach python)
Input: is file containing a list of links
Outputs: links indexed by words that occur in their titles, output dumped to console.


from BeautifulSoup import BeautifulSoup,SoupStrainer
from urllib2 import urlopen, URLError
from pysqlite2 import dbapi2 as sqlite
from sets import Set
import re

# HTML parser class
class MyHTMLParser():

    titledata = ''

    # browse to specified page & extract title string
    # borrowed exception handling code from:
    # http://www.voidspace.org.uk/python/articles/urllib2.shtml
    def visitURL(self,url):
        self.titledata = ''
        try:
            req = urlopen(url)
        except URLError, e:
            if hasattr(e, 'reason'):
                print 'We failed to reach a server.'
                print 'Reason: ', e.reason
            elif hasattr(e, 'code'):
                print 'The server couldn\'t fulfill the request.'
                print 'Error code: ', e.code
        else:
            try:
                title = SoupStrainer('title')
                for tag in BeautifulSoup(req, parseOnlyThese=title):
                    s = tag.string.strip().lower()
                    self.titledata += s
            except:
                print "Error: HTML parse error"

    # get title string
    def getTitleData(self):
        return self.titledata



# Titleword2URL map
class TitleWords2LinkMap:
    # word to URL map
    word2URLmap = {}

    def storeData (self, word, url):
        if (self.word2URLmap.has_key(word)):
            self.word2URLmap[word].append(url)
        else:
            self.word2URLmap[word] = [url]

    def printDirectory(self):
        keys = self.word2URLmap.keys()
        for key in keys:
            print '\n',key, "===>", self.word2URLmap[key]





# Map URLs to title strings
class Link2TitleMap:

    # global to keep track of the categorizing title for a set of links
    title = ''

    # hashtable of links indexed by their categorizing title
    # (dictionary in python speak)
    weblinks = {}

    #HTML parser object
    htmlparser = MyHTMLParser()

    # remove these words from the title, these are not to be indexed
    common_words = Set(['in','the', 'of','it', 'on','a','an', 'with', 'to', 'for', 'you' 'your','my','mine'])

    # remove commonly occurring words in titles
    def sanitizeTitle (self):
        keys = self.weblinks.keys()

        for key in keys:
            title_word_set = Set()
            title_list = self.weblinks[key]

            # title is made up of a set of words...
            for title in title_list:
                title_words = title.split()
                for word in title_words:
                    if(word.isalpha()):
                        title_word_set.add(word)
            # ...with common words removed
            s = title_word_set.difference(self.common_words)
            self.weblinks[key] = s




    # store incoming input string into link Hash
    def storeData (self, data):
        if (data.startswith('http')):
            if (self.weblinks.has_key(data)):
                self.weblinks[data].append(self.title.lower())
            else:
                self.weblinks[data] = [self.title.lower()]
        else:
            self.title = data


    # print out contents of link2title map
    def printDirectory(self):
        keys = self.weblinks.keys()
        for key in keys:
            print '\n',key, "--->", self.weblinks[key]



    def printTitle(self):
        urllist = self.weblinks.keys()
        for url in urllist:
            urllist = self.weblinks[key]
            self.htmlparser.visitURL(url)
            print url, "--->", self.htmlparser.getTitleData()



    # iterate thru the linkDirectory hash, visiting each URL pointed
    # to by the keys & extract the title of each webpage
    def getPageTitle(self):
        urllist = self.weblinks.keys()
        for url in urllist:
            urllist = self.weblinks[url]
            self.htmlparser.visitURL(url)
            self.weblinks[url].append(self.htmlparser.getTitleData())

    # return the URL2titlewordmap
    def getMap(self):
        return self.weblinks






##############################################
# main program
# open file for reading
infile = open('Weblinks.txt','r')

# create a new LinkDirectory object
linkdir = Link2TitleMap()


title2url = TitleWords2LinkMap()

# read in file having the links (line by line - for efficiency)
# internalize file data into linkdir object
str = infile.readline()
while (str != ''):
    str = str.strip()
    if(str != '\n'):
       linkdir.storeData(str)
    str = infile.readline()

# close file handle
infile.close()

# populate the link hash  by visiting the links ( keys)
# and extracting the title
linkdir.getPageTitle()

# remove common words from title ( we dont index these)
linkdir.sanitizeTitle()

# generate tag -> URL mappling
url2words = linkdir.getMap()
urllist = url2words.keys()
for url in urllist:
    words = url2words[url]
    for word in words:
        title2url.storeData(word, url)

# print out tag-> mapping to console
title2url.printDirectory()