<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DZone Snippets: beautifulsoup code</title>
    <link>http://snippets.dzone.com/posts</link>
    <pubDate>Sat, 26 Jul 2008 22:42:48 GMT</pubDate>
    <description>DZone Snippets: beautifulsoup code</description>
    <item>
      <title>Screen scape heise.de newsticker (german)</title>
      <link>http://snippets.dzone.com/posts/show/886</link>
      <description>&lt;code&gt;&lt;br /&gt;#!/usr/bin/env python&lt;br /&gt;# -*- encoding: latin1 -*-&lt;br /&gt;&lt;br /&gt;import BeautifulSoup&lt;br /&gt;from PyRSS2Gen import RSSItem, Guid&lt;br /&gt;import ScrapeNFeed&lt;br /&gt;import urllib2&lt;br /&gt;import re&lt;br /&gt;&lt;br /&gt;debug = 0&lt;br /&gt;&lt;br /&gt;def fetch(url):&lt;br /&gt;    response = urllib2.urlopen(urllib2.Request(url))&lt;br /&gt;    return response.read(),response.info()&lt;br /&gt;&lt;br /&gt;class HeiFeed(ScrapeNFeed.ScrapedFeed):    &lt;br /&gt;    def HTML2RSS(self, headers, body):&lt;br /&gt;        items = []&lt;br /&gt;        soup = BeautifulSoup.BeautifulSoup(body)&lt;br /&gt;        for item in soup('a', {'href' : re.compile('^meldung.*')}):&lt;br /&gt;            link = 'http://www.heise.de/newsticker/' + item['href']&lt;br /&gt;            if not self.hasSeen(link):&lt;br /&gt;                title = item.contents[0].strip()&lt;br /&gt;                if debug:&lt;br /&gt;                    print "title: " + title&lt;br /&gt;                    print "link : " + link&lt;br /&gt;                response, headers = fetch(link)&lt;br /&gt;                s = BeautifulSoup.BeautifulSoup(response)&lt;br /&gt;                desc = s.fetch('div',{'class':'meldung_wrapper'})[0].prettify()&lt;br /&gt;                items.append(RSSItem(title=title, description=desc, link=link))&lt;br /&gt;            self.addRSSItems(items)&lt;br /&gt;&lt;br /&gt;HeiFeed.load("heise.de newsticker", 'http://www.heise.de/newsticker/',&lt;br /&gt;             "heise.de newsticker", 'heise_rss.xml', 'heise_rss.pickle',&lt;br /&gt;             managingEditor = 'tsch')&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Sun, 13 Nov 2005 03:04:25 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/886</guid>
      <author>tsch ()</author>
    </item>
    <item>
      <title>Create an RSS feed from an SQL query</title>
      <link>http://snippets.dzone.com/posts/show/885</link>
      <description>&lt;code&gt;&lt;br /&gt;#!/usr/bin/env python&lt;br /&gt;# -*- encoding: latin1 -*-&lt;br /&gt;&lt;br /&gt;import datetime,PyRSS2Gen,sqlobject&lt;br /&gt;from sqlobject.postgres import builder&lt;br /&gt;&lt;br /&gt;con = builder()(user = 'user', passwd = '', host = 'localhost', db='name')&lt;br /&gt;&lt;br /&gt;# set db encoding (maybe optional)&lt;br /&gt;con.queryOne("SET client_encoding TO 'latin1'; SELECT 1;")&lt;br /&gt;&lt;br /&gt;items = []&lt;br /&gt;for res in con.queryAll("""SELECT title,url,datum,description FROM table ORDER BY datum DESC LIMIT 30"""):&lt;br /&gt;    items.append(&lt;br /&gt;        PyRSS2Gen.RSSItem(&lt;br /&gt;        title = res[0], link = res[1],&lt;br /&gt;        description = """&lt;h2&gt;%s&lt;/h2&gt;on %s&lt;br/&gt;&lt;p&gt;%s&lt;/p&gt;"""%(res[0],res[2],res[]3),&lt;br /&gt;        guid = PyRSS2Gen.Guid(res[1]), pubDate = res[2]))&lt;br /&gt;&lt;br /&gt;    # generate rss feed&lt;br /&gt;PyRSS2Gen.RSS2(&lt;br /&gt;    title         = "sql2rss feed",&lt;br /&gt;    link          = "http://localhost/die URL",&lt;br /&gt;    description   = "The latest sql2rss news",&lt;br /&gt;    lastBuildDate = datetime.datetime.now(),&lt;br /&gt;    items         = items).write_xml(open("sql2rss.xml", "w"))&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Sun, 13 Nov 2005 02:51:47 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/885</guid>
      <author>tsch ()</author>
    </item>
    <item>
      <title>A simple python class to browse snippets website (with beautifoulsoup)</title>
      <link>http://snippets.dzone.com/posts/show/692</link>
      <description>if you got some path/enhancements, you can mail me at my pseudo at gmail.com, i'll update it.&lt;br /&gt;(you should install the marvellous beautifulsoup module, http://www.crummy.com/software/BeautifulSoup/documentation.html)&lt;br /&gt;&lt;br /&gt;the snippets.py file :&lt;br /&gt;&lt;code&gt;&lt;br /&gt;from BeautifulSoup import BeautifulSoup&lt;br /&gt;import urllib&lt;br /&gt;&lt;br /&gt;class Keyword: # top tags&lt;br /&gt;    def __init__(self,tag,nb):&lt;br /&gt;        self.tag=tag&lt;br /&gt;        self.nb=int(nb)&lt;br /&gt;    def __repr__(self):&lt;br /&gt;        return "&lt;Keyword '%s' : %d&gt;" % (self.tag,self.nb)&lt;br /&gt;&lt;br /&gt;class Snippet:&lt;br /&gt;    def __init__(self,title,code,tags):&lt;br /&gt;        self.title=title&lt;br /&gt;        self.code=code&lt;br /&gt;        self.tags = tags&lt;br /&gt;    def __repr__(self):&lt;br /&gt;        return "&lt;Snippet '%s' : tags %s&gt;" % (self.title,str(self.tags))&lt;br /&gt;&lt;br /&gt;class Snippets:&lt;br /&gt;    urlForTags = "http://www.bigbold.com/snippets/tags"&lt;br /&gt;    &lt;br /&gt;    def __init__(self,l=[]):&lt;br /&gt;        url = self.__getUrlForTags(l)&lt;br /&gt;        &lt;br /&gt;        #load the url&lt;br /&gt;        fu = urllib.urlopen(url)&lt;br /&gt;        content = fu.read()&lt;br /&gt;        fu.close()&lt;br /&gt;&lt;br /&gt;        self.tags = l&lt;br /&gt;        self.keywords,self.snippets = self.__extractContent(content)&lt;br /&gt;&lt;br /&gt;    def __repr__(self):&lt;br /&gt;        return "&lt;Snippets for tags:%s&gt;" % (str(self.tags))&lt;br /&gt;&lt;br /&gt;    def __getUrlForTags(self, l ):&lt;br /&gt;        assert type(l)==list&lt;br /&gt;        l = [Snippets.urlForTags] + l&lt;br /&gt;        return "/".join(l)&lt;br /&gt;    &lt;br /&gt;    def __extractContent(self,content):&lt;br /&gt;        &lt;br /&gt;        soup = BeautifulSoup( content ) &lt;br /&gt;            &lt;br /&gt;        # get the keywords&lt;br /&gt;        tagTable=soup('div', {'id' : "sidebar"})[0].table&lt;br /&gt;        keywords=[]&lt;br /&gt;        for i in tagTable("tr"):&lt;br /&gt;            td = i("td")&lt;br /&gt;            &lt;br /&gt;            # add this keyword&lt;br /&gt;            try:&lt;br /&gt;                # extract from the empty selection page "/tags"&lt;br /&gt;                keywords.append( Keyword(td[1].span.a.string , td[0].string) )&lt;br /&gt;            except TypeError:&lt;br /&gt;                # extract from a selected selection page "/tag/something"&lt;br /&gt;                keywords.append( Keyword(td[2].span.a.string , td[1].string) )&lt;br /&gt;        &lt;br /&gt;        # get the snippets&lt;br /&gt;        postList=soup('div', {'class' : "post"})&lt;br /&gt;        snippets=[]&lt;br /&gt;        for i in postList:&lt;br /&gt;            divs = i("div")&lt;br /&gt;            &lt;br /&gt;            # get title and tags&lt;br /&gt;            title =  divs[0].h3.a.string # title&lt;br /&gt;            tags = [j.string for j in divs[1]("a")][:-1] #don't get the user ;-)&lt;br /&gt;&lt;br /&gt;            # get code of the snippet&lt;br /&gt;            list = [j for j in divs[0]][1:]# zap the first (h3)&lt;br /&gt;            code=""&lt;br /&gt;            for i in list: &lt;br /&gt;                try:&lt;br /&gt;                    if i.name == "pre":&lt;br /&gt;                        try:&lt;br /&gt;                            code+=i.string&lt;br /&gt;                        except TypeError:&lt;br /&gt;                            pass&lt;br /&gt;                except AttributeError:&lt;br /&gt;                    # transform "out-pre-text" in comment&lt;br /&gt;                    out = str(i).strip()&lt;br /&gt;                    if out:&lt;br /&gt;                        code+="#| "+out+"\n" &lt;br /&gt;            &lt;br /&gt;            # add this snippet&lt;br /&gt;            snippets.append( Snippet(title,code,tags) )&lt;br /&gt;            &lt;br /&gt;        return keywords,snippets&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;&lt;br /&gt;and an example (all returned "strings" are in utf-8):&lt;br /&gt;&lt;code&gt;&lt;br /&gt;from snippets import Snippets&lt;br /&gt;&lt;br /&gt;s = Snippets(["python","xml"])&lt;br /&gt;print s&lt;br /&gt;print s.keywords # the "top tags" column&lt;br /&gt;for i in s.snippets:&lt;br /&gt;    print i&lt;br /&gt;print s.snippets[6].title # the title of the 6th&lt;br /&gt;print s.snippets[6].code  # the code of the 6th&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Fri, 09 Sep 2005 19:31:22 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/692</guid>
      <author>manatlan (manatlan)</author>
    </item>
  </channel>
</rss>
