<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DZone Snippets: scrape code</title>
    <link>http://snippets.dzone.com/posts</link>
    <pubDate>Wed, 20 Aug 2008 16:38:54 GMT</pubDate>
    <description>DZone Snippets: scrape code</description>
    <item>
      <title>Screen scape heise.de newsticker (german)</title>
      <link>http://snippets.dzone.com/posts/show/886</link>
      <description>&lt;code&gt;&lt;br /&gt;#!/usr/bin/env python&lt;br /&gt;# -*- encoding: latin1 -*-&lt;br /&gt;&lt;br /&gt;import BeautifulSoup&lt;br /&gt;from PyRSS2Gen import RSSItem, Guid&lt;br /&gt;import ScrapeNFeed&lt;br /&gt;import urllib2&lt;br /&gt;import re&lt;br /&gt;&lt;br /&gt;debug = 0&lt;br /&gt;&lt;br /&gt;def fetch(url):&lt;br /&gt;    response = urllib2.urlopen(urllib2.Request(url))&lt;br /&gt;    return response.read(),response.info()&lt;br /&gt;&lt;br /&gt;class HeiFeed(ScrapeNFeed.ScrapedFeed):    &lt;br /&gt;    def HTML2RSS(self, headers, body):&lt;br /&gt;        items = []&lt;br /&gt;        soup = BeautifulSoup.BeautifulSoup(body)&lt;br /&gt;        for item in soup('a', {'href' : re.compile('^meldung.*')}):&lt;br /&gt;            link = 'http://www.heise.de/newsticker/' + item['href']&lt;br /&gt;            if not self.hasSeen(link):&lt;br /&gt;                title = item.contents[0].strip()&lt;br /&gt;                if debug:&lt;br /&gt;                    print "title: " + title&lt;br /&gt;                    print "link : " + link&lt;br /&gt;                response, headers = fetch(link)&lt;br /&gt;                s = BeautifulSoup.BeautifulSoup(response)&lt;br /&gt;                desc = s.fetch('div',{'class':'meldung_wrapper'})[0].prettify()&lt;br /&gt;                items.append(RSSItem(title=title, description=desc, link=link))&lt;br /&gt;            self.addRSSItems(items)&lt;br /&gt;&lt;br /&gt;HeiFeed.load("heise.de newsticker", 'http://www.heise.de/newsticker/',&lt;br /&gt;             "heise.de newsticker", 'heise_rss.xml', 'heise_rss.pickle',&lt;br /&gt;             managingEditor = 'tsch')&lt;br /&gt;&lt;/code&gt;</description>
      <pubDate>Sun, 13 Nov 2005 03:04:25 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/886</guid>
      <author>tsch ()</author>
    </item>
  </channel>
</rss>
