Simon' Bookmark: Web::Scraper in Python

http://d.hatena.ne.jp/akkt/20070911/1189521960
http://www.packtpub.com/article/web-scraping-with-python
http://code.activestate.com/recipes/286269/
http://www.goldb.org/geo_maps/

#!/usr/bin/env python2.5
from scraper import scraper, process
import codecs, sys
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

s = scraper(
process('span.title > a:first-child', title='TEXT', url='@href'),
process('span.furigana', furigana='TEXT'),
process('ul.list-circle > li:first-child > a', category='TEXT'),
)

result = s.scrape('http://d.hatena.ne.jp/keyword/%BA%B0%CC%EE%A4%A2%A4%B5%C8%FE')
print ''.join(result['category'])
print ''.join(result['furigana'])
print ''.join(result['title'])
print ''.join(result['url'])

$ ./keyword.py
アイドル
こんのあさみ
紺野あさ美
/keyword/%ba%b0%cc%ee%a4%a2%a4%b5%c8%fe

Web::ScraperよりFlickrからサムネイルURLをとってくる例。

#!/usr/bin/env python2.5
from scraper import scraper, process
import codecs, sys
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

s = scraper(
process('a.image_link img', thumbs="@src"),
)

result = s.scrape('http://www.flickr.com/photos/bulknews/sets/72157601700510359/')

print "\n".join(result['thumbs'])

scrape.py自体はこんな感じ

#!/usr/bin/env python2.5
# -*- coding: utf-8 -*-
from urllib import urlopen
from lxml import etree

def scraper(*funcs):
class Scraper(object):
    def __init__(self, funcs):
        self.funcs = funcs
    
    def scrape(self, url):
        from StringIO import StringIO
        stash = {}
        res = urlopen(url)
        html = res.read().decode(res.headers.getparam('charset') or 'latin-1')
        tree = etree.parse(StringIO(html), etree.HTMLParser())
        for f in self.funcs:
            xpath, attr = f()
            for key, val in attr.iteritems():
                if val.startswith('@'):
                    stash[key] = [e.attrib[val[1:]] for e in tree.xpath(xpath)]
                elif val.upper() == "TEXT":
                    stash[key] = [e.text for e in tree.xpath(xpath)]
                else:
                    print "Got an unknown thingy: ", what
        return stash

return Scraper(funcs)


def create_process(func):
def do(selector, **kwargs):
    def wrap():
        return func(selector, kwargs)
    return wrap
return do


@create_process
def process(selector, kwargs):
from lxml.cssselect import  CSSSelector
xpath = selector if selector.startswith('/') else CSSSelector(selector).path
return xpath, kwargs

1 comment:

atlas245 said...: I thought the post made some good points on web scrapers, I use python for simple web scrapers, data extraction can be a time consuming process but for larger projects like documents, the web, or files i tried "web scrapers" which worked great, they build quick custom screen scrapers, web scrapers, and data parsing programs; November 1, 2009 at 8:11 PM

Simon' Bookmark

Sunday, March 8, 2009

Web::Scraper in Python

1 comment:

廣告

Followers

Blog Archive

About Me