Sunday, March 8, 2009

Web::Scraper in Python

http://d.hatena.ne.jp/akkt/20070911/1189521960
http://www.packtpub.com/article/web-scraping-with-python
http://code.activestate.com/recipes/286269/
http://www.goldb.org/geo_maps/
#!/usr/bin/env python2.5
from scraper import scraper, process
import codecs, sys
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

s = scraper(
process('span.title > a:first-child', title='TEXT', url='@href'),
process('span.furigana', furigana='TEXT'),
process('ul.list-circle > li:first-child > a', category='TEXT'),
)

result = s.scrape('http://d.hatena.ne.jp/keyword/%BA%B0%CC%EE%A4%A2%A4%B5%C8%FE')
print ''.join(result['category'])
print ''.join(result['furigana'])
print ''.join(result['title'])
print ''.join(result['url'])
$ ./keyword.py
アイドル
こんのあさみ
紺野あさ美
/keyword/%ba%b0%cc%ee%a4%a2%a4%b5%c8%fe

Web::ScraperよりFlickrからサムネイルURLをとってくる例。

#!/usr/bin/env python2.5
from scraper import scraper, process
import codecs, sys
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

s = scraper(
process('a.image_link img', thumbs="@src"),
)

result = s.scrape('http://www.flickr.com/photos/bulknews/sets/72157601700510359/')

print "\n".join(result['thumbs'])

scrape.py自体はこんな感じ

#!/usr/bin/env python2.5
# -*- coding: utf-8 -*-
from urllib import urlopen
from lxml import etree

def scraper(*funcs):
class Scraper(object):
def __init__(self, funcs):
self.funcs = funcs

def scrape(self, url):
from StringIO import StringIO
stash = {}
res = urlopen(url)
html = res.read().decode(res.headers.getparam('charset') or 'latin-1')
tree = etree.parse(StringIO(html), etree.HTMLParser())
for f in self.funcs:
xpath, attr = f()
for key, val in attr.iteritems():
if val.startswith('@'):
stash[key] = [e.attrib[val[1:]] for e in tree.xpath(xpath)]
elif val.upper() == "TEXT":
stash[key] = [e.text for e in tree.xpath(xpath)]
else:
print "Got an unknown thingy: ", what
return stash

return Scraper(funcs)


def create_process(func):
def do(selector, **kwargs):
def wrap():
return func(selector, kwargs)
return wrap
return do


@create_process
def process(selector, kwargs):
from lxml.cssselect import CSSSelector
xpath = selector if selector.startswith('/') else CSSSelector(selector).path
return xpath, kwargs

1 comment:

atlas245 said...

I thought the post made some good points on web scrapers, I use python for simple web scrapers, data extraction can be a time consuming process but for larger projects like documents, the web, or files i tried "web scrapers" which worked great, they build quick custom screen scrapers, web scrapers, and data parsing programs