#!/usr/bin/env python2.5
from scraper import scraper, process
import codecs, sys
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
s = scraper(
process('span.title > a:first-child', title='TEXT', url='@href'),
process('span.furigana', furigana='TEXT'),
process('ul.list-circle > li:first-child > a', category='TEXT'),
result = s.scrape('')
print ''.join(result['category'])
print ''.join(result['furigana'])
print ''.join(result['title'])
print ''.join(result['url'])
$ ./
#!/usr/bin/env python2.5
from scraper import scraper, process
import codecs, sys
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
s = scraper(
process('a.image_link img', thumbs="@src"),
result = s.scrape('')
print "\n".join(result['thumbs'])
#!/usr/bin/env python2.5
# -*- coding: utf-8 -*-
from urllib import urlopen
from lxml import etree
def scraper(*funcs):
class Scraper(object):
def __init__(self, funcs):
self.funcs = funcs
def scrape(self, url):
from StringIO import StringIO
stash = {}
res = urlopen(url)
html ='charset') or 'latin-1')
tree = etree.parse(StringIO(html), etree.HTMLParser())
for f in self.funcs:
xpath, attr = f()
for key, val in attr.iteritems():
if val.startswith('@'):
stash[key] = [e.attrib[val[1:]] for e in tree.xpath(xpath)]
elif val.upper() == "TEXT":
stash[key] = [e.text for e in tree.xpath(xpath)]
print "Got an unknown thingy: ", what
return stash
return Scraper(funcs)
def create_process(func):
def do(selector, **kwargs):
def wrap():
return func(selector, kwargs)
return wrap
return do
def process(selector, kwargs):
from lxml.cssselect import CSSSelector
xpath = selector if selector.startswith('/') else CSSSelector(selector).path
return xpath, kwargs
1 comment:
I thought the post made some good points on web scrapers, I use python for simple web scrapers, data extraction can be a time consuming process but for larger projects like documents, the web, or files i tried "web scrapers" which worked great, they build quick custom screen scrapers, web scrapers, and data parsing programs
Post a Comment