diff options
| author | Petter Reinholdtsen <pere@hungry.com> | 2015-11-23 07:40:00 +0100 | 
|---|---|---|
| committer | Petter Reinholdtsen <pere@hungry.com> | 2015-11-23 07:40:00 +0100 | 
| commit | d05e9377d1168c24d4d9096011a548059148b614 (patch) | |
| tree | 222903718ac44be1055fc024352e62f18f7392b4 | |
| parent | 0a8885848b1cf2f57e84a5440855300875c1718d (diff) | |
Start on new scraiper for sio.no.
| -rw-r--r-- | scrapersources/postliste-sioa | 30 | 
1 files changed, 30 insertions, 0 deletions
| diff --git a/scrapersources/postliste-sioa b/scrapersources/postliste-sioa new file mode 100644 index 0000000..6fabcc1 --- /dev/null +++ b/scrapersources/postliste-sioa @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- + +#import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import urlparse +import re +import urllib2 +frontpage = 'https://sio.no/snarveier/om-sio/rapporter-og-referater' +#scraperwiki.scrape(frontpage) + +#postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Samskipnaden i Oslo og Akershus' + +baseurl = 'https://sio.no' +response = urllib2.urlopen(frontpage) +html = response.read() +root = lxml.html.fromstring(html) +urls = root.cssselect("a.readmore") +urllist = [] +for ahref in urls: +    linktext = ahref.text_content() +    if -1 != linktext.find('Postliste SiO'): +        href = ahref.attrib['href'] +        print href +        urllist.append(baseurl + href) | 
