diff options
| -rw-r--r-- | scrapersources/postliste-difi | 8 | ||||
| -rw-r--r-- | scrapersources/postliste-hoegskolen-i-gjoevik | 14 | ||||
| -rw-r--r-- | scrapersources/postliste-hoegskolen-i-lillehammer | 4 | ||||
| -rw-r--r-- | scrapersources/postliste-hoegskolen-i-volda | 3 | ||||
| -rw-r--r-- | scrapersources/postliste-lenvik | 173 | ||||
| -rw-r--r-- | scrapersources/postliste-met | 4 | ||||
| -rw-r--r-- | scrapersources/postliste-naroy | 7 | ||||
| -rw-r--r-- | scrapersources/postliste-oep | 34 | ||||
| -rw-r--r-- | scrapersources/postliste-oep-deliverydates | 4 | ||||
| -rw-r--r-- | scrapersources/postliste-ruter | 1 | 
10 files changed, 232 insertions, 20 deletions
diff --git a/scrapersources/postliste-difi b/scrapersources/postliste-difi index dfc986f..459327b 100644 --- a/scrapersources/postliste-difi +++ b/scrapersources/postliste-difi @@ -54,17 +54,17 @@ def process_journal_pdfs(parser, listurl, errors):      html = scraperwiki.scrape(listurl)      root = lxml.html.fromstring(html)      html = None -    for ahref in root.cssselect("div.body a"): +    for ahref in root.cssselect("div.sixcol a"):          href = ahref.attrib['href']          url = urlparse.urljoin(listurl, href)          if -1 != href.find("file://") or -1 == url.find(".pdf"): -#            print "Skipping non-http URL " + url +            print "Skipping non-http URL " + url              continue          if parser.is_already_scraped(url):              True -#            print "Skipping already scraped " + url +            print "Skipping already scraped " + url          else: -#            print "Will process " + url +            #print "Will process " + url              process_pdf(parser, url, errors)  def test_small_pdfs(parser): diff --git a/scrapersources/postliste-hoegskolen-i-gjoevik b/scrapersources/postliste-hoegskolen-i-gjoevik index d4f7931..cdf007e 100644 --- a/scrapersources/postliste-hoegskolen-i-gjoevik +++ b/scrapersources/postliste-hoegskolen-i-gjoevik @@ -60,13 +60,15 @@ def process_journal_pdfs(parser, listurl, errors):      html = scraperwiki.scrape(listurl)      root = lxml.html.fromstring(html)      html = None -    for ahref in root.cssselect("div.spalte-inner a"): +    for ahref in root.cssselect("section a"):          href = ahref.attrib['href']          url = urlparse.urljoin(listurl, href).replace(" ", "+") +        #print url          if -1 != href.find("file://") or -1 == url.find(".pdf"):  #            print "Skipping non-http URL " + url              continue          if parser.is_already_scraped(url): +            #print "Scraped: %s" % url              True  #            print "Skipping already scraped " + url          else: @@ -98,6 +100,16 @@ endYear=datetime.datetime.now().year  for year in range(startYear, endYear+1): # range goes from startyear to endYear-1      process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors) +  process_page_queue(parser, errors)  report_errors(errors) +warningQuery = "recorddate as lastupdate from 'swdata' order by recorddate DESC limit 1"; +result = scraperwiki.sqlite.select(warningQuery) +now=datetime.datetime.today() +then=datetime.datetime.strptime(result[0]['lastupdate'],"20%y-%m-%dT%H:%M:%S") + +if (now-then).days > 14: +    print "warning" +    warningURL = "http://hild1.no/~hildenae/files/dynamic/run.php?scraper=postliste-hoegskolen-i-gjoevik&reason=7days"; +    scraperwiki.scrape(warningURL)
\ No newline at end of file diff --git a/scrapersources/postliste-hoegskolen-i-lillehammer b/scrapersources/postliste-hoegskolen-i-lillehammer index 5337521..5687ece 100644 --- a/scrapersources/postliste-hoegskolen-i-lillehammer +++ b/scrapersources/postliste-hoegskolen-i-lillehammer @@ -64,9 +64,9 @@ def process_journal_pdfs(parser, listurl, errors):              continue          if parser.is_already_scraped(url):              True -#            print "Skipping already scraped " + url +            print "Skipping already scraped " + url          else: -#            print "Will process " + url +            print "Will process " + url              process_pdf(parser, url, errors)  def test_small_pdfs(parser): diff --git a/scrapersources/postliste-hoegskolen-i-volda b/scrapersources/postliste-hoegskolen-i-volda index 0106cb7..d8f3686 100644 --- a/scrapersources/postliste-hoegskolen-i-volda +++ b/scrapersources/postliste-hoegskolen-i-volda @@ -53,11 +53,12 @@ def process_journal_pdfs(parser, listurl, errors):      html = scraperwiki.scrape(listurl)      root = lxml.html.fromstring(html)      html = None -    for ahref in root.cssselect("div.inside a"): +    for ahref in root.cssselect("div#maincontent a"):          if 'id' not in ahref.attrib or -1 == ahref.attrib['id'].find("archiveimage_"):              continue          href = ahref.attrib['href']          url = urlparse.urljoin(listurl, href) +        #print "found url %s" %url          if -1 != href.find("file://"):  #            print "Skipping non-http URL " + url              continue diff --git a/scrapersources/postliste-lenvik b/scrapersources/postliste-lenvik new file mode 100644 index 0000000..66a502d --- /dev/null +++ b/scrapersources/postliste-lenvik @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- + +import scraperwiki +import urllib2 +import lxml.html +import re +import dateutil.parser +from dateutil.relativedelta import relativedelta +import datetime +import urlparse + +agency = "Lenvik kommune" + +# Point scraperwiki GUI to the start page +scraperwiki.scrape("http://webway.lenvik.kommune.no/postjournal") + +postlistelib=scraperwiki.swimport('postliste-python-lib') +parser = postlistelib.JournalParser(agency=agency) + +def saver(unique_keys, data): +#    return +    #print "Not saving data" +    scraperwiki.sqlite.save(unique_keys, data) + +def expand_year(year): +    year = int(year) +    if year > 50: +        year = year + 1900 +    else: +        year = year + 2000 +    return year + +#            <tr class=yang> +#              <td>13/00563-001</td> +#              <td>04.03.2013</td> +#              <td style="text-align:center;"> +#                <div title="Inngående">I</div> +#              </td> +#              <td>Flytting av VPN-tunell </td> +#                  <td>EVRY AS</td> +#              <td>Jan-Eirik Nordahl</td> +#                  <td> +#                        <a href="/dokumentbestilling?jpid=13003566" title="Klikk for å bestille innsyn">Bestill</a> +#                  </td> +#                  <td></td> +# +#            </tr> +# + +def fetch_postjournal_day(parser, url, html, saver): +    root = lxml.html.fromstring(html.decode('utf-8')) + +    recorddate = None +    for div in root.cssselect('div'): +        divcontent = div.text_content() +        if 0 == divcontent.find("Offentlig postjournal for "): +            recorddate = dateutil.parser.parse(divcontent.replace("Offentlig postjournal for ",""), dayfirst=True) +    print recorddate + +    # Make sure we save the entire URL or nothing at all +    datastore = [] +    for tr in root.cssselect('tr.yang'): +        tds = tr.cssselect("td") +        docidstr = tds[0].text_content().strip() +        docdate = tds[1].text_content().strip() +        doctype = tds[2].text_content().strip() +        docdesc = tds[3].text_content().strip() +        fratil = tds[4].text_content().strip() +        saksbehandler = tds[5].text_content().strip() +        if -1 != tds[6].text_content().find("Bestill"): +            exemption = None +        else: +            exemption = tds[6].text_content().strip() + +        docdate = dateutil.parser.parse(docdate, dayfirst=True) + +#        print doctype, docdesc +        if not parser.is_valid_doctype(doctype): +            doctype = { +                '' : '?', +                }[doctype] +        if parser.is_sender_doctype(doctype): +            fratilfield = 'sender' +        elif parser.is_recipient_doctype(doctype): +            fratilfield = 'recipient' + +        caseyear, caseseqnr = docidstr.split("/") +        caseyear = expand_year(caseyear) +        caseseqnr, casedocseq = caseseqnr.split("-") +        caseid = "%d/%d" % (int(caseyear), int(caseseqnr)) + +        data = { +            'agency' : parser.agency, +            'recorddate' : recorddate.date(), +            'docdate' : docdate.date(), +            'docdesc' : docdesc, +            'casedesc' : docdesc, # FIXME fake value + +            'caseyear' : int(caseyear), +            'caseseqnr' : int(caseseqnr), +            'casedocseq' : int(casedocseq), +            'caseid' : caseid, +            'doctype' : doctype, + +#        'journalseqnr' : int(journalseqnr), +#        'journalyear' : int(journalyear), +#        'journalid' : journalid, +            fratilfield : fratil, + +            'saksbehandler' : saksbehandler, +#        'saksansvarlig' : saksansvarlig.strip(), +#        'saksansvarligenhet' : saksansvarligenhet.strip(), + +            'docidstr' : docidstr, +#        'laapenr' : laapenr, +            'exemption' : exemption, + +            'scrapedurl' : url, +            'scrapestamputc' : datetime.datetime.now() +            } + +#        print data +        parser.verify_entry(data) +        datastore.append(data) + +    seenurl = {} +    # Find next URL.  There are two on each page. +    for ahref in root.cssselect('a.next_page'): +        if 0 == ahref.text_content().find('Neste'): +            nexturl = urlparse.urljoin(url, ahref.attrib['href']) +            if nexturl not in seenurl: +                seenurl[nexturl] = True; +                print 'Fetching ' + nexturl +                html = postlistelib.fetch_url_harder(nexturl) +                mysaver = lambda unique_keys, data: datastore.extend(data) +                fetch_postjournal_day(parser=parser, url=nexturl, html=html, +                                      saver=mysaver) + +    saver(unique_keys=['docidstr'], data=datastore) + +def date2url(date): +    return 'http://webway.lenvik.kommune.no/?date=%s' % date + +def gen_date_urls(urllist, startdate, step, count): +    d = dateutil.parser.parse(startdate, dayfirst=False) +    for n in xrange(1, step*(count+1), step): +        next = (d + relativedelta(days=n)).strftime("%Y-%m-%d") +        urllist.append(date2url(next)) + +urllist = [] +today = datetime.date.today() +try: +    first = scraperwiki.sqlite.select("min(recorddate) as min from swdata")[0]['min'] +    last = scraperwiki.sqlite.select("max(recorddate) as max from swdata")[0]['max'] +except: +    last = (today + relativedelta(days=-14)).strftime("%Y-%m-%d") +    first = None + +print first, last + +# Parse back in time +if first is not None: +    gen_date_urls(urllist, first, -1, 100) + +# Parse forward in time +if last is not None: +    gen_date_urls(urllist, last, 1, 3) + +for dayurl in urllist: +    print 'Fetching ' + dayurl +    html = postlistelib.fetch_url_harder(dayurl) +    fetch_postjournal_day(parser=parser, url=dayurl, html=html, saver=saver) + diff --git a/scrapersources/postliste-met b/scrapersources/postliste-met index 02c53ca..d769e97 100644 --- a/scrapersources/postliste-met +++ b/scrapersources/postliste-met @@ -58,11 +58,11 @@ def process_journal_pdfs(parser, listurl, errors):          href = ahref.attrib['href']          url = urlparse.urljoin(listurl, href)          if -1 != href.find("file://") or -1 == url.find("=File.getFile;"): -#            print "Skipping non-http URL " + url +            print "Skipping non-http URL " + url              continue          if parser.is_already_scraped(url):              True -#            print "Skipping already scraped " + url +            print "Skipping already scraped " + url          else:  #            print "Will process " + url              process_pdf(parser, url, errors) diff --git a/scrapersources/postliste-naroy b/scrapersources/postliste-naroy index b8fa33b..f47adb3 100644 --- a/scrapersources/postliste-naroy +++ b/scrapersources/postliste-naroy @@ -59,7 +59,12 @@ def process_journal_pdfs(parser, listurl, errors):              continue          # Special case, file indicating no journal entries this day          if "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/CA6C83764E56DDCBC1257A02003F9025/$FILE/Postjournal+11.05.12.pdf" == url or \ -            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url: +            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url or \ +            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/777B497BB48936ACC1257A450033E1D4/$FILE/Postjournal+20.07.12.pdf" == url or \ +            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/1802A0FF57C08EFEC1257A4500337345/$FILE/Postjournal+16.07.12.pdf" == url or \ +            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/90373A38701C27E5C1257A45002F63FD/$FILE/Postjournal+12.07.12.pdf" == url or \ +            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/6B00A3BD92B3C2AEC1257A45002F4044/$FILE/Postjournal+10.07.12.pdf" == url or \ +            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/0141B5488D38B8FEC1257A44003756ED/$FILE/Postjournal+06.07.12.pdf" == url:              continue          if parser.is_already_scraped(url):              True diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep index 360ab91..bcfde1b 100644 --- a/scrapersources/postliste-oep +++ b/scrapersources/postliste-oep @@ -9,7 +9,7 @@ import httplib  import urllib2  # Try several times as the database get bigger -writetries = 6 +writetries = 8  # http://www.oep.no/search/resultSingle.html?journalPostId=1000000  # http://www.oep.no/search/resultSingle.html?journalPostId=3889259 @@ -102,23 +102,31 @@ def url_from_id(id):      return "http://www.oep.no/search/resultSingle.html?journalPostId=" + str(id)  def save(data): +    problem = False      for run in range(0,writetries):          try:              scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data) +            if problem: +                print "Sqlite write succeeded"              return          except scraperwiki.sqlite.SqliteError, e: -            print "Sqlite write error, trying again" +            print "Sqlite write error, trying again: " + str(e)              time.sleep(22) +            problem = True      raise scraperwiki.sqlite.SqliteError("Unable to write to database, tried " + str(writetries) + " times")  def save_var(var, data): +    problem = False      for run in range(0,writetries):          try:              scraperwiki.sqlite.save_var(var, data) +            if problem: +                print "Sqlite write succeeded"              return          except scraperwiki.sqlite.SqliteError, e: -            print "Sqlite write error, trying again" +            print "Sqlite write error, trying again: " + str(e)              time.sleep(22) +            problem = True      raise scraperwiki.sqlite.SqliteError("Unable to write variable " + var + " to database, tried " + str(writetries) + " times")  fieldmap = { @@ -177,9 +185,8 @@ def fetch_oep_entry(id, datastorage):  #    scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)      return 0 -def fetch_range(first, last, step): +def fetch_range(datastorage, first, last, step):      myskiplimit = skiplimit -    datastorage = []      skipped = 0      fetched = 0      min_id = first @@ -312,6 +319,16 @@ def remove_original():  print "Starting to fetch journal entries " + str(datetime.datetime.now())  scraperwiki.scrape("http://www.oep.no/") +datastorage = [] + +# Update entries to handle <URL: https://rt.nuug.no:443/Ticket/Display.html?id=6342 >. +# Used 2012-09-17 +#scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638167") +#fetch_oep_entry(638167, datastorage) +#scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638104") +#fetch_oep_entry(638104, datastorage) +#scraperwiki.sqlite.commit() +  count = 10000  skiplimit = 500  # Random value fairly close to the most recent ID when this project started 2012-05-03 @@ -320,20 +337,21 @@ try:      max = scraperwiki.sqlite.select("max(journalPostId) as max from swdata")[0]["max"]      if 0 < scraperwiki.sqlite.get_var('min_tested_id'):          saved_min = scraperwiki.sqlite.get_var('min_tested_id') +    else: +        saved_min = 0      sql_min = scraperwiki.sqlite.select("min(journalPostId) as min from swdata")[0]["min"]      print "Saved min: " + str(saved_min) + ", sql min: " + str(sql_min)      if sql_min < saved_min:          min = sql_min      else:          min = saved_min -      print "Scraping " + str(count) + " IDs below " + str(min) + " and above " + str(max)  except scraperwiki.sqlite.SqliteError:      pass -fetched = fetch_range(max + 1, max + count, 1) +fetched = fetch_range(datastorage, max + 1, max + count, 1)  print "Fetched " + str(fetched) + " new journal entries, cpu spent: " + str(cpu_spent())  if min >= 0: -    fetched = fetch_range(min, min - count, -1) +    fetched = fetch_range(datastorage, min, min - count, -1)      print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent()) diff --git a/scrapersources/postliste-oep-deliverydates b/scrapersources/postliste-oep-deliverydates index f04ce49..ebce253 100644 --- a/scrapersources/postliste-oep-deliverydates +++ b/scrapersources/postliste-oep-deliverydates @@ -30,7 +30,9 @@ def fetch_oep_deliverydates(url, datastorage):      return 0  datastorage = [] -fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage) +#fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage) +# New url before 2012-11-09 +fetch_oep_deliverydates("http://www.oep.no/pub/report.xhtml?reportId=3", datastorage)  print datastorage  scraperwiki.sqlite.save(unique_keys=['agency', 'deliverydate'], data=datastorage) diff --git a/scrapersources/postliste-ruter b/scrapersources/postliste-ruter index 757d6be..51a2776 100644 --- a/scrapersources/postliste-ruter +++ b/scrapersources/postliste-ruter @@ -10,6 +10,7 @@ import resource  import sys  import urlparse  import re +scraperwiki.scrape('http://www2.ruter.no/verdt-a-vite/presse/offentlig-journal/')  lazycache=scraperwiki.swimport('lazycache')  postlistelib=scraperwiki.swimport('postliste-python-lib')  | 
