diff options
| author | Anders Einar Hilden <hildenae@gmail.com> | 2015-01-17 01:39:55 +0100 | 
|---|---|---|
| committer | Anders Einar Hilden <hildenae@gmail.com> | 2015-01-17 01:39:55 +0100 | 
| commit | f5072f480d1db635e3612eead00a66395c5df2be (patch) | |
| tree | 77ef5ced6d37444d78089b502d5344430806b869 | |
| parent | 8ac5d9ec8290e281fbcb6f7c7f7790d5c1317feb (diff) | |
Add the correct libraryfile for dms2002
| -rw-r--r-- | scrapersources/postliste-python-lib-pdf-dms2002.py | 217 | 
1 files changed, 217 insertions, 0 deletions
| diff --git a/scrapersources/postliste-python-lib-pdf-dms2002.py b/scrapersources/postliste-python-lib-pdf-dms2002.py new file mode 100644 index 0000000..57b2c04 --- /dev/null +++ b/scrapersources/postliste-python-lib-pdf-dms2002.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- +# +# Python library for parsing public post journals (postlister) in Norway +# +# This parser is for the format currently known as  +# "DMS2002 - Software Innovation" +# +# Based on the scraper advanced-scraping-pdf and postliste-python-lib +# +# Possible sources using format: +# http://www.hig.no/om_hig/offentleg_journal (week 34 2014 and onwards) +# khib.no +# hbv.no +# www.bystyret.oslo.kommune.no +# www.spesialenheten.no +# www.frogn.kommune.no + +# Google search to find more: "Offentlig journal" "Ansvarlig enhet" Arkivdel "Dok. dato" Avskrevet filetype:pdf + + +import scraperwiki +import string +import re +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import sys +jp=scraperwiki.swimport('postliste-python-lib') + +class PDFJournalParser(jp.JournalParser): +    pagetable = "unparsedpages" +    brokenpagetable = "brokenpages" +    hiddentext = False +    breakonfailure = True +    debug = False +    def __init__(self, agency, hiddentext=False, debug=False): +        self.hiddentext = hiddentext +        self.debug = debug +        jp.JournalParser.__init__(self, agency=agency) + +    def sync(self): +        sys.stdout.flush() +        sys.stderr.flush() + +    def dprint(self, msg): +        if self.debug: +            print(msg) +            self.sync() + +    def parse_page(self, pdfurl, pagenum, pagecontent): +        self.sync() +        print "Scraping " + pdfurl + " page " + str(pagenum) +        s = BeautifulSoup(pagecontent) +        datastore = [] +        text = [] +        linecount = 0 +        #dprint(s) +        # Find all text-blobs and number them +        for t in s.findAll('text'): +            if t.text != " ": +                text.append(t.text) +                #self.dprint(str(linecount) + ": " + t.text) +                #self.dprint(str(linecount) + ": " + ":".join("{:02x}".format(ord(c)) for c in t.text)) +            linecount = linecount + 1 + +        #self.dprint("Found " + str(linecount) + " lines/text fragments in the PDF") +        if len(text) < linecount: +            raise  ValueError("[ERROR] Found %s interresting lines, but only saved %s?" % (linecount, len(text))) + +        # Count how many entries to expect on this page, to be able to +        # verify that all of them were found. +        entrycount = 0 +        i = 0 +        while i < len(text): +            if 'Avskrevet:' == text[i]: +                entrycount = entrycount + 1 +            i = i + 1 +        self.dprint("We found %s entries on page %s ('Avskrevet:')" % (entrycount, pagenum)) + +        if(entrycount > 6): +            self.dprint("[WARNING] We found %s entries on page %s, more that 6 is not normal" % (entrycount, pagenum)) + +        if(entrycount < 1): +            raise  ValueError("[ERROR] No entries found on page %s" % (pagenum)) + +        i = 0 +        found_entries = 0 +        entry_start = -1 +        entry_stop = -1 +        while i < len(text): +            if 'Avsender:' == text[i] or 'Mottaker:' == text[i]: +                entry_start = i - 1 +                if (entry_start < 0): +                    entry_start = 0 +                #self.dprint("ESTART") +             +            if 'Arkivdel:' == text[i]: +                #self.dprint("EEND") +                if(entry_start == -1): +                    self.dprint("[ERROR] Found end of entry (line %s) before start of entry on page %s" % (i, pagenum)) +                    raise ValueError("[ERROR] Found end of entry before start of entry on page %s" % (pagenum)) +                entry_end = i + 2 +                if (entry_end > len(text)): +                    entry_end = len(text) +                found_entries = found_entries + 1 +                entry = self.pdfparser(text[entry_start:entry_end], pdfurl, pagenum, found_entries) +                entry_start = -1 +                entry_stop = -1 +            i = i + 1 +        if (found_entries != entrycount): +            self.dprint("[ERROR] We expected %s but found %s entries on page %s" % (found_entries, entrycount, pagenum)) +            raise ValueError("[ERROR] We expected %s but found %s entries on page %s" % (found_entries, entrycount, pagenum)) +        self.dprint("We found %s of %s expected entries on page %s" %(found_entries, entrycount, pagenum)) +        s = None +        raise ValueError("parse_page not implemented") + +    def pdfparser(self, entrytext, pdfurl, pagenum, num_entry): +        FIELDS_IN_ENTRY = 10 +        field_order = {'Arkivdel:': 10, 'Arkivkode:': 7, 'Sak:': 2, 'Dok.:': 3, 'Tilg. kode:': 5, 'Dok. dato:': 9, 'Avskrevet:': 6, 'Avsender:': 1, 'Mottaker:': 1, 'Journaldato:': 4, 'Saksbehandler:': 8} +        fields = {'Avsender:', 'Mottaker:', 'Sak:', 'Dok.:', 'Journaldato:', 'Tilg. kode:', 'Avskrevet:', 'Arkivkode:', 'Saksbehandler:', 'Dok. dato:', 'Arkivdel:' } +        num_fields_found = 0 +        for text in entrytext: +            if text in field_order: +                num_fields_found = num_fields_found + 1 +                if (field_order[text] != num_fields_found): # Sanity check +                    self.dprint("[ERROR] Field '%s' is normally field #%s, but was #%s on page %s, entry %s" % (text, field_order[text], num_fields_found, pagenum, num_entry)) +                    raise ValueError("[ERROR] Field '%s' is normally field #%s, but was #%s on page %s, entry %s" % (text, field_order[text], num_fields_found, pagenum, num_entry)) +        self.dprint("All fields appeared in the expected order") +        if (num_fields_found != FIELDS_IN_ENTRY): # Sanity check +            self.dprint("[ERROR] Found %s fields, expected %s on page %s, field %s!" % (num_fields_found, FIELDS_IN_ENTRY, pagenum, num_entry)) +            raise ValueError("[ERROR] Found %s fields, expected %s on page %s, field %s!" % (num_fields_found, FIELDS_IN_ENTRY, pagenum, num_entry)) +        else: +            self.dprint("Found %s/10 fields in entry %s on page %s" % (num_fields_found, num_entry, pagenum)) +        #print field_order + +    def process_pages(self): +        brokenpages = 0 +        try: +            sqlselect = "* from " + self.pagetable + " limit 1" +            pageref = scraperwiki.sqlite.select(sqlselect) +            while pageref: +                scrapedurl = pageref[0]['scrapedurl'] +                pagenum = pageref[0]['pagenum'] +                pagecontent = pageref[0]['pagecontent'] +                try: +                    sqldelete = "delete from " + self.pagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum) +                    self.parse_page(scrapedurl, pagenum, pagecontent) +                    sys.stdout.flush() +                    sys.stderr.flush() +                    scraperwiki.sqlite.execute(sqldelete) +                except ValueError, e: +                    brokenpage = { +                        'scrapedurl' : scrapedurl, +                        'pagenum' : pagenum, +                        'pagecontent' : pagecontent, +                        'failstamp' : datetime.datetime.now(), +                    } +                    #print "Unsupported page %d from %s" % (pagenum, scrapedurl) +                    brokenpages = brokenpages + 1 +                    scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable) +                scraperwiki.sqlite.execute(sqldelete) +                #scraperwiki.sqlite.commit() +                #exit(0) +                pageref = scraperwiki.sqlite.select(sqlselect) +        except scraperwiki.sqlite.SqliteError, e: +            print str(e) +            raise +        if 0 < brokenpages: +            raise ValueError("Found %d pages with unsupported format" % brokenpages) + +    # Check if we recognize the page content, and throw if not +    def is_valid_page(self, pdfurl, pagenum, pagecontent): +        s = BeautifulSoup(pagecontent) +        for t in s.findAll('text'): +            if t.text != " ": +                if 'Dok.:' == t.text: +                    s = None +                    return True +        s = None +        self.dprint("Unrecognized page format for " + pdfurl) +        #raise ValueError("Unrecognized page format for " + pdfurl) + +    # Split PDF content into pages and store in SQL table for later processing. +    # The process is split in two to better handle parge PDFs (like 600 pages), +    # without running out of CPU time without loosing track of what is left to +    # parse. +    def preprocess(self, pdfurl, pdfcontent): +        print "Preprocessing PDF " + pdfurl +        if not pdfcontent: +            raise ValueError("No pdf content passed for " + pdfurl) +        if self.hiddentext: +            options = '-hidden' +        else: +            options = '' +        xml=scraperwiki.pdftoxml(pdfcontent, options) +        #self.dprint("The XMLK:") +        #self.dprint(xml) +        pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL) +        xml=None + +        pagecount = 0 +        datastore = [] +        for page in pages: +            pagecount = pagecount + 1 +            self.is_valid_page(pdfurl, pagecount, page) +            data = { +                'scrapedurl' : pdfurl, +                'pagenum' : pagecount, +                'pagecontent' : page, +            } +            datastore.append(data) +        self.dprint("Found %s pages, %s added to database" % (pagecount, len(datastore))) +        if 0 < len(datastore): +            scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=datastore, table_name=self.pagetable) +        else: +            raise ValueError("Unable to find any pages in " + pdfurl) +        pages = None | 
