1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
import scraperwiki, urllib2, datetime, base64, time, re
from bs4 import BeautifulSoup
from collections import deque
import scraperwiki
lazycache = scraperwiki.swimport('lazycache')
u = scraperwiki.swimport('hildenae_utils')
def d(text):
if(False):
print "DEBUG:", text
def process_pdf(pdfurl):
pdfxml = u.findInCache(pdfurl,verbose=True) # look for html parse in cache
if pdfxml is None: # a html parse is not cached
pdfdata=lazycache.lazycache(pdfurl, verbose=True) # look for pdf document in cache, if not download
pdfxml = scraperwiki.pdftoxml(pdfdata, "-hidden") # parse pdf text to html
u.putInCache(pdfurl, pdfxml, verbose=True) # save cache of html parse
beautifulxml = BeautifulSoup(pdfxml) # convert html to BeautifulSoup(4) object
for page in beautifulxml.find_all('page'):
FIRSTPAGE = 6
LASTPAGE = 6
if int(page['number']) < FIRSTPAGE:
continue
if int(page['number']) == FIRSTPAGE:
print "*******************************************"
print "***** FIRSTPAGE #%d while developing ******" % (FIRSTPAGE)
print "*******************************************"
if int(page['number']) == LASTPAGE+1:
print "*******************************************"
print "****** LASTPAGE #%d while developing ******" % (LASTPAGE)
print "*******************************************"
break
print( "*******************************************")
print( "********** Working on page #%s **********" % page['number'])
print( "*******************************************")
elementList = deque(page.find_all('text')) # we want to be able to use popleft
d(elementList)
while True:
try:
currElement = elementList.popleft()
if "Innhold:" in currElement.text and currElement.b: # we found a "Innhold:"-header
entry = parseDocumentRecord(currElement, elementList)
print entry
scraperwiki.sqlite.save(unique_keys=["innhold", "sakstittel"], data=entry)
d( "back in process_pdf")
#else:
#print currElement.text
except IndexError, e:
d("No more text elements on page (%s)" % e)
break
def parseDocumentRecord(currElement, elementList):
# previous element in list is "Innhold:"
d ("starting parseDocumentRecord")
entry = {}
while(True):
try:
d(elementList)
if "Innhold:" in elementList[0].text: # look ahead, if next is "Innhold:" return to process_pdf
break
currElement = elementList.popleft() # first text in innhold
entry["innhold"] = ""
while(True):
if "Sakstittel:" in currElement.text: # we found sakstittel, go to next
break
entry["innhold"] += currElement.text
currElement = elementList.popleft()
entry["innhold"] = u.removeDoubleSpaces(entry["innhold"])
currElement = elementList.popleft() # first text in sakstittel
entry["sakstittel"] = ""
while(True):
if "DokType" in currElement.text: # we found DokType, go to next
break
entry["sakstittel"] += currElement.text
currElement = elementList.popleft()
entry["sakstittel"] = u.removeDoubleSpaces(entry["sakstittel"])
print("before spool to 'mottaker:'")
'''
Komments: Virker som om pdf2html noen ganger ikke klarer å lese DokType. Hittil er dette kun observert når
DokType er U (selv om den klarer å lese noen DokType U). Dette er bekreftet mesteparten av 18 og 22 i juni
'''
print elementList
print("spool to 'mottaker:'")
currElement = elementList.popleft() # first text after DocType
while(True):
if re.search( r'[t].*[t].*[a].*[k].*[e].*[r].*[:]', currElement.text): # match "motta ker:" (some last pages - nooooot pretty)
d("found mottaker")
break
currElement = elementList.popleft()
d(elementList)
entry["avsender_mottager"] = ""
while(True):
if ("Innhold:" in elementList[0].text) or ("Side:" in elementList[0].text): # ***look ahead***, if next is "Innhold:" return to process_pdf
#print "next is innhold, cleanup"
entry["avsender_mottager"] = u.removeDoubleSpaces(entry["avsender_mottager"])
if re.match("^[*]+$", entry["avsender_mottager"]):
entry["avsender_mottager"] = None
#print elementList
#print entry
d("finished with record")
break
#print "Adding to avs_mot (%s)" % currElement.text
entry["avsender_mottager"] += currElement.text
currElement = elementList.popleft()
#print "lastBreak"
break # we are finished with this Innhold
except IndexError, e:
d("No more text elements on page (%s)" % e)
break
return entry
process_pdf("http://www.nrk.no/contentfile/file/1.8221353!offentlig22062012.pdf") # 4 records on last page
#process_pdf("http://www.nrk.no/contentfile/file/1.8217234!offentligjournal21062012.pdf") # 3 records on last page
#process_pdf("http://www.nrk.no/contentfile/file/1.8214156!offentligjournal20062012.pdf")
#process_pdf("http://www.nrk.no/contentfile/file/1.8212381!offentligjournal19062012.pdf")
# https://views.scraperwiki.com/run/pdf_to_html_preview_4/?url=http%3A%2F%2Fwww.nrk.no%2Fcontentfile%2Ffile%2F1.8209505%21offentligjournal18062012.pdf&hidden=1
#process_pdf("http://www.nrk.no/contentfile/file/1.8209505!offentligjournal18062012.pdf") # 1 record on last page
|