From 0b7136f645122445d92d726eaa1a94626aa46576 Mon Sep 17 00:00:00 2001 From: Petter Reinholdtsen Date: Sun, 2 Oct 2016 23:02:41 +0200 Subject: Improve handling of limited CPU resources. --- scrapersources/postliste-oslo-kommune-byraadsavdelingene | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scrapersources/postliste-oslo-kommune-byraadsavdelingene b/scrapersources/postliste-oslo-kommune-byraadsavdelingene index 8523e8b..b54d182 100644 --- a/scrapersources/postliste-oslo-kommune-byraadsavdelingene +++ b/scrapersources/postliste-oslo-kommune-byraadsavdelingene @@ -18,6 +18,7 @@ import re import resource import dateutil.parser import datetime +import sys from dateutil.relativedelta import relativedelta # Some example URLs @@ -134,13 +135,16 @@ def fetch_day(parser, day): # print count, dayurl if 0 == count: # print "Ending day at offset %d" % offset - return totalcount + break offset = offset + offsetstep scraperwiki.sqlite.save(unique_keys=['arkivsaksref'], data=datastore) datastore = [] + return totalcount except scraperwiki.CPUTimeExceededError, e: print "error: Ran out of time, abort scraping" - pass + # Not saving, to avoid saving partial day. Better to scrape + # the entire day the next run. + return 0 except Exception, e: # print html print e @@ -172,18 +176,18 @@ for n in xrange(skiplimit+1): day = newest + aday * n # print day totalcount = totalcount + fetch_day(parser, day) - if cpu_spent() > cpu_available() + 5: + if cpu_spent() > (cpu_available() - 3): print "Running short on CPU time, exiting" - os.exit(0) + sys.exit(0) # Scan backwards, one day before the oldest entry in the database for n in xrange(skiplimit): day = oldest - aday * (n+1) # print day totalcount = totalcount + fetch_day(parser, day) - if cpu_spent() > cpu_available() + 5: + if cpu_spent() > (cpu_available() - 3): print "Running short on CPU time, exiting" - os.exit(0) + sys.exit(0) print "Fetched %d journal entries" % totalcount -- cgit v1.2.3