aboutsummaryrefslogtreecommitdiffstats
path: root/python/etatsbasen.py
blob: 24e626765b40d35550ed4073064895f9215a5fe7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#! /usr/bin/env python3
import argparse
import sys
import os
from email.utils import parseaddr
import csv 
import re

VERSION="python-etatsbasen-v0.1"
DEFAULT_CATEGORIES = "12,14,17,18,27,33,38,66,68,76"
DEFAULT_FILENAME = "etatsbasen-small.csv" # "etatsbasen.csv"

def cleanup_email(string):
    fix1 = re.sub(r"^mailto:?", "", string)
    if fix1 == valid_email(fix1):
        return fix1
    # Split on ' ?[,/] ?'
    split = re.split(r' ?[,/] ?', string)
    for fix2 in split:
        if fix2 == valid_email(fix2):
            return fix2
    return False

def valid_email(string):
    # Think about using https://pypi.python.org/pypi/validate_email ?
    name, email = parseaddr(string)
    if (email == string and '@' in email):
        return email

rename = {
    'tailid': 'id',
    'email': 'request_email',
    'name_nb': 'name',
    'name_nn': 'name.nn',
    'name_en': 'name.en'
  };

def filter_orgstructid(row, categories):
    if row == None:
        return None
    if int(row["orgstructid"]) in categories:
        return row
    else:
        print("Skipping tailid %s: orgstructid not in selected categories (%s not in %s)" % (row['tailid'], row['orgstructid'], categories), file=sys.stderr)
        return None

def filter_email(row):
    if row == None:
        return None
    if row['email'] == "":
        print("Skipping tailid %s: No email specified" % (row['tailid']), file=sys.stderr)
        return None # No email, skip
    elif not valid_email(row['email']):
        fixed = cleanup_email(row['email'])
        if fixed:
            print("Replaced email for tailid %s: \"%s\" -> \"%s\"" % (row['tailid'], row['email'], fixed), file=sys.stderr)
            row['email'] = fixed
        else:
            print("Skipping tailid %s: Invalid email (%s)" % (row['tailid'], row['email']), file=sys.stderr)
            return None # Invalid email, skip
    return row



def printCSV(options):
    print(options)
    with open(options["inputfile"], newline='') as csvfile:
        reader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
        filtered_rows = []
        for row in reader:
            row = filter_orgstructid(row, options["categories"])
            row = filter_email(row)
            if row != None:
                filtered_rows.append(row)
    pass


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Tool for exporting etatsbasen-data to a file that can be imported into alaveteli.')
    parser.add_argument('-c', metavar="all|c1[,c2,c3,..]", default=DEFAULT_CATEGORIES, help="Categories to include (default: \"%s\")" % (DEFAULT_CATEGORIES))
    parser.add_argument('-f', metavar="file", default=DEFAULT_FILENAME, help="File to read from (default: \"%s\")" % (DEFAULT_FILENAME))
    parser.add_argument('-o', metavar="h1[,h2,h3...] ", help="Include only these headers in output (id or name)")
    parser.add_argument('-v', help="Print version (%s) and exit" % (VERSION), action='store_true')
    args = parser.parse_args()
    
    options = {}
    
    if args.v:
        print("version: %s" % (VERSION))
        sys.exit(0)
    
    if os.path.isfile(args.f):
        options["inputfile"] =  args.f
    else:
        print("%s: No such file" % (args.f), file=sys.stderr)
        sys.exit(0)
    
    if args.o:
        options["headers"] = args.o.split(',')
    else:
        options["headers"] = None
    try:
        options["categories"] = [ int(x) for x in args.c.split(',') ]
    except ValueError as ve:
        print("Failed to parse \"-c %s\"; Categories must comma separated list of only integers" % (args.c), file=sys.stderr)
        sys.exit(0)
    
    printCSV(options)