1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
|
#!/usr/bin/python
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2003 Daniel Naber <daniel.naber@t-online.de>
# Based on Shlomo Yona's Perl module Lingua::EN::Sentence 0.25
import os
import string
import re
import sys
class SentenceSplitter:
ABBR_FILE = os.path.join(sys.path[0], "data", "abbr.txt")
EOS = "\001"
#EOS = "<>" # for testing only
P = """[\.!?]""" ## PUNCTUATION
AP = """(?:'|"|�|\)|\]|\})?""" ## AFTER PUNCTUATION
PAP = "%s%s" % (P, AP)
reFlags = re.DOTALL|re.LOCALE
def __init__(self):
"""Init the object by loading the abbreviation list."""
self.abbr = self.loadAbbreviations()
return
def loadAbbreviations(self):
"""Load the abbreviation list and return all words in a list."""
abbr = []
f = open(self.ABBR_FILE, "r")
while 1:
l = f.readline()
if not l:
break
l = l.strip()
if l:
abbr.append(l)
f.close()
return abbr
def split(self, text):
"""Take a text and split it into sentences. Return the list
of sentences. Adapted from Perl's Lingua-EN-Sentence-0.25 module."""
if text == None:
return []
#print "text=%s" % text
marked_text = self.first_sentence_breaking(text)
#print "marked_text=%s" % marked_text
fixed_marked_text = self.remove_false_end_of_sentence(marked_text)
#print "fixed_marked_text=%s" % fixed_marked_text
fixed_marked_text = self.split_unsplit_stuff(fixed_marked_text)
#print "fixed_marked_text=%s" % fixed_marked_text
sentences = re.split(self.EOS, fixed_marked_text)
return sentences
def first_sentence_breaking(self, text):
"""Add a special break character at all places with typical sentence
delimiters."""
# Double new-line means a new sentence:
text = re.compile("(\n\s*\n)", self.reFlags).sub("\\1%s" % self.EOS, text)
# Punctuation followed by whitespace means a new sentence:
text = re.compile("(%s\s)" % self.PAP, self.reFlags).sub("\\1%s" % self.EOS, text)
# New (compared to the perl module): Punctuation followed by uppercase followed
# by non-uppercase character (except dot) means a new sentence:
text = re.compile("(%s)([%s][^%s.])" % (self.PAP, string.uppercase, string.uppercase), \
self.reFlags).sub("\\1%s\\2" % self.EOS, text)
# Break also when single letter comes before punctuation:
text = re.compile("(\s\w%s)" % self.P, self.reFlags).sub("\\1%s" % self.EOS, text)
return text
def remove_false_end_of_sentence(self, text):
"""Repair some positions that don't require a split, i.e. remove the
special break character."""
# Don't split at e.g. "U. S. A.":
text = re.compile("([^-\w]\w%s\s)%s" % (self.PAP, self.EOS), self.reFlags).sub("\\1", text)
# Don't split at e.g. "U.S.A.":
text = re.compile("([^-\w]\w%s)%s" % (self.P, self.EOS), self.reFlags).sub("\\1", text)
# Don't split after a white-space followed by a single letter followed
# by a dot followed by another whitespace.
# e.g. " p. "
text = re.compile("(\s\w\.\s+)%s" % self.EOS, self.reFlags).sub("\\1", text)
# Don't split at "bla bla... yada yada" (TODO: use \.\.\.\s+ instead?)
text = re.compile("(\.\.\. )%s([%s])" % (self.EOS, string.lowercase), self.reFlags).sub("\\1\\2", text)
# Don't split [.?!] when the're quoted:
text = re.compile("(['\"]%s['\"]\s+)%s" % (self.P, self.EOS)).sub("\\1", text)
# Don't split at abbreviations:
for abbr in self.abbr:
# TODO: really ignore case?
s = "(\\b%s%s\s)%s" % (abbr, self.PAP, self.EOS)
text = re.compile(s, self.reFlags|re.IGNORECASE).sub("\\1", text)
# Don't break after quote unless there's a capital letter:
# e.g.: "That's right!" he said.
text = re.compile('(["\']\s*)%s(\s*[%s])' % (self.EOS, string.lowercase), self.reFlags).sub("\\1\\2", text)
# fixme? not sure where this should occur, leaving it commented out:
# don't break: text . . some more text.
#text=~s/(\s\.\s)$EOS(\s*)/$1$2/sg;
text = re.compile("(\s%s\s)%s" % (self.PAP, self.EOS), self.reFlags).sub("\\1", text)
# extension by dnaber --commented out, doesn't help:
#text = re.compile("(:\s+)%s(\s*[%s])" % (self.EOS, string.lowercase), self.reFlags).sub("\\1\\2", text)
return text
def split_unsplit_stuff(self, text):
"""Treat some more special cases that make up a sentence boundary. Insert
the special break character at these positions."""
# Split at e.g. "no. 5 ":
text = re.compile("(\D\d+)(%s)(\s+)" % self.P, self.reFlags).sub("\\1\\2%s\\3" % self.EOS, text)
# TODO: Not sure about this one, leaving out foir now:
#text = re.compile("(%s\s)(\s*\()" % self.PAP, self.reFlags).sub("\\1%s\\2" % self.EOS, text)
# Split e.g.: He won't. #Really.
text = re.compile("('\w%s)(\s)" % self.P, self.reFlags).sub("\\1%s\\2" % self.EOS, text)
# Split e.g.: He won't say no. Not really.
text = re.compile("(\sno\.)(\s+)(?!\d)", self.reFlags|re.IGNORECASE).sub("\\1%s\\2" % self.EOS, text)
# Split at "a.m." or "p.m." followed by a capital letter.
text = re.compile("([ap]\.m\.\s+)([%s])" % string.uppercase, self.reFlags).sub("\\1%s\\2" % self.EOS, text)
return text
if __name__ == "__main__":
#t = '"Do split me." Will you?'
#print t
#s = SentenceSplitter()
#l = s.split(t)
#print l
print "Please use ./SentenceSplitterTest.py for testing."
|