summaryrefslogtreecommitdiffstats
path: root/languagetool/src
diff options
context:
space:
mode:
authorArno Teigseth <arno@teigseth.no>2011-01-31 05:34:56 +0000
committerArno Teigseth <arno@teigseth.no>2011-01-31 05:34:56 +0000
commit1afa96100bcb613c86533698f8a9d1115e63391e (patch)
tree07c754e874bcbc95eeaa21abc35d4bc84158f4fb /languagetool/src
parent635a3c7c275c00748c56736b4eb593b651223edd (diff)
downloadgrammar-norwegian-1afa96100bcb613c86533698f8a9d1115e63391e.tar.gz
grammar-norwegian-1afa96100bcb613c86533698f8a9d1115e63391e.tar.bz2
grammar-norwegian-1afa96100bcb613c86533698f8a9d1115e63391e.tar.xz
Added very basic pre-beta version of LanguageTool. Builds, though :)
Diffstat (limited to 'languagetool/src')
-rw-r--r--languagetool/src/.cvsignore1
-rw-r--r--languagetool/src/Chunker.py127
-rw-r--r--languagetool/src/ChunkerTest.py78
-rw-r--r--languagetool/src/EnglishTest.py62
-rw-r--r--languagetool/src/Entities.py68
-rwxr-xr-xlanguagetool/src/GermanTest.py41
-rwxr-xr-xlanguagetool/src/HungarianTest.py39
-rw-r--r--languagetool/src/LanguageTest.py68
-rw-r--r--languagetool/src/Rules.py632
-rw-r--r--languagetool/src/RulesTest.py257
-rw-r--r--languagetool/src/SentenceSplitter.py132
-rw-r--r--languagetool/src/SentenceSplitterEval.py128
-rw-r--r--languagetool/src/SentenceSplitterTest.py91
-rw-r--r--languagetool/src/TagInfo.py276
-rw-r--r--languagetool/src/Tagger.py1108
-rw-r--r--languagetool/src/TaggerTest.py168
-rw-r--r--languagetool/src/Tools.py58
-rwxr-xr-xlanguagetool/src/Wfdeu.py70
-rwxr-xr-xlanguagetool/src/Wfhun.py88
-rw-r--r--languagetool/src/Wfinder.py568
-rw-r--r--languagetool/src/client.py28
-rw-r--r--languagetool/src/query.py249
-rw-r--r--languagetool/src/socket_server.py218
-rw-r--r--languagetool/src/tag.py152
24 files changed, 4707 insertions, 0 deletions
diff --git a/languagetool/src/.cvsignore b/languagetool/src/.cvsignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/languagetool/src/.cvsignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/languagetool/src/Chunker.py b/languagetool/src/Chunker.py
new file mode 100644
index 0000000..fc0cfd3
--- /dev/null
+++ b/languagetool/src/Chunker.py
@@ -0,0 +1,127 @@
+# -*- coding: iso-8859-1 -*-
+# Assign chunks to a tagged text
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import os
+import re
+import sys
+
+class Chunker:
+ """Assign chunks (like "noun phrase") to a tagged text."""
+
+ def __init__(self):
+ return
+
+ def setRules(self, rules):
+ """Use the rules from this Rules object for the chunk() method."""
+ self.rules = rules
+ return
+
+ def chunk(self, tagged_text):
+ """Take a POS tagged text and find all its chunks. Returns
+ a list of (from, to, chunk_name) tuples where the from/to positions
+ refer to the list position. Only parts of the list may be
+ covered by chunks. There are no overlappings."""
+ l = []
+
+ tagged_text_pos = 0
+ while 1:
+ if tagged_text_pos >= len(tagged_text):
+ break
+ word, norm_word, tag = tagged_text[tagged_text_pos]
+
+ for rule in self.rules.rules:
+ #print "### %s" % rule.name
+ match_start = None
+ match_end = None
+ pattern_pos = 0
+ pos_corr = 0
+
+ rule_match = 1
+ cont = 1
+
+ while 1:
+ #print " %d,%d,%d" % (tagged_text_pos,pattern_pos,pos_corr)
+ try:
+ tag = tagged_text[tagged_text_pos+pattern_pos+pos_corr][2]
+ except IndexError:
+ #print "index error"
+ break
+ #print "%s ?= %s (pp=%d, ttp=%d)" % (tag, rule.pattern[pattern_pos], pattern_pos, tagged_text_pos)
+ if pattern_pos == 0 and tag == None:
+ cont = 0
+ break
+ if tag == None:
+ # ignore whitespace
+ pos_corr = pos_corr + 1
+ continue
+ if tag != rule.pattern[pattern_pos]:
+ rule_match = 0
+ break
+ if match_start == None:
+ match_start = tagged_text_pos
+
+ pattern_pos = pattern_pos + 1
+ if pattern_pos == len(rule.pattern):
+ #print "match (%s)! tagged_text_pos=%d" % (rule.name, tagged_text_pos)
+ match_end = match_start + pattern_pos + pos_corr - 1
+ l.append((match_start, match_end, rule.name))
+ tagged_text_pos = tagged_text_pos + (match_end - match_start)
+ cont = 0
+ break
+ if not rule_match:
+ continue # next rule
+ if not cont:
+ break # next word
+ tagged_text_pos = tagged_text_pos + 1
+
+ #print l
+ return l
+
+class Rules:
+ """A container for chunking rules."""
+
+ chunk_rules = os.path.join(sys.path[0], "data", "chunks.txt")
+
+ def __init__(self):
+ """Read the chunking rules from data/chunks.txt. The rules
+ can then be access via Rules.rules."""
+ self.rules = []
+ f = open(self.chunk_rules)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ if line.startswith("#"): # ignore comments
+ continue
+ rule = Rule(line.strip())
+ self.rules.append(rule)
+ return
+
+class Rule:
+ """A chunking rule, consisting of a name and a pattern. The
+ pattern is a list of POS tags."""
+
+ def __init__(self, line):
+ """Parse a chunk rule in this format:
+ name: tag1 tag2..."""
+ parts = re.split("\s+", line.strip())
+ name = parts[0]
+ self.name = name[0:len(name)-1] # cut off colon
+ self.pattern = parts[1:]
+ return
diff --git a/languagetool/src/ChunkerTest.py b/languagetool/src/ChunkerTest.py
new file mode 100644
index 0000000..eb8889e
--- /dev/null
+++ b/languagetool/src/ChunkerTest.py
@@ -0,0 +1,78 @@
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import re
+import unittest
+
+import Chunker
+
+class LocalRules:
+
+ def __init__(self, rule_list):
+ self.rules = rule_list
+ return
+
+class ChunkerTestCase(unittest.TestCase):
+
+ def testChunking(self):
+ c = Chunker.Chunker()
+ r1 = Chunker.Rule("NP1: AT0 NN1 NN1")
+ r2 = Chunker.Rule("NP2: AT0 NN1")
+ rules = LocalRules([r1, r2])
+ c.setRules(rules)
+
+ tagged_text = self._makeList("Blah/XX the/AT0 house/NN1 foo/YY")
+ chunks = c.chunk(tagged_text)
+ self.assertEqual(chunks, [(2, 4, 'NP2')])
+
+ tagged_text = self._makeList("Blah/XX house/NN1 foo/YY")
+ chunks = c.chunk(tagged_text)
+ self.assertEqual(chunks, [])
+
+ tagged_text = self._makeList("the/AT0 summer/NN1 house/NN1 foo/YY2")
+ chunks = c.chunk(tagged_text)
+ self.assertEqual(chunks, [(0, 4, 'NP1')])
+
+ # more than one chunk:
+
+ tagged_text = self._makeList("the/AT0 summer/NN1 is/VB a/AT0 hit/NN1")
+ chunks = c.chunk(tagged_text)
+ self.assertEqual(chunks, [(0, 2, 'NP2'), (6, 8, 'NP2')])
+
+ tagged_text = self._makeList("the/AT0 summer/NN1 a/AT0 hit/NN1")
+ chunks = c.chunk(tagged_text)
+ self.assertEqual(chunks, [(0, 2, 'NP2'), (4, 6, 'NP2')])
+
+ return
+
+ def _makeList(self, s):
+ parts = re.split("(\s+)", s)
+ l = []
+ for part in parts:
+ word = None
+ word_norm = None
+ tag = None
+ pair = re.split("/", part)
+ if len(pair) == 2:
+ word, tag = pair
+ word_norm = word
+ else:
+ word = pair[0]
+ l.append((word, word_norm, tag))
+ return l
diff --git a/languagetool/src/EnglishTest.py b/languagetool/src/EnglishTest.py
new file mode 100644
index 0000000..358d26c
--- /dev/null
+++ b/languagetool/src/EnglishTest.py
@@ -0,0 +1,62 @@
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import TextChecker
+import LanguageTest
+from LanguageTest import ExpMatch
+
+class EnglishTestCase(LanguageTest.LanguageTest):
+
+ def setUp(self):
+ self.checker = TextChecker.TextChecker(grammar=None, falsefriends=None, \
+ words=None, builtin=None, textlanguage="en", mothertongue="de", \
+ max_sentence_length=20, debug_mode=0)
+ return
+
+ def testSomeRules(self):
+ """Some English rule checks. Requires a trained tagger."""
+
+ self._check("A sentence without problems.", None)
+ self._check("This is bigger then blah.", ExpMatch("COMP_THAN", 15, 19))
+ self._check("English/German false friend: my chef", ExpMatch("CHEF", 32, 36))
+ self._check("Whitespace,here it's lacking.", ExpMatch("WHITESPACE", 11, 12))
+
+ self._check("he good good.", ExpMatch("WORD_REPEAT", 7, 12))
+
+ self._check("I ask you because of him.", None)
+ self._check("Of cause not.", ExpMatch("OF_CAUSE", 3, 8))
+ self._check("he is nice.", None)
+
+ self._check("This is a stoopid test.", None)
+ # TODO: error not detected:
+ self._check("The baseball team are established.", None)
+
+ self._check("I definitely think is should be less than four years.",
+ ExpMatch("IS_SHOULD", 19, 21))
+
+ self._check("Peter's car is bigger then mine, and this isa spelling error.",
+ ExpMatch("COMP_THAN", 22, 26))
+
+ self._check("Peter's car is bigger then mine, and and a word repeat.",
+ [ExpMatch("COMP_THAN", 22, 26), ExpMatch("WORD_REPEAT", 34, 38)])
+
+ return
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/languagetool/src/Entities.py b/languagetool/src/Entities.py
new file mode 100644
index 0000000..615bd8b
--- /dev/null
+++ b/languagetool/src/Entities.py
@@ -0,0 +1,68 @@
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import re
+
+class Entities:
+ """Some(!) BNC SGML entities."""
+
+ def cleanEntities(s):
+ """Replace only the most common BNC entities with their
+ ASCII respresentation."""
+ entities = { "amp" : "&",
+ "pound": "P", # fixme: use "£"
+ "eacute": "e",
+ "aacute": "a",
+ "bquo": "\"",
+ "equo": "\"",
+ "ecirc": "e",
+ "quot": "'",
+ #"deg": u"°",
+ "dollar": "$",
+ "agrave": "á",
+ "egrave": "é",
+ "percnt": "&",
+ "ndash": "-",
+ "mdash": "--",
+ "hellip": "...",
+ "lsqb": "[",
+ "rsqb": "]",
+ "uuml": "ü", #fixme: use ü
+ "auml": "ä", # see above!
+ "ouml": "ö",
+ "Uuml": "Ü",
+ "Auml": "Ä",
+ "Ouml": "Ö",
+ "szlig": "ß"
+ }
+# print "in entities %s"%s
+ try:
+ for key in entities:
+ #s = re.compile("&%s;?" % key).sub("%s" % entities[key].encode('latin1'), s)
+ s = s.replace("&%s;" % key, entities[key])
+ s = s.replace("&%s" % key, entities[key])
+ except TypeError:
+ # FIXME: what to do here?!
+ print >> sys.stderr, "TypeError: '%s'" % s
+ return s
+
+ cleanEntities = staticmethod(cleanEntities)
+
+if __name__ == "__main__":
+ main()
diff --git a/languagetool/src/GermanTest.py b/languagetool/src/GermanTest.py
new file mode 100755
index 0000000..5575b5e
--- /dev/null
+++ b/languagetool/src/GermanTest.py
@@ -0,0 +1,41 @@
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import TextChecker
+import LanguageTest
+from LanguageTest import ExpMatch
+
+class GermanTestCase(LanguageTest.LanguageTest):
+
+ def setUp(self):
+ self.checker = TextChecker.TextChecker(grammar=None, falsefriends=None, \
+ words=None, builtin=None, textlanguage="de", mothertongue="de", \
+ max_sentence_length=20, debug_mode=0)
+ return
+
+ def testSomeRules(self):
+ """Some English rule checks. Requires a trained tagger."""
+
+ self._check(u"Ich gehe daß er sieht", ExpMatch("DASS", 4, 12))
+ self._check(u"Ich gehe.", None)
+ self._check(u"Ich gehst.", ExpMatch("ICH", 0, 9))
+ return
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/languagetool/src/HungarianTest.py b/languagetool/src/HungarianTest.py
new file mode 100755
index 0000000..cb6b0a5
--- /dev/null
+++ b/languagetool/src/HungarianTest.py
@@ -0,0 +1,39 @@
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import TextChecker
+import LanguageTest
+from LanguageTest import ExpMatch
+
+class HungarianTestCase(LanguageTest.LanguageTest):
+
+ def setUp(self):
+ self.checker = TextChecker.TextChecker(grammar=None, falsefriends=None, \
+ words=None, builtin=None, textlanguage="hu", mothertongue="de", \
+ max_sentence_length=20, debug_mode=0)
+ return
+
+ def testSomeRules(self):
+ """Some English rule checks. Requires a trained tagger."""
+ self._check(u"Én mész moziba", ExpMatch("EN", 0, 7))
+ self._check(u"Õk soha nem fogják megtanulni.", None)
+ return
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/languagetool/src/LanguageTest.py b/languagetool/src/LanguageTest.py
new file mode 100644
index 0000000..ee4f2b2
--- /dev/null
+++ b/languagetool/src/LanguageTest.py
@@ -0,0 +1,68 @@
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import TextChecker
+
+import unittest
+
+class LanguageTest(unittest.TestCase):
+
+ def _check(self, sentence, expectedErrors):
+ (rule_matches, output, tagged_text) = self.checker.check(sentence)
+ rule_matches.sort()
+ if expectedErrors == None:
+ if len(rule_matches) != 0:
+ print "Expected no errors, found %d" % len(rule_matches)
+ print "Sentence: %s" % sentence
+ self.fail()
+ elif isinstance(expectedErrors, list):
+ if len(rule_matches) != len(expectedErrors):
+ print "Expected %d errors, found %d" % (len(expectedErrors), len(rule_matches))
+ print "Sentence: %s" % sentence
+ self.fail()
+ i = 0
+ for expError in expectedErrors:
+ self._checkError(sentence, rule_matches[i], expError)
+ i = i + 1
+ else:
+ if len(rule_matches) != 1:
+ print "Expected 1 error, found %d" % len(rule_matches)
+ print "Sentence: %s" % sentence
+ self.fail()
+ self._checkError(sentence, rule_matches[0], expectedErrors)
+ return
+
+ def _checkError(self, sentence, rule_match, expectedError):
+ self.assertEqual(rule_match.id, expectedError.error_type)
+ if rule_match.from_pos != expectedError.from_pos or \
+ rule_match.to_pos != expectedError.to_pos:
+ print "Expected error from %d to %d, found error from %d to %d" % \
+ (expectedError.from_pos, expectedError.to_pos, rule_match.from_pos, \
+ rule_match.to_pos)
+ print "Sentence: %s" % sentence
+ self.fail()
+ return
+
+class ExpMatch:
+
+ def __init__(self, error_type, from_pos, to_pos):
+ self.error_type = error_type
+ self.from_pos = from_pos
+ self.to_pos = to_pos
+ return
diff --git a/languagetool/src/Rules.py b/languagetool/src/Rules.py
new file mode 100644
index 0000000..551e519
--- /dev/null
+++ b/languagetool/src/Rules.py
@@ -0,0 +1,632 @@
+# -*- coding: iso-8859-1 -*-
+# Class for Grammar and Style Rules
+#$rcs = ' $Id$ ' ;
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import Tools
+import codecs # tktk
+
+import copy
+import os
+import re
+import string
+import sys
+import xml.dom.minidom
+from string import *
+
+# FIXME:
+grammarFile = 'engrammar.xml'
+wordFile = 'enwords.xml'
+falsefriendsFile = 'enfalse_friends.xml'
+
+class Rule:
+ """Style or grammar rule -- quasi virtual class."""
+
+ def __init__(self, rule_id, message, false_positives, language):
+ self.rule_id = rule_id
+ self.message = message
+ # errors per 100 sentences in the BNC, i.e. mostly false positives:
+ self.false_positives = false_positives
+ self.language = language # two letter code like "en" or None (= relevant for alle languages)
+ return
+
+ # match() is not defined here, but in the sub classes
+
+class Rules:
+ """All known style and grammar error rules (from XML and the built-in ones)."""
+
+ python_rules_dir = "python_rules"
+
+ def __init__(self, max_sentence_length, grammar_rules, word_rules, \
+ builtin_rules, false_friend_rules, textlanguage, mothertongue):
+ """Parse all rules and put them in the self.rules list, together
+ with built-in rules like the SentenceLengthRule."""
+ self.textlanguage = textlanguage
+ if textlanguage == 'en':
+ self.rule_files = [os.path.join(sys.path[0], "rules", grammarFile),
+ os.path.join(sys.path[0], "rules", wordFile),
+ os.path.join(sys.path[0], "rules", falsefriendsFile)]
+ else:
+ self.rule_files = [os.path.join(sys.path[0], "rules", grammarFile)]
+ self.rules = []
+
+ # dynamically load rule files from the "python_rules" dir:
+ sys.path.append(self.python_rules_dir)
+ dyn_files = os.listdir(self.python_rules_dir)
+ for filename in dyn_files:
+ if textlanguage == 'en':
+ if filename[0:2] != 'en' and filename[0:3] != 'all':
+ continue
+ elif textlanguage == 'de':
+ if filename[0:2] != 'de' and filename[0:3] != 'all':
+ continue
+ elif textlanguage == 'hu':
+ if filename[0:2] != 'hu' and filename[0:3] != 'all':
+ continue
+ if not filename.endswith(".py") or filename.endswith("Test.py"):
+ continue
+ filename = filename[:-3] # cut off ".py"
+ exec("import %s" % filename)
+ try:
+ exec("dynamic_rule = %s.%s()" % (filename, filename))
+ except AttributeError:
+ print filename
+ raise InvalidFilename(filename)
+ if not hasattr(dynamic_rule, "match"):
+ raise MissingMethod("match", "%s.py" % filename)
+ if dynamic_rule.rule_id == "SENTENCE_LENGTH" and \
+ max_sentence_length != None:
+ dynamic_rule.setMaxLength(max_sentence_length)
+ # do not use the rule if it wasn't activated
+ # (builtin_rules == None will use all rules):
+ if not builtin_rules or dynamic_rule.rule_id in builtin_rules:
+ self.rules.append(dynamic_rule)
+
+ for filename in self.rule_files:
+ # minidom expects the DTD in the current directory, not in the
+ # documents directory, so we have to chdir to 'rules':
+ dir_temp = os.getcwd()
+ os.chdir(os.path.dirname(filename))
+ doc = xml.dom.minidom.parse(os.path.basename(filename))
+ os.chdir(dir_temp)
+ if filename.endswith(grammarFile):
+ rule_nodes = doc.getElementsByTagName("rule")
+ for rule_node in rule_nodes:
+ rule = PatternRule(rule_node)
+ lang_ok = 0
+ if self.textlanguage == None or self.textlanguage == rule.language:
+ lang_ok = 1
+ if lang_ok and (grammar_rules == None or rule.rule_id in grammar_rules):
+ self.rules.append(rule)
+ elif filename.endswith("words.xml"):
+ rule_nodes = doc.getElementsByTagName("rule")
+ for rule_node in rule_nodes:
+ rule = PatternRule(rule_node)
+ lang_ok = 0
+ if self.textlanguage == None or self.textlanguage == rule.language:
+ lang_ok = 1
+ if lang_ok and (word_rules == None or rule.rule_id in word_rules):
+ self.rules.append(rule)
+ elif filename.endswith("false_friends.xml"):
+ pattern_nodes = doc.getElementsByTagName("pattern")
+ for pattern_node in pattern_nodes:
+ lang = pattern_node.getAttribute("lang")
+ if self.textlanguage == None or lang == self.textlanguage:
+ rule = PatternRule(pattern_node.parentNode, 1, mothertongue, textlanguage)
+ if rule.valid and (false_friend_rules == None or \
+ rule.rule_id in false_friend_rules):
+ self.rules.append(rule)
+ return
+
+class InvalidFilename(Exception):
+
+ def __init__(self, value):
+ self.value = value
+ return
+
+ def __str__(self):
+ s = "Constructor must be named as the file, i.e. '%s'" % self.value
+ return s
+
+class MissingMethod(Exception):
+
+ def __init__(self, value, filename):
+ self.value = value
+ self.filename = filename
+ return
+
+ def __str__(self):
+ s = "The '%s' method needs to be implemented in %s" % (self.value, self.filename)
+ return s
+
+class WhitespaceRule(Rule):
+ """A rule that matches punctuation not followed by a whitespace
+ and whitespace preceding punctuation. This rule does not work
+ on sentence level, it works on complete tagged texts or paragraphs."""
+
+ punct = "[.,?!:;]"
+ punct_regex = re.compile("^%s+$" % punct)
+ whitespace_regex = re.compile("^\s+$")
+ after_punct_regex = re.compile("^[\"]+$")
+ number_regex = re.compile("^\d+$")
+ whitespace_before_punct = re.compile("^\s+%s" % punct)
+
+ def __init__(self):
+ Rule.__init__(self, "WHITESPACE", "Insert a space character before punctuation.", 0, None)
+ return
+
+ def getNextTriple(self, tagged_words, pos):
+ """Get the next triple form the tagged_words list, starting at
+ pos but ignoring all SENT_START and SENT_END tags."""
+ tag = tagged_words[pos][2]
+ while tag == 'SENT_START' or tag == 'SENT_END':
+ pos = pos + 1
+ if pos >= len(tagged_words):
+ return None
+ tag = tagged_words[pos][2]
+ return tagged_words[pos]
+
+ def match(self, tagged_words, chunks=None, position_fix=0, line_fix=0, column_fix=0):
+ """Check if a sentence contains whitespace/token sequences
+ that are against the 'use a space after, but not before, a token'
+ rule."""
+ matches = []
+ text_length = 0
+ line_breaks = 1
+ column = 0
+ i = 0
+ while 1:
+ if i >= len(tagged_words)-1:
+ break
+ org_word = tagged_words[i][0]
+ line_breaks_cur = Tools.Tools.countLinebreaks(org_word)
+ if line_breaks_cur > 0:
+ column = 0
+ line_breaks = line_breaks + line_breaks_cur
+ org_word_next = self.getNextTriple(tagged_words, i+1)
+ if org_word_next:
+ org_word_next = org_word_next[0]
+ text_length = text_length + len(org_word)
+ if tagged_words[i][1] == None:
+ # ignore whitespace
+ if line_breaks_cur == 0:
+ column = column + len(org_word)
+ i = i + 1
+ continue
+ whitespace_length = len(tagged_words[i+1][0])
+ if line_breaks_cur == 0:
+ column = column + len(org_word)
+ if self.punct_regex.match(org_word) and not (org_word.endswith("\n") or org_word.endswith("\r")):
+ word_next = tagged_words[i+1][1]
+ word_next = self.getNextTriple(tagged_words, i+1)
+ if word_next:
+ word_next = word_next[1]
+ if word_next and self.number_regex.match(word_next):
+ # don't complain about "24,000" etc.
+ i = i + 1
+ continue
+ if word_next and (not self.after_punct_regex.match(org_word_next)) and \
+ (not self.whitespace_regex.match(org_word_next)):
+ matches.append(RuleMatch(self.rule_id, text_length, text_length + len(org_word),
+ line_breaks+line_fix,
+ column+column_fix,
+ "Usually a space character is inserted after punctuation."))
+ elif self.whitespace_before_punct.match(org_word):
+ if not self.punct_regex.match(org_word_next):
+ matches.append(RuleMatch(self.rule_id, text_length, text_length + len(org_word),
+ line_breaks+line_fix, column+column_fix,
+ "Usually no space character is inserted before punctuation."))
+ i = i + 1
+ return matches
+
+class PatternRule(Rule):
+ """A rule that can be formalised in the XML configuration file."""
+
+ def __init__(self, node, is_false_friend_node=None, mothertongue=None, textlang=None):
+ """Build an object by parsing an XML rule node."""
+ if node == None:
+ # for the test cases. They use setVars().
+ return
+ if is_false_friend_node:
+ self.parseFalseFriendsRuleNode(node, mothertongue, textlang)
+ else:
+ self.parseRuleNode(node)
+ return
+
+ def parseRuleNode(self, rule_node):
+ self.rule_id = rule_node.getAttribute("id")
+ if not self.rule_id:
+ # FIXME? rule_id is not unique...
+ self.rule_id = rule_node.parentNode.getAttribute("id")
+ self.pattern = rule_node.getElementsByTagName("pattern")[0].childNodes[0].data.strip()
+ token_strings = re.split("\s+", self.pattern)
+ self.tokens = []
+ for token_string in token_strings:
+ token = Token(token_string)
+ self.tokens.append(token)
+ pattern_node = rule_node.getElementsByTagName("pattern")[0]
+ self.language = pattern_node.getAttribute("lang")
+ marker_from_att = pattern_node.getAttribute("mark_from")
+ if marker_from_att:
+ self.marker_from = int(marker_from_att)
+ else:
+ self.marker_from = 0
+ marker_to_att = pattern_node.getAttribute("mark_to")
+ if marker_to_att:
+ self.marker_to = int(marker_to_att)
+ else:
+ self.marker_to = 0
+ self.case_sensitive = 0
+ if rule_node.getElementsByTagName("pattern")[0].getAttribute("case_sensitive") == 'yes':
+ #print "*** %s" % rule_node.getElementsByTagName("pattern")[0].getAttribute("case_sensitive")
+ self.case_sensitive = 1
+ if rule_node.getElementsByTagName("message"):
+ self.message = Tools.Tools.getXML(rule_node.getElementsByTagName("message")[0])
+ else:
+ self.message = Tools.Tools.getXML(rule_node.parentNode.getElementsByTagName("message")[0])
+ example_nodes = rule_node.getElementsByTagName("example")
+ self.example_good = ""
+ self.example_bad = ""
+ for example_node in example_nodes:
+ # TODO?: only one good and one bad example currently supported:
+ if example_node.getAttribute("type") == 'correct':
+ self.example_good = Tools.Tools.getXML(example_node.childNodes[0])
+ else:
+ self.example_bad = Tools.Tools.getXML(example_node.childNodes[0])
+ self.false_positives = None # None = unknown
+ if rule_node.getElementsByTagName("error_rate"):
+ error_rate_node = rule_node.getElementsByTagName("error_rate")[0]
+ warnings = error_rate_node.getAttribute("warnings")
+ sentences = error_rate_node.getAttribute("sentences")
+ try:
+ if int(sentences) != 0:
+ error_rate = float(warnings) / float(sentences) * 100
+ self.false_positives = error_rate
+ except ValueError:
+ pass
+ return
+
+ def parseFalseFriendsRuleNode(self, rule_node, mothertongue, textlang):
+ # This is only called for rule nodes that have a pattern
+ # element with the relevant language.
+ self.rule_id = rule_node.parentNode.getAttribute("id")
+ pattern_node = rule_node.getElementsByTagName("pattern")[0]
+ self.language = rule_node.getAttribute("lang")
+ # Now look for the correct translation:
+ trans_nodes = rule_node.getElementsByTagName("translation")
+ self.valid = 0 # useless object because no translation was found
+ translations = []
+ for trans_node in trans_nodes:
+ trans_lang = trans_node.getAttribute("lang")
+ if trans_lang == mothertongue:
+ self.valid = 1
+ trans_str = trans_node.childNodes[0].data
+ translations.append(trans_str)
+ if self.valid:
+ self.case_sensitive = 0
+ self.pattern = rule_node.getElementsByTagName("pattern")[0].childNodes[0].data.strip()
+ repl_word, repl_trans = self.getOtherMeaning(rule_node.parentNode, mothertongue, textlang)
+ l = []
+ for elem in repl_trans:
+ l.append("<em>%s</em>" % elem)
+ repl_trans_str = str.join(', ', l)
+ self.message = "'%s' means %s. " % (self.pattern, str.join(', ', translations))
+ if repl_word:
+ self.message = self.message + " Did you maybe mean '%s', which is %s?" % \
+ (repl_word, repl_trans_str)
+ #print "#%s" % self.message.encode('latin1')
+ token_strings = re.split("\s+", self.pattern)
+ self.tokens = []
+ for token_string in token_strings:
+ token = Token('"%s"' % token_string) # quotes = it's a word (not a POS tag)
+ self.tokens.append(token)
+ #print "#%s" % token
+ self.marker_from = 0
+ self.marker_to = 0
+ return
+
+ def getOtherMeaning(self, rulegroup_node, mothertongue, textlang):
+ """Get the word (and its correct translations) that the user
+ maybe meant when he used a false friend. Returns a tuple
+ (word, [translations])."""
+ replace_nodes = rulegroup_node.getElementsByTagName("pattern")
+ word = None
+ translations = []
+ for replace_node in replace_nodes:
+ repl_lang = replace_node.getAttribute("lang")
+ if repl_lang == mothertongue:
+ word = replace_node.childNodes[0].data
+ trans_nodes = replace_node.parentNode.getElementsByTagName("translation")
+ for trans_node in trans_nodes:
+ trans_lang = trans_node.getAttribute("lang")
+ #print "#%s, %s" % (trans_lang, textlang)
+ if trans_lang == textlang:
+ self.valid = 1
+ trans_str = trans_node.childNodes[0].data
+ translations.append(trans_str)
+ return (word, translations)
+
+ def setVars(self, rule_id, pattern, message, marker_from, marker_to, \
+ example_good, example_bad, case_sensitive, false_positives, language):
+ """Manually initialize the pattern rule -- for test cases only."""
+ self.rule_id = rule_id
+ self.message = message
+ self.false_positives = false_positives
+ self.language = language
+ self.marker_from = marker_from
+ self.marker_to = marker_to
+ self.example_good = example_good
+ self.example_bad = example_bad
+ self.case_sensitive = case_sensitive
+ self.tokens = []
+ token_strings = re.split("\s+", pattern)
+ for token_string in token_strings:
+ token = Token(token_string)
+ self.tokens.append(token)
+ return
+
+ def match(self, tagged_words, chunks=None, position_fix=0, line_fix=0, column_fix=0):
+ """Check if there are rules that match the tagged_words. Returns a list
+ of RuleMatch objects."""
+ matches = []
+ ct = 0
+ tagged_words_copy = tagged_words # no copy, just a refernce
+ last_match = None
+
+ #print self.rule_id
+ #print tagged_words_copy
+ for word_tag_tuple in tagged_words_copy:
+ i = ct
+ p = 0 # matched position in the pattern so far
+ expected_token = None # expected token if the pattern matches
+ found = None
+ match = 1
+ first_match = None
+ chunk_corr = 0
+ chunk_len = 0
+
+ while match:
+ try:
+ if not tagged_words_copy[i][1] and tagged_words_copy[i][2] != 'SENT_START' and tagged_words_copy[i][2] != 'SENT_END':
+ # here's just whitespace or other un-taggable stuff:
+ i = i + 1
+ ct = ct + 1
+ continue
+ elif not first_match:
+ first_match = ct
+ except IndexError: # end of tagged words
+ break
+ try:
+ expected_token = self.tokens[p]
+ except IndexError:
+ # pattern isn't that long
+ break
+ expected_token_str = expected_token.token
+
+ #print "expected_token_str=%s" % expected_token_str
+ if tagged_words_copy[i][2] == 'SENT_START':
+ found = 'SENT_START'
+ elif tagged_words_copy[i][2] == 'SENT_END':
+ found = 'SENT_END'
+ elif expected_token.is_word:
+ # TODO: some cases need to be escaped, e.g. "?", but
+ # this breaks the pipe etc.
+ #expected_token_str = re.escape(expected_token_str)
+ # look at the real word:
+ try:
+ found = tagged_words_copy[i][1].strip()
+ except: # text isn't that long
+ break
+ elif expected_token.is_chunk:
+ #print "chunk %s@%d?" % (expected_token.token, i)
+ found = None
+ for from_pos, to_pos, chunk_name in chunks:
+ if i >= from_pos and i <= to_pos:
+ found = chunk_name
+ #print "CHUNK %d-%d: %s" % (from_pos, to_pos, chunk_name)
+ i = i + (to_pos - from_pos)
+ chunk_corr = chunk_corr + (to_pos - from_pos)
+ chunk_len = chunk_len + 1
+ break
+ else:
+ # look at the word's POS tag:
+ try:
+ found = tagged_words_copy[i][2]
+ except: # text ends here
+ break
+ if not found:
+ #print >> sys.stderr, "*** 'found' undefined (i=%d, %s/%s)" % (i, tagged_words_copy[i][1], tagged_words_copy[i][2])
+ break
+ case_sensitive = re.IGNORECASE
+ if self.case_sensitive:
+ case_sensitive = 0
+ if expected_token.simple_token:
+ # speed up for e.g. simple false friends rules that don't
+ # require regex matching:
+ if case_sensitive:
+ #print "exp:%s" %expected_token
+ match = (expected_token_str.lower() == found.lower())
+ else:
+ match = (expected_token_str == found)
+ else:
+ match = re.compile("%s$" % expected_token_str, case_sensitive).match(found)
+ #print "%s: %s/%s -> %s" % (self.rule_id, found, expected_token_str, match)
+ if expected_token.negation:
+ if not match:
+ match = 1
+ else:
+ match = None
+ #print "F=%s, m=%s, '%s'" % (found, match, re.escape(expected_token.token))
+ i = i + 1
+ p = p + 1
+
+ #print "p=%d, len(self.tokens)=%d" % (p, len(self.tokens))
+ if match and p == len(self.tokens):
+
+ #print "##MATCH "+found+" " +expected_token_str
+ #FIXME: does this always mark the correct position?
+ (first_match, from_pos, to_pos, line, column) = self.listPosToAbsPos(tagged_words_copy, \
+ first_match, 0)
+ to_pos = to_pos + chunk_corr
+
+ # Let \n in a rule refer to the n'th matched word:
+ l = first_match
+ lcount = 1
+ msg = self.message
+ while lcount <= len(self.tokens) and l < len(tagged_words_copy):
+ if not tagged_words_copy[l][1] and tagged_words_copy[l][2] != 'SENT_START' and tagged_words_copy[l][2] != 'SENT_END':
+ pass
+ else:
+ msg = msg.replace("\\%d" % lcount, tagged_words_copy[l][0])
+ lcount = lcount + 1
+ l = l + 1
+
+ first_match_word = tagged_words_copy[first_match][0]
+ match = RuleMatch(self.rule_id, from_pos+position_fix, to_pos+position_fix, \
+ line+line_fix, column+column_fix, msg, first_match_word)
+ matches.append(match)
+
+ ct = ct + 1
+ return matches
+
+ def listPosToAbsPos(self, l, first_match, chunk_corr=0):
+ #print "*%d (%d)" % (first_match, chunk_corr)
+ j = first_match + 1
+ i = 0
+ mark_from_tmp = self.marker_from
+ while mark_from_tmp > 0 and j < len(l):
+ if l[j][1]:
+ mark_from_tmp = mark_from_tmp - 1
+ i = i + 1
+ j = j + 1
+ first_match = first_match + i
+
+ last_match = first_match
+ match_len = len(self.tokens)-self.marker_from+self.marker_to+chunk_corr
+ for el in l[first_match:]:
+ if match_len == 0:
+ break
+ if el[1]:
+ match_len = match_len - 1
+ last_match = last_match + 1
+
+ from_pos = 0
+ line = 0
+ column = 0 # FIXME!
+ for el in l[:first_match]:
+ #print "** '%s' (%d)" % (el[0], first_match)
+ matches = re.findall("[\n\r]", el[0])
+ line = line + len(matches)
+ if len(matches) > 0:
+ column = 0
+ else:
+ column = column + len(el[0])
+ from_pos = from_pos + len(el[0])
+ #print "** L=%s" % line
+ to_pos = 0
+ for el in l[:last_match]:
+ to_pos = to_pos + len(el[0])
+
+ return (first_match, from_pos, to_pos, line, column)
+
+class RuleMatch:
+ """A matching rule, i.e. an error or a warning and from/to positions."""
+
+ def __init__(self, rule_id, from_pos, to_pos, line, column, message, first_match_word=None):
+ self.id = rule_id
+ self.from_pos = from_pos
+ self.to_pos = to_pos
+ self.line = line
+ self.column = column
+ self.message = message
+ # TOOD: is it okay to use 'latin1' here?:
+ if first_match_word and first_match_word[0] in unicode(string.uppercase, 'latin1'):
+ # Replace the first char in <em>...</em> with its uppercase
+ # variant. Useful for replacements at the beginning of the
+ # sentence
+ self.message = re.compile("<em>(.)").sub(self.upper, self.message)
+ return
+
+ def upper(self, match):
+ return "<em>%s" % match.group(1)[0].upper()
+
+ def __str__(self):
+ """String representation of this object, i.e. human readable output."""
+ msg = self.message
+ msg = re.compile("</?message>").sub("", msg)
+ msg = re.compile("</?em>").sub("'", msg)
+ strng = 'Line %d, Column %d: %s' % (self.line, self.column, msg)
+ return strng
+
+ def toXML(self):
+ """XML representation of this object."""
+ strng = '<error from="%d" to="%d">%s</error>' % (self.from_pos, self.to_pos, self.message)
+ return strng
+
+ def __cmp__(self, b):
+ """Compare by 'from' position."""
+ if self.from_pos > b.from_pos:
+ return 1
+ elif self.from_pos < b.from_pos:
+ return -1
+ else:
+ return 0
+
+class Token:
+ """A word, tag or chunk token, negated or not. Examples:
+ "^(has|will)",
+ "he",
+ (VB|VBP),
+ _NP
+ """
+
+ def __init__(self, token):
+ self.token = token
+ self.negation = 0
+ self.is_word = 0
+ self.is_tag = 0
+ self.is_chunk = 0
+ if self.token.find("|") != -1 or self.token.find("(") != -1 \
+ or self.token.find("[") != -1 or self.token.find(".") != -1:
+ self.simple_token = 0
+ else:
+ self.simple_token = 1 # no regex required
+ if self.token.startswith('^'):
+ self.token = token[1:] # remove '^'
+ self.negation = 1
+ if self.token.startswith('"'):
+ self.is_word = 1
+ if not self.token.endswith('"'):
+ print >> sys.stderr, "*** Warning: token '%s' starts with quote but doesn't end with quote!" % self.token
+ self.token = self.token[1:(len(self.token)-1)] # remove quotes
+ elif self.token.startswith('_'):
+ self.token = token[1:] # remove '_'
+ self.is_chunk = 1
+ else:
+ self.is_tag = 1
+ return
+
+ def __str__(self):
+ """For debugging only"""
+ strng = self.token
+ if self.negation:
+ strng = "^%s" % strng
+ if self.is_word:
+ strng = '"%s"' % strng
+ return strng
diff --git a/languagetool/src/RulesTest.py b/languagetool/src/RulesTest.py
new file mode 100644
index 0000000..fd54598
--- /dev/null
+++ b/languagetool/src/RulesTest.py
@@ -0,0 +1,257 @@
+#!/usr/bin/python
+# Test cases for Rule.py
+#$rcs = ' $Id$ ' ;
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import unittest
+import Rules
+import os
+import sys
+
+sys.path.append(os.path.join("python_rules"))
+import allSentenceLengthRule
+import enWordRepeatRule
+import enAvsAnRule
+
+class RuleTestCase(unittest.TestCase):
+
+ def setUp(self):
+ self.rule = Rules.PatternRule(None)
+ self.rule.setVars("TEST1", '"word" (VB|TST)', "Test message 1.", 0, 0, \
+ "Good example.", "Bad example.", 0, 5, "en")
+ # negation:
+ self.rule2 = Rules.PatternRule(None)
+ self.rule2.setVars("TEST2", '"word" ^(VB|TST)', "Test message 2.", 0, 0, \
+ "Good example.", "Bad example.", 0, 5, "en")
+ # negation at the beginning:
+ self.rule3 = Rules.PatternRule(None)
+ self.rule3.setVars("TEST3", '^"word" (VB|TST)', "Test message 3.", 0, 0, \
+ "Good example.", "Bad example.", 0, 5, "en")
+ return
+
+ def testConstructor(self):
+ self.assertEqual(self.rule.rule_id, "TEST1")
+ self.assertEqual(len(self.rule.tokens), 2)
+ self.assertEqual(self.rule2.rule_id, "TEST2")
+ self.assertEqual(len(self.rule.tokens), 2)
+ self.assertEqual(self.rule3.rule_id, "TEST3")
+ self.assertEqual(len(self.rule.tokens), 2)
+ return
+
+ def testSentenceLengthRule(self):
+ r = allSentenceLengthRule.allSentenceLengthRule()
+ r.setMaxLength(3)
+
+ # just below the limit:
+ warnings = r.match([('x','x','T'),('x','x','T'),('x','x','T')])
+ self.assertEqual(len(warnings), 0)
+
+ # just on the limit:
+ warnings = r.match([('x','x','T'),('x','x','T'),('x','x','T'),('x','x','T')])
+ self.assertEqual(len(warnings), 1)
+ assert(warnings[0].toXML().startswith('<error from="3" to="4">'))
+ r.setMaxLength(60)
+ warnings = r.match([('x','x','T'),('x','x','T'),('x','x','T'),('x','x','T')])
+ self.assertEqual(len(warnings), 0)
+ r.setMaxLength(3)
+
+ # whitespace is okay:
+ warnings = r.match([(' ',None,None),('x','x','T'),('x','x','T'),('x','x','T')])
+ self.assertEqual(len(warnings), 0)
+
+ # much longer than the limit:
+ warnings = r.match([('x','x','T'),('x','x','T'),('x','x','T'),('x','x','T'),\
+ ('x','x','T'),('x','x','T'),('x','x','T')])
+ self.assertEqual(len(warnings), 1)
+
+ return
+
+ def testAvsAnRule(self):
+ r = enAvsAnRule.enAvsAnRule()
+ # okay:
+ warnings = r.match([('A','A','DET'),(' ',None,None),('test','test','NN')], [])
+ self.assertEqual(len(warnings), 0)
+ warnings = r.match([('a','a','DET'),(' ',None,None),('test','test','NN')], [])
+ self.assertEqual(len(warnings), 0)
+ warnings = r.match([('an','an','DET'),(' ',None,None),('idea','idea','NN')], [])
+ self.assertEqual(len(warnings), 0)
+
+ # okay (exceptions list):
+ warnings = r.match([('a','a','DET'),(' ',None,None),('university','university','NN')], [])
+ self.assertEqual(len(warnings), 0)
+ warnings = r.match([('an','an','DET'),(' ',None,None),('hour','hour','NN')], [])
+ self.assertEqual(len(warnings), 0)
+
+ # wrong:
+ warnings = r.match([('An','An','DET'),(' ',None,None),('test','test','NN')], [])
+ self.assertEqual(len(warnings), 1)
+ warnings = r.match([('an','an','DET'),(' ',None,None),('test','test','NN')], [])
+ self.assertEqual(len(warnings), 1)
+ warnings = r.match([('a','a','DET'),(' ',None,None),('idea','idea','NN')], [])
+ self.assertEqual(len(warnings), 1)
+
+ # wrong (exceptions list):
+ warnings = r.match([('an','an','DET'),(' ',None,None),('university','university','NN')], [])
+ self.assertEqual(len(warnings), 1)
+ warnings = r.match([('a','a','DET'),(' ',None,None),('hour','hour','NN')], [])
+ self.assertEqual(len(warnings), 1)
+
+ return
+
+ def testWhitespaceRule(self):
+ r = Rules.WhitespaceRule()
+
+ # okay:
+ warnings = r.match([('blah','blah','XX'),('?',None,None)])
+ self.assertEqual(len(warnings), 0)
+ warnings = r.match([('3.14','3.14','XX'),('?',None,None)])
+ self.assertEqual(len(warnings), 0)
+
+ # error - whitespace before punctuation:
+ warnings = r.match([('blah','blah','XX'),(' ',None,None),('.',None,None)])
+ self.assertEqual(len(warnings), 1)
+ warnings = r.match([('blah','blah','XX'),(' ',None,None),('?',None,None)])
+ self.assertEqual(len(warnings), 1)
+ warnings = r.match([('blah','blah','XX'),(' ',None,None),('...',None,None)])
+ self.assertEqual(len(warnings), 1)
+ warnings = r.match([('blah','blah','XX'),(' ',None,None),('?!',None,None)])
+ self.assertEqual(len(warnings), 1)
+
+ # both errors
+ warnings = r.match([('blah','blah','XX'),(' ',None,None),(',',None,None),('blah','blah','XX')])
+ self.assertEqual(len(warnings), 2)
+
+ # okay:
+ warnings = r.match([('blah','blah','XX'),('?',None,None),(None,None,'SENT_END')])
+ self.assertEqual(len(warnings), 0)
+
+ # error - no whitespace after punctuation:
+ warnings = r.match([('blah','blah','XX'),('?',None,None),('foo','foo','YY')])
+ self.assertEqual(len(warnings), 1)
+
+ return
+
+ def testWordRepeat(self):
+ r = enWordRepeatRule.enWordRepeatRule()
+
+ warnings = r.match([('blah','blah','XX'),(' ',None,None),('blahbla','blahbla','YY')], [])
+ self.assertEqual(len(warnings), 0)
+
+ warnings = r.match([('blah','blah','XX'),(' ',None,None),('blah','blah','YY')], [])
+ self.assertEqual(len(warnings), 1)
+ warnings = r.match([('blah','blah','XX'),(' ',None,None),('BLAH','BLAH','XX')], [])
+ self.assertEqual(len(warnings), 1)
+
+ return
+
+ def testPatternRuleMatch(self):
+
+ # rule 1:
+
+ res_list = self.rule.match([('', None, 'SENT_START'),
+ ('word', 'word', 'XX'),(' ', None, None),('bla', 'bla', 'VB')], 0)
+ self.assertEqual(len(res_list), 1)
+ self.assertEqual(res_list[0].toXML(), '<error from="0" to="8">Test message 1.</error>')
+
+ res_list = self.rule.match([('no', 'no', 'XX'),('foo', 'foo', 'VB')], 0)
+ self.assertEqual(len(res_list), 0)
+
+ res_list = self.rule.match([], 0)
+ self.assertEqual(len(res_list), 0)
+
+ res_list = self.rule.match([('word', 'word', 'XX')], 0)
+ self.assertEqual(len(res_list), 0)
+
+ # rule 2:
+
+ res_list = self.rule2.match([('word', 'word', 'XX'),('', None, None),('xxx', 'xxx', 'VBX')], 0)
+ self.assertEqual(len(res_list), 1)
+
+ # rule 3:
+
+ res_list = self.rule3.match([('foo', 'foo', 'XX'),(' ', None, None),('xxx', 'xxx', 'VB')], 0)
+ self.assertEqual(len(res_list), 1)
+ return
+
+class RuleMatchTestCase(unittest.TestCase):
+
+ def testCompare(self):
+ r1 = Rules.RuleMatch("ONE", 1, 2, 0, 0, "fake1", 0)
+ r2 = Rules.RuleMatch("ONE", 2, 3, 0, 0, "fake2", 0)
+ assert(r1 < r2)
+ r3 = Rules.RuleMatch("ONE", 1, 3, 0, 0, "fake3", 0)
+ assert(r1 == r3)
+ assert(r2 > r3)
+ return
+
+class TokenTestCase(unittest.TestCase):
+
+ def testToken(self):
+
+ token = Rules.Token('NN')
+ self.assertEqual(token.token, "NN")
+ assert(not token.negation)
+ assert(token.is_tag)
+ assert(not token.is_word)
+ assert(not token.is_chunk)
+ assert(token.simple_token)
+
+ token = Rules.Token('"word"')
+ self.assertEqual(token.token, "word")
+ assert(not token.negation)
+ assert(not token.is_tag)
+ assert(token.is_word)
+ assert(not token.is_chunk)
+ assert(token.simple_token)
+
+ token = Rules.Token("^(NN)")
+ self.assertEqual(token.token, "(NN)")
+ assert(token.negation)
+ assert(token.is_tag)
+ assert(not token.is_word)
+ assert(not token.is_chunk)
+ assert(not token.simple_token) # b/c of the parenthesis
+
+ token = Rules.Token('^"word"')
+ self.assertEqual(token.token, "word")
+ assert(token.negation)
+ assert(not token.is_tag)
+ assert(token.is_word)
+ assert(not token.is_chunk)
+ assert(token.simple_token)
+
+ token = Rules.Token('_NP')
+ self.assertEqual(token.token, "NP")
+ assert(not token.negation)
+ assert(not token.is_tag)
+ assert(not token.is_word)
+ assert(token.is_chunk)
+ assert(token.simple_token)
+
+ token = Rules.Token("(AA|BB|CC)")
+ self.assertEqual(token.token, "(AA|BB|CC)")
+ assert(not token.negation)
+ assert(token.is_tag)
+ assert(not token.is_word)
+ assert(not token.is_chunk)
+ assert(not token.simple_token) # b/c of the parenthesis
+ return
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/languagetool/src/SentenceSplitter.py b/languagetool/src/SentenceSplitter.py
new file mode 100644
index 0000000..35dfb7d
--- /dev/null
+++ b/languagetool/src/SentenceSplitter.py
@@ -0,0 +1,132 @@
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2003 Daniel Naber <daniel.naber@t-online.de>
+# Based on Shlomo Yona's Perl module Lingua::EN::Sentence 0.25
+
+import os
+import string
+import re
+import sys
+
+class SentenceSplitter:
+
+ ABBR_FILE = os.path.join(sys.path[0], "data", "abbr.txt")
+
+ EOS = "\001"
+ #EOS = "<>" # for testing only
+ P = """[\.!?]""" ## PUNCTUATION
+ AP = """(?:'|"|�|\)|\]|\})?""" ## AFTER PUNCTUATION
+ PAP = "%s%s" % (P, AP)
+
+ reFlags = re.DOTALL|re.LOCALE
+
+ def __init__(self):
+ """Init the object by loading the abbreviation list."""
+ self.abbr = self.loadAbbreviations()
+ return
+
+ def loadAbbreviations(self):
+ """Load the abbreviation list and return all words in a list."""
+ abbr = []
+ f = open(self.ABBR_FILE, "r")
+ while 1:
+ l = f.readline()
+ if not l:
+ break
+ l = l.strip()
+ if l:
+ abbr.append(l)
+ f.close()
+ return abbr
+
+ def split(self, text):
+ """Take a text and split it into sentences. Return the list
+ of sentences. Adapted from Perl's Lingua-EN-Sentence-0.25 module."""
+ if text == None:
+ return []
+ #print "text=%s" % text
+ marked_text = self.first_sentence_breaking(text)
+ #print "marked_text=%s" % marked_text
+ fixed_marked_text = self.remove_false_end_of_sentence(marked_text)
+ #print "fixed_marked_text=%s" % fixed_marked_text
+ fixed_marked_text = self.split_unsplit_stuff(fixed_marked_text)
+ #print "fixed_marked_text=%s" % fixed_marked_text
+ sentences = re.split(self.EOS, fixed_marked_text)
+ return sentences
+
+ def first_sentence_breaking(self, text):
+ """Add a special break character at all places with typical sentence
+ delimiters."""
+ # Double new-line means a new sentence:
+ text = re.compile("(\n\s*\n)", self.reFlags).sub("\\1%s" % self.EOS, text)
+ # Punctuation followed by whitespace means a new sentence:
+ text = re.compile("(%s\s)" % self.PAP, self.reFlags).sub("\\1%s" % self.EOS, text)
+ # New (compared to the perl module): Punctuation followed by uppercase followed
+ # by non-uppercase character (except dot) means a new sentence:
+ text = re.compile("(%s)([%s][^%s.])" % (self.PAP, string.uppercase, string.uppercase), \
+ self.reFlags).sub("\\1%s\\2" % self.EOS, text)
+ # Break also when single letter comes before punctuation:
+ text = re.compile("(\s\w%s)" % self.P, self.reFlags).sub("\\1%s" % self.EOS, text)
+ return text
+
+ def remove_false_end_of_sentence(self, text):
+ """Repair some positions that don't require a split, i.e. remove the
+ special break character."""
+
+ # Don't split at e.g. "U. S. A.":
+ text = re.compile("([^-\w]\w%s\s)%s" % (self.PAP, self.EOS), self.reFlags).sub("\\1", text)
+ # Don't split at e.g. "U.S.A.":
+ text = re.compile("([^-\w]\w%s)%s" % (self.P, self.EOS), self.reFlags).sub("\\1", text)
+
+ # Don't split after a white-space followed by a single letter followed
+ # by a dot followed by another whitespace.
+ # e.g. " p. "
+ text = re.compile("(\s\w\.\s+)%s" % self.EOS, self.reFlags).sub("\\1", text)
+
+ # Don't split at "bla bla... yada yada" (TODO: use \.\.\.\s+ instead?)
+ text = re.compile("(\.\.\. )%s([%s])" % (self.EOS, string.lowercase), self.reFlags).sub("\\1\\2", text)
+ # Don't split [.?!] when the're quoted:
+ text = re.compile("(['\"]%s['\"]\s+)%s" % (self.P, self.EOS)).sub("\\1", text)
+
+ # Don't split at abbreviations:
+ for abbr in self.abbr:
+ # TODO: really ignore case?
+ s = "(\\b%s%s\s)%s" % (abbr, self.PAP, self.EOS)
+ text = re.compile(s, self.reFlags|re.IGNORECASE).sub("\\1", text)
+
+ # Don't break after quote unless there's a capital letter:
+ # e.g.: "That's right!" he said.
+ text = re.compile('(["\']\s*)%s(\s*[%s])' % (self.EOS, string.lowercase), self.reFlags).sub("\\1\\2", text)
+
+ # fixme? not sure where this should occur, leaving it commented out:
+ # don't break: text . . some more text.
+ #text=~s/(\s\.\s)$EOS(\s*)/$1$2/sg;
+
+ text = re.compile("(\s%s\s)%s" % (self.PAP, self.EOS), self.reFlags).sub("\\1", text)
+
+ # extension by dnaber --commented out, doesn't help:
+ #text = re.compile("(:\s+)%s(\s*[%s])" % (self.EOS, string.lowercase), self.reFlags).sub("\\1\\2", text)
+ return text
+
+ def split_unsplit_stuff(self, text):
+ """Treat some more special cases that make up a sentence boundary. Insert
+ the special break character at these positions."""
+ # Split at e.g. "no. 5 ":
+ text = re.compile("(\D\d+)(%s)(\s+)" % self.P, self.reFlags).sub("\\1\\2%s\\3" % self.EOS, text)
+ # TODO: Not sure about this one, leaving out foir now:
+ #text = re.compile("(%s\s)(\s*\()" % self.PAP, self.reFlags).sub("\\1%s\\2" % self.EOS, text)
+ # Split e.g.: He won't. #Really.
+ text = re.compile("('\w%s)(\s)" % self.P, self.reFlags).sub("\\1%s\\2" % self.EOS, text)
+ # Split e.g.: He won't say no. Not really.
+ text = re.compile("(\sno\.)(\s+)(?!\d)", self.reFlags|re.IGNORECASE).sub("\\1%s\\2" % self.EOS, text)
+ # Split at "a.m." or "p.m." followed by a capital letter.
+ text = re.compile("([ap]\.m\.\s+)([%s])" % string.uppercase, self.reFlags).sub("\\1%s\\2" % self.EOS, text)
+ return text
+
+if __name__ == "__main__":
+ #t = '"Do split me." Will you?'
+ #print t
+ #s = SentenceSplitter()
+ #l = s.split(t)
+ #print l
+ print "Please use ./SentenceSplitterTest.py for testing."
diff --git a/languagetool/src/SentenceSplitterEval.py b/languagetool/src/SentenceSplitterEval.py
new file mode 100644
index 0000000..cdf8745
--- /dev/null
+++ b/languagetool/src/SentenceSplitterEval.py
@@ -0,0 +1,128 @@
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import sys
+import re
+
+import Entities
+import SentenceSplitter
+
+class SentenceSplitterEval:
+
+ def __init__(self):
+ return
+
+ def findSentence(self, real_boundary, bnc_sentences):
+ sent = None
+ sent_disp = None
+ l = 0
+ i = 0
+ for s in bnc_sentences:
+ l = l + len(s)
+ if l == real_boundary:
+ sent = s
+ next_sent_start = ""
+ try:
+ next_sent_start = bnc_sentences[i+1][0:20]
+ except IndexError:
+ pass
+ sent_disp = "%s###%s..." % (s, next_sent_start)
+ break
+ i = i + 1
+ return sent, sent_disp
+
+ def run(self, bnc_string):
+ self.s = SentenceSplitter.SentenceSplitter()
+
+ # manual testing:
+ #bnc_string = "<s n=0000>This a test. Sentence.</s> <s n=1111>Another one.</s>"
+ #bnc_string = "<s n=0000>This a Sentence</s> <s n=1111>Another one.</s>"
+
+ bnc_paras = re.compile("<p>(.*?)</p>", re.DOTALL).findall(bnc_string)
+ bnc_paras_str = str.join(' ', bnc_paras)
+ bnc_sentences = re.compile("<s\s.*?>(.*?)</s>", re.DOTALL).findall(bnc_paras_str)
+ bnc_boundaries = []
+ l = 0
+ i = 0
+ for s in bnc_sentences:
+ s = bnc_sentences[i]
+ s = Entities.Entities.cleanEntities(s)
+ s = re.compile("<.*?>").sub("", s)
+ s = s.strip()
+ if not s.endswith(" "):
+ # TODO: is this fair?
+ s = s + " "
+ bnc_sentences[i] = s
+ l = l + len(s)
+ bnc_boundaries.append(l)
+ i = i + 1
+ ###print bnc_sentences
+ bnc_sentences_str = str.join('', bnc_sentences)
+ #print bnc_sentences_str
+
+ detected_sentences = self.s.split(bnc_sentences_str)
+ ###print detected_sentences
+ detected_boundaries = []
+ l = 0
+ for s in detected_sentences:
+ l = l + len(s)
+ detected_boundaries.append(l)
+
+ sent_count = 0
+ # recall = how many of the sentence boundaries have been detected?
+ recall_count = 0
+ for real_boundary in bnc_boundaries:
+ if real_boundary in detected_boundaries:
+ recall_count = recall_count + 1
+ #print "Found: '%s'" % s
+ else:
+ pass
+ (s, s_disp) = self.findSentence(real_boundary, bnc_sentences)
+ print "Not found: '%s'" % s_disp
+ sent_count = sent_count + 1
+ recall = 0
+ if len(bnc_boundaries) > 0:
+ recall = float(recall_count) / float(len(bnc_boundaries))
+
+ # precision = how many of detected boundaries are real sentence boundaries?
+ precision_count = 0
+ for detected_boundary in detected_boundaries:
+ if detected_boundary in bnc_boundaries:
+ precision_count = precision_count + 1
+ precision = 0
+ if len(detected_boundaries) > 0:
+ precision = float(precision_count) / float(len(detected_boundaries))
+
+ print "Real sentences = %d" % sent_count
+ print "Recall = %.3f" % recall
+ print "Precision = %.3f" % precision
+ return
+
+if __name__ == "__main__":
+ prg = SentenceSplitterEval()
+ if len(sys.argv) <= 1:
+ print "Usage: ./SentenceSplitterEval.py <bnc_sampler_files>"
+ else:
+ for filename in sys.argv[1:]:
+ print filename
+ f = open(filename)
+ bnc_string = f.read()
+ f.close()
+ prg.run(bnc_string)
diff --git a/languagetool/src/SentenceSplitterTest.py b/languagetool/src/SentenceSplitterTest.py
new file mode 100644
index 0000000..52fe732
--- /dev/null
+++ b/languagetool/src/SentenceSplitterTest.py
@@ -0,0 +1,91 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2003,2004 Daniel Naber <daniel.naber@t-online.de>
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import os
+
+import SentenceSplitter
+import unittest
+
+class SentenceSplitterTestCase(unittest.TestCase):
+
+ def testSplit(self):
+ self.s = SentenceSplitter.SentenceSplitter()
+
+ l = self.s.split(None)
+ self.assertEqual(len(l), 0)
+
+ self._doTest("")
+ self._doTest("This is a sentence.")
+ self._doTest("This is a sentence. #And this is another one.")
+ self._doTest("This is a sentence. #Isn't it? #Yes, it is.")
+ self._doTest("This is e.g. Mr. Smith, who talks slowly... #But this is another sentence.")
+ self._doTest("Chanel no. 5 is groovy.")
+ self._doTest("Mrs. Jones gave Peter $4.5, to buy Chanel No 5. #He never came back.")
+ self._doTest("On p. 6 there's nothing. #Another sentence.")
+ self._doTest("Leave me alone!, he yelled. #Another sentence.")
+ self._doTest("\"Leave me alone!\", he yelled.")
+ self._doTest("'Leave me alone!', he yelled. #Another sentence.")
+ self._doTest("'Leave me alone,' he yelled. #Another sentence.")
+ self._doTest("This works on the phrase level, i.e. not on the word level.")
+ self._doTest("Let's meet at 5 p.m. in the main street.")
+ self._doTest("James comes from the U.K. where he worked as a programmer.")
+ self._doTest("Don't split strings like U.S.A. please.")
+ self._doTest("Don't split strings like U. S. A. either.")
+ self._doTest("Don't split... #Well you know. #Here comes more text.")
+ self._doTest("Don't split... well you know. #Here comes more text.")
+ self._doTest('The "." should not be a delimiter in quotes.')
+ self._doTest('"Here he comes!" she said.')
+ self._doTest('"Here he comes!", she said.')
+ self._doTest('"Here he comes." #But this is another sentence.')
+ self._doTest('"Here he comes!". #That\'s what he said.')
+ self._doTest('The sentence ends here. #(Not me.)')
+ self._doTest("He won't. #Really.")
+ self._doTest("He won't say no. #Not really.")
+ self._doTest("He won't say no. 5 is better. #Not really.")
+ self._doTest("They met at 5 p.m. on Thursday.")
+ self._doTest("They met at 5 p.m. #It was Thursday.")
+ self._doTest("This is it: a test.")
+ # known not to work:
+ #self._doTest("This is it: #A final test.")
+ # two returns -> paragraph -> new sentence:
+ self._doTest("He won't\n\n#Really.")
+ # Some people make two spaces after sentence end:
+ self._doTest("This is a sentence. #And this is another one.")
+ # Missing space after sentence end:
+ self._doTest("James is from the Ireland!#He lives in Spain now.")
+ # From the abbreviation list:
+ self._doTest("Jones Bros. have built a succesful company.")
+ # Doesn't work:
+ #self._doTest("James is from the U.K. #He lives in Spain now.")
+
+ return
+
+ def _doTest(self, s):
+ s_copy = s.replace("#", "")
+ l = self.s.split(s_copy)
+ correct_result = s.split("#")
+ # ignore leading/trailing whitespace differences:
+ i = 0
+ for item in l:
+ l[i] = l[i].strip()
+ i = i + 1
+ i = 0
+ for item in correct_result:
+ correct_result[i] = correct_result[i].strip()
+ i = i + 1
+ self.assertEqual(l, correct_result)
+ return
diff --git a/languagetool/src/TagInfo.py b/languagetool/src/TagInfo.py
new file mode 100644
index 0000000..31aec80
--- /dev/null
+++ b/languagetool/src/TagInfo.py
@@ -0,0 +1,276 @@
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+# Provide user information about BNC tags
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import re
+import sys
+
+class TagInfo:
+
+ TAG_STRING = {}
+ TAG_STRING['en'] = """AJ0 Adjective (general or positive) (e.g. good, old, beautiful)
+ AJC Comparative adjective (e.g. better, older)
+ AJS Superlative adjective (e.g. best, oldest)
+ AT0 Article (e.g. the, a, an, no) [N.B. no is included among articles, which are defined here as determiner words which typically begin a noun phrase, but which cannot occur as the head of a noun phrase.]
+ AV0 General adverb: an adverb not subclassified as AVP or AVQ (see below) (e.g. often, well, longer (adv.), furthest. [Note that adverbs, unlike adjectives, are not tagged as positive, comparative, or superlative. This is because of the relative rarity of comparative and superlative adverbs.]
+ AVP Adverb particle (e.g. up, off, out) [N.B. AVP is used for such "prepositional adverbs", whether or not they are used idiomatically in a phrasal verb: e.g. in 'Come out here' and 'I can't hold out any longer', the same AVP tag is used for out.
+ AVQ Wh-adverb (e.g. when, where, how, why, wherever) [The same tag is used, whether the word occurs in interrogative or relative use.]
+ CJC Coordinating conjunction (e.g. and, or, but)
+ CJS Subordinating conjunction (e.g. although, when)
+ CJT The subordinating conjunction that [N.B. that is tagged CJT when it introduces not only a nominal clause, but also a relative clause, as in 'the day that follows Christmas'. Some theories treat that here as a relative pronoun, whereas others treat it as a conjunction.We have adopted the latter analysis.]
+ CRD Cardinal number (e.g. one, 3, fifty-five, 3609)
+ DPS Possessive determiner (e.g. your, their, his)
+ DT0 General determiner: i.e. a determiner which is not a DTQ. [Here a determiner is defined as a word which typically occurs either as the first word in a noun phrase, or as the head of a noun phrase. E.g. This is tagged DT0 both in 'This is my house' and in 'This house is mine'.]
+ DTQ Wh-determiner (e.g. which, what, whose, whichever) [The category of determiner here is defined as for DT0 above. These words are tagged as wh-determiners whether they occur in interrogative use or in relative use.]
+ EX0 Existential there, i.e. there occurring in the there is ... or there are ... construction
+ ITJ Interjection or other isolate (e.g. oh, yes, mhm, wow)
+
+ NN0 Common noun, neutral for number (e.g. aircraft, data, committee) [N.B. Singular collective nouns such as committee and team are tagged NN0, on the grounds that they are capable of taking singular or plural agreement with the following verb: e.g. 'The committee disagrees/disagree'.]
+ NN1 Singular common noun (e.g. pencil, goose, time, revelation)
+ NN2 Plural common noun (e.g. pencils, geese, times, revelations)
+ NP0 Proper noun (e.g. London, Michael, Mars, IBM) [N.B. the distinction between singular and plural proper nouns is not indicated in the tagset, plural proper nouns being a comparative rarity.]
+ ORD Ordinal numeral (e.g. first, sixth, 77th, last) . [N.B. The ORD tag is used whether these words are used in a nominal or in an adverbial role. Next and last, as "general ordinals", are also assigned to this category.]
+ PNI Indefinite pronoun (e.g. none, everything, one [as pronoun], nobody) [N.B. This tag applies to words which always function as [heads of] noun phrases. Words like some and these, which can also occur before a noun head in an article-like function, are tagged as determiners (see DT0 and AT0 above).]
+ PNP Personal pronoun (e.g. I, you, them, ours) [Note that possessive pronouns like ours and theirs are tagged as personal pronouns.]
+ PNQ Wh-pronoun (e.g. who, whoever, whom) [N.B. These words are tagged as wh-pronouns whether they occur in interrogative or in relative use.]
+ PNX Reflexive pronoun (e.g. myself, yourself, itself, ourselves)
+
+ POS The possessive or genitive marker 's or ' (e.g. for 'Peter's or somebody else's', the sequence of tags is: NP0 POS CJC PNI AV0 POS)
+ PRF The preposition of. Because of its frequency and its almost exclusively postnominal function, of is assigned a special tag of its own.
+ PRP Preposition (except for of) (e.g. about, at, in, on, on behalf of, with)
+ PUL Punctuation: left bracket - i.e. ( or [
+ PUN Punctuation: general separating mark - i.e. . , ! , : ; - or ?
+ PUQ Punctuation: quotation mark - i.e. ' or "
+ PUR Punctuation: right bracket - i.e. ) or ]
+ TO0 Infinitive marker to
+ UNC Unclassified items which are not appropriately classified as items of the English lexicon. [Items tagged UNC include foreign (non-English) words, special typographical symbols, formulae, and (in spoken language) hesitation fillers such as er and erm.]
+
+ VBB The present tense forms of the verb BE, except for is, 's: i.e. am, are, 'm, 're and be [subjunctive or imperative]
+ VBD The past tense forms of the verb BE: was and were
+ VBG The -ing form of the verb BE: being
+ VBI The infinitive form of the verb BE: be
+ VBN The past participle form of the verb BE: been
+ VBZ The -s form of the verb BE: is, 's
+
+ VDB The finite base form of the verb DO: do
+ VDD The past tense form of the verb DO: did
+ VDG The -ing form of the verb DO: doing
+ VDI The infinitive form of the verb DO: do
+ VDN The past participle form of the verb DO: done
+ VDZ The -s form of the verb DO: does, 's
+
+ VHB The finite base form of the verb HAVE: have, 've
+ VHD The past tense form of the verb HAVE: had, 'd
+ VHG The -ing form of the verb HAVE: having
+ VHI The infinitive form of the verb HAVE: have
+ VHN The past participle form of the verb HAVE: had
+ VHZ The -s form of the verb HAVE: has, 's
+
+ VM0 Modal auxiliary verb (e.g. will, would, can, could, 'll, 'd)
+
+ VVB The finite base form of lexical verbs (e.g. forget, send, live, return) [Including the imperative and present subjunctive]
+ VVD The past tense form of lexical verbs (e.g. forgot, sent, lived, returned)
+ VVG The -ing form of lexical verbs (e.g. forgetting, sending, living, returning)
+ VVI The infinitive form of lexical verbs (e.g. forget, send, live, return)
+ VVN The past participle form of lexical verbs (e.g. forgotten, sent, lived, returned)
+ VVZ The -s form of lexical verbs (e.g. forgets, sends, lives, returns)
+
+ XX0 The negative particle not or n't
+ ZZ0 Alphabetical symbols (e.g. A, a, B, b, c, d)"""
+
+ TAG_STRING['de'] = """ADJ Adjective (general) (e.g. gut, alt)
+ ADJE Comparative adjective (e.g. alte)
+ ADJER adjective with er Ending (e.g. alter)
+ ADJES adjective with es Ending (e.g. altes)
+ ADJEM adjective with em Ending (e.g. altem)
+ ADJEN adjective with en Ending (e.g. alten)
+ *ADV Adverb like abends, morgen
+
+ PRA Pronoun with accusativ wider, gegen
+ PRD Pronoun with dativ ab, aus
+ PRD Pronoun with accusativ or dativ in, über
+
+ PP1 Personal pronoun ich, mich, mir
+ PP2 Personal pronoun du
+ PP3 Personal pronoun er, sie, es
+ PP4 Personal pronoun wir
+ PP5 Personal pronoun ihr
+
+ *IND oh, ah, heisa
+ *INT Interrogating word like Wer, wo, etc...
+
+ CNT Number
+ CJC Conjunctive word like und, oder, ...
+
+ V verb, e.g. gehen
+ V11 verb, e.g. gehe
+ V12 verb, e.g. gehst
+ V13 verb, e.g. geht
+ V14 verb, e.g. gehen
+ V15 verb, e.g. gehet
+
+ HV auxiliary verb, e.g. moegen
+ HV11 auxiliary verb, e.g. mag
+ HV12 auxiliary verb, e.g. magst
+ HV13 auxiliary verb, e.g. mag
+ HV14 auxiliary verb, e.g. moegen
+ HV15 auxiliary verb, e.g. moeget
+
+ N Noun
+ NMS Noun male no ending, e.g. Garten
+ NFS Noun female no ending, e.g. Frau
+ NNS Noun neutrum no ending
+ NFNS Noun female or neutrum no ending
+ NFMS Noun female or male no ending
+ NMNS Noun male or neutrum no ending
+ NFMNS Noun male female or neutrum no ending
+ NM Noun male with ending like Gartens
+ NF Noun female with ending like Frauen
+ NN Noun neutrum with ending
+ NFN Noun female or neutrum with ending
+ NFM Noun female or male with ending
+ NMN Noun male or neutrum with ending
+ NFMN Noun male female or neutrum with ending
+
+ UA1 indefinite article ein
+ UAE indefinite article eine
+ UAR indefinite article einer
+ UAN indefinite article einen
+ UAM indefinite article einem
+ UAS indefinite article eines
+ * INT,IND,ADV sometimes mixed up in the word collection - to be corrected"""
+
+ TAG_STRING['hu'] = """ADJS Singular adjective (e.g. szep)
+ ADJP Plural Adjective (e.g. szepek)
+ ADJN Numeric Adjective (e.g. tizedik)
+ ADV Adverb like szepen, jol
+ NS Noun, singular asztalnak
+ NSN Noun, singular, nominativ asztal
+ NSR Noun, singular, not nominativ asztalt
+ NP Noun, plural asztalokat
+ NPN Noun, plural, nominativ asztalok
+ NPR Noun, plural, not nominativ asztalokra
+ V1 Verb, Singular, 1-st person irok
+ V2 Verb, Singular, 2-nd person
+ V3 Verb, Singular, 3-rd person
+ V4 Verb, Plural, 1-st person
+ V5 Verb, Plural, 2-nd person
+ V6 Verb, Plural, 3-rd person
+ VINF Verb infinitiv
+ IKV1 Prefixed Verb, Singular, 1-st person megirok
+ IKV2 Prefixed Verb, Singular, 2-nd person
+ IKV3 Prefixed Verb, Singular, 3-rd person
+ IKV4 Prefixed Verb, Plural, 1-st person
+ IKV5 Prefixed Verb, Plural, 2-nd person
+ IKV6 Prefixed Verb, Plural, 3-rd person
+ VINF Prefixed Verb infinitiv
+ SI1 Help Verb, Singular, 1-st person akarok
+ SI2 Help Verb, Singular, 2-nd person
+ SI3 Help Verb, Singular, 3-rd person
+ SI4 Help Verb, Plural, 1-st person
+ SI5 Help Verb, Plural, 2-nd person
+ SI6 Help Verb, Plural, 3-rd person
+ SIINF Help Verb infinitiv
+ IKSI1 Prefixed Help Verb, Singular, 1-st person megvagyok
+ IKSI2 Prefixed Help Verb, Singular, 2-nd person
+ IKSI3 Prefixed Help Verb, Singular, 3-rd person
+ IKSI4 Prefixed Help Verb, Plural, 1-st person
+ IKSI5 Prefixed Help Verb, Plural, 2-nd person
+ IKSI6 Prefixed Help Verb, Plural, 3-rd person
+ IKSIINF Prefixed Help Verb infinitiv
+ NEIK Non detachable verb prefix be, ki, le, fel, etc...
+ PP1 Personal pronom en
+ PP2 Personal pronom te
+ PP3 Personal pronom o
+ PP4 Personal pronom mi
+ PP5 Personal pronom ti
+ PP6 Personal pronom ok
+ RPP1 Owning Personal Pronom enyem
+ RPP2 Owning Personal Pronom tied
+ RPP3 Owning Personal Pronom ove
+ RPP4 Owning Personal Pronom mienk
+ RPP5 Owning Personal Pronom tietek
+ RPP6 Owning Personal Pronom ovek
+ IND uhum
+ INT Interrogating word like nemde etc...
+ CRD Number tizenot
+ INTRN Numerical interrogation mennyi, etc...
+ INTR Interrogation miert, etc...
+ CJC Conjunctive word like es vagy, ...
+ DNV Double role, Noun and verb var
+ DAV Double role, Adj and Verb irt
+ DNA Double role, Noun and ADJ or ADV iro ...
+ RART Conjunction word like de, hogy
+ """
+
+ def __init__(self, lang):
+ if not self.TAG_STRING.has_key(lang):
+ raise KeyError, "no information found for language '%s'" % lang
+ tag_lines = re.split("\n", self.TAG_STRING[lang])
+ self.tags = [] # [(short, explanation)]
+ for tag_line in tag_lines:
+ tag_line = tag_line.strip()
+ parts = re.split("\s+", tag_line)
+ short_tag = parts[0]
+ tag_exp = str.join(' ', parts[1:])
+ self.tags.append((short_tag, tag_exp))
+ return
+
+ def getExp(self, short_tag_search):
+ for (tag_short, tag_exp) in self.tags:
+ if short_tag_search == tag_short:
+ return tag_exp
+ return None
+
+ def getJavascriptCode(self):
+ l = []
+ for (tag_short, tag_exp) in self.tags:
+ tag_exp = tag_exp.replace("\"", "\\\"")
+ l.append('data["%s"] = "%s";' % (tag_short, tag_exp))
+ return str.join('\n', l)
+
+ def getHTMLCode(self):
+ l = []
+ l.append('<table border="0" cellpadding="0" cellspacing="2">')
+ for (tag_short, tag_exp) in self.tags:
+ tag_exp = tag_exp.replace("\"", "\\\"")
+ if tag_short:
+ l.append('<tr bgcolor="#dddddd"><td valign="top"><strong>%s</strong></td><td>%s</td></tr>' % (tag_short, tag_exp))
+ else:
+ l.append('<tr><td>&nbsp;</td></tr>')
+ l.append('</table>')
+ return str.join('\n', l)
+
+ def printAll(self):
+ for (tag_short, tag_exp) in self.tags:
+ if tag_short:
+ print "%s: %s" % (tag_short, tag_exp)
+ else:
+ print
+ return
+
+if __name__ == "__main__":
+ # TODO: take language as parameter
+ if len(sys.argv) < 2:
+ print "Usage: TagInfo.py <language>"
+ print " where <language> is a language code like en, de, ..."
+ sys.exit(1)
+ taginfo = TagInfo(sys.argv[1])
+ taginfo.printAll()
diff --git a/languagetool/src/Tagger.py b/languagetool/src/Tagger.py
new file mode 100644
index 0000000..1243c41
--- /dev/null
+++ b/languagetool/src/Tagger.py
@@ -0,0 +1,1108 @@
+# -*- coding: iso-8859-1 -*-
+# A probabilistic part-of-speech tagger (see the QTag paper) with
+# a rule-based extension.
+#$rcs = ' $Id$ ' ;
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import codecs
+import os
+import re
+import string
+import sys
+import time
+import cPickle
+import htmlentitydefs
+import Wfinder
+
+# FIXME:
+dicFile = 'deutsch.txt'
+affFile = 'deutsch.aff'
+
+class Tagger:
+ """POS-tag any text. The result in XML can be used to re-build the original
+ text by concatenating all contents of the <w> tags. Whitespace characters
+ have term=None and type=None, i.e. they are inside their own <w>
+ elements. Words that could not be tagged have type=unknown."""
+
+ def __init__(self, textlanguage, db_word_name=None, db_seq_name1=None, db_seq_name2=None):
+ """Initialize the tagger, optionally using the given
+ file names that will be used to load and save data later."""
+ self.textlanguage = textlanguage
+ self.wfinder = Wfinder.Wfinder(textlanguage)
+ db_word_name = os.path.join(sys.path[0], "data", dicFile)
+ db_seq_name1 = os.path.join(sys.path[0], "data", "seqs1")
+ db_seq_name2 = os.path.join(sys.path[0], "data", "seqs2")
+ #uncountable_name = os.path.join("data", "uncountable.txt")
+ self.data_table = None
+ self.seqs_table_followed_by = None # tag sequences: table[tag1,tag2] = value
+ self.seqs_table_follows = None # tag sequences: table[tag1,tag2] = value
+ if db_word_name:
+ self.db_word_name = db_word_name
+ if db_seq_name1:
+ self.db_seq_name1 = db_seq_name1
+ if db_seq_name2:
+ self.db_seq_name2 = db_seq_name2
+ #uncountable_nouns = self.loadUncountables()
+ self.word_count = 0
+
+ return
+
+ def loadUncountables(self):
+ """TODO: not used yet."""
+ l = []
+ f = open(self.uncountable_name)
+ while 1:
+ line = f.readline()
+ if not line:
+ break
+ line = line.strip()
+ if not line.startswith("#") and line != '':
+ l.append(line)
+ f.close()
+ return l
+
+ def bindData(self):
+ """Load the word/POS tag and POS tag sequence data from disk."""
+ try:
+ if self.textlanguage != 'en':
+ self.ReadData(self.db_word_name);
+ else:
+ self.data_table = cPickle.load(open(self.db_word_name, 'rb'))
+ except IOError:
+ print >> sys.stderr, "No data file '%s' yet, starting with empty table." % self.db_word_name
+ self.data_table = {}
+ if self.textlanguage == 'en':
+ try:
+ self.seqs_table_followed_by = cPickle.load(open(self.db_seq_name1, 'rb'))
+ except IOError:
+ print >> sys.stderr, "No data file '%s' yet, starting with empty table." % self.db_seq_name1
+ self.seqs_table_followed_by = {}
+ try:
+ self.seqs_table_follows = cPickle.load(open(self.db_seq_name2, 'rb'))
+ except IOError:
+ print >> sys.stderr, "No data file '%s' yet, starting with empty table." % self.db_seq_name2
+ self.seqs_table_follows = {}
+ else:
+ self.seqs_table_followed_by = {}
+ self.seqs_table_follows = {}
+ return
+
+ def commitData(self):
+ """Save the word/POS tag and POS tag sequence data to disk."""
+ print >> sys.stderr, "Words = %d" % self.word_count
+ print >> sys.stderr, "Known words = %d" % len(self.data_table.keys())
+ print >> sys.stderr, "Known sequences = %d" % len(self.seqs_table_followed_by.keys())
+ print >> sys.stderr, "Commiting results..."
+# cPickle.dump(self.data_table, open(self.db_word_name, 'wb'), 1)
+# cPickle.dump(self.seqs_table_followed_by, open(self.db_seq_name1, 'wb'), 1)
+# cPickle.dump(self.seqs_table_follows, open(self.db_seq_name2, 'wb'), 1)
+ return
+
+ def deleteData(self):
+ """Remove the word/POS tag and POS tag sequence data files from disk."""
+# print >> sys.stderr, "Deleting old data files..."
+# try:
+# os.remove(self.db_word_name)
+# except OSError, e:
+# print >> sys.stderr, "Note: Could not delete file: %s" % e
+# try:
+# os.remove(self.db_seq_name1)
+# except OSError, e:
+# print >> sys.stderr, "Note: Could not delete file: %s" % e
+# try:
+# os.remove(self.db_seq_name2)
+# except OSError, e:
+# print >> sys.stderr, "Note: Could not delete file: %s" % e
+ return
+
+ def buildData(self, filenames):
+ """Load BNC files in XML or SGML format and count the word/POS
+ occurences and the POS tag sequences."""
+ tagged_words = []
+ for filename in filenames:
+ print >> sys.stderr, "Loading %s..." % filename
+ text = PreTaggedText(filename)
+ tagged_words.extend(text.getTaggedWords())
+ self.word_count = self.word_count + len(tagged_words)
+# text.addToData(tagged_words, self.data_table, self.seqs_table_followed_by, self.seqs_table_follows)
+ return
+
+ def buildDataFromString(self, s):
+ """Take a string with format "word1/tag1 word2/tag2 ..." and
+ count the word/POS occurences and the POS tag sequences.
+ Only useful for the test cases."""
+ pairs = re.compile("\s+").split(s)
+ tagged_words = []
+ split_regex = re.compile("/")
+ for pair in pairs:
+ pair = split_regex.split(pair)
+ if len(pair) != 2:
+ # e.g. punctuation
+ continue
+ word = pair[0]
+ tag = pair[1]
+ tagged_words.append((word, tag))
+ text = TextToTag(self.textlanguage, self.wfinder)
+# text.addToData(tagged_words, self.data_table, self.seqs_table_followed_by, self.seqs_table_follows)
+ return
+
+ def ReadData(self, db_word_name):
+ self.data_table = {}
+ self.word_table = {}
+ table = {}
+ return
+
+
+ def tagFile(self, filename):
+ """POS-tag the contents of a text file and return XML that contains
+ the original text with each word's POS tag in the "type"
+ attribute."""
+ text = TextToTag(self.textlanguage, self.wfinder)
+ text.setFilename(filename)
+ tagged_words = text.tag(self.data_table, self.seqs_table_followed_by, self.seqs_table_follows)
+# print tagged_words # tktk
+ xml = text.toXML(tagged_words)
+ return xml
+
+ def tagText(self, strng): #textchecker check calls
+ """POS-tag a string and return a list of (word, normalized word, tag)
+ triples."""
+ text = TextToTag(self.textlanguage, self.wfinder)
+ text.setText(strng)
+# print strng
+ tagged_words = text.tag(self.data_table, self.seqs_table_followed_by, self.seqs_table_follows)
+# print tagged_words # tktk
+ return tagged_words
+
+ def tagTexttoXML(self, strng):
+ """POS-tag a string and return a list of (word, normalized word, tag)
+ triples."""
+ text = TextToTag(self.textlanguage, self.wfinder)
+ text.setText(strng)
+ tagged_words = text.tag(self.data_table, self.seqs_table_followed_by, self.seqs_table_follows)
+ xml = text.toXML(tagged_words)
+ return xml
+
+ def tagSeq(self, tup):
+ """Return the probability of a 2-POS-tag sequence."""
+ if len(tup) != 2:
+ #TODO?: throw exception
+ print >> sys.stderr, "Sequence does not consist of 2 tokens: '%s'" % str(seq)
+ return None
+ try:
+ probability = self.seqs_table_followed_by[tup]
+ #probability = self.seqs_table_follows[tup]
+ except KeyError:
+ probability = 0
+ return probability
+
+ def tagSeq2(self, tup):
+ """Return the probability of a 2-POS-tag sequence."""
+ if len(tup) != 2:
+ #TODO?: throw exception
+ print >> sys.stderr, "Sequence does not consist of 2 tokens: '%s'" % str(seq)
+ return None
+ try:
+ #probability = self.seqs_table_followed_by[tup]
+ probability = self.seqs_table_follows[tup]
+ except KeyError:
+ probability = 0
+ return probability
+
+ def tagWord(self, word):
+ """See Text.tagWord()"""
+ text = TextToTag(self.textlanguage, self.wfinder)
+ text.setText("")
+ tag = text.tagWord(word, self.data_table)
+ return tag
+
+ def guessTagTest(self, word):
+ """See Text.guessTags(). For test cases only."""
+ text = TextToTag(self.textlanguage, self.wfinder)
+ text.setText("")
+ tag = text.guessTags(word)
+ return tag
+
+
+class Text:
+
+ DUMMY = None
+ number_regex = re.compile("^(\d|\d+[.,/\-]\d+)+$")
+ time_regex = re.compile("\d(am|pm)$")
+ bnc_regex = re.compile("<(w|c) (.*?)>(.*?)<", re.DOTALL)
+
+ mapping_file = os.path.join(sys.path[0], "data", "c7toc5.txt")
+ manually_tagged_file = os.path.join(sys.path[0], "data", "postags.txt")
+
+ def __init__(self, textlanguage, wfinder):
+ self.textlanguage = textlanguage
+ self.wfinder = wfinder
+ self.count_unambiguous = 0
+ self.count_ambiguous = 0
+ self.count_unknown = 0
+ self.whitespace = re.compile("\s+$")
+ self.nonword = re.compile("([\s,:;]+)")
+ self.nonword_punct = re.compile("([,:;]+)")
+ self.sentence_end = re.compile("([.!?]+)$")
+ self.bnc_word_regexp = re.compile("<W\s+TYPE=\"(.*?)\".*?>(.*?)</W>", \
+ re.DOTALL|re.IGNORECASE)
+ self.mapping = self.loadMapping()
+ self.manually_tagged = self.loadManuallyTagged()
+ return
+
+ def loadMapping(self):
+ f = open(self.mapping_file)
+ line_count = 1
+ mapping = {}
+ while 1:
+ line = f.readline().strip()
+ if not line:
+ break
+ l = re.split("\s+", line)
+ if not len(l) == 2:
+ print >> sys.stderr, "No valid mapping in line %d: '%s'" % (line_count, line)
+ (c7, c5) = l[0], l[1]
+ if mapping.has_key(c7):
+ print >> sys.stderr, "No valid mapping in line %d: '%s', duplicate key '%s'" % (line_count, line, c7)
+ continue
+ mapping[c7] = c5
+ #print "%s -> %s" % (c7, c5)
+ line_count = line_count + 1
+ f.close()
+ return mapping
+
+ def loadManuallyTagged(self):
+ table = {}
+ regex = re.compile("^(.+)\s+(.+?)$")
+ f = open(self.manually_tagged_file)
+ while 1:
+ line = f.readline()
+ if not line:
+ break
+ line = line.strip()
+ if not line.startswith("#") and line != '':
+ regex_match = regex.search(line)
+ if regex_match:
+ word = regex_match.group(1)
+ postag = regex_match.group(2)
+ table[word] = postag
+ f.close()
+ return table
+
+ def expandEntities(self, text):
+ """Take a text and expand a few selected entities. Return the same
+ text with entities expanded. (We cannot simply parse the file with
+ DOM, as we don't have an XML DTD -- the original files were SGML.)"""
+ ### TODO: use Entities module
+ text = re.compile("&amp;", re.IGNORECASE).sub("&", text)
+ # TODO: several entities are missing here:
+ #text = re.compile("&#(x..);", re.IGNORECASE).sub(self.expandHexEntities, text)
+ text = re.compile("&#xA3;", re.IGNORECASE).sub("�", text)
+ return text
+
+ #def expandHexEntities(self, matchobj):
+ # htmlentitydefs.entitydefs[]
+ # s = u'\%s' % matchobj.group(1)
+ # #s = "Y"
+ # return s
+
+ def getBNCTuples(self, text):
+ """Return a list of (tag, word) tuples from text if
+ text is a BNC Sampler text in XML or SGML format. Otherwise
+ return an empty list. The tags are mapped from the C7 tag set
+ to the much smaller C5 tag set."""
+ l = []
+ pos = 0
+ while 1:
+ m = self.bnc_regex.search(text, pos)
+ if not m:
+ break
+ tag = m.group(2)
+ if self.mapping.has_key(tag):
+ tag = self.mapping[tag]
+ else:
+ #print "no mapping: %s" % tag
+ pass
+ if m.group(3):
+ l.append((tag, m.group(3).strip()))
+ #print "- %s/%s" % (tag, m.group(3).strip())
+ pos = m.start()+1
+ return l
+
+ def normalise(self, text):
+ """Take a string and remove XML markup and whitespace at the beginning
+ and the end. Return the modified string."""
+ # sometimes there's <PB...>...</PB> *inside* <W...>...</W>!
+ text = re.compile("<.*?>", re.DOTALL|re.IGNORECASE).sub("", text)
+ text = text.strip()
+ return text
+
+ def splitBNCTag(self, tag):
+ """Take a string with BNC tags like 'NN1-NP0' and return a list,
+ e.g. ['NN1', 'NP0']. For single tags like 'NN0' this will
+ be returned: ['NN0']."""
+ tags = re.split("-", tag)
+ return tags
+
+ def guessTags(self, word):
+ """Take a word and guess which POS tags it might have and return
+ those POS tags. This considers e.g. word prefixes, suffixes and
+ capitalization. If no guess can be made, None is returned."""
+ # TODO: return more than one tag
+
+ # �25 etc:
+ # fixme -- UnicodeDecodeError
+ #if word.startswith(u"�") or word.startswith(u"$"):
+ # return 'NN0'
+
+ # numbers:
+ if self.number_regex.match(word):
+ return 'CRD'
+
+ # e.g. HIV
+ if len(word) >= 2 and word == word.upper():
+ return 'NN0'
+
+ # this >=3 limit also prevents to assign 'A' (i.e. determiner
+ # at sentence start) NP0, of course that's only relevant
+ # for the test cases:
+ # English only
+ # TODO: is it okay to use 'latin1' here?
+ if len(word) >= 3 and word[0] in unicode(string.uppercase, 'latin1'): # e.g. "Jefferson"
+ return 'NP0'
+
+ # e.g. freedom, contentment, celebration, assistance, fighter,
+ # violinist, capacity
+ if self.textlanguage == 'en':
+ noun = ['dom', 'ment', 'tion', 'sion', 'ance', 'ence', 'er', 'or',
+ 'ist', 'ness', 'icity']
+ for suffix in noun:
+ if word.endswith(suffix):
+ return 'NN1'
+
+ # e.g. quickly
+ if word.endswith("ly"):
+ return 'AV0'
+
+ # e.g. 8.55am
+ if self.time_regex.search(word):
+ return 'AV0'
+
+ # e.g. extensive, heroic, financial, portable, hairy
+ # mysterious, hopeful, powerless
+ # 'en' was left out, could also be a verb
+ if self.textlanguage == 'en':
+ adj = ['ive', 'ic', 'al', 'able', 'y', 'ous', 'ful', 'less']
+ for suffix in adj:
+ if word.endswith(suffix):
+ return 'AJ0'
+
+ # e.g. publicize, publicise, activate, simplify
+ # 'en' was left out, could also be a adjective
+ verb = ['ize', 'ise', 'ate', 'fy']
+ for suffix in verb:
+ if word.endswith(suffix):
+ # fixme: could also be VVB
+ return 'VVI'
+
+ return None
+
+ def tagWord(self, word, data_table):
+ """Find all possible tags for a word and return a list of tuples:
+ [(orig_word, normalised_word, [(tag, probability])]"""
+ orig_word = word
+ word = self.normalise(word)
+ #word = re.compile("[^\w' ]", re.IGNORECASE).sub("", word)
+
+ #if word and self.nonword_punct.match(word):
+ # # punctuation
+ # return [(orig_word, orig_word, [])]
+ if (not word) or self.whitespace.match(word):
+ # word is just white space
+ return [(orig_word, None, [])]
+
+ if self.manually_tagged.has_key(word):
+ return [(orig_word, orig_word, [(self.manually_tagged[word], 1)])]
+
+ # sanity check:
+ #if word.count("'") > 1:
+ # print >> sys.stderr, "*** What's this, more than one apostroph: '%s'?" % word
+
+ # Special cases: BNC tags "wasn't" like this: "<w VBD>was<w XX0>n't"
+ # Call yourself, but don't indefinitely recurse.
+ if self.textlanguage == 'en':
+ special_cases = ("n't", "'s", "'re", "'ll", "'ve")
+ for special_case in special_cases:
+ special_case_pos = word.find(special_case)
+ if special_case_pos != -1 and special_case_pos != 0:
+ first_part = self.tagWord(word[0:special_case_pos], data_table)[0]
+ second_part = self.tagWord(special_case, data_table)[0]
+ tag_results = []
+ #TODO: return probability?:
+ #print second_part
+ tag_results.append((word[0:special_case_pos], first_part[1], first_part[2]))
+ tag_results.append((special_case, second_part[1], second_part[2]))
+ return tag_results
+
+ # TODO?: ignore upper/lower case?, no -- seems to decrease precision
+ #word = word.lower() #handled by word finder itself
+ #if not data_table.has_key(word) and len(word) >= 1:
+ # word = word.lower()
+ # #if data_table.has_key(word):
+ # # print "lower: %s" % word
+ #if not data_table.has_key(word) and len(word) >= 1:
+ # word = "%s%s" % (word[0].upper(), word[1:])
+ # #if data_table.has_key(word):
+ # # print "upper: %s" % word
+
+ if self.textlanguage != 'en':
+ rc = self.wfinder.test_it(word)
+ if rc[0] != '-':
+ src = rc.split()
+ # print len(src)
+ # last returned word exists in .dic file
+ # that's why this word was found
+ word = src[len(src)-2]
+ return [(orig_word, orig_word, [(src [len(src)-1], 1)])]
+# return [(orig_word, word, [(src [len(src)-1], 1)])]
+ if rc[0] == '-':
+ #if not data_table.has_key(word):
+ # word is unknown
+ #print "unknown: '%s'" % word
+ self.count_unknown = self.count_unknown + 1
+ guess_tag = self.guessTags(word)
+ if guess_tag:
+ return [(orig_word, orig_word, [(guess_tag, 1)])]
+# return [(orig_word, word, [(guess_tag, 1)])]
+ else:
+ return [(orig_word, orig_word, [("unknown", 1)])]
+# return [(orig_word, word, [("unknown", 1)])]
+ else: # English case
+ if not data_table.has_key(word):
+ # word is unknown
+ #print "unknown: '%s'" % word
+ self.count_unknown = self.count_unknown + 1
+ guess_tag = self.guessTags(word)
+ if guess_tag:
+ return [(orig_word, word, [(guess_tag, 1)])]
+ else:
+ return [(orig_word, word, [("unknown", 1)])]
+ else:
+ pos_table = data_table[word].table
+ if len(pos_table) == 1:
+ # word is unambiguous
+ self.count_unambiguous = self.count_unambiguous + 1
+ return [(orig_word, word, [(pos_table.keys()[0], 1)])]
+ else:
+ # word is ambiguous
+ tag_tuples = []
+ for pos_tag in pos_table.keys():
+ #print "pos_tag=%s -> %.2f" % (pos_tag, pos_table[pos_tag])
+ tag_tuples.append((pos_tag, pos_table[pos_tag]))
+ self.count_ambiguous = self.count_ambiguous + 1
+ return [(orig_word, word, tag_tuples)]
+
+# def addToData(self, tagged_words, data_table, seqs_table_followed_by, seqs_table_follows):
+ """Count words and POS tags so they can later be added
+ to the persistent storage."""
+# tag_list = self.addWords(tagged_words, data_table)
+# self.addTagSequences(tag_list, seqs_table_followed_by, seqs_table_follows)
+# return
+
+# def addWords(self, tagged_words, data_table):
+ """For each word, save the tag frequency to data_table so
+ it can later be added to the persistent storage. Return
+ a list of all tags."""
+# all_tags_list = []
+# for (word, tag) in tagged_words:
+ #only for testing if case-insensitivity is better:
+ #word = word.lower()
+# all_tags_list.append(tag)
+# tag_list = self.splitBNCTag(tag)
+# assert(len(tag_list) == 1 or len(tag_list) == 2)
+ #print "word/pos_list: %s/%s" % (word, tag_list)
+# if data_table.has_key(word):
+ # word is already known
+# word_table = data_table[word].table
+# for tag in tag_list:
+# if word_table.has_key(tag):
+# word_table[tag] = word_table[tag] + 1.0/len(tag_list)
+ #print "word_table[%s] += %f" % (tag, 1.0/len(tag_list))
+# else:
+# word_table[tag] = 1.0/len(tag_list)
+ #print "word_table[%s] = %f" % (tag, word_table[tag])
+# else:
+# word_table = {}
+# for tag in tag_list:
+# word_table[tag] = 1.0/len(tag_list)
+ #print "word_table[%s] = %f" % (tag, word_table[tag])
+# data_table[word] = WordData(word, word_table)
+ # Normalize data_table values so they are probabilities (0 to 1):
+# for e in data_table.keys():
+# t = data_table[e].table
+# occ_all = 0
+# for occ in t.values():
+# occ_all = occ_all + occ
+# for key in t.keys():
+# t[key] = t[key] / occ_all
+ # debug:
+ #for e in data_table.keys():
+ # print "%s, %s" % (e, data_table[e])
+# return all_tags_list
+
+ def addTagSequences(self, tag_list, seqs_table_followed_by, seqs_table_follows):
+ """Save information about POS tag tuples to seqs_table."""
+ # TODO: add dummy entries?
+ if len(tag_list) == 0:
+ return
+ i = 0
+
+ ### FIXME: does this work if data is added later? probably not...:
+ count_followed_by = {}
+ count_follows = {}
+
+ while 1:
+ if i >= len(tag_list)-1:
+ break
+ tag0 = tag_list[i]
+ key = ()
+ if self.mapping.has_key(tag0):
+ tag0 = self.mapping[tag0]
+ tag1 = tag_list[i+1]
+ if self.mapping.has_key(tag1):
+ tag1 = self.mapping[tag1]
+ try:
+ seqs_table_followed_by[(tag0,tag1)] = seqs_table_followed_by[(tag0,tag1)] + 1
+ except KeyError:
+ seqs_table_followed_by[(tag0,tag1)] = 1
+ try:
+ count_followed_by[tag0] = count_followed_by[tag0] + 1
+ except KeyError:
+ count_followed_by[tag0] = 1
+
+ #print "%s/%s" % (tag1, tag0)
+ try:
+ seqs_table_follows[(tag1,tag0)] = seqs_table_follows[(tag1,tag0)] + 1
+ except KeyError:
+ seqs_table_follows[(tag1,tag0)] = 1
+ try:
+ count_follows[tag1] = count_follows[tag1] + 1
+ except KeyError:
+ count_follows[tag1] = 1
+ i = i + 1
+
+ # Normalize to 0-1 range:
+ # TODO: do these numbers become too small, as the Qtag paper states?
+ for t in seqs_table_followed_by.keys():
+ #if t[0] == 'NN0':
+ # print "%s=%s -- %d" % (t, seqs_table_followed_by[t], count_followed_by[t[0]])
+ seqs_table_followed_by[t] = float(seqs_table_followed_by[t]) / float(count_followed_by[t[0]])
+ for t in seqs_table_follows.keys():
+ seqs_table_follows[t] = float(seqs_table_follows[t]) / float(count_follows[t[0]])
+
+ #debug:
+ #print "FOLLOWED BY (norm):"
+ #for k in seqs_table_followed_by.keys():
+ # print "%s -> %s" % (k, seqs_table_followed_by[k])
+ #print "FOLLOWS (norm):"
+ #for k in seqs_table_follows.keys():
+ # print "%s -> %s" % (k, seqs_table_follows[k])
+ return
+
+
+class TextToTag(Text):
+ """Any text (also pre-tagged texts from the BNC -- for
+ testing the tagger)."""
+
+ DUMMY = None
+
+ def __init__(self, textlanguage, wfinder):
+ # FIXME: not needed, is it? (done in base class):
+ self.textlanguage = textlanguage
+ self.text = None
+ Text.__init__(self, self.textlanguage, wfinder)
+ return
+
+ def setText(self, text):
+ self.text = text
+ return
+
+ def setFilename(self, filename):
+ f = open(filename)
+ self.text = f.read()
+ f.close()
+ return
+
+ def getBestTagSimple(self, tag_tuples):
+ """Return the most probable tag without taking context into
+ account. Only useful for testing and checking the baseline."""
+ max_prob = 0
+ best_tag = None
+ for tag_tuples_here in tag_tuples:
+ prob = tag_tuples_here[1]
+ if prob >= max_prob:
+ max_prob = prob
+ best_tag = tag_tuples_here[0]
+ return best_tag
+
+ def checkBNCMatch(self, i, tagged_list_bnc, word, best_tag, data_table):
+ """Check for mismatches, i.e. POS tags that differ from the original
+ tag in BNC. Print out a warning for all those differences and return
+ 1, otherwise return 0. Note that the BNC's tags are only correct
+ in 97-98%. If the original tag is 'UNC' and this tagger's tag is
+ not 'unknown', this is still considered a mismatch."""
+ if i >= len(tagged_list_bnc)-1:
+ print >> sys.stderr, "Index out of range..."
+ return 0
+ if not tagged_list_bnc[i]:
+ return 0
+ word_from_bnc, tags_from_bnc = tagged_list_bnc[i]
+ #print "%s ?= %s" % (word_from_bnc, word)
+ if best_tag == 'unknown':
+ # 'UNC' means unclassified in BNC, assume that this corresponds
+ # to out 'unknown':
+ best_tag = 'UNC'
+ guessed = 1
+ if data_table.has_key(word):
+ guessed = 0
+ if not word == word_from_bnc:
+ print >> sys.stderr, "*** word mismatch: '%s'/'%s'" % (word, word_from_bnc)
+ #sys.exit()
+ elif not (best_tag in tags_from_bnc) and \
+ tags_from_bnc[0][0] != 'Y': # ignore punctuation tags
+ print >> sys.stderr, "*** tag mismatch (guessed=%d): got %s/%s, expected %s/%s" % \
+ (guessed, word, best_tag, word_from_bnc, tags_from_bnc)
+ return 1
+ #if word == word_from_bnc and guessed:
+ # print >> sys.stderr, "GOODGUESS"
+ return 0
+
+ def getStats(self, count_wrong_tags, is_bnc):
+ """Get some human-readable statistics about tagging success,
+ e.g. number and percentage of correctly tagged tokens."""
+ sum = self.count_unknown + self.count_unambiguous + self.count_ambiguous
+ res = ""
+ if sum > 0:
+ res = "<!-- Statistics:\n"
+ res = res + "count_unknown = %d (%.2f%%)\n" % (self.count_unknown, float(self.count_unknown)/float(sum)*100)
+ res = res + "count_unambiguous = %d (%.2f%%)\n" % (self.count_unambiguous, float(self.count_unambiguous)/float(sum)*100)
+ res = res + "count_ambiguous = %d (%.2f%%)\n" % (self.count_ambiguous, float(self.count_ambiguous)/float(sum)*100)
+ #res = res + "sum = %d\n" % sum
+ if is_bnc:
+ res = res + "correct tags = %d (%.2f%%)\n" % (sum-count_wrong_tags, float(sum-count_wrong_tags)/float(sum)*100)
+ #res = res + "count_wrong_tags = %d (%.2f%%)\n" % (count_wrong_tags, float(count_wrong_tags)/float(sum)*100)
+ res = res + "-->"
+ return res
+
+ def applyConstraints(self, prev_word, curr_word, next_word, tagged_tuples):
+ """Some hard-coded and manually written rules that prevent mistaggings by
+ the probabilistic tagger. Removes incorrect POS tags from tagged_tuples.
+ Returns nothing, as it works directly on tagged_tuples."""
+ # demo rule just for the test cases:
+ if curr_word and curr_word.lower() == 'demodemo':
+ self.constrain(tagged_tuples, 'AA')
+ # ...
+ return
+
+ def constrain(self, tagged_tuples, pos_tag):
+ """Remove the pos_tag reading from tagged_tuples. Returns nothing,
+ works directly on tagged_tuples."""
+ i = 0
+ for t in tagged_tuples:
+ if t[0] == pos_tag:
+ del tagged_tuples[i]
+ i = i + 1
+ return
+
+ def applyTagRules(self, curr_word, tagged_word, curr_tag):
+ """Some hard-coded and manually written rules that extent the
+ tagging. Returns a (word, normalized_word, tag) triple."""
+ # ...
+ return None
+
+ def tag(self, data_table, seqs_table_followed_by, seqs_table_follows): # z.164 texttag calls
+ """Tag self.text and return list of tuples
+ (word, normalized word, most probable tag)"""
+ self.text = self.expandEntities(self.text)
+ is_bnc = 0
+ word_matches = self.getBNCTuples(self.text)
+ if len(word_matches) > 0:
+ # seems like this is a BNC text used for testing
+ is_bnc = 1
+ print >> sys.stderr, "BNC text detected."
+ else:
+ word_matches = self.nonword.split(self.text)
+ # tktk splitted looks \xe1, etc...
+ # Put sentence end periods etc into an extra element.
+ # We cannot just split on periods etc. because that would
+ # break inner-sentence tokens like "... No. 5 ...":
+ # fixme: only work on the last element (not counting white space)
+ # FIXME: doesn't work here: "I cannot , she said."
+ if not is_bnc:
+ j = len(word_matches)-1
+ while j >= 0:
+ w = word_matches[j]
+ s_end_match = self.sentence_end.search(w)
+ if s_end_match:
+ word_matches[j] = w[:len(w)-len(s_end_match.group(1))]
+ word_matches.insert(j+1, s_end_match.group(1))
+ break
+ j = j - 1
+
+# print "word_matches=%s" % word_matches
+ i = 0
+ tagged_list = [self.DUMMY, self.DUMMY]
+ tagged_list_bnc = [self.DUMMY, self.DUMMY]
+
+ while i < len(word_matches):
+ next_token = None
+ tags = None
+ if is_bnc:
+ # word_matches[i] is a (tag,word) tuple
+ (tag, word) = word_matches[i]
+ if i+1 < len(word_matches):
+ (next_token, foo) = word_matches[i+1]
+ word = self.normalise(word)
+ tags = self.splitBNCTag(tag)
+ else:
+ word = word_matches[i]
+ if i+1 < len(word_matches):
+ next_token = word_matches[i+1]
+ if self.textlanguage == 'en':
+ if i + 2 < len(word_matches): # english only
+ # BNC special case: "of course" and some others are tagged as one word!
+ tuple_word = "%s %s" % (word, word_matches[i+2]) # +2 = jump over whitespace
+ if data_table.has_key(tuple_word):
+ #print >> sys.stderr, "*** SPECIAL CASE %d '%s' ..." % (i, tuple_word)
+ word = tuple_word
+ i = i + 2
+#
+# The next several (6-7) lines avoid not found words
+# because of trailing dots.
+#
+ if len(word) >= 1 and word[-1] in ( '.', ',', '?','!', ':', ';', '\'', '\"', '%', '='):
+ wordend = word[-1];
+ word = word[0:-1]
+ r = Text.tagWord(self, word, data_table)
+ tagged_list.extend(r)
+ word = wordend
+ r = Text.tagWord(self, word, data_table)
+ tagged_list.extend(r)
+
+ if is_bnc:
+ for el in r:
+ # happens e.g. with this (wrong?) markup in BNC:
+ #<W TYPE="CRD" TEIFORM="w">4's</W>
+ # My tagger tags <4> and <'s>, so there's an offset
+ # which makes futher comparisons BNC <-> tagger impossible,
+ # so use this pseudo-workaround and just re-use the tags
+ # for the <'s>, too:
+ #print "%s -> %s" % (el[0], tags)
+ tagged_list_bnc.append((el[0], tags))
+ i = i + 1
+
+ tagged_list.append(self.DUMMY)
+ tagged_list.append(self.DUMMY)
+
+ # test only:
+ #result_tuple_list = []
+ #i = 0
+ #count_wrong_tags = 0
+ #for t in tagged_list:
+ # #print "t=%s" % t
+ # if t:
+ # best_tag = self.getBestTagSimple(t[2])
+ # if is_bnc:
+ # wrong_tags = self.checkBNCMatch(i, tagged_list_bnc, t[0], best_tag, data_table)
+ # count_wrong_tags = count_wrong_tags + wrong_tags
+ # result_tuple_list.append((t[0], t[1], best_tag))
+ # i = i + 1
+ #stat = self.getStats(count_wrong_tags)
+ #print >> sys.stderr, stat
+ #print result_tuple_list
+
+ ### Constraint-based part:
+ prev_word = None
+ next_word = None
+ i = 0
+ for tag_tuples in tagged_list:
+ prev_word = self.getPrevWord(i, tagged_list)
+ next_word = self.getNextWord(i, tagged_list)
+ if tag_tuples and tag_tuples[1]:
+ self.applyConstraints(prev_word, tag_tuples[0], next_word, tag_tuples[2])
+ i = i + 1
+
+ result_tuple_list = self.selectTagsByContext(tagged_list, seqs_table_followed_by, \
+ seqs_table_follows, tagged_list_bnc, is_bnc, data_table)
+
+ i = 0
+ for tag_triple in result_tuple_list:
+ triple = self.applyTagRules(tag_triple[0], tag_triple[1], tag_triple[2])
+ if triple:
+ result_tuple_list[i] = triple
+ if self.sentence_end.search(tag_triple[0]):
+ # make sure punctuation doesn't have tags:
+ result_tuple_list[i] = (tag_triple[0], None, None)
+ i = i + 1
+
+ return result_tuple_list
+
+ def selectTagsByContext(self, tagged_list, seqs_table_followed_by, \
+ seqs_table_follows, tagged_list_bnc, is_bnc, data_table):
+
+ count_wrong_tags = 0
+ tag_probs = {}
+ i = 0
+ for tagged_triple in tagged_list:
+ if tagged_triple != None and tagged_triple[1] == None:
+ # ignore whitespace
+ i = i + 1
+ continue
+ try:
+ one = tagged_list[i]
+ two = tagged_list[i+1]
+ whitespace_jump = 0
+ if two and two[1] == None:
+ two = tagged_list[i+2]
+ whitespace_jump = whitespace_jump + 1
+ two_pos = i + 1 + whitespace_jump
+ three = tagged_list[i+2+whitespace_jump]
+ if three and three[1] == None:
+ three = tagged_list[i+3+whitespace_jump]
+ whitespace_jump = whitespace_jump + 1
+ three_pos = i + 2 + whitespace_jump
+ except IndexError:
+ # list end
+ break
+
+ one_tags = [None]
+ if one:
+ one_tags = one[2]
+ two_tags = [None]
+ if two: two_tags = two[2]
+ three_tags = [None]
+ if three: three_tags = three[2]
+
+ for one_tag in one_tags:
+ tag_one_prob = 0
+ if one_tag:
+ tag_one_prob = one_tag[1]
+
+ for two_tag in two_tags:
+ tag_two_prob = 0
+ if two_tag:
+ tag_two_prob = two_tag[1]
+
+ for three_tag in three_tags:
+ tag_three_prob = 0
+ if three_tag:
+ tag_three_prob = three_tag[1]
+
+ #print "** %s/%s/%s" % (one_tag, two_tag, three_tag)
+ one_tag_prob = None
+ if one_tag: one_tag_prob = one_tag[0]
+ two_tag_prob = None
+ if two_tag: two_tag_prob = two_tag[0]
+ three_tag_prob = None
+ if three_tag: three_tag_prob = three_tag[0]
+
+ seq_prob = 0
+ if one:
+ #print one[0],
+ #if two:
+ # print two[0]
+ try:
+ k1 = (one_tag_prob, two_tag_prob)
+ k2 = (two_tag_prob, three_tag_prob)
+ seq_prob = seqs_table_followed_by[k1] * \
+ seqs_table_followed_by[k2]
+ #print "k1=%s, k2=%s" % (str(k1), str(k2))
+ except KeyError:
+ pass
+ prob_combined = seq_prob * tag_one_prob
+ #print "%s, %s, %s: %.7f * %.7f = %.7f" % (one_tag_prob, two_tag_prob, \
+ # three_tag_prob, seq_prob, tag_one_prob, prob_combined)
+ k1 = (i, one_tag[0])
+ #print "%s = %.7f" % (str(k1), prob_combined)
+ try:
+ tag_probs[k1] = tag_probs[k1] + prob_combined
+ except KeyError:
+ tag_probs[k1] = prob_combined
+ if two:
+ try:
+ seq_prob = seqs_table_follows[(two_tag_prob, one_tag_prob)] * \
+ seqs_table_followed_by[(two_tag_prob, three_tag_prob)]
+ except KeyError:
+ pass
+ prob_combined = seq_prob * tag_two_prob
+ k2 = (two_pos, two_tag[0])
+ try:
+ tag_probs[k2] = tag_probs[k2] + prob_combined
+ except KeyError:
+ tag_probs[k2] = prob_combined
+ #print "%s = %.7f" % (str(k2), prob_combined)
+ if three:
+ try:
+ seq_prob = seqs_table_follows[(two_tag_prob, one_tag_prob)] * \
+ seqs_table_follows[(three_tag_prob, two_tag_prob)]
+ except KeyError:
+ pass
+ prob_combined = seq_prob * tag_three_prob
+ k3 = (three_pos, three_tag[0])
+ try:
+ tag_probs[k3] = tag_probs[k3] + prob_combined
+ except KeyError:
+ tag_probs[k3] = prob_combined
+ #print "%s = %.7f" % (str(k3), prob_combined)
+
+ orig_word = None
+ norm_word = None
+ # the word that falls out of the window is assigned its final tag:
+ if one:
+ orig_word = one[0]
+ norm_word = one[1]
+ keys = tag_probs.keys()
+ max_prob = 0
+ best_tag = None
+ for tag_prob in keys:
+ if tag_prob[0] == i and tag_probs[tag_prob] >= max_prob:
+ ###print " K=%s, V=%s" % (tag_prob, tag_probs[tag_prob])
+ max_prob = tag_probs[tag_prob]
+ best_tag = tag_prob[1]
+ tagged_list[i] = (orig_word, norm_word, best_tag)
+ #print "BEST@%d: %s" % (i, best_tag)
+
+ # this avoids inefficiencies, it's necessary because
+ # of the tag_probs.keys() call above (which becomes
+ # too slow otherwise):
+ for tag_prob in keys:
+ if tag_prob[0] <= i:
+ del tag_probs[tag_prob]
+
+ if is_bnc and one:
+ orig_word = one[0]
+ if self.textlanguage == 'en':
+ wrong_tags = self.checkBNCMatch(i, tagged_list_bnc, orig_word, best_tag, data_table)
+ count_wrong_tags = count_wrong_tags + wrong_tags
+
+ i = i + 1
+
+ stat = self.getStats(count_wrong_tags, is_bnc)
+ #print >> sys.stderr, stat
+
+ # remove dummy entries:
+ tagged_list.pop(0)
+ tagged_list.pop(0)
+ tagged_list.pop()
+ tagged_list.pop()
+
+ return tagged_list
+
+ def getPrevWord(self, i, tagged_list):
+ """Find the token previous to the token at position i from tagged_list,
+ ignoring whitespace tokens. Return a tuple (word, tuple_list),
+ whereas tuple_list is a list of (tag, tag_probability) tuples."""
+ j = i-1
+ while j >= 0:
+ (orig_word_tmp, tagged_word_tmp, tag_tuples_tmp) = self.getTuple(tagged_list[j])
+ j = j - 1
+ if not tagged_word_tmp:
+ continue
+ else:
+ prev = tag_tuples_tmp
+ return orig_word_tmp
+ return None
+
+ def getNextWord(self, i, tagged_list):
+ """Find the token next to the token at position i from tagged_list,
+ ignoring whitespace tokens. See self.getPrevToken()"""
+ j = i + 1
+ while j < len(tagged_list):
+ (orig_word_tmp, tagged_word_tmp, tag_tuples_tmp) = self.getTuple(tagged_list[j])
+ j = j + 1
+ if not tagged_word_tmp:
+ continue
+ else:
+ next = tag_tuples_tmp
+ return orig_word_tmp
+ return None
+
+ def getTuple(self, tagged_list_elem):
+ if not tagged_list_elem:
+ orig_word = None
+ tagged_word = None
+ tag_tuples = None
+ else:
+ (orig_word, tagged_word, tag_tuples) = tagged_list_elem
+ return (orig_word, tagged_word, tag_tuples)
+
+
+ def toXML(self, tagged_words):
+ "Show result as XML."
+ xml_list = []
+ for (orig_word, word, tag) in tagged_words:
+ # fast appending:
+ if not word and not tag:
+ xml_list.append(' <w>%s</w>\n' % orig_word)
+ else:
+ xml_list.append(' <w term="%s" type="%s">%s</w>\n' % (word, tag, orig_word))
+ xml = "<taggedWords>\n" + string.join(xml_list, "") + "</taggedWords>\n"
+ return xml
+
+
+class PreTaggedText(Text):
+ "Text from the BNC Sampler in XML format."
+
+ def __init__(self, filename):
+ self.content = None
+ Text.__init__(self)
+ f = open(filename)
+ self.content = f.read()
+ f.close()
+ return
+
+ def getTaggedWords(self):
+ "Returns list of tuples (word, tag)"
+ text = self.expandEntities(self.content)
+ word_matches = self.getBNCTuples(text)
+ tagged_words = []
+ for (tag, word) in word_matches:
+ tagged_words.append((word, tag))
+ return tagged_words
+
+
+class WordData:
+ "A term and the frequency of its tags."
+
+ def __init__(self, word, affix, table):
+ self.word = word
+ self.affix = affix
+ # table = tag / number of occurences
+ # deep copy the hash table (TODO: use deep copy functions):
+ self.table = {}
+ for el in table:
+ self.table[el] = table[el]
+ return
+
+ def __str__(self):
+ "Show word data (debugging only!)"
+ string = self.word + ":\n"
+ for el in self.table:
+ string = string + "\t" + el + ": " + str(self.table[el]) + "\n"
+ return string
diff --git a/languagetool/src/TaggerTest.py b/languagetool/src/TaggerTest.py
new file mode 100644
index 0000000..c94f233
--- /dev/null
+++ b/languagetool/src/TaggerTest.py
@@ -0,0 +1,168 @@
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import unittest
+import Tagger
+
+import os
+import sys
+
+class TaggerTestCase(unittest.TestCase):
+
+ FILENAME_WORDS = os.path.join(sys.path[0], "data", "tag_test_words")
+ FILENAME_SEQ1 = os.path.join(sys.path[0], "data", "tag_test_sequences1")
+ FILENAME_SEQ2 = os.path.join(sys.path[0], "data", "tag_test_sequences2")
+
+ def cleanList(self, l):
+ """Return a copy of the list with 'None' elements (e.g. whitespace)
+ removed. Also, only the first and last element of each triple is
+ copied."""
+ new_list = []
+ for el in l:
+ if el[1]:
+ new_list.append((el[0], el[2]))
+ return new_list
+
+ def cleanListAll(self, l):
+ """Return a copy of the list with 'None' elements (e.g. whitespace)
+ removed. Also, only the last element of each triple is copied."""
+ new_list = []
+ for el in l:
+ if el[1]:
+ new_list.append(el[2])
+ return new_list
+
+ def tag(self, learn_text, text):
+
+ # build data:
+ tagger = Tagger.Tagger("en", self.FILENAME_WORDS, self.FILENAME_SEQ1, self.FILENAME_SEQ2)
+ tagger.deleteData()
+ tagger.bindData()
+ tagger.buildDataFromString(learn_text)
+ tagger.commitData()
+ tagger = None
+
+ # tag text:
+ tagger2 = Tagger.Tagger("en", self.FILENAME_WORDS, self.FILENAME_SEQ1, self.FILENAME_SEQ2)
+ tagger2.bindData()
+ res = tagger2.tagText(text)
+ res = self.cleanList(res)
+ tagger2.deleteData()
+
+ return res
+
+ def testExpandEntities(self):
+ tagger = Tagger.Text("en", None)
+ r = tagger.expandEntities("")
+ self.assertEqual(r, "")
+ r = tagger.expandEntities("bla &amp;&amp;")
+ self.assertEqual(r, "bla &&")
+ #r = tagger.expandEntities("bla &#xA3;")
+ #self.assertEqual(r, u"bla £")
+ return
+
+ def testGuess(self):
+ tagger = Tagger.Tagger("en", self.FILENAME_WORDS, self.FILENAME_SEQ1, self.FILENAME_SEQ2)
+ tagger.deleteData()
+ tagger.bindData()
+ tagger.buildDataFromString("") # don't learn at all!
+ tagger.commitData()
+
+ tag = tagger.guessTagTest("")
+ self.assertEqual(tag, None)
+
+ # numbers = CRD:
+ self.assertEqual(tagger.guessTagTest("0"), 'CRD')
+ self.assertEqual(tagger.guessTagTest("3123.1312"), 'CRD')
+ self.assertEqual(tagger.guessTagTest("00,99"), 'CRD')
+ self.assertEqual(tagger.guessTagTest("00/99"), 'CRD')
+ self.assertEqual(tagger.guessTagTest("1-99"), 'CRD')
+
+ # BNC Sampler tags "$xx" as NNU, which is mapped to NN0 (same for £):
+ self.assertEqual(tagger.guessTagTest("$31.12"), 'NN0')
+ self.assertEqual(tagger.guessTagTest("HIV"), 'NN0')
+ self.assertEqual(tagger.guessTagTest("8.55pm"), 'AV0')
+ self.assertEqual(tagger.guessTagTest("10.10pm"), 'AV0')
+ self.assertEqual(tagger.guessTagTest(u"Großekathöfer"), 'NP0')
+ self.assertEqual(tagger.guessTagTest("jackerfoodom"), 'NN1')
+ self.assertEqual(tagger.guessTagTest("testious"), 'AJ0')
+ self.assertEqual(tagger.guessTagTest("testize"), 'VVI')
+ self.assertEqual(tagger.guessTagTest("foofooly"), 'AV0')
+ self.assertEqual(tagger.guessTagTest("unguessablexxx"), None)
+ self.assertEqual(tagger.guessTagTest("verboten"), None)
+ return
+
+ def testLearningAndTagging(self):
+
+ print "###########1"
+
+ #FIXME: doesn't work:
+ r = self.tag("The/AT0 fat/AJ0 man/NN1", "The big man")
+ self.assertEqual(r, [('The', 'AT0'), ('big', 'unknown'), ('man', 'NN1')])
+
+ print "###########2"
+ return #FIXME
+
+ r = self.tag("The/AT0 fat/AJ0 man/NN1", "the xxx")
+ # the/unknown because the tagger is case sensitive:
+ self.assertEqual(r, [('the', 'unknown'), ('xxx', 'unknown')])
+
+ r = self.tag("The/AT0 fat/AJ0 man/NN1", "The fat man")
+ self.assertEqual(r, [('The', 'AT0'), ('fat', 'AJ0'), ('man', 'NN1')])
+
+ r = self.tag("A/DET cool/AJ0 large/AJ0 car/NN1", "A cool car")
+ self.assertEqual(r, [('A', 'DET'), ('cool', 'AJ0'), ('car', 'NN1')])
+
+ # fat occurs 2 times as NN1 and 1 time as AJ0, but context decides:
+ r = self.tag("""The/DET fat/NN1 is/VB hot/AJ0
+ The/DET fat/AJ0 guy/NN1
+ A/DET man/NN1 used/VBD fat/NN1""",
+ "A fat man")
+ self.assertEqual(r, [('A', 'DET'), ('fat', 'AJ0'), ('man', 'NN1')])
+
+ # fat occurs 3 times as NN1 and 0 times as AJ0 -> tagged as NN1 of course:
+ r = self.tag("""The/DET fat/NN1 is/VB hot/AJ0
+ A/DET fat/NN1 man/NN1 .
+ He/PP used/VBD fat/NN1""", "A fat man")
+ self.assertEqual(r, [('A', 'DET'), ('fat', 'NN1'), ('man', 'NN1')])
+
+ # fat occurs 1 times as NN1 and 2 times as AJ0 -> tagged as AJ0
+ r = self.tag("""The/DET fat/AJ0 is/VB hot/AJ0
+ A/DET fat/AJ0 man/NN1 .
+ He/PP used/VBD fat/NN1""", "A fat man")
+ self.assertEqual(r, [('A', 'DET'), ('fat', 'AJ0'), ('man', 'NN1')])
+
+ r = self.tag("""The/DET fat/AJ0 man/NN is/VB fat/AJ0 ./PP""",
+ "A fat man he is fat.")
+ self.assertEqual(r, [('A', 'unknown'), ('fat', 'AJ0'), ('man', 'NN'),
+ ('he', 'unknown'), ('is', 'VB'), ('fat', 'AJ0')])
+
+ return
+
+ #FIXME
+ #def testApplyConstraints(self):
+ # r = self.tag("A/X bla/X demodemo/AA demodemo/AA demodemo/BB bla/X bla/X", \
+ # "demodemo")
+ # self.assertEqual(r, [('demodemo', 'BB')])
+ #
+ # return
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/languagetool/src/Tools.py b/languagetool/src/Tools.py
new file mode 100644
index 0000000..5bed1b1
--- /dev/null
+++ b/languagetool/src/Tools.py
@@ -0,0 +1,58 @@
+# -*- coding: iso-8859-1 -*-
+# Tools class
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import sys
+import re
+
+class Tools:
+
+ def __init__(self):
+ return
+
+ def getXML(node, xmlstr=""):
+ """Get the XML content of a node, but only elements and text."""
+ if node and node.nodeType == node.ELEMENT_NODE:
+ l = []
+ for child in node.childNodes:
+ l.append(Tools.getXML(child, xmlstr))
+ xmlstr = "<%s>%s</%s>" % (node.tagName, str.join('', l), node.tagName)
+ elif node and node.nodeType == node.TEXT_NODE:
+ xmlstr = "%s%s" % (xmlstr, node.data)
+ return xmlstr
+
+ getXML = staticmethod(getXML)
+
+ def countLinebreaks(s):
+ matches = re.findall("[\n\r]", s)
+ #print "#%s -> %s" % (s, len(matches))
+ return len(matches)
+
+ countLinebreaks = staticmethod(countLinebreaks)
+
+ def getLanguageName(shortName):
+ if shortName == 'en':
+ return 'English'
+ elif shortName == 'de':
+ return 'German'
+ elif shortName == 'hu':
+ return 'Hungarian'
+ return None
+
+ getLanguageName = staticmethod(getLanguageName)
diff --git a/languagetool/src/Wfdeu.py b/languagetool/src/Wfdeu.py
new file mode 100755
index 0000000..89b26fc
--- /dev/null
+++ b/languagetool/src/Wfdeu.py
@@ -0,0 +1,70 @@
+# -*- coding: iso-8859-1 -*-
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2004 ....
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+import array
+import codecs
+import os
+from string import *
+import sys
+
+class Wfdeu:
+
+ encoding = "latin1"
+
+ def __init__(self):
+ return
+
+ def getTyp(self,typ, oword, word):
+ if typ != "":
+ if typ == 'V' or typ == 'HV':
+ if oword[-4:] == 'ende' or oword[-5:-1] == 'ende':
+ typ = 'ADJV'
+ if typ == 'V' or typ == 'HV':
+ if oword[-1:] == 'e':
+ typ = typ + '11'
+ elif oword[-2:] == 'st':
+ typ = typ + '12'
+ elif oword[-2:] == 'en':
+ typ = typ + '14'
+ elif oword[-2:] == 'et':
+ typ = typ + '15'
+ elif oword[-1:] == 't':
+ typ = typ + '13'
+ elif typ == 'ADJ':
+ if oword[-2:] == 'er':
+ typ = 'ADJER'
+ elif oword[-2:] == 'en':
+ typ = 'ADJEN'
+ elif oword[-2:] == 'em':
+ typ = 'ADJEM'
+ elif oword[-2:] == 'es':
+ typ = 'ADJES'
+ elif oword[-1:] == 'e':
+ typ = 'ADJE'
+ elif typ == 'NMS':
+ if oword[-2:] == 'in':
+ typ = 'NFS'
+ elif oword[-5:] == 'innen':
+ typ = 'NF'
+ if typ[0] == 'N':
+ if word != oword and typ[-1:] == 'S':
+ typ = typ[0:-1]
+ return typ
+
+
+
diff --git a/languagetool/src/Wfhun.py b/languagetool/src/Wfhun.py
new file mode 100755
index 0000000..3514ca1
--- /dev/null
+++ b/languagetool/src/Wfhun.py
@@ -0,0 +1,88 @@
+# -*- coding: iso-8859-1 -*-
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2004 ....
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+import array
+import codecs
+import os
+from string import *
+import sys
+
+class Wfhun:
+
+ encoding = "latin1"
+
+ def __init__(self):
+ return
+
+ def getTyp(self,typ, oword, word):
+ dif = len(oword) - len(word)
+ if (typ[0] == 'V' or typ[0:2] == 'SI') and word != oword:
+ ik = ''
+ telo = 'SI'
+ if typ[0] == 'V':
+ telo = 'V'
+ if oword[0:2] != word[0:2]:
+ ik = 'IK'
+ if oword[-3:] in (u'iük','iuk', 'nak', 'nek','tak', 'tek') or oword[-2:] in (u'ák', u'ék'):
+ typ = ik + telo + '6'
+ elif oword[-3:] in ('tok','tek', u'tök'):
+ typ = ik + telo + '5'
+ elif oword[-3:] in (u'ünk','unk', u'ánk', u'énk') or oword[-2:] in ('uk', u'ük'):
+ typ = ik + telo + '4'
+ elif oword[-2:] in ('sz','od', 'ed', u'öd',u'éd','ad',u'ád'):
+ typ = ik + telo + '2'
+ elif oword[-2:] in ('ok','ek',u'ök','om','em',u'öm', u'ám', u'ém', 'am'):
+ typ = ik + telo + '1'
+ elif oword[-2:] in ('va', 've') or oword[-3:] in (u'ván', u'vén'):
+ typ = 'ADV'
+ elif oword[-2:] == 'ni':
+ typ = 'INF'
+ else:
+ typ = ik + telo + '3'
+ elif typ[0:3] == 'PP4':
+ if oword != 'mi':
+ typ = 'ADV'
+ elif typ[0:3] == 'ADJ':
+ if oword[-2:] in ('ek','ok', 'ak', u'ék', u'ák') and dif > 0 and (dif < 3 or ((word[0:1] != oword[0:1]) and dif < 9)):
+ typ = 'ADJP'
+ elif oword[-1:] in (u'é',u'á') and dif > 0 and (dif < 5 or ((word[0:1] != oword[0:1]) and dif < 12)):
+ typ = 'ADV'
+ elif oword[-2:] in ('an', 'en', 'bb','ul',u'ül') and dif == 2:
+ typ = 'ADV'
+ elif dif != 0:
+ typ = 'ADV'
+ elif typ[0] == 'N':
+ if oword[-1] == 'k' and oword[-2] in ('a',u'á', 'e',u'é','i',u'í','o',u'ó',u'ö',u'õ','u',u'ú',u'ü',u'û') and dif > 0 and dif < 3 :
+ typ = 'NP'
+ elif oword[-1:] == 'i' and dif == 1:
+ typ = 'DNA'
+ elif (oword[-1:] in(u'ú', u'û') and dif == 1) or (oword[-2:] in (u'jú', u'jû') and dif == 2):
+ typ = 'ADJS'
+ elif typ == 'N':
+ if oword[-1] == 'k' and oword == word:
+ typ = 'NP'
+ else:
+ typ = 'NS'
+ elif dif >= 2:
+ typ = 'N'
+ if typ[0] == 'N' and oword == word and word[-1] != 'k':
+ typ = typ+'N'
+ return typ
+
+
+
diff --git a/languagetool/src/Wfinder.py b/languagetool/src/Wfinder.py
new file mode 100644
index 0000000..7ba1935
--- /dev/null
+++ b/languagetool/src/Wfinder.py
@@ -0,0 +1,568 @@
+# -*- coding: iso-8859-1 -*-
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2004 ....
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+# usage python stem.py
+#
+# file test.txt contains are for example:
+# carried
+# worked
+# play
+#
+# example aff file (dtest.aff)
+# SFX D Y 4
+# SFX D 0 e d # abate->abated
+# SFX D y ied [^aeiou]y # carry -> carried
+# SFX D 0 ed [^ey] # work -> worked
+# SFX D 0 ed [aeiuu]y # play -> played
+#
+# example dic file (dtest.dic)
+# 3
+# carry/D
+# work/D
+# play/D
+#
+# reads words from the file test.txt
+#
+# Speed up 9 times by helding different
+# append endings in different arrays 3.July, 2004
+#
+# Speed improvement by 30% by doing the above
+# also with the prefixes, and by helding
+# affixes and prefixes in different lists. 4. July, 2004
+#
+
+import array
+import codecs
+import os
+import Tagger
+import Wfdeu
+import Wfhun
+from string import *
+import time
+import sys
+
+
+#aff_file = "dtest.aff"
+#dic_file = "dtest.dic"
+#test_file = "test.txt"
+yesno = {}
+comment = "#"
+condlist = []
+condlist1 = []
+alfab_conddic = {}
+palfab_conddic = {}
+alfab_condlist_group = []
+alfab2_condlist_group = []
+alfab2_conddic = {}
+palfab2_conddic = {}
+alfab2_condlist_group = []
+szodic = {}
+typdic = {}
+
+class Wfinder:
+
+ encoding = "latin1"
+ doubleflags = ""
+ doubleflagList=""
+
+ def __init__(self, textlanguage):
+# print time.strftime('%X %x %Z')
+ self.is_initialized = 0
+ self.is_secondflag = 0
+ self.textlanguage = textlanguage
+ self.wfdeu = Wfdeu.Wfdeu()
+ self.wfhun = Wfhun.Wfhun()
+ return
+
+ def aff_read(self):
+ self.aff_file = os.path.join(sys.path[0], "data", Tagger.affFile)
+ condlist = []
+ alfab_condlist_group = []
+ alfab2_condlist_group = []
+ faff = codecs.open(self.aff_file, "r", self.encoding)
+ l = " "
+ for i in range(0,256,1):
+ alfab_conddic[i] = []
+ palfab_conddic[i] = []
+ alfab2_conddic[i] = []
+ palfab2_conddic[i] = []
+ while l != "":
+ l = faff.readline()
+ ll = l.split()
+ if len(ll) <= 1:
+ continue
+ if ll[0][0] in comment:
+ continue
+ if ll[0][1:3] == "FX":
+ arrname = ll[1]
+ prefix = 0
+ if ll[0][0] == 'P':
+ prefix = 1
+ yesno[arrname] = ll[2]
+ for i in range(0, int(ll[3])):
+ l = faff.readline()
+ bb = l.split()
+# print "%s %d" %(bb,len(bb))
+# print "l:%s bb[2]:%s arrname:%s" %(l,bb[2], arrname)
+ strip = bb[2]
+ if bb[2] == '0':
+ strip = '';
+ appnd = bb[3]
+ if bb[3] == '0':
+ appnd = ''
+ appnd_last = '0'
+ else:
+ if prefix == 0:
+ appnd_last = appnd[-1]
+ else:
+ appnd_last = appnd[0]
+ if bb[4] != '.':
+ jj = 0
+ while(jj < len(bb[4])):
+ condarr = array.array('B',range(256))
+ insbit = 1;
+ for iii in range(0,256,1):
+ condarr[iii] = 0
+ if bb[4][jj] == '[':
+ kk = 0;
+ jj = jj + 1
+ if bb[4][jj] == '^':
+ jj = jj+1
+ insbit = 0;
+ for iii in range(0,256,1):
+ condarr[iii] = 1
+ while bb[4][jj] != ']':
+ condarr[ord(bb[4][jj])] = insbit;
+ jj = jj + 1
+ if bb[4][jj] == ']':
+ jj = jj +1
+ else:
+ condarr[ord(bb[4][jj])] = insbit;
+ jj = jj +1
+ condlist.append(condarr)
+ secondflag = ""
+ if len(bb) >= 7:
+ secondflag = bb[6]
+ self.is_secondflag = 1
+ if find(self.doubleflags,arrname) == -1:
+ self.doubleflags = self.doubleflags+arrname
+ for elem in secondflag:
+ if find(self.doubleflagList,elem) == -1:
+ self.doubleflagList = self.doubleflagList+elem
+# print "is_sec:%d" % self.is_secondflag
+ alfab2_condlist_group.append(condlist)
+ alfab2_condlist_group.append(strip)
+ alfab2_condlist_group.append(appnd)
+ alfab2_condlist_group.append(arrname)
+ alfab2_condlist_group.append(secondflag)
+ if prefix == 0:
+ alfab2_conddic[ord(appnd_last)].append(alfab2_condlist_group)
+ else:
+ palfab2_conddic[ord(appnd_last)].append(alfab2_condlist_group)
+ alfab_condlist_group.append(condlist)
+ alfab_condlist_group.append(strip)
+ alfab_condlist_group.append(appnd)
+ alfab_condlist_group.append(arrname)
+ if prefix == 0:
+ alfab_conddic[ord(appnd_last)].append(alfab_condlist_group)
+ else:
+ palfab_conddic[ord(appnd_last)].append(alfab_condlist_group)
+# print "appended %s to %s %d" %(appnd.encode('latin1'), appnd_last.encode('latin1'), ord(appnd_last))
+ condlist = []
+ alfab_condlist_group = []
+ alfab2_condlist_group = []
+ faff.close()
+# print self.doubleflags
+# for i in range (0,255,1):
+# print len(alfab_conddic[i])
+# print alfab_conddic[ord('a')]
+
+#
+# Now read the dictionary
+#
+ def dic_read(self):
+ self.dic_file = os.path.join(sys.path[0], "data", Tagger.dicFile)
+ szoszam = 0;
+ fdic = codecs.open(self.dic_file, "r", self.encoding)
+ l = " "
+ szolista = []
+ ujlista = []
+ l = fdic.readline()
+ szoszam = int(l)
+ while l != "":
+ l = fdic.readline()
+ szolista = l.split("/")
+ for szo in szolista:
+ szo = szo.strip('\n \t')
+ ujlista.append(szo)
+ if len(ujlista) > 1:
+ szodic[ujlista[0]] = ujlista[1]
+ else:
+ szodic[ujlista[0]] = ""
+ if len(ujlista) > 2:
+ typdic[ujlista[0]] = ujlista[2]
+ else:
+ typdic[ujlista[0]] = ""
+ ujlista = []
+ fdic.close()
+
+ def do_keytest(self,l):
+ if l == "":
+ return ""
+ if szodic.has_key(l):
+ return "+ %s" %l
+ else:
+ return "- %s" %l
+
+ def suffix2_search(self, l, oarrname, oword):
+ retval = ""
+ found = 0
+ for windex in ord(l[-1]), ord('0'):
+ for elem in alfab2_conddic[windex]:
+ # elem0: condlist, elem1: strip elem2 = append, elem3 = arrname
+# print "s2_s l:%s oarr:%s elem[4]:%s app:%s strip:%s" % (l, oarrname, elem[4],elem[2],elem[1] )
+ if found:
+ return retval
+ if find(elem[4], oarrname) == -1:
+ continue
+ #
+ # search first only suffixes
+ # since prefix is optional
+ #
+ appnd = elem[2]
+ if len(appnd):
+ if l[-len(appnd):] != appnd:
+ continue
+# if len(appnd):
+ restoredWord = l[0:len(l)-len(appnd)]
+ else:
+ restoredWord = l
+ condlist = elem[0]
+ strip = elem[1]
+ if len(strip):
+ restoredWord = restoredWord + strip
+ break_it = 0
+ if len(condlist) > 0 and len(restoredWord) >= len(condlist): #tktk
+ substr = restoredWord[-len(condlist):]
+ for i in range(0, len(condlist), 1): #tktk
+ if condlist[i][ord(substr[i])] != 1:
+ break_it = 1
+ break
+ if break_it:
+ continue
+
+ if szodic.has_key(restoredWord):
+ flags = szodic[restoredWord]
+# print "s22_s: %s %d %s %s %s %s %s" % (restoredWord,szodic.has_key(restoredWord),elem[3], oarrname, elem[4], oarrname, flags)
+ if flags == "": # tktk
+ continue
+ else:
+ if find(flags, elem[3]) == -1:
+ continue
+ retval = "++ %s %s" %(oword,restoredWord)
+ found = 1
+ return retval
+ return retval
+
+
+ def suffix_search(self, l, oldl, oarrname):
+ retval = ""
+ found = 0
+ for windex in ord(l[-1]), ord('0'):
+ for elem in alfab_conddic[windex]:
+ # elem0: condlist, elem1: strip elem2 = append, elem3 = arrname
+ if found:
+ return retval
+ #
+ # search first only suffixes
+ # since prefix is optional
+ #
+ appnd = elem[2]
+ if len(appnd):
+ if l[-len(appnd):] != appnd:
+ continue
+ restoredWord = l[0:len(l)-len(appnd)]
+ else:
+ restoredWord = l
+ condlist = elem[0]
+ strip = elem[1]
+ if len(strip):
+ restoredWord = restoredWord + strip
+ break_it = 0
+# print "%s %s %s %s" %(restoredWord,appnd,strip, elem[3])
+ if len(condlist) > 0 and len(restoredWord) >= len(condlist): #tktk
+ substr = restoredWord[-len(condlist):]
+ for i in range(0, len(condlist), 1): #tktk
+ if condlist[i][ord(substr[i])] != 1:
+ break_it = 1
+ break
+ if break_it:
+ continue
+ if szodic.has_key(restoredWord):
+ flags = szodic[restoredWord]
+ if flags == "": # tktk
+ continue
+ else:
+ if find(flags, elem[3]) == -1:
+ continue
+ if oarrname != "" and find(flags, oarrname) == -1:
+ continue
+ if oldl != "":
+ retval = "+++ %s %s %s" %(oldl, l,restoredWord)
+ else:
+ retval = "++ %s %s" %(l,restoredWord)
+ found = 1
+ return retval
+ # print windex
+ return retval
+
+ def suffix22_search(self, l, oldl, oarrname):
+ retval = ""
+ found = 0
+ for windex in ord(l[-1]), ord('0'):
+ for elem in alfab_conddic[windex]:
+ # elem0: condlist, elem1: strip elem2 = append, elem3 = arrname
+# print "s.d:%s e3:%s app:%s str:%s" % (self.doubleflags, elem[3], elem[2],elem[1])
+ if find(self.doubleflagList, elem[3]) == -1:
+ continue
+ if found:
+ return retval
+ #
+ # search first only suffixes
+ # since prefix is optional
+ #
+# print "s22x l:%s oldl:%s oarrname:%s appnd:%s strip:%s" % (l, oldl, oarrname, elem[2], elem[1])
+ appnd = elem[2]
+ if len(appnd):
+ if l[-len(appnd):] != appnd:
+ continue
+ restoredWord = l[0:len(l)-len(appnd)]
+ else:
+ restoredWord = l
+ condlist = elem[0]
+ strip = elem[1]
+ if len(strip):
+ restoredWord = restoredWord + strip
+ break_it = 0
+# print "s22: %s %s %s %s" %(restoredWord,appnd,strip, elem[3])
+ if len(condlist) > 0 and len(restoredWord) >= len(condlist): #tktk
+ substr = restoredWord[-len(condlist):]
+ for i in range(0, len(condlist), 1): #tktk
+ if condlist[i][ord(substr[i])] != 1:
+ break_it = 1
+ break
+ if break_it:
+ continue
+# print "s->s2, rw:%s e3:%s" % (restoredWord, elem[3])
+ rval = self.suffix2_search(restoredWord, elem[3], l)
+ if rval != "":
+ found = 1
+ retval = rval
+ return rval
+ # print windex
+ return retval
+
+ def prefix_search(self, l):
+ found = 0
+ retval = ""
+ for windex in ord(l[0]), ord('0'):
+ for elem in palfab_conddic[windex]:
+ if found:
+ return retval
+ appnd = elem[2]
+ if appnd == l[:len(appnd)]: # cut the matching prefix
+ l1 = l[len(appnd):]
+ else:
+ continue
+ condlist = elem[0]
+ strip = elem[1]
+ if len(strip):
+ l1 = strip + l1
+ break_it = 0
+ if len(condlist) > 0 and len(l1) >= len(condlist): #tktk
+ substr = l1[0:len(condlist)]
+ for i in range(0, len(condlist), 1): #tktk
+ if condlist[i][ord(substr[i])] != 1:
+ break_it = 1
+ break
+ if break_it:
+ continue
+ #
+ # prefix without suffix
+ #
+ arrname = elem[3]
+ if szodic.has_key(l1):
+ flags1 = szodic[l1]
+ if flags1 != "":
+ if find(flags1, arrname) == -1:
+ continue
+ retval = "++ %s %s" %(l,l1)
+ found = 1
+ return retval
+
+ if lower(yesno[arrname]) == 'n':
+ continue
+#
+# check if this unprefixed word
+# is a valid suffixed one
+#
+ retval = self.suffix_search(l1, l, arrname)
+ if retval != "":
+ found = 1
+ return retval
+ return retval
+
+ def prefix22_search(self, l):
+ found = 0
+ retval = ""
+ for windex in ord(l[0]), ord('0'):
+ for elem in palfab_conddic[windex]:
+ if found:
+ return retval
+# print "str:%s app:%s e3:%s dfl:%s df:%s" % (elem[1],elem[2], elem[3],self.doubleflagList,self.doubleflags)
+ if find(self.doubleflagList, elem[3]) == -1 and find(self.doubleflags, elem[3]) == -1:
+ continue
+ appnd = elem[2]
+ if appnd == l[:len(appnd)]: # cut the matching prefix
+ l1 = l[len(appnd):]
+ else:
+ continue
+ condlist = elem[0]
+ strip = elem[1]
+ if len(strip):
+ l1 = strip + l1
+ break_it = 0
+ if len(condlist) > 0 and len(l1) >= len(condlist): #tktk
+ substr = l1[0:len(condlist)]
+ for i in range(0, len(condlist), 1): #tktk
+ if condlist[i][ord(substr[i])] != 1:
+ break_it = 1
+ break
+ if break_it:
+ continue
+ #
+ # prefix without suffix
+ #
+ arrname = elem[3]
+# print "p22->s2 l1:%s e3:%s l:%s" %(l1,elem[3],l)
+ rval = self.suffix2_search(l1, elem[3],l)
+ if rval != "":
+ found = 1
+ retval = rval
+ return rval
+
+ if lower(yesno[arrname]) == 'n':
+ continue
+#
+# check if this unprefixed word
+# is a valid suffixed one
+#
+# print "ps l1:%s l:%s arrn:%s" % (l1, l, arrname)
+ retval = self.suffix22_search(l1, "", "")
+ if retval != "":
+ found = 1
+ return retval
+ return retval
+
+
+ def do_test(self,l):
+ if l == "":
+ return ""
+ else:
+ oldword = l
+ found = 0
+# print "ss l:%s" %l
+ retval = self.suffix_search(l, "", "")
+ if retval != "":
+ found = 1
+ return retval
+#
+# searched all suffixes and not found
+# now try to combine all prefixes with all suffixes
+# that allow combinations
+#
+# print "sp l:%s" %l
+ retval = self.prefix_search(l)
+ if retval != "":
+ found = 1
+ return retval
+
+ if self.is_secondflag:
+# print "s22 l:%s" %l
+ retval = self.suffix22_search(l, "", "")
+ if retval != "":
+ found = 1
+ return retval
+# print "p22 l:%s" %l
+ retval = self.prefix22_search(l)
+ if retval != "":
+ found = 1
+ return retval
+
+ return "- %s" % oldword
+
+ def test_it(self,l):
+ if self.is_initialized == 0:
+ self.aff_read()
+ self.dic_read()
+ self.is_initialized = 1
+ lcasetest = 0
+ result = self.do_keytest(l)
+ if result[0] == '-':
+ lu = l[0]
+ if lu != lu.lower():
+ l1 = lu[0].lower()+l[1:]
+ if l1 != l:
+ lcasetest = 1;
+ result = self.do_keytest(l1)
+ #
+ # in languages not German more likely to find
+ # a lower case word than an uppercase
+ #
+ if result[0] == '-' and self.textlanguage != 'de':
+ tmp = l1
+ l1 = l
+ l = tmp
+ if result[0] == '-':
+ result = self.do_test(l)
+ if result[0] == '-' and lcasetest == 1:
+ result = self.do_test(l1)
+ typ = ''
+ if result[0] != '-':
+ src = result.split()
+ word = src[len(src) - 1]
+ oword = src[1]
+ typ = typdic[word]
+# print typ + " " + oword[-1:] + " " +oword[-2:]
+#
+# Here are the language specific rules of each language
+#
+ if self.textlanguage == 'de':
+ typ = self.wfdeu.getTyp(typ, oword, word)
+ elif self.textlanguage == 'hu':
+# print word+" "+oword+" "+typ
+ typ = self.wfhun.getTyp(typ, oword, word)
+#
+# end of language specific rules for new languages
+#
+# print typ
+ result = result + " " + typ
+# print result
+ return result
+
+
diff --git a/languagetool/src/client.py b/languagetool/src/client.py
new file mode 100644
index 0000000..c3826ba
--- /dev/null
+++ b/languagetool/src/client.py
@@ -0,0 +1,28 @@
+#!/usr/bin/python
+# daniel.naber@t-online.de, 2003-05-02
+# This is just a test to show how a TextChecker server can be called
+
+import socket
+
+sentence = "A sentence bigger then a short one."
+
+server_name = "127.0.0.1"
+server_port = 50100
+
+print "Test client for socket_server.py"
+print "Connecting %s, port %d..." % (server_name, server_port)
+s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+s.connect(("127.0.0.1", 50100))
+print "Connected."
+cfg = '<config textlanguage="en" mothertongue="de" grammar="COMP_THAN" />\n'
+s.sendall("%s<text>%s</text>" % (cfg, sentence))
+print "Data sent, waiting for reply..."
+data = ""
+while 1:
+ received = s.recv(1024)
+ data = "%s%s" % (data, received)
+ if not received:
+ break
+s.close()
+print "Received reply:"
+print data
diff --git a/languagetool/src/query.py b/languagetool/src/query.py
new file mode 100644
index 0000000..b34a1ff
--- /dev/null
+++ b/languagetool/src/query.py
@@ -0,0 +1,249 @@
+#!/usr/bin/python
+# Query BNC data files in XML format
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+# for debugging only
+import cgitb
+cgitb.enable()
+
+#import profile
+
+import cPickle
+import cgi
+import os
+import re
+import re
+import sys
+import time
+
+os.chdir(sys.path[0])
+sys.path.append(sys.path[0])
+import TagInfo
+
+data_dir = "/data/bnc/xml_data"
+context = 4
+limit = 30
+tags_str = "AJ0,AJC,AJS,AT0,AV0,AVP,AVQ,CJC,CJS,CJT,\
+CRD,DPS,DT0,DTQ,EX0,ITJ,NN0,NN1,NN2,NP0,ORD,PNI,PNP,\
+PNQ,PNX,POS,PRF,PRP,PUL,PUN,PUQ,PUR,TO0,UNC,VBB,VBD,\
+VBG,VBI,VBN,VBZ,VDB,VDD,VDG,VDI,VDN,VDZ,VHB,VHD,VHG,\
+VHI,VHN,VHZ,VM0,VVB,VVD,VVG,VVI,VVN,VVZ,XX0,ZZ0"
+
+tags = re.split(",", tags_str)
+sentence_count = 0
+word_count = 0
+matches = 0
+regex = re.compile("(<S.*?</S>)", re.DOTALL)
+words_regex = re.compile("(<[WC].*?</[WC]>)", re.DOTALL)
+type_regex = re.compile("TYPE=\"(.*?)\"")
+word_regex = re.compile(">(.*?)</[WC]>")
+
+def query(search_tokens, filename):
+ global sentence_count
+ global word_count
+ global limit
+ global matches
+ global tags
+ t1 = time.time()
+ tokens = buildList(filename)
+ #print "T=%.2f<br>" % (time.time()-t1)
+ t1 = time.time()
+ #print tokens
+ match_pos = 0
+ pos = 0
+ for word,tag in tokens:
+ if tag == 'S_BEGIN':
+ sentence_count = sentence_count + 1
+ word_count = word_count + 1
+ if tags.count(search_tokens[match_pos]) > 0:
+ compare = tag
+ else:
+ compare = word
+ if compare == search_tokens[match_pos] or search_tokens[match_pos] == '_':
+ match_pos = match_pos + 1
+ else:
+ match_pos = 0
+ #print match_pos
+ if match_pos == len(search_tokens):
+ if matches+1 > limit:
+ return None
+ print "%d." % (matches+1)
+ print niceFormat(tokens[pos-context:pos+context], \
+ context-len(search_tokens)+1, len(search_tokens))
+ sys.stdout.flush()
+ matches = matches + 1
+ match_pos = 0
+ pos = pos + 1
+ #print "T2=%.2f<br>" % (time.time()-t1)
+ return 1
+
+def niceFormat(tokens, rel_pos, match_len):
+ l = []
+ count = 0
+ for word,tag in tokens:
+ if count >= rel_pos and count < rel_pos+match_len:
+ l.append('<b>%s<span class="tag">/%s</span></b>' % (word,tag))
+ elif tag == 'PUN':
+ l.append(word)
+ else:
+ l.append('%s<span class="tag">/%s</span>' % (word,tag))
+ count = count + 1
+ return str.join(' ', l) + "<br>"
+
+def buildList(filename):
+ # Speed up:
+ pickle_filename = "%s.pickle" % filename
+ if os.path.exists(pickle_filename):
+ #print "Loading pickled data from %s<br>" % pickle_filename
+ t1 = time.time()
+ tokens = cPickle.load(open(pickle_filename))
+ #print "Tpickle=%.2f<br>" % (time.time()-t1)
+ return tokens
+
+ f = open(filename)
+ content = f.read()
+ f.close()
+ global regex
+ global words_regex
+ global type_regex
+ global word_regex
+
+ sentences = regex.findall(content)
+ tokens = []
+ for s in sentences:
+ #print "X"
+ words = words_regex.findall(s)
+ tokens.append(('', 'S_BEGIN'))
+ for w in words:
+ w = w.replace("\n", " ")
+ #print w
+ type_match = type_regex.search(w)
+ if not type_match:
+ print "*** no type_match!?"
+ continue
+ type_str = type_match.group(1)
+ word_match = word_regex.search(w)
+ word = word_match.group(1).strip()
+ #print "%s/%s" % (word, type_str)
+ tokens.append((word, type_str))
+ tokens.append(('', 'S_END'))
+ # Prepare speed up for next search:
+ cPickle.dump(tokens, open(pickle_filename, 'w'), 1)
+ return tokens
+
+def queryFiles(tokens, dir_name):
+ os.chdir(dir_name)
+ dir_contents = os.listdir(".")
+ dir_contents.sort()
+ c = 0
+ for filename in dir_contents:
+ if filename.endswith(".xml"):
+ c = c + 1
+ print "Found %d *.xml files in %s<br>" % (c, dir_name)
+ w = 0
+ s = 0
+ m = 0
+ f_count = 1
+ for name in dir_contents:
+ if os.path.isdir(name):
+ queryFiles(tokens, name)
+ elif name.endswith(".xml"):
+ print "<strong>%.3d. %s</strong>, so far %d words, %d sentences<br>" % (f_count, name, word_count, sentence_count)
+ res = query(tokens, name)
+ if not res:
+ return
+ #global_file_count = global_file_count + 1
+ #print "<hr />"
+ sys.stdout.flush()
+ f_count = f_count + 1
+ # for profiling
+ #if word_count > 200000:
+ # return
+ os.chdir("..")
+ return
+
+def displayForm():
+ taginfo = TagInfo.TagInfo()
+ print "Content-Type: text/html\n\n"
+ print """
+ <html><head>
+ <title>BNC Query</title></head>
+ <body>
+ <h1>BNC Query</h1>
+
+ <form action="query.py" method="get">
+ <table border="0" cellspacing="0" cellpadding="0">
+ <tr>
+ <td>Word/tag sequence:</td>
+ <td>Context:</td>
+ <td>Max. results:</td>
+ </tr>
+ <tr>
+ <td><input type="text" name="tokens"></td>
+ <td><select name="context">
+ <option value="4">4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</option>
+ <option>6</option>
+ <option>8</option>
+ <option>10</option>
+ </select></td>
+ <td><input type="text" name="limit" value="30" size="6" /></input>
+ <td>&nbsp;</td>
+ <td><input type="submit" value="Query" /></td>
+ </tr>
+ </table>
+ </form>
+ <br />
+ _ (underline) matches any word
+ %s
+ </body>
+ </html>""" % taginfo.getHTMLCode()
+ return
+
+def main():
+ global limit
+ global context
+ form = cgi.FieldStorage()
+ if not form.getvalue("tokens"):
+ displayForm()
+ return
+ if form.getvalue("context"):
+ context = int(form.getvalue("context"))
+ if form.getvalue("limit"):
+ limit = int(form.getvalue("limit"))
+ print "Content-Type: text/html\n\n"
+ token_display = cgi.escape(form.getvalue("tokens"), 1)
+ print """<html><head>
+ <title>BNC query result for '%s'</title>
+ <style rel="stylesheet">
+ <!--
+ .tag { color:#999999; }
+ -->
+ </style></head>
+ <body>
+ <h1>BNC query result for '%s'</h1>""" % (token_display, token_display)
+ tokens = re.split("\s+", form.getvalue("tokens"))
+ queryFiles(tokens, data_dir)
+ print '<p>Queried %d words in %d sentences.' % (word_count, \
+ sentence_count)
+ print '</body></html>'
+ #print '<pre>' # profiling
+ return
+
+main()
+#profile.run('main()')
diff --git a/languagetool/src/socket_server.py b/languagetool/src/socket_server.py
new file mode 100644
index 0000000..81cac5b
--- /dev/null
+++ b/languagetool/src/socket_server.py
@@ -0,0 +1,218 @@
+#!/usr/bin/python
+# A server that uses TextChecker.py to check text for style
+# and grammar errors
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import TextChecker
+
+import ConfigParser
+import os
+import re
+import socket
+import sys
+import time
+
+sys.path.append(os.path.join(sys.path[0], "snakespell-1.01"))
+from scriptfoundry.snakespell import iSpell
+
+server_name = "127.0.0.1"
+server_port = 50100
+configfile = os.path.join(os.getenv('HOME'), ".kde/share/config/languagetool")
+
+def makeChecker(grammar_cfg=None, falsefriends_cfg=None, words_cfg=None, \
+ builtin_cfg=None, textlanguage=None, mothertongue=None, \
+ max_sentence_length=None):
+ """Create a new TextChecker object and return it."""
+ checker = TextChecker.TextChecker(grammar_cfg, falsefriends_cfg, words_cfg, \
+ builtin_cfg, textlanguage, mothertongue, max_sentence_length)
+ return checker
+
+def loadOptionList(config, enable_name, option_name):
+ val = None
+ if config.has_option("General", enable_name) and \
+ config.getboolean("General", enable_name):
+ if config.has_option("General", option_name):
+ val = re.split(',', config.get("General", option_name))
+ else:
+ val = ["NONE"]
+ return val
+
+def loadOptionBoolean(config, option_name):
+ if config.has_option("General", option_name) and config.getboolean("General", option_name):
+ return 1
+ return None
+
+def loadOptionString(config, option_name, default):
+ val = default
+ if config.has_option("General", option_name):
+ val = config.get("General", option_name)
+ return val
+
+def readConfig():
+ """Read the checker config from a KDE config file (INI style).
+ Return a checker which uses that config."""
+ config = ConfigParser.ConfigParser()
+ try:
+ config.readfp(open(configfile))
+ except IOError:
+ print "Couldn't load config file '%s', using defaults..." % configfile
+ grammar = loadOptionList(config, "EnableGrammar", "GrammarRules")
+ falsefriends = loadOptionList(config, "EnableFalseFriends", "FalseFriendsRules")
+ words = loadOptionList(config, "EnableWords", "WordsRules")
+ builtin = []
+ if loadOptionBoolean(config, "EnableWhitespaceCheck"):
+ builtin.append("WHITESPACE")
+ if len(builtin) == 0:
+ builtin = None
+ textlanguage = loadOptionString(config, "TextLanguage", "en")
+ mothertongue = loadOptionString(config, "MotherTongue", "en")
+ sentence_length = 0
+ if loadOptionBoolean(config, "EnableSentenceLength"):
+ if config.has_option("General", "MaxSentenceLength"):
+ sentence_length = config.getint("General", "MaxSentenceLength")
+ checker = makeChecker(grammar, falsefriends, words, builtin, \
+ textlanguage, mothertongue, sentence_length)
+ return checker
+
+def getConfig(data):
+ """Get a new config in pseudo XML format from the client.
+ It needs to be at the beginning of the string that comes
+ from the client and must be of form <config ... />.
+ Returns a tuple with the a checker based on this config and
+ the 'data' string with the config section removed."""
+ print "Receiving new config..."
+ line_end_pos = data.find("/>")
+ cfg_str = data[:line_end_pos]
+ data = data[line_end_pos+3:]
+ grammar = getConfigValue(cfg_str, "grammar")
+ falsefriends = getConfigValue(cfg_str, "falsefriends")
+ words = getConfigValue(cfg_str, "words")
+ builtin = getConfigValue(cfg_str, "builtin")
+ textlanguage = getConfigValue(cfg_str, "textlanguage")
+ if textlanguage:
+ textlanguage = textlanguage[0]
+ mothertongue = getConfigValue(cfg_str, "mothertongue")
+ if mothertongue:
+ mothertongue = mothertongue[0]
+ sentence_length = getConfigValue(cfg_str, "max-sentence-length")
+ if not sentence_length:
+ sentence_length = 0
+ else:
+ sentence_length = int(sentence_length[0])
+ checker = makeChecker(grammar, falsefriends, words, builtin, \
+ textlanguage, mothertongue, sentence_length)
+ return (checker, data)
+
+def getConfigValue(cfg_str, val):
+ m = re.compile('%s="(.*?)"' % val).search(cfg_str)
+ if not m:
+ return None
+ s = m.group(1)
+ l = re.split(',', s)
+ return l
+
+def main():
+ print "Binding to '%s:%d'..." % (server_name, server_port)
+ s.bind((server_name, server_port))
+ print "Listening..."
+ s.listen(1)
+ print "Setting up Checker..."
+ checker = readConfig()
+ print "Ready..."
+ while 1:
+ conn, addr = s.accept()
+ if addr[0] != "127.0.0.1": # security
+ print "Connection by '%s' refused" % addr[0]
+ conn.close()
+ continue
+ else:
+ print "Connected by '%s'" % addr[0]
+
+ l = []
+ limit = 1024
+ while 1:
+ data = conn.recv(limit)
+ l.append(data)
+ #FIXME: need to look for separator, not just < limit!
+ if not data or len(data) < limit:
+ break
+ data = str.join('', l)
+
+ print "Received '%s'" % data
+ if data.find("<config") != -1:
+ del checker
+ (checker, data) = getConfig(data)
+ print "New config activated"
+ t1 = time.time()
+ check_result = checkWords(checker, data)
+ t2 = time.time()-t1
+ print "Replying (%.2fs) '%s'" % (t2, check_result.encode('utf8'))
+ #print "Replying (%.2fs)" % t2
+ conn.send(check_result.encode('utf8'))
+
+ conn.close()
+ s.close()
+ return
+
+def checkWordsTEST(words):
+ """Just for testing. Marks 'working' as incorrect."""
+ words = re.split("\s+", words)
+ s = '<result>'
+ for w in words:
+ if w == "working":
+ s = s + '\t<error word="working" pos="5" corrections="Bohlen,Didda"/>'
+ s = s + '</result>'
+ return s
+
+def checkWords(checker, words):
+ result = u'<result>'
+
+ ### Spelling:
+ ispell = iSpell()
+ words = words.replace("\n", " ") # iSpell works line by line
+ r = ispell.check(words)
+ if r > 0:
+ # fixme: escape word
+ for mistake in ispell.getMistakes():
+ # TODO: make faster
+ pos = []
+ for p in mistake.getPositions():
+ result = u'%s<error from="%d" to="%d" word="%s" corrections="%s"/>' % \
+ (result, p, p+len(mistake.getWord()), \
+ unicode(mistake.getWord(), 'latin1'), \
+ unicode(str.join(',', mistake.corrections), ('latin1')))
+
+ ### Grammar + Style:
+ (rule_matches, res, tags) = checker.check(words)
+ # FIXME: only if there's no overlap?!
+ result = result + res
+
+ result = result + '</result>\n'
+ return result
+
+try:
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ os.chdir(sys.path[0])
+ main()
+except KeyboardInterrupt:
+ # TODO: close explicitely, unfortunately we still get an
+ # 'Address already in use' error if we restart immediately:
+ s.shutdown(2)
+ s.close()
+ print "Stopped."
diff --git a/languagetool/src/tag.py b/languagetool/src/tag.py
new file mode 100644
index 0000000..7ab713b
--- /dev/null
+++ b/languagetool/src/tag.py
@@ -0,0 +1,152 @@
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+# A frontend to a probabilistc part-of-speech tagger (see the QTag paper)
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+# Usage examples:
+# 1) ./tag.py -b /data/bnc_sampler/train/*
+# 2) ./tag.py -t /data/bnc_sampler/test/fcf
+
+import re
+import sys
+import string
+import getopt
+import profile
+
+import Tagger
+import Entities
+
+class Controller:
+ "Main program."
+
+ TAG = 0
+ BUILD = 1
+ TAGWORD = 2
+ TAGSEQ = 3
+
+ def __init__(self):
+ return
+
+ def usage(self):
+ print >> sys.stderr, "Usage: ./tagger.py <--build|--tag|--tagword> <filename...>"
+ print >> sys.stderr, " -h, --help this help information"
+ print >> sys.stderr, " -t, --tag tag any text files"
+ print >> sys.stderr, " -b, --build train the tagger using BNC XML files"
+ print >> sys.stderr, " -w, --wordtag tag any word"
+ print >> sys.stderr, " -s, --seqtag probability for any 2-tag-sequence"
+ # TODO: better help (e.g. 'build' adds to existing index (?))
+ return
+
+ def sanityCheck(self, filename, xml):
+ """Sanity check: all <w>...</w> together == original file?"""
+ words = re.compile("<w.*?>(.*?)</w>", re.DOTALL).findall(xml)
+ words_string = string.join(words, "")
+ # Load original file:
+ f = open(filename)
+ orig_contents = f.read()
+ f.close()
+ if orig_contents != words_string:
+ print >> sys.stderr, "*** Warning: joined output doesn't match original file!"
+ print >> sys.stderr, "*** (can be ignored if the file is a BNC file)"
+ return
+
+ def run(self):
+ try:
+ (options, rest) = getopt.getopt(sys.argv[1:], 'htbws',
+ ['help', 'build', 'tag', 'wordtag', 'seqtag'])
+ except getopt.GetoptError, e:
+ print >> sys.stderr, "Error: %s" % e
+ self.usage()
+ sys.exit(1)
+ mode = self.TAG
+ for o, a in options:
+ if o in ("-h", "--help"):
+ self.usage()
+ sys.exit(0)
+ elif o in ("-t", "--tag"):
+ mode = self.TAG
+ elif o in ("-b", "--build"):
+ mode = self.BUILD
+ elif o in ("-w", "--wordtag"):
+ mode = self.TAGWORD
+ elif o in ("-s", "--seqtag"):
+ mode = self.TAGSEQ
+ if not rest:
+ self.usage()
+ sys.exit(1)
+
+ if mode == self.BUILD:
+ tagger = Tagger.Tagger()
+ tagger.bindData()
+ tagger.buildData(rest)
+ tagger.commitData()
+ elif mode == self.TAG:
+ tagger = Tagger.Tagger()
+ tagger.bindData()
+ for filename in rest:
+ f = open(filename)
+ content = f.read()
+ f.close()
+ content = Entities.Entities.cleanEntities(content)
+ xml = tagger.tagTexttoXML(content)
+ self.sanityCheck(filename, xml)
+ print xml
+ print >> sys.stderr, "Done."
+ elif mode == self.TAGWORD:
+ tagger = Tagger.Tagger()
+ tagger.bindData()
+ for word in rest:
+ r = tagger.tagWord(word)
+ print r
+ elif mode == self.TAGSEQ:
+ tagger = Tagger.Tagger()
+ tagger.bindData()
+ if len(rest) > 1 and rest[1] != '*':
+ key = (rest[0], rest[1])
+ prob = tagger.tagSeq(key)
+ print prob
+ else:
+ # TODO: don't duplicate code from query.py:
+ tags_str = "AJ0,AJC,AJS,AT0,AV0,AVP,AVQ,CJC,CJS,CJT,"
+ tags_str = tags_str + "CRD,DPS,DT0,DTQ,EX0,ITJ,NN0,NN1,NN2,NP0,ORD,PNI,PNP,"
+ tags_str = tags_str + "PNQ,PNX,POS,PRF,PRP,PUL,PUN,PUQ,PUR,TO0,UNC,VBB,VBD,"
+ tags_str = tags_str + "VBG,VBI,VBN,VBZ,VDB,VDD,VDG,VDI,VDN,VDZ,VHB,VHD,VHG,"
+ tags_str = tags_str + "VHI,VHN,VHZ,VM0,VVB,VVD,VVG,VVI,VVN,VVZ,XX0,ZZ0,"
+ # these are not in query.py:
+ tags_str = tags_str + "YBL,YBR,YCOL,YCOM,YDSH,YEX,YLIP,YQUE,YQUO,YSCOL,YSTP"
+ tags = re.split(",", tags_str)
+ sum = 0
+ items = 0
+ for tag in tags:
+ key = (rest[0], tag)
+ prob = tagger.tagSeq(key)
+ prob2 = tagger.tagSeq2(key)
+ if prob > 0 or prob2 > 0:
+ sum = sum + prob
+ print "%s followed by %s -> %.10f" % (key[0], key[1], prob)
+ print "%s follows %s -> %.10f" % (key[0], key[1], prob2)
+ items = items + 1
+ print "items=%d, sum=%.5f" % (items, sum)
+ return
+
+### Main program
+
+prg = Controller()
+prg.run()
+#profile.run('prg.run()', 'fooprof')