Added very basic pre-beta version of LanguageTool. Builds, though :)

author: Arno Teigseth <arno@teigseth.no> 2011-01-31 05:34:56 +0000
committer: Arno Teigseth <arno@teigseth.no> 2011-01-31 05:34:56 +0000
commit: 1afa96100bcb613c86533698f8a9d1115e63391e (patch)
tree: 07c754e874bcbc95eeaa21abc35d4bc84158f4fb /languagetool/src
parent: 635a3c7c275c00748c56736b4eb593b651223edd (diff)
download: grammar-norwegian-1afa96100bcb613c86533698f8a9d1115e63391e.tar.gz
grammar-norwegian-1afa96100bcb613c86533698f8a9d1115e63391e.tar.bz2
grammar-norwegian-1afa96100bcb613c86533698f8a9d1115e63391e.tar.xz
24 files changed, 4707 insertions, 0 deletions
diff --git a/languagetool/src/.cvsignore b/languagetool/src/.cvsignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/languagetool/src/.cvsignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/languagetool/src/Chunker.py b/languagetool/src/Chunker.py
new file mode 100644
index 0000000..fc0cfd3
--- /dev/null
+++ b/languagetool/src/Chunker.py
@@ -0,0 +1,127 @@
+# -*- coding: iso-8859-1 -*-
+# Assign chunks to a tagged text
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import os
+import re
+import sys
+
+class Chunker:
+	"""Assign chunks (like "noun phrase") to a tagged text."""
+
+	def __init__(self):
+		return
+
+	def setRules(self, rules):
+		"""Use the rules from this Rules object for the chunk() method."""
+		self.rules = rules
+		return
+
+	def chunk(self, tagged_text):
+		"""Take a POS tagged text and find all its chunks. Returns
+		a list of (from, to, chunk_name) tuples where the from/to positions
+		refer to the list position. Only parts of the list may be
+		covered by chunks. There are no overlappings."""
+		l = []
+		
+		tagged_text_pos = 0
+		while 1:
+			if tagged_text_pos >= len(tagged_text):
+				break
+			word, norm_word, tag = tagged_text[tagged_text_pos]
+
+			for rule in self.rules.rules:
+				#print "### %s" % rule.name
+				match_start = None
+				match_end = None
+				pattern_pos = 0
+				pos_corr = 0
+
+				rule_match = 1
+				cont = 1
+
+				while 1:
+					#print " %d,%d,%d" % (tagged_text_pos,pattern_pos,pos_corr)
+					try:
+						tag = tagged_text[tagged_text_pos+pattern_pos+pos_corr][2]
+					except IndexError:
+						#print "index error"
+						break
+					#print "%s ?= %s (pp=%d, ttp=%d)" % (tag, rule.pattern[pattern_pos], pattern_pos, tagged_text_pos)
+					if pattern_pos == 0 and tag == None:
+						cont = 0
+						break
+					if tag == None:
+						# ignore whitespace
+						pos_corr = pos_corr + 1
+						continue
+					if tag != rule.pattern[pattern_pos]:
+						rule_match = 0
+						break
+					if match_start == None:
+						match_start = tagged_text_pos
+
+					pattern_pos = pattern_pos + 1
+					if pattern_pos == len(rule.pattern):
+						#print "match (%s)! tagged_text_pos=%d" % (rule.name, tagged_text_pos)
+						match_end = match_start + pattern_pos + pos_corr - 1
+						l.append((match_start, match_end, rule.name))
+						tagged_text_pos = tagged_text_pos + (match_end - match_start)
+						cont = 0
+						break
+				if not rule_match:
+					continue		# next rule
+				if not cont:
+					break			# next word
+			tagged_text_pos = tagged_text_pos + 1
+				
+		#print l
+		return l
+
+class Rules:
+	"""A container for chunking rules."""
+
+	chunk_rules = os.path.join(sys.path[0], "data", "chunks.txt")
+	
+	def __init__(self):
+		"""Read the chunking rules from data/chunks.txt. The rules
+		can then be access via Rules.rules."""
+		self.rules = []
+		f = open(self.chunk_rules)
+		lines = f.readlines()
+		f.close()
+		for line in lines:
+			if line.startswith("#"):	# ignore comments
+				continue
+			rule = Rule(line.strip())
+			self.rules.append(rule)
+		return
+
+class Rule:
+	"""A chunking rule, consisting of a name and a pattern. The
+	pattern is a list of POS tags."""
+
+	def __init__(self, line):
+		"""Parse a chunk rule in this format:
+		name: tag1 tag2..."""
+		parts = re.split("\s+", line.strip())
+		name = parts[0]
+		self.name = name[0:len(name)-1]		# cut off colon
+		self.pattern = parts[1:]
+		return
diff --git a/languagetool/src/ChunkerTest.py b/languagetool/src/ChunkerTest.py
new file mode 100644
index 0000000..eb8889e
--- /dev/null
+++ b/languagetool/src/ChunkerTest.py
@@ -0,0 +1,78 @@
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import re
+import unittest
+
+import Chunker
+
+class LocalRules:
+
+	def __init__(self, rule_list):
+		self.rules = rule_list
+		return
+
+class ChunkerTestCase(unittest.TestCase):
+
+	def testChunking(self):
+		c = Chunker.Chunker()
+		r1 = Chunker.Rule("NP1: AT0 NN1 NN1")
+		r2 = Chunker.Rule("NP2: AT0 NN1")
+		rules = LocalRules([r1, r2])
+		c.setRules(rules)
+
+		tagged_text = self._makeList("Blah/XX the/AT0 house/NN1 foo/YY")
+		chunks = c.chunk(tagged_text)
+		self.assertEqual(chunks, [(2, 4, 'NP2')])
+		
+		tagged_text = self._makeList("Blah/XX house/NN1 foo/YY")
+		chunks = c.chunk(tagged_text)
+		self.assertEqual(chunks, [])
+
+		tagged_text = self._makeList("the/AT0 summer/NN1 house/NN1 foo/YY2")
+		chunks = c.chunk(tagged_text)
+		self.assertEqual(chunks, [(0, 4, 'NP1')])
+	
+		# more than one chunk:
+
+		tagged_text = self._makeList("the/AT0 summer/NN1 is/VB a/AT0 hit/NN1")
+		chunks = c.chunk(tagged_text)
+		self.assertEqual(chunks, [(0, 2, 'NP2'), (6, 8, 'NP2')])
+
+		tagged_text = self._makeList("the/AT0 summer/NN1 a/AT0 hit/NN1")
+		chunks = c.chunk(tagged_text)
+		self.assertEqual(chunks, [(0, 2, 'NP2'), (4, 6, 'NP2')])
+
+		return
+
+	def _makeList(self, s):
+		parts = re.split("(\s+)", s)
+		l = []
+		for part in parts:
+			word = None
+			word_norm = None
+			tag = None
+			pair = re.split("/", part)
+			if len(pair) == 2:
+				word, tag = pair
+				word_norm = word
+			else:
+				word = pair[0]
+			l.append((word, word_norm, tag))
+		return l
diff --git a/languagetool/src/EnglishTest.py b/languagetool/src/EnglishTest.py
new file mode 100644
index 0000000..358d26c
--- /dev/null
+++ b/languagetool/src/EnglishTest.py
@@ -0,0 +1,62 @@
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import TextChecker
+import LanguageTest
+from LanguageTest import ExpMatch
+
+class EnglishTestCase(LanguageTest.LanguageTest):
+
+	def setUp(self):
+		self.checker = TextChecker.TextChecker(grammar=None, falsefriends=None, \
+			words=None, builtin=None, textlanguage="en", mothertongue="de", \
+			max_sentence_length=20, debug_mode=0)
+		return
+		
+	def testSomeRules(self):
+		"""Some English rule checks. Requires a trained tagger."""
+
+		self._check("A sentence without problems.", None)
+		self._check("This is bigger then blah.", ExpMatch("COMP_THAN", 15, 19))
+		self._check("English/German false friend: my chef", ExpMatch("CHEF", 32, 36))
+		self._check("Whitespace,here it's lacking.", ExpMatch("WHITESPACE", 11, 12))
+		
+		self._check("he good good.", ExpMatch("WORD_REPEAT", 7, 12))
+
+		self._check("I ask you because of him.", None)
+		self._check("Of cause not.", ExpMatch("OF_CAUSE", 3, 8))
+		self._check("he is nice.", None)
+		
+		self._check("This is a stoopid test.", None)
+		# TODO: error not detected:
+		self._check("The baseball team are established.", None)
+		
+		self._check("I definitely think is should be less than four years.", 
+			ExpMatch("IS_SHOULD", 19, 21))
+			
+		self._check("Peter's car is bigger then mine, and this isa spelling error.",
+			ExpMatch("COMP_THAN", 22, 26))
+
+		self._check("Peter's car is bigger then mine, and and a word repeat.",
+			[ExpMatch("COMP_THAN", 22, 26), ExpMatch("WORD_REPEAT", 34, 38)])
+
+		return
+
+if __name__ == "__main__":
+	unittest.main()
diff --git a/languagetool/src/Entities.py b/languagetool/src/Entities.py
new file mode 100644
index 0000000..615bd8b
--- /dev/null
+++ b/languagetool/src/Entities.py
@@ -0,0 +1,68 @@
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import re
+
+class Entities:
+	"""Some(!) BNC SGML entities."""
+
+	def cleanEntities(s):
+		"""Replace only the most common BNC entities with their
+		ASCII respresentation."""
+		entities = { 	"amp" : "&",
+						"pound": "P",		# fixme: use "�"
+						"eacute": "e",
+						"aacute": "a",
+						"bquo": "\"",
+						"equo": "\"",
+						"ecirc": "e",
+						"quot": "'",
+						#"deg": u"�",
+						"dollar": "$",
+						"agrave": "�",
+						"egrave": "�",
+						"percnt": "&",
+						"ndash": "-",
+						"mdash": "--",
+						"hellip": "...",
+						"lsqb": "[",
+						"rsqb": "]",
+						"uuml": "�",	#fixme: use �
+						"auml": "�",	# see above!
+						"ouml": "�",
+						"Uuml": "�",
+						"Auml": "�",
+						"Ouml": "�",
+						"szlig": "�"
+					}
+#		print "in entities %s"%s
+		try:
+			for key in entities:
+				#s = re.compile("&%s;?" % key).sub("%s" % entities[key].encode('latin1'), s)
+				s = s.replace("&%s;" % key, entities[key])
+				s = s.replace("&%s" % key, entities[key])
+		except TypeError:
+			# FIXME: what to do here?!
+			print >> sys.stderr, "TypeError: '%s'" % s
+		return s
+
+	cleanEntities = staticmethod(cleanEntities)
+
+if __name__ == "__main__":
+	main()
diff --git a/languagetool/src/GermanTest.py b/languagetool/src/GermanTest.py
new file mode 100755
index 0000000..5575b5e
--- /dev/null
+++ b/languagetool/src/GermanTest.py
@@ -0,0 +1,41 @@
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import TextChecker
+import LanguageTest
+from LanguageTest import ExpMatch
+
+class GermanTestCase(LanguageTest.LanguageTest):
+
+	def setUp(self):
+		self.checker = TextChecker.TextChecker(grammar=None, falsefriends=None, \
+			words=None, builtin=None, textlanguage="de", mothertongue="de", \
+			max_sentence_length=20, debug_mode=0)
+		return
+		
+	def testSomeRules(self):
+		"""Some English rule checks. Requires a trained tagger."""
+
+		self._check(u"Ich gehe da� er sieht", ExpMatch("DASS", 4, 12))
+		self._check(u"Ich gehe.", None)
+		self._check(u"Ich gehst.", ExpMatch("ICH", 0, 9))
+		return
+
+if __name__ == "__main__":
+	unittest.main()
diff --git a/languagetool/src/HungarianTest.py b/languagetool/src/HungarianTest.py
new file mode 100755
index 0000000..cb6b0a5
--- /dev/null
+++ b/languagetool/src/HungarianTest.py
@@ -0,0 +1,39 @@
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import TextChecker
+import LanguageTest
+from LanguageTest import ExpMatch
+
+class HungarianTestCase(LanguageTest.LanguageTest):
+
+	def setUp(self):
+		self.checker = TextChecker.TextChecker(grammar=None, falsefriends=None, \
+			words=None, builtin=None, textlanguage="hu", mothertongue="de", \
+			max_sentence_length=20, debug_mode=0)
+		return
+		
+	def testSomeRules(self):
+		"""Some English rule checks. Requires a trained tagger."""
+		self._check(u"�n m�sz moziba", ExpMatch("EN", 0, 7))
+		self._check(u"�k soha nem fogj�k megtanulni.", None)
+		return
+
+if __name__ == "__main__":
+	unittest.main()
diff --git a/languagetool/src/LanguageTest.py b/languagetool/src/LanguageTest.py
new file mode 100644
index 0000000..ee4f2b2
--- /dev/null
+++ b/languagetool/src/LanguageTest.py
@@ -0,0 +1,68 @@
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import TextChecker
+
+import unittest
+
+class LanguageTest(unittest.TestCase):
+
+	def _check(self, sentence, expectedErrors):
+		(rule_matches, output, tagged_text) = self.checker.check(sentence)
+		rule_matches.sort()
+		if expectedErrors == None:
+			if len(rule_matches) != 0:
+				print "Expected no errors, found %d" % len(rule_matches)
+				print "Sentence: %s" % sentence
+				self.fail()
+		elif isinstance(expectedErrors, list):
+			if len(rule_matches) != len(expectedErrors):
+				print "Expected %d errors, found %d" % (len(expectedErrors), len(rule_matches))
+				print "Sentence: %s" % sentence
+				self.fail()
+			i = 0
+			for expError in expectedErrors:
+				self._checkError(sentence, rule_matches[i], expError)
+				i = i + 1
+		else:
+			if len(rule_matches) != 1:
+				print "Expected 1 error, found %d" % len(rule_matches)
+				print "Sentence: %s" % sentence
+				self.fail()
+			self._checkError(sentence, rule_matches[0], expectedErrors)
+		return
+
+	def _checkError(self, sentence, rule_match, expectedError):
+		self.assertEqual(rule_match.id, expectedError.error_type)
+		if rule_match.from_pos != expectedError.from_pos or \
+			rule_match.to_pos != expectedError.to_pos:
+			print "Expected error from %d to %d, found error from %d to %d" % \
+				(expectedError.from_pos, expectedError.to_pos, rule_match.from_pos, \
+				rule_match.to_pos)
+			print "Sentence: %s" % sentence
+			self.fail()
+		return
+
+class ExpMatch:
+
+	def __init__(self, error_type, from_pos, to_pos):
+		self.error_type = error_type
+		self.from_pos = from_pos
+		self.to_pos = to_pos
+		return
diff --git a/languagetool/src/Rules.py b/languagetool/src/Rules.py
new file mode 100644
index 0000000..551e519
--- /dev/null
+++ b/languagetool/src/Rules.py
@@ -0,0 +1,632 @@
+# -*- coding: iso-8859-1 -*-
+# Class for Grammar and Style Rules
+#$rcs = ' $Id$ ' ;
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import Tools
+import codecs # tktk
+
+import copy
+import os
+import re
+import string
+import sys
+import xml.dom.minidom
+from string import *
+
+# FIXME:
+grammarFile = 'engrammar.xml'
+wordFile = 'enwords.xml'
+falsefriendsFile = 'enfalse_friends.xml'
+
+class Rule:
+	"""Style or grammar rule -- quasi virtual class."""
+
+	def __init__(self, rule_id, message, false_positives, language):
+		self.rule_id = rule_id
+		self.message = message
+		# errors per 100 sentences in the BNC, i.e. mostly false positives:
+		self.false_positives = false_positives
+		self.language = language	# two letter code like "en" or None (= relevant for alle languages)
+		return
+
+	# match() is not defined here, but in the sub classes
+
+class Rules:
+	"""All known style and grammar error rules (from XML and the built-in ones)."""
+
+	python_rules_dir = "python_rules"
+
+	def __init__(self, max_sentence_length, grammar_rules, word_rules, \
+		builtin_rules, false_friend_rules, textlanguage, mothertongue):
+		"""Parse all rules and put them in the self.rules list, together
+		with built-in rules like the SentenceLengthRule."""
+		self.textlanguage = textlanguage
+		if textlanguage == 'en':
+			self.rule_files = [os.path.join(sys.path[0], "rules", grammarFile),
+						os.path.join(sys.path[0], "rules", wordFile),
+						os.path.join(sys.path[0], "rules", falsefriendsFile)]
+		else:
+			self.rule_files = [os.path.join(sys.path[0], "rules", grammarFile)]
+		self.rules = []
+
+		# dynamically load rule files from the "python_rules" dir:
+		sys.path.append(self.python_rules_dir)
+		dyn_files = os.listdir(self.python_rules_dir)
+		for filename in dyn_files:
+			if textlanguage == 'en':
+				if filename[0:2] != 'en' and filename[0:3] != 'all':
+					continue
+			elif textlanguage == 'de':
+				if filename[0:2] != 'de' and filename[0:3] != 'all':
+					continue
+			elif textlanguage == 'hu':
+				if filename[0:2] != 'hu' and filename[0:3] != 'all':
+					continue
+			if not filename.endswith(".py") or filename.endswith("Test.py"):
+				continue
+			filename = filename[:-3]		# cut off ".py"
+			exec("import %s" % filename)
+			try:
+				exec("dynamic_rule = %s.%s()" % (filename, filename))
+			except AttributeError:
+				print filename
+				raise InvalidFilename(filename)
+			if not hasattr(dynamic_rule, "match"):
+				raise MissingMethod("match", "%s.py" % filename)
+			if dynamic_rule.rule_id == "SENTENCE_LENGTH" and \
+				max_sentence_length != None:
+				dynamic_rule.setMaxLength(max_sentence_length)
+			# do not use the rule if it wasn't activated
+			# (builtin_rules == None will use all rules):
+			if not builtin_rules or dynamic_rule.rule_id in builtin_rules:
+				self.rules.append(dynamic_rule)
+
+		for filename in self.rule_files:
+			# minidom expects the DTD in the current directory, not in the
+			# documents directory, so we have to chdir to 'rules':
+			dir_temp = os.getcwd()
+			os.chdir(os.path.dirname(filename))
+			doc = xml.dom.minidom.parse(os.path.basename(filename))
+			os.chdir(dir_temp)
+			if filename.endswith(grammarFile):
+				rule_nodes = doc.getElementsByTagName("rule")
+				for rule_node in rule_nodes:
+					rule = PatternRule(rule_node)
+					lang_ok = 0
+					if self.textlanguage == None or self.textlanguage == rule.language:
+						lang_ok = 1
+					if lang_ok and (grammar_rules == None or rule.rule_id in grammar_rules):
+						self.rules.append(rule)
+			elif filename.endswith("words.xml"):
+				rule_nodes = doc.getElementsByTagName("rule")
+				for rule_node in rule_nodes:
+					rule = PatternRule(rule_node)
+					lang_ok = 0
+					if self.textlanguage == None or self.textlanguage == rule.language:
+						lang_ok = 1
+					if lang_ok and (word_rules == None or rule.rule_id in word_rules):
+						self.rules.append(rule)
+			elif filename.endswith("false_friends.xml"):
+				pattern_nodes = doc.getElementsByTagName("pattern")
+				for pattern_node in pattern_nodes:
+					lang = pattern_node.getAttribute("lang")
+					if self.textlanguage == None or lang == self.textlanguage:
+						rule = PatternRule(pattern_node.parentNode, 1, mothertongue, textlanguage)
+						if rule.valid and (false_friend_rules == None or \
+							rule.rule_id in false_friend_rules):
+							self.rules.append(rule)
+		return
+
+class InvalidFilename(Exception):
+
+	def __init__(self, value):
+		self.value = value
+		return
+
+	def __str__(self):
+		s = "Constructor must be named as the file, i.e. '%s'" % self.value
+		return s
+
+class MissingMethod(Exception):
+
+	def __init__(self, value, filename):
+		self.value = value
+		self.filename = filename
+		return
+
+	def __str__(self):
+		s = "The '%s' method needs to be implemented in %s" % (self.value, self.filename)
+		return s
+
+class WhitespaceRule(Rule):
+	"""A rule that matches punctuation not followed by a whitespace
+	and whitespace preceding punctuation. This rule does not work
+	on sentence level, it works on complete tagged texts or paragraphs."""
+
+	punct = "[.,?!:;]"
+	punct_regex = re.compile("^%s+$" % punct)
+	whitespace_regex = re.compile("^\s+$")
+	after_punct_regex = re.compile("^[\"]+$")
+	number_regex = re.compile("^\d+$")
+	whitespace_before_punct = re.compile("^\s+%s" % punct)
+
+	def __init__(self):
+		Rule.__init__(self, "WHITESPACE", "Insert a space character before punctuation.", 0, None)
+		return
+
+	def getNextTriple(self, tagged_words, pos):
+		"""Get the next triple form the tagged_words list, starting at
+		pos but ignoring all SENT_START and SENT_END tags."""
+		tag = tagged_words[pos][2]
+		while tag == 'SENT_START' or tag == 'SENT_END':
+			pos = pos + 1
+			if pos >= len(tagged_words):
+				return None
+			tag = tagged_words[pos][2]
+		return tagged_words[pos]
+		
+	def match(self, tagged_words, chunks=None, position_fix=0, line_fix=0, column_fix=0):
+		"""Check if a sentence contains whitespace/token sequences
+		that are against the 'use a space after, but not before, a token'
+		rule."""
+		matches = []
+		text_length = 0
+		line_breaks = 1
+		column = 0
+		i = 0
+		while 1:
+			if i >= len(tagged_words)-1:
+				break
+			org_word = tagged_words[i][0]
+			line_breaks_cur = Tools.Tools.countLinebreaks(org_word) 
+			if line_breaks_cur > 0:
+				column = 0
+			line_breaks = line_breaks + line_breaks_cur
+			org_word_next = self.getNextTriple(tagged_words, i+1)
+			if org_word_next:
+				org_word_next = org_word_next[0]
+			text_length = text_length + len(org_word)
+			if tagged_words[i][1] == None:
+				# ignore whitespace
+				if line_breaks_cur == 0:
+					column = column + len(org_word)
+				i = i + 1
+				continue
+			whitespace_length = len(tagged_words[i+1][0])
+			if line_breaks_cur == 0:
+				column = column + len(org_word)
+			if self.punct_regex.match(org_word) and not (org_word.endswith("\n") or org_word.endswith("\r")):
+				word_next = tagged_words[i+1][1]
+				word_next = self.getNextTriple(tagged_words, i+1)
+				if word_next:
+					word_next = word_next[1]
+					if word_next and self.number_regex.match(word_next):
+						# don't complain about "24,000" etc.
+						i = i + 1
+						continue
+				if word_next and (not self.after_punct_regex.match(org_word_next)) and \
+					(not self.whitespace_regex.match(org_word_next)):
+					matches.append(RuleMatch(self.rule_id, text_length, text_length + len(org_word), 
+						line_breaks+line_fix,
+						column+column_fix,
+						"Usually a space character is inserted after punctuation."))
+			elif self.whitespace_before_punct.match(org_word):
+				if not self.punct_regex.match(org_word_next):
+					matches.append(RuleMatch(self.rule_id, text_length, text_length + len(org_word),
+						line_breaks+line_fix, column+column_fix,
+						"Usually no space character is inserted before punctuation."))
+			i = i + 1
+		return matches
+
+class PatternRule(Rule):
+	"""A rule that can be formalised in the XML configuration file."""
+
+	def __init__(self, node, is_false_friend_node=None, mothertongue=None, textlang=None):
+		"""Build an object by parsing an XML rule node."""
+		if node == None:
+			# for the test cases. They use setVars().
+			return
+		if is_false_friend_node:
+			self.parseFalseFriendsRuleNode(node, mothertongue, textlang)
+		else:
+			self.parseRuleNode(node)
+		return
+
+	def parseRuleNode(self, rule_node):
+		self.rule_id = rule_node.getAttribute("id")
+		if not self.rule_id:
+			# FIXME? rule_id is not unique...
+			self.rule_id = rule_node.parentNode.getAttribute("id")
+		self.pattern = rule_node.getElementsByTagName("pattern")[0].childNodes[0].data.strip()
+		token_strings = re.split("\s+", self.pattern)
+		self.tokens = []
+		for token_string in token_strings:
+			token = Token(token_string)
+			self.tokens.append(token)
+		pattern_node = rule_node.getElementsByTagName("pattern")[0]
+		self.language = pattern_node.getAttribute("lang")
+		marker_from_att = pattern_node.getAttribute("mark_from")
+		if marker_from_att:
+			self.marker_from = int(marker_from_att)
+		else:
+			self.marker_from = 0
+		marker_to_att = pattern_node.getAttribute("mark_to")
+		if marker_to_att:
+			self.marker_to = int(marker_to_att)
+		else:
+			self.marker_to = 0
+		self.case_sensitive = 0
+		if rule_node.getElementsByTagName("pattern")[0].getAttribute("case_sensitive") == 'yes':
+			#print "*** %s" % rule_node.getElementsByTagName("pattern")[0].getAttribute("case_sensitive")
+			self.case_sensitive = 1
+		if rule_node.getElementsByTagName("message"):
+			self.message = Tools.Tools.getXML(rule_node.getElementsByTagName("message")[0])
+		else:
+			self.message = Tools.Tools.getXML(rule_node.parentNode.getElementsByTagName("message")[0])
+		example_nodes = rule_node.getElementsByTagName("example")
+		self.example_good = ""
+		self.example_bad = ""
+		for example_node in example_nodes:
+			# TODO?: only one good and one bad example currently supported:
+			if example_node.getAttribute("type") == 'correct':
+				self.example_good = Tools.Tools.getXML(example_node.childNodes[0])
+			else:
+				self.example_bad = Tools.Tools.getXML(example_node.childNodes[0])
+		self.false_positives = None		# None = unknown
+		if rule_node.getElementsByTagName("error_rate"):
+			error_rate_node = rule_node.getElementsByTagName("error_rate")[0]
+			warnings = error_rate_node.getAttribute("warnings")
+			sentences = error_rate_node.getAttribute("sentences")
+			try:
+				if int(sentences) != 0:
+					error_rate = float(warnings) / float(sentences) * 100
+					self.false_positives = error_rate
+			except ValueError:
+				pass
+		return
+
+	def parseFalseFriendsRuleNode(self, rule_node, mothertongue, textlang):
+		# This is only called for rule nodes that have a pattern
+		# element with the relevant language. 
+		self.rule_id = rule_node.parentNode.getAttribute("id")
+		pattern_node = rule_node.getElementsByTagName("pattern")[0]
+		self.language = rule_node.getAttribute("lang")
+		# Now look for the correct translation:
+		trans_nodes = rule_node.getElementsByTagName("translation")
+		self.valid = 0		# useless object because no translation was found
+		translations = []
+		for trans_node in trans_nodes:
+			trans_lang = trans_node.getAttribute("lang")
+			if trans_lang == mothertongue:
+				self.valid = 1
+				trans_str = trans_node.childNodes[0].data
+				translations.append(trans_str)
+		if self.valid:
+			self.case_sensitive = 0
+			self.pattern = rule_node.getElementsByTagName("pattern")[0].childNodes[0].data.strip()
+			repl_word, repl_trans = self.getOtherMeaning(rule_node.parentNode, mothertongue, textlang)
+			l = []
+			for elem in repl_trans:
+				l.append("<em>%s</em>" % elem)
+			repl_trans_str = str.join(', ', l)
+			self.message = "'%s' means %s. " % (self.pattern, str.join(', ', translations))
+			if repl_word:
+				self.message = self.message + " Did you maybe mean '%s', which is %s?" % \
+					(repl_word, repl_trans_str)
+			#print "#%s" % self.message.encode('latin1')
+			token_strings = re.split("\s+", self.pattern)
+			self.tokens = []
+			for token_string in token_strings:
+				token = Token('"%s"' % token_string) # quotes = it's a word (not a POS tag)
+				self.tokens.append(token)
+				#print "#%s" % token
+			self.marker_from = 0
+			self.marker_to = 0
+		return
+
+	def getOtherMeaning(self, rulegroup_node, mothertongue, textlang):
+		"""Get the word (and its correct translations) that the user
+		maybe meant when he used a false friend. Returns a tuple
+		(word, [translations])."""
+		replace_nodes = rulegroup_node.getElementsByTagName("pattern")
+		word = None
+		translations = []
+		for replace_node in replace_nodes:
+			repl_lang = replace_node.getAttribute("lang")
+			if repl_lang == mothertongue:
+				word = replace_node.childNodes[0].data
+			trans_nodes = replace_node.parentNode.getElementsByTagName("translation")
+			for trans_node in trans_nodes:
+				trans_lang = trans_node.getAttribute("lang")
+				#print "#%s, %s" % (trans_lang, textlang)
+				if trans_lang == textlang:
+					self.valid = 1
+					trans_str = trans_node.childNodes[0].data
+					translations.append(trans_str)
+		return (word, translations)
+
+	def setVars(self, rule_id, pattern, message, marker_from, marker_to, \
+			example_good, example_bad, case_sensitive, false_positives, language):
+		"""Manually initialize the pattern rule -- for test cases only."""
+		self.rule_id = rule_id
+		self.message = message
+		self.false_positives = false_positives
+		self.language = language
+		self.marker_from = marker_from
+		self.marker_to = marker_to
+		self.example_good = example_good
+		self.example_bad = example_bad
+		self.case_sensitive = case_sensitive
+		self.tokens = []
+		token_strings = re.split("\s+", pattern)
+		for token_string in token_strings:
+			token = Token(token_string)
+			self.tokens.append(token)
+		return
+
+	def match(self, tagged_words, chunks=None, position_fix=0, line_fix=0, column_fix=0):
+		"""Check if there are rules that match the tagged_words. Returns a list
+		of RuleMatch objects."""
+		matches = []
+		ct = 0
+		tagged_words_copy = tagged_words		# no copy, just a refernce
+		last_match = None
+
+		#print self.rule_id
+		#print tagged_words_copy
+		for word_tag_tuple in tagged_words_copy:
+			i = ct
+			p = 0		# matched position in the pattern so far
+			expected_token = None		# expected token if the pattern matches
+			found = None
+			match = 1
+			first_match = None
+			chunk_corr = 0
+			chunk_len = 0
+
+			while match:
+				try:
+					if not tagged_words_copy[i][1] and tagged_words_copy[i][2] != 'SENT_START' and tagged_words_copy[i][2] != 'SENT_END':
+						# here's just whitespace or other un-taggable stuff:
+						i = i + 1
+						ct = ct + 1
+						continue
+					elif not first_match:
+						first_match = ct
+				except IndexError:		# end of tagged words
+					break
+				try:
+					expected_token = self.tokens[p]
+				except IndexError:
+					# pattern isn't that long
+					break
+				expected_token_str = expected_token.token
+
+				#print "expected_token_str=%s" % expected_token_str
+				if tagged_words_copy[i][2] == 'SENT_START':
+					found = 'SENT_START'
+				elif tagged_words_copy[i][2] == 'SENT_END':
+					found = 'SENT_END'
+				elif expected_token.is_word:
+					# TODO: some cases need to be escaped, e.g. "?", but
+					# this breaks the pipe etc.
+					#expected_token_str = re.escape(expected_token_str)
+					# look at the real word:
+					try:
+						found = tagged_words_copy[i][1].strip()
+					except:		# text isn't that long
+						break
+				elif expected_token.is_chunk:
+					#print "chunk %s@%d?" % (expected_token.token, i)
+					found = None
+					for from_pos, to_pos, chunk_name in chunks:
+						if i >= from_pos and i <= to_pos:
+							found = chunk_name
+							#print "CHUNK %d-%d: %s" % (from_pos, to_pos, chunk_name)
+							i = i + (to_pos - from_pos)
+							chunk_corr = chunk_corr + (to_pos - from_pos)
+							chunk_len = chunk_len + 1
+							break
+				else:
+					# look at the word's POS tag:
+					try:
+						found = tagged_words_copy[i][2]
+					except:		# text ends here
+						break
+				if not found:
+					#print >> sys.stderr, "*** 'found' undefined (i=%d, %s/%s)" % (i, tagged_words_copy[i][1], tagged_words_copy[i][2])
+					break
+				case_sensitive = re.IGNORECASE
+				if self.case_sensitive:
+					case_sensitive = 0
+				if expected_token.simple_token:
+					# speed up for e.g. simple false friends rules that don't
+					# require regex matching:
+					if case_sensitive:
+						#print "exp:%s" %expected_token
+						match = (expected_token_str.lower() == found.lower())
+					else:
+						match = (expected_token_str == found)
+				else:
+					match = re.compile("%s$" % expected_token_str, case_sensitive).match(found)
+				#print "%s: %s/%s -> %s" % (self.rule_id, found, expected_token_str, match)
+				if expected_token.negation:
+					if not match:
+						match = 1
+					else:
+						match = None
+				#print "F=%s, m=%s, '%s'" % (found, match, re.escape(expected_token.token))
+				i = i + 1
+				p = p + 1
+
+			#print "p=%d, len(self.tokens)=%d" % (p, len(self.tokens))
+			if match and p == len(self.tokens):
+
+				#print "##MATCH "+found+" " +expected_token_str
+				#FIXME: does this always mark the correct position?
+				(first_match, from_pos, to_pos, line, column) = self.listPosToAbsPos(tagged_words_copy, \
+					first_match, 0)
+				to_pos = to_pos + chunk_corr
+
+				# Let \n in a rule refer to the n'th matched word:
+				l = first_match
+				lcount = 1
+				msg = self.message
+				while lcount <= len(self.tokens) and l < len(tagged_words_copy):
+					if not tagged_words_copy[l][1] and tagged_words_copy[l][2] != 'SENT_START' and tagged_words_copy[l][2] != 'SENT_END':
+						pass
+					else:
+						msg = msg.replace("\\%d" % lcount, tagged_words_copy[l][0])
+						lcount = lcount + 1
+					l = l + 1
+
+				first_match_word = tagged_words_copy[first_match][0]
+				match = RuleMatch(self.rule_id, from_pos+position_fix, to_pos+position_fix, \
+					line+line_fix, column+column_fix, msg, first_match_word)
+				matches.append(match)
+
+			ct = ct + 1
+		return matches
+
+	def listPosToAbsPos(self, l, first_match, chunk_corr=0):
+		#print "*%d (%d)" % (first_match, chunk_corr)
+		j = first_match + 1
+		i = 0
+		mark_from_tmp = self.marker_from
+		while mark_from_tmp > 0 and j < len(l):
+			if l[j][1]:
+				mark_from_tmp = mark_from_tmp - 1
+			i = i + 1
+			j = j + 1
+		first_match = first_match + i
+
+		last_match = first_match
+		match_len = len(self.tokens)-self.marker_from+self.marker_to+chunk_corr
+		for el in l[first_match:]:
+			if match_len == 0:
+				break
+			if el[1]:
+				match_len = match_len - 1
+			last_match = last_match + 1
+
+		from_pos = 0
+		line = 0
+		column = 0			# FIXME!
+		for el in l[:first_match]:
+			#print "** '%s' (%d)" % (el[0], first_match)
+			matches = re.findall("[\n\r]", el[0])
+			line = line + len(matches)
+			if len(matches) > 0:
+				column = 0
+			else:
+				column = column + len(el[0])
+			from_pos = from_pos + len(el[0])
+		#print "** L=%s" % line
+		to_pos = 0
+		for el in l[:last_match]:
+			to_pos = to_pos + len(el[0])
+
+		return (first_match, from_pos, to_pos, line, column)
+
+class RuleMatch:
+	"""A matching rule, i.e. an error or a warning and from/to positions."""
+
+	def __init__(self, rule_id, from_pos, to_pos, line, column, message, first_match_word=None):
+		self.id = rule_id
+		self.from_pos = from_pos
+		self.to_pos = to_pos
+		self.line = line
+		self.column = column
+		self.message = message
+		# TOOD: is it okay to use 'latin1' here?:
+		if first_match_word and first_match_word[0] in unicode(string.uppercase, 'latin1'):
+			# Replace the first char in <em>...</em> with its uppercase
+			# variant. Useful for replacements at the beginning of the
+			# sentence
+			self.message = re.compile("<em>(.)").sub(self.upper, self.message)
+		return
+
+	def upper(self, match):
+		return "<em>%s" % match.group(1)[0].upper()
+
+	def __str__(self):
+		"""String representation of this object, i.e. human readable output."""
+		msg = self.message
+		msg = re.compile("</?message>").sub("", msg)
+		msg = re.compile("</?em>").sub("'", msg)
+		strng = 'Line %d, Column %d: %s' % (self.line, self.column, msg)
+		return strng
+
+	def toXML(self):
+		"""XML representation of this object."""
+		strng = '<error from="%d" to="%d">%s</error>' % (self.from_pos, self.to_pos, self.message)
+		return strng
+
+	def __cmp__(self, b):
+		"""Compare by 'from' position."""
+		if self.from_pos > b.from_pos:
+			return 1
+		elif self.from_pos < b.from_pos:
+			return -1
+		else:
+			return 0
+
+class Token:
+	"""A word, tag or chunk token, negated or not. Examples:
+	"^(has|will)",
+	"he",
+	(VB|VBP),
+	_NP
+	"""
+	
+	def __init__(self, token):
+		self.token = token
+		self.negation = 0
+		self.is_word = 0
+		self.is_tag = 0
+		self.is_chunk = 0
+		if self.token.find("|") != -1 or self.token.find("(") != -1 \
+			or self.token.find("[") != -1 or self.token.find(".") != -1:
+			self.simple_token = 0
+		else:
+			self.simple_token = 1		# no regex required
+		if self.token.startswith('^'):
+			self.token = token[1:]	# remove '^'
+			self.negation = 1
+		if self.token.startswith('"'):
+			self.is_word = 1
+			if not self.token.endswith('"'):
+				print >> sys.stderr, "*** Warning: token '%s' starts with quote but doesn't end with quote!" % self.token
+			self.token = self.token[1:(len(self.token)-1)]	# remove quotes
+		elif self.token.startswith('_'):
+			self.token = token[1:]	# remove '_'
+			self.is_chunk = 1
+		else:
+			self.is_tag = 1
+		return
+
+	def __str__(self):
+		"""For debugging only"""
+		strng = self.token 
+		if self.negation:
+			strng = "^%s" % strng
+		if self.is_word:
+			strng = '"%s"' % strng
+		return strng
diff --git a/languagetool/src/RulesTest.py b/languagetool/src/RulesTest.py
new file mode 100644
index 0000000..fd54598
--- /dev/null
+++ b/languagetool/src/RulesTest.py
@@ -0,0 +1,257 @@
+#!/usr/bin/python
+# Test cases for Rule.py
+#$rcs = ' $Id$ ' ;
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import unittest
+import Rules
+import os
+import sys
+
+sys.path.append(os.path.join("python_rules"))
+import allSentenceLengthRule
+import enWordRepeatRule
+import enAvsAnRule
+
+class RuleTestCase(unittest.TestCase):
+
+    def setUp(self):
+		self.rule = Rules.PatternRule(None)
+		self.rule.setVars("TEST1", '"word" (VB|TST)', "Test message 1.", 0, 0, \
+			"Good example.", "Bad example.", 0, 5, "en")
+		# negation:
+		self.rule2 = Rules.PatternRule(None)
+		self.rule2.setVars("TEST2", '"word" ^(VB|TST)', "Test message 2.", 0, 0, \
+			"Good example.", "Bad example.", 0, 5, "en")
+		# negation at the beginning:
+		self.rule3 = Rules.PatternRule(None)
+		self.rule3.setVars("TEST3", '^"word" (VB|TST)', "Test message 3.", 0, 0, \
+			"Good example.", "Bad example.", 0, 5, "en")
+		return
+
+    def testConstructor(self):
+		self.assertEqual(self.rule.rule_id, "TEST1")
+		self.assertEqual(len(self.rule.tokens), 2)
+		self.assertEqual(self.rule2.rule_id, "TEST2")
+		self.assertEqual(len(self.rule.tokens), 2)
+		self.assertEqual(self.rule3.rule_id, "TEST3")
+		self.assertEqual(len(self.rule.tokens), 2)
+		return
+
+    def testSentenceLengthRule(self):
+		r = allSentenceLengthRule.allSentenceLengthRule()
+		r.setMaxLength(3)
+
+		# just below the limit:
+		warnings = r.match([('x','x','T'),('x','x','T'),('x','x','T')])
+		self.assertEqual(len(warnings), 0)
+
+		# just on the limit:
+		warnings = r.match([('x','x','T'),('x','x','T'),('x','x','T'),('x','x','T')])
+		self.assertEqual(len(warnings), 1)
+		assert(warnings[0].toXML().startswith('<error from="3" to="4">'))
+		r.setMaxLength(60)
+		warnings = r.match([('x','x','T'),('x','x','T'),('x','x','T'),('x','x','T')])
+		self.assertEqual(len(warnings), 0)
+		r.setMaxLength(3)
+
+		# whitespace is okay:
+		warnings = r.match([('  ',None,None),('x','x','T'),('x','x','T'),('x','x','T')])
+		self.assertEqual(len(warnings), 0)
+
+		# much longer than the limit:
+		warnings = r.match([('x','x','T'),('x','x','T'),('x','x','T'),('x','x','T'),\
+			('x','x','T'),('x','x','T'),('x','x','T')])
+		self.assertEqual(len(warnings), 1)
+
+		return
+
+    def testAvsAnRule(self):
+		r = enAvsAnRule.enAvsAnRule()
+		# okay:
+		warnings = r.match([('A','A','DET'),(' ',None,None),('test','test','NN')], [])
+		self.assertEqual(len(warnings), 0)
+		warnings = r.match([('a','a','DET'),(' ',None,None),('test','test','NN')], [])
+		self.assertEqual(len(warnings), 0)
+		warnings = r.match([('an','an','DET'),(' ',None,None),('idea','idea','NN')], [])
+		self.assertEqual(len(warnings), 0)
+
+		# okay (exceptions list):
+		warnings = r.match([('a','a','DET'),(' ',None,None),('university','university','NN')], [])
+		self.assertEqual(len(warnings), 0)
+		warnings = r.match([('an','an','DET'),(' ',None,None),('hour','hour','NN')], [])
+		self.assertEqual(len(warnings), 0)
+
+		# wrong:
+		warnings = r.match([('An','An','DET'),(' ',None,None),('test','test','NN')], [])
+		self.assertEqual(len(warnings), 1)
+		warnings = r.match([('an','an','DET'),(' ',None,None),('test','test','NN')], [])
+		self.assertEqual(len(warnings), 1)
+		warnings = r.match([('a','a','DET'),(' ',None,None),('idea','idea','NN')], [])
+		self.assertEqual(len(warnings), 1)
+
+		# wrong (exceptions list):
+		warnings = r.match([('an','an','DET'),(' ',None,None),('university','university','NN')], [])
+		self.assertEqual(len(warnings), 1)
+		warnings = r.match([('a','a','DET'),(' ',None,None),('hour','hour','NN')], [])
+		self.assertEqual(len(warnings), 1)
+
+		return
+		
+    def testWhitespaceRule(self):
+		r = Rules.WhitespaceRule()
+	
+		# okay:
+		warnings = r.match([('blah','blah','XX'),('?',None,None)])
+		self.assertEqual(len(warnings), 0)
+		warnings = r.match([('3.14','3.14','XX'),('?',None,None)])
+		self.assertEqual(len(warnings), 0)
+
+		# error - whitespace before punctuation:
+		warnings = r.match([('blah','blah','XX'),(' ',None,None),('.',None,None)])
+		self.assertEqual(len(warnings), 1)
+		warnings = r.match([('blah','blah','XX'),(' ',None,None),('?',None,None)])
+		self.assertEqual(len(warnings), 1)
+		warnings = r.match([('blah','blah','XX'),(' ',None,None),('...',None,None)])
+		self.assertEqual(len(warnings), 1)
+		warnings = r.match([('blah','blah','XX'),(' ',None,None),('?!',None,None)])
+		self.assertEqual(len(warnings), 1)
+
+		# both errors
+		warnings = r.match([('blah','blah','XX'),(' ',None,None),(',',None,None),('blah','blah','XX')])
+		self.assertEqual(len(warnings), 2)
+
+		# okay:
+		warnings = r.match([('blah','blah','XX'),('?',None,None),(None,None,'SENT_END')])
+		self.assertEqual(len(warnings), 0)
+
+		# error - no whitespace after punctuation:
+		warnings = r.match([('blah','blah','XX'),('?',None,None),('foo','foo','YY')])
+		self.assertEqual(len(warnings), 1)
+
+		return
+
+    def testWordRepeat(self):
+		r = enWordRepeatRule.enWordRepeatRule()
+	
+		warnings = r.match([('blah','blah','XX'),(' ',None,None),('blahbla','blahbla','YY')], [])
+		self.assertEqual(len(warnings), 0)
+		
+		warnings = r.match([('blah','blah','XX'),(' ',None,None),('blah','blah','YY')], [])
+		self.assertEqual(len(warnings), 1)
+		warnings = r.match([('blah','blah','XX'),(' ',None,None),('BLAH','BLAH','XX')], [])
+		self.assertEqual(len(warnings), 1)
+
+		return
+
+    def testPatternRuleMatch(self):
+
+		# rule 1:
+		
+		res_list = self.rule.match([('', None, 'SENT_START'),
+			('word', 'word', 'XX'),(' ', None, None),('bla', 'bla', 'VB')], 0)
+		self.assertEqual(len(res_list), 1)
+		self.assertEqual(res_list[0].toXML(), '<error from="0" to="8">Test message 1.</error>')
+
+		res_list = self.rule.match([('no', 'no', 'XX'),('foo', 'foo', 'VB')], 0)
+		self.assertEqual(len(res_list), 0)
+
+		res_list = self.rule.match([], 0)
+		self.assertEqual(len(res_list), 0)
+
+		res_list = self.rule.match([('word', 'word', 'XX')], 0)
+		self.assertEqual(len(res_list), 0)
+		
+		# rule 2:
+		
+		res_list = self.rule2.match([('word', 'word', 'XX'),('', None, None),('xxx', 'xxx', 'VBX')], 0)
+		self.assertEqual(len(res_list), 1)
+
+		# rule 3:
+		
+		res_list = self.rule3.match([('foo', 'foo', 'XX'),(' ', None, None),('xxx', 'xxx', 'VB')], 0)
+		self.assertEqual(len(res_list), 1)
+		return
+
+class RuleMatchTestCase(unittest.TestCase):
+
+	def testCompare(self):
+		r1 = Rules.RuleMatch("ONE", 1, 2, 0, 0, "fake1", 0)
+		r2 = Rules.RuleMatch("ONE", 2, 3, 0, 0, "fake2", 0)
+		assert(r1 < r2)
+		r3 = Rules.RuleMatch("ONE", 1, 3, 0, 0, "fake3", 0)
+		assert(r1 == r3)
+		assert(r2 > r3)
+		return
+
+class TokenTestCase(unittest.TestCase):
+
+	def testToken(self):
+
+		token = Rules.Token('NN')
+		self.assertEqual(token.token, "NN")
+		assert(not token.negation)
+		assert(token.is_tag)
+		assert(not token.is_word)
+		assert(not token.is_chunk)
+		assert(token.simple_token)
+
+		token = Rules.Token('"word"')
+		self.assertEqual(token.token, "word")
+		assert(not token.negation)
+		assert(not token.is_tag)
+		assert(token.is_word)
+		assert(not token.is_chunk)
+		assert(token.simple_token)
+
+		token = Rules.Token("^(NN)")
+		self.assertEqual(token.token, "(NN)")
+		assert(token.negation)
+		assert(token.is_tag)
+		assert(not token.is_word)
+		assert(not token.is_chunk)
+		assert(not token.simple_token)		# b/c of the parenthesis
+
+		token = Rules.Token('^"word"')
+		self.assertEqual(token.token, "word")
+		assert(token.negation)
+		assert(not token.is_tag)
+		assert(token.is_word)
+		assert(not token.is_chunk)
+		assert(token.simple_token)
+
+		token = Rules.Token('_NP')
+		self.assertEqual(token.token, "NP")
+		assert(not token.negation)
+		assert(not token.is_tag)
+		assert(not token.is_word)
+		assert(token.is_chunk)
+		assert(token.simple_token)
+
+		token = Rules.Token("(AA|BB|CC)")
+		self.assertEqual(token.token, "(AA|BB|CC)")
+		assert(not token.negation)
+		assert(token.is_tag)
+		assert(not token.is_word)
+		assert(not token.is_chunk)
+		assert(not token.simple_token)		# b/c of the parenthesis
+		return
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/languagetool/src/SentenceSplitter.py b/languagetool/src/SentenceSplitter.py
new file mode 100644
index 0000000..35dfb7d
--- /dev/null
+++ b/languagetool/src/SentenceSplitter.py
@@ -0,0 +1,132 @@
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2003 Daniel Naber <daniel.naber@t-online.de>
+# Based on Shlomo Yona's Perl module Lingua::EN::Sentence 0.25
+
+import os
+import string
+import re
+import sys
+
+class SentenceSplitter:
+
+	ABBR_FILE = os.path.join(sys.path[0], "data", "abbr.txt")
+	
+	EOS = "\001"
+	#EOS = "<>"		# for testing only
+	P = """[\.!?]"""				## PUNCTUATION
+	AP = """(?:'|"|�|\)|\]|\})?"""	## AFTER PUNCTUATION
+	PAP = "%s%s" % (P, AP)
+	
+	reFlags = re.DOTALL|re.LOCALE
+	
+	def __init__(self):
+		"""Init the object by loading the abbreviation list."""
+		self.abbr = self.loadAbbreviations()
+		return
+
+	def loadAbbreviations(self):
+		"""Load the abbreviation list and return all words in a list."""
+		abbr = []
+		f = open(self.ABBR_FILE, "r")
+		while 1:
+			l = f.readline()
+			if not l:
+				break
+			l = l.strip()
+			if l:
+				abbr.append(l)
+		f.close()
+		return abbr
+		
+	def split(self, text):
+		"""Take a text and split it into sentences. Return the list
+		of sentences. Adapted from Perl's Lingua-EN-Sentence-0.25 module."""
+		if text == None:
+			return []
+		#print "text=%s" % text
+		marked_text = self.first_sentence_breaking(text)
+		#print "marked_text=%s" % marked_text
+		fixed_marked_text = self.remove_false_end_of_sentence(marked_text)
+		#print "fixed_marked_text=%s" % fixed_marked_text
+		fixed_marked_text = self.split_unsplit_stuff(fixed_marked_text)
+		#print "fixed_marked_text=%s" % fixed_marked_text
+		sentences = re.split(self.EOS, fixed_marked_text)
+		return sentences
+
+	def first_sentence_breaking(self, text):
+		"""Add a special break character at all places with typical sentence
+		delimiters."""
+		# Double new-line means a new sentence:
+		text = re.compile("(\n\s*\n)", self.reFlags).sub("\\1%s" % self.EOS, text)
+		# Punctuation followed by whitespace means a new sentence:
+		text = re.compile("(%s\s)" % self.PAP, self.reFlags).sub("\\1%s" % self.EOS, text)
+		# New (compared to the perl module): Punctuation followed by uppercase followed
+		# by non-uppercase character (except dot) means a new sentence:
+		text = re.compile("(%s)([%s][^%s.])" % (self.PAP, string.uppercase, string.uppercase), \
+			self.reFlags).sub("\\1%s\\2" % self.EOS, text)
+		# Break also when single letter comes before punctuation:
+		text = re.compile("(\s\w%s)" % self.P, self.reFlags).sub("\\1%s" % self.EOS, text)
+		return text
+		
+	def remove_false_end_of_sentence(self, text):
+		"""Repair some positions that don't require a split, i.e. remove the
+		special break character."""
+		
+		# Don't split at e.g. "U. S. A.":
+		text = re.compile("([^-\w]\w%s\s)%s" % (self.PAP, self.EOS), self.reFlags).sub("\\1", text)
+		# Don't split at e.g. "U.S.A.":
+		text = re.compile("([^-\w]\w%s)%s" % (self.P, self.EOS), self.reFlags).sub("\\1", text)
+
+		# Don't split after a white-space followed by a single letter followed
+		# by a dot followed by another whitespace.
+		# e.g. " p. "
+		text = re.compile("(\s\w\.\s+)%s" % self.EOS, self.reFlags).sub("\\1", text)
+
+		# Don't split at "bla bla... yada yada" (TODO: use \.\.\.\s+ instead?)
+		text = re.compile("(\.\.\. )%s([%s])" % (self.EOS, string.lowercase), self.reFlags).sub("\\1\\2", text)
+		# Don't split [.?!] when the're quoted:
+		text = re.compile("(['\"]%s['\"]\s+)%s" % (self.P, self.EOS)).sub("\\1", text)
+
+		# Don't split at abbreviations:
+		for abbr in self.abbr:
+			# TODO: really ignore case?
+			s = "(\\b%s%s\s)%s" % (abbr, self.PAP, self.EOS)
+			text = re.compile(s, self.reFlags|re.IGNORECASE).sub("\\1", text)
+		
+		# Don't break after quote unless there's a capital letter:
+		# e.g.: "That's right!" he said.
+		text = re.compile('(["\']\s*)%s(\s*[%s])' % (self.EOS, string.lowercase), self.reFlags).sub("\\1\\2", text)
+
+		# fixme? not sure where this should occur, leaving it commented out:
+		# don't break: text . . some more text.
+		#text=~s/(\s\.\s)$EOS(\s*)/$1$2/sg;
+
+		text = re.compile("(\s%s\s)%s" % (self.PAP, self.EOS), self.reFlags).sub("\\1", text)
+
+		# extension by dnaber --commented out, doesn't help:
+		#text = re.compile("(:\s+)%s(\s*[%s])" % (self.EOS, string.lowercase), self.reFlags).sub("\\1\\2", text)
+		return text
+
+	def split_unsplit_stuff(self, text):
+		"""Treat some more special cases that make up a sentence boundary. Insert
+		the special break character at these positions."""
+		# Split at e.g. "no. 5 ":
+		text = re.compile("(\D\d+)(%s)(\s+)" % self.P, self.reFlags).sub("\\1\\2%s\\3" % self.EOS, text)
+		# TODO: Not sure about this one, leaving out foir now:
+		#text = re.compile("(%s\s)(\s*\()" % self.PAP, self.reFlags).sub("\\1%s\\2" % self.EOS, text)
+		# Split e.g.: He won't. #Really.
+		text = re.compile("('\w%s)(\s)" % self.P, self.reFlags).sub("\\1%s\\2" % self.EOS, text)
+		# Split e.g.: He won't say no. Not really.
+		text = re.compile("(\sno\.)(\s+)(?!\d)", self.reFlags|re.IGNORECASE).sub("\\1%s\\2" % self.EOS, text)
+		# Split at "a.m." or "p.m." followed by a capital letter.
+		text = re.compile("([ap]\.m\.\s+)([%s])" % string.uppercase, self.reFlags).sub("\\1%s\\2" % self.EOS, text)
+		return text
+
+if __name__ == "__main__":
+	#t = '"Do split me." Will you?'
+	#print t
+	#s = SentenceSplitter()
+	#l = s.split(t)
+	#print l
+	print "Please use ./SentenceSplitterTest.py for testing."
diff --git a/languagetool/src/SentenceSplitterEval.py b/languagetool/src/SentenceSplitterEval.py
new file mode 100644
index 0000000..cdf8745
--- /dev/null
+++ b/languagetool/src/SentenceSplitterEval.py
@@ -0,0 +1,128 @@
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import sys
+import re
+
+import Entities
+import SentenceSplitter
+
+class SentenceSplitterEval:
+
+	def __init__(self):
+		return
+
+	def findSentence(self, real_boundary, bnc_sentences):
+		sent = None
+		sent_disp = None
+		l = 0
+		i = 0
+		for s in bnc_sentences:
+			l = l + len(s)
+			if l == real_boundary:
+				sent = s
+				next_sent_start = ""
+				try:
+					next_sent_start = bnc_sentences[i+1][0:20]
+				except IndexError:
+					pass
+				sent_disp = "%s###%s..." % (s, next_sent_start)
+				break
+			i = i + 1
+		return sent, sent_disp
+
+	def run(self, bnc_string):
+		self.s = SentenceSplitter.SentenceSplitter()
+
+		# manual testing:
+		#bnc_string = "<s n=0000>This a test. Sentence.</s> <s n=1111>Another one.</s>"
+		#bnc_string = "<s n=0000>This a Sentence</s> <s n=1111>Another one.</s>"
+
+		bnc_paras = re.compile("<p>(.*?)</p>", re.DOTALL).findall(bnc_string)
+		bnc_paras_str = str.join(' ', bnc_paras)
+		bnc_sentences = re.compile("<s\s.*?>(.*?)</s>", re.DOTALL).findall(bnc_paras_str)
+		bnc_boundaries = []
+		l = 0
+		i = 0
+		for s in bnc_sentences:
+			s = bnc_sentences[i]
+			s = Entities.Entities.cleanEntities(s)
+			s = re.compile("<.*?>").sub("", s)
+			s = s.strip()
+			if not s.endswith(" "):
+				# TODO: is this fair?
+				s = s + " "
+			bnc_sentences[i] = s
+			l = l + len(s)
+			bnc_boundaries.append(l)
+			i = i + 1
+		###print bnc_sentences
+		bnc_sentences_str = str.join('', bnc_sentences)
+		#print bnc_sentences_str
+
+		detected_sentences = self.s.split(bnc_sentences_str)
+		###print detected_sentences 
+		detected_boundaries = []
+		l = 0
+		for s in detected_sentences:
+			l = l + len(s)
+			detected_boundaries.append(l)
+
+		sent_count = 0
+		# recall = how many of the sentence boundaries have been detected?
+		recall_count = 0
+		for real_boundary in bnc_boundaries:
+			if real_boundary in detected_boundaries:
+				recall_count = recall_count + 1
+				#print "Found: '%s'" % s
+			else:
+				pass
+				(s, s_disp) = self.findSentence(real_boundary, bnc_sentences)
+				print "Not found: '%s'" % s_disp
+			sent_count = sent_count + 1
+		recall = 0
+		if len(bnc_boundaries) > 0:
+			recall = float(recall_count) / float(len(bnc_boundaries))
+
+		# precision = how many of detected boundaries are real sentence boundaries?
+		precision_count = 0
+		for detected_boundary in detected_boundaries:
+			if detected_boundary in bnc_boundaries:
+				precision_count = precision_count + 1
+		precision = 0
+		if len(detected_boundaries) > 0:
+			precision = float(precision_count) / float(len(detected_boundaries))
+		
+		print "Real sentences = %d" % sent_count
+		print "Recall = %.3f" % recall
+		print "Precision = %.3f" % precision
+		return
+
+if __name__ == "__main__":
+	prg = SentenceSplitterEval()
+	if len(sys.argv) <= 1:
+		print "Usage: ./SentenceSplitterEval.py <bnc_sampler_files>"
+	else:
+		for filename in sys.argv[1:]:
+			print filename
+			f = open(filename)
+			bnc_string = f.read()
+			f.close()
+			prg.run(bnc_string)
diff --git a/languagetool/src/SentenceSplitterTest.py b/languagetool/src/SentenceSplitterTest.py
new file mode 100644
index 0000000..52fe732
--- /dev/null
+++ b/languagetool/src/SentenceSplitterTest.py
@@ -0,0 +1,91 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2003,2004 Daniel Naber <daniel.naber@t-online.de>
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import os
+
+import SentenceSplitter
+import unittest
+
+class SentenceSplitterTestCase(unittest.TestCase):
+
+	def testSplit(self):
+		self.s = SentenceSplitter.SentenceSplitter()
+
+		l = self.s.split(None)
+		self.assertEqual(len(l), 0)
+
+		self._doTest("")
+		self._doTest("This is a sentence.")
+		self._doTest("This is a sentence. #And this is another one.")
+		self._doTest("This is a sentence. #Isn't it? #Yes, it is.")
+		self._doTest("This is e.g. Mr. Smith, who talks slowly... #But this is another sentence.")
+		self._doTest("Chanel no. 5 is groovy.")
+		self._doTest("Mrs. Jones gave Peter $4.5, to buy Chanel No 5. #He never came back.")
+		self._doTest("On p. 6 there's nothing. #Another sentence.")
+		self._doTest("Leave me alone!, he yelled. #Another sentence.")
+		self._doTest("\"Leave me alone!\", he yelled.")
+		self._doTest("'Leave me alone!', he yelled. #Another sentence.")
+		self._doTest("'Leave me alone,' he yelled. #Another sentence.")
+		self._doTest("This works on the phrase level, i.e. not on the word level.")
+		self._doTest("Let's meet at 5 p.m. in the main street.")
+		self._doTest("James comes from the U.K. where he worked as a programmer.")
+		self._doTest("Don't split strings like U.S.A. please.")
+		self._doTest("Don't split strings like U. S. A. either.")
+		self._doTest("Don't split... #Well you know. #Here comes more text.")
+		self._doTest("Don't split... well you know. #Here comes more text.")
+		self._doTest('The "." should not be a delimiter in quotes.')
+		self._doTest('"Here he comes!" she said.')
+		self._doTest('"Here he comes!", she said.')
+		self._doTest('"Here he comes." #But this is another sentence.')
+		self._doTest('"Here he comes!". #That\'s what he said.')
+		self._doTest('The sentence ends here. #(Not me.)')
+		self._doTest("He won't. #Really.")
+		self._doTest("He won't say no. #Not really.")
+		self._doTest("He won't say no. 5 is better. #Not really.")
+		self._doTest("They met at 5 p.m. on Thursday.")
+		self._doTest("They met at 5 p.m. #It was Thursday.")
+		self._doTest("This is it: a test.")
+		# known not to work:
+		#self._doTest("This is it: #A final test.")
+		# two returns -> paragraph -> new sentence:
+		self._doTest("He won't\n\n#Really.")
+		# Some people make two spaces after sentence end:
+		self._doTest("This is a sentence.  #And this is another one.")
+		# Missing space after sentence end:
+		self._doTest("James is from the Ireland!#He lives in Spain now.")
+		# From the abbreviation list:
+		self._doTest("Jones Bros. have built a succesful company.")
+		# Doesn't work:
+		#self._doTest("James is from the U.K. #He lives in Spain now.")
+
+		return
+
+	def _doTest(self, s):
+		s_copy = s.replace("#", "")
+		l = self.s.split(s_copy)
+		correct_result = s.split("#")
+		# ignore leading/trailing whitespace differences:
+		i = 0
+		for item in l:
+			l[i] = l[i].strip()
+			i = i + 1
+		i = 0
+		for item in correct_result:
+			correct_result[i] = correct_result[i].strip()
+			i = i + 1
+		self.assertEqual(l, correct_result)
+		return
diff --git a/languagetool/src/TagInfo.py b/languagetool/src/TagInfo.py
new file mode 100644
index 0000000..31aec80
--- /dev/null
+++ b/languagetool/src/TagInfo.py
@@ -0,0 +1,276 @@
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+# Provide user information about BNC tags
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import re
+import sys
+
+class TagInfo:
+
+	TAG_STRING = {}
+	TAG_STRING['en'] = """AJ0 Adjective (general or positive) (e.g. good, old, beautiful)
+		AJC Comparative adjective (e.g. better, older)
+		AJS Superlative adjective (e.g. best, oldest)
+		AT0 Article (e.g. the, a, an, no) [N.B. no is included among articles, which are defined here as determiner words which typically begin a noun phrase, but which cannot occur as the head of a noun phrase.]
+		AV0 General adverb: an adverb not subclassified as AVP or AVQ (see below) (e.g. often, well, longer (adv.), furthest. [Note that adverbs, unlike adjectives, are not tagged as positive, comparative, or superlative. This is because of the relative rarity of comparative and superlative adverbs.]
+		AVP Adverb particle (e.g. up, off, out) [N.B. AVP is used for such "prepositional adverbs", whether or not they are used idiomatically in a phrasal verb: e.g. in 'Come out here' and 'I can't hold out any longer', the same AVP tag is used for out.
+		AVQ Wh-adverb (e.g. when, where, how, why, wherever) [The same tag is used, whether the word occurs in interrogative or relative use.]
+		CJC Coordinating conjunction (e.g. and, or, but)
+		CJS Subordinating conjunction (e.g. although, when)
+		CJT The subordinating conjunction that [N.B. that is tagged CJT when it introduces not only a nominal clause, but also a relative clause, as in 'the day that follows Christmas'. Some theories treat that here as a relative pronoun, whereas others treat it as a conjunction.We have adopted the latter analysis.]
+		CRD Cardinal number (e.g. one, 3, fifty-five, 3609)
+		DPS Possessive determiner (e.g. your, their, his)
+		DT0 General determiner: i.e. a determiner which is not a DTQ. [Here a determiner is defined as a word which typically occurs either as the first word in a noun phrase, or as the head of a noun phrase. E.g. This is tagged DT0 both in 'This is my house' and in 'This house is mine'.]
+		DTQ Wh-determiner (e.g. which, what, whose, whichever) [The category of determiner here is defined as for DT0 above. These words are tagged as wh-determiners whether they occur in interrogative use or in relative use.]
+		EX0 Existential there, i.e. there occurring in the there is ... or there are ... construction
+		ITJ Interjection or other isolate (e.g. oh, yes, mhm, wow)
+
+		NN0 Common noun, neutral for number (e.g. aircraft, data, committee) [N.B. Singular collective nouns such as committee and team are tagged NN0, on the grounds that they are capable of taking singular or plural agreement with the following verb: e.g. 'The committee disagrees/disagree'.]
+		NN1 Singular common noun (e.g. pencil, goose, time, revelation)
+		NN2 Plural common noun (e.g. pencils, geese, times, revelations)
+		NP0 Proper noun (e.g. London, Michael, Mars, IBM) [N.B. the distinction between singular and plural proper nouns is not indicated in the tagset, plural proper nouns being a comparative rarity.]
+		ORD Ordinal numeral (e.g. first, sixth, 77th, last) . [N.B. The ORD tag is used whether these words are used in a nominal or in an adverbial role. Next and last, as "general ordinals", are also assigned to this category.]
+		PNI Indefinite pronoun (e.g. none, everything, one [as pronoun], nobody) [N.B. This tag applies to words which always function as [heads of] noun phrases. Words like some and these, which can also occur before a noun head in an article-like function, are tagged as determiners (see DT0 and AT0 above).]
+		PNP Personal pronoun (e.g. I, you, them, ours) [Note that possessive pronouns like ours and theirs are tagged as personal pronouns.]
+		PNQ Wh-pronoun (e.g. who, whoever, whom) [N.B. These words are tagged as wh-pronouns whether they occur in interrogative or in relative use.]
+		PNX Reflexive pronoun (e.g. myself, yourself, itself, ourselves)
+
+		POS The possessive or genitive marker 's or ' (e.g. for 'Peter's or somebody else's', the sequence of tags is: NP0 POS CJC PNI AV0 POS)
+		PRF The preposition of. Because of its frequency and its almost exclusively postnominal function, of is assigned a special tag of its own.
+		PRP Preposition (except for of) (e.g. about, at, in, on, on behalf of, with)
+		PUL Punctuation: left bracket - i.e. ( or [
+		PUN Punctuation: general separating mark - i.e. . , ! , : ; - or ?
+		PUQ Punctuation: quotation mark - i.e. ' or "
+		PUR Punctuation: right bracket - i.e. ) or ]
+		TO0 Infinitive marker to 
+		UNC Unclassified items which are not appropriately classified as items of the English lexicon. [Items tagged UNC include foreign (non-English) words, special typographical symbols, formulae, and (in spoken language) hesitation fillers such as er and erm.]
+
+		VBB The present tense forms of the verb BE, except for is, 's: i.e. am, are, 'm, 're and be [subjunctive or imperative]
+		VBD The past tense forms of the verb BE: was and were
+		VBG The -ing form of the verb BE: being
+		VBI The infinitive form of the verb BE: be
+		VBN The past participle form of the verb BE: been
+		VBZ The -s form of the verb BE: is, 's
+
+		VDB The finite base form of the verb DO: do
+		VDD The past tense form of the verb DO: did
+		VDG The -ing form of the verb DO: doing
+		VDI The infinitive form of the verb DO: do
+		VDN The past participle form of the verb DO: done
+		VDZ The -s form of the verb DO: does, 's
+
+		VHB The finite base form of the verb HAVE: have, 've
+		VHD The past tense form of the verb HAVE: had, 'd
+		VHG The -ing form of the verb HAVE: having
+		VHI The infinitive form of the verb HAVE: have
+		VHN The past participle form of the verb HAVE: had
+		VHZ The -s form of the verb HAVE: has, 's
+
+		VM0 Modal auxiliary verb (e.g. will, would, can, could, 'll, 'd)
+
+		VVB The finite base form of lexical verbs (e.g. forget, send, live, return) [Including the imperative and present subjunctive]
+		VVD The past tense form of lexical verbs (e.g. forgot, sent, lived, returned)
+		VVG The -ing form of lexical verbs (e.g. forgetting, sending, living, returning)
+		VVI The infinitive form of lexical verbs (e.g. forget, send, live, return)
+		VVN The past participle form of lexical verbs (e.g. forgotten, sent, lived, returned)
+		VVZ The -s form of lexical verbs (e.g. forgets, sends, lives, returns)
+
+		XX0 The negative particle not or n't 
+		ZZ0 Alphabetical symbols (e.g. A, a, B, b, c, d)"""
+
+	TAG_STRING['de'] = """ADJ Adjective (general) (e.g. gut, alt)
+		ADJE Comparative adjective (e.g. alte)
+		ADJER  adjective with er Ending (e.g. alter)
+		ADJES  adjective with es Ending (e.g. altes)
+		ADJEM  adjective with em Ending (e.g. altem)
+		ADJEN  adjective with en Ending (e.g. alten)
+		*ADV  Adverb like abends, morgen
+		
+		PRA  Pronoun with accusativ  wider, gegen
+		PRD  Pronoun with dativ  ab, aus
+		PRD  Pronoun with accusativ or dativ  in, �ber
+		
+		PP1  Personal pronoun ich, mich, mir
+		PP2  Personal pronoun du
+		PP3  Personal pronoun er, sie, es
+		PP4  Personal pronoun wir
+		PP5  Personal pronoun ihr
+		
+		*IND  oh, ah, heisa
+		*INT  Interrogating word like Wer, wo, etc...
+		
+		CNT  Number
+		CJC  Conjunctive word like und, oder, ...
+		
+		V    verb, e.g. gehen
+		V11  verb, e.g. gehe
+		V12  verb, e.g. gehst
+		V13  verb, e.g. geht
+		V14  verb, e.g. gehen
+		V15  verb, e.g. gehet
+		
+		HV   auxiliary verb, e.g. moegen
+		HV11 auxiliary verb, e.g. mag
+		HV12 auxiliary verb, e.g. magst
+		HV13 auxiliary verb, e.g. mag
+		HV14 auxiliary verb, e.g. moegen
+		HV15 auxiliary verb, e.g. moeget
+		
+		N    Noun
+		NMS  Noun male no ending, e.g. Garten
+		NFS  Noun female no ending, e.g. Frau
+		NNS  Noun neutrum no ending
+		NFNS Noun female or neutrum no ending
+		NFMS Noun female or male no ending
+		NMNS Noun male or neutrum no ending
+		NFMNS Noun male female or neutrum no ending
+		NM  Noun male with ending like Gartens
+		NF  Noun female with ending  like Frauen
+		NN  Noun neutrum with ending
+		NFN Noun female or neutrum with ending
+		NFM Noun female or male with ending
+		NMN Noun male or neutrum with ending
+		NFMN Noun male female or neutrum with ending
+		
+		UA1   indefinite article ein
+		UAE   indefinite article eine
+		UAR   indefinite article einer
+		UAN   indefinite article einen
+		UAM   indefinite article einem
+		UAS   indefinite article eines
+		* INT,IND,ADV sometimes mixed up in the word collection - to be corrected"""
+
+	TAG_STRING['hu'] = """ADJS Singular adjective (e.g.  szep)
+		ADJP Plural Adjective (e.g. szepek)
+		ADJN Numeric Adjective (e.g. tizedik)
+		ADV  Adverb like szepen, jol
+		NS   Noun, singular  asztalnak
+		NSN  Noun, singular, nominativ asztal
+		NSR  Noun, singular, not nominativ asztalt
+		NP   Noun, plural asztalokat
+		NPN  Noun, plural, nominativ asztalok
+		NPR  Noun, plural, not nominativ asztalokra
+		V1   Verb, Singular, 1-st person  irok
+		V2   Verb, Singular, 2-nd person
+		V3   Verb, Singular, 3-rd person
+		V4   Verb, Plural, 1-st person
+		V5   Verb, Plural, 2-nd person
+		V6   Verb, Plural, 3-rd person
+		VINF Verb infinitiv
+		IKV1  Prefixed Verb, Singular, 1-st person megirok
+		IKV2  Prefixed Verb, Singular, 2-nd person
+		IKV3  Prefixed Verb, Singular, 3-rd person
+		IKV4  Prefixed Verb, Plural, 1-st person
+		IKV5  Prefixed Verb, Plural, 2-nd person
+		IKV6  Prefixed Verb, Plural, 3-rd person
+		VINF  Prefixed Verb infinitiv
+		SI1   Help Verb, Singular, 1-st person akarok
+		SI2   Help Verb, Singular, 2-nd person
+		SI3   Help Verb, Singular, 3-rd person
+		SI4   Help Verb, Plural, 1-st person
+		SI5   Help Verb, Plural, 2-nd person
+		SI6   Help Verb, Plural, 3-rd person
+		SIINF Help Verb infinitiv
+		IKSI1 Prefixed Help Verb, Singular, 1-st person  megvagyok
+		IKSI2 Prefixed Help Verb, Singular, 2-nd person
+		IKSI3 Prefixed Help Verb, Singular, 3-rd person
+		IKSI4 Prefixed Help Verb, Plural, 1-st person
+		IKSI5 Prefixed Help Verb, Plural, 2-nd person
+		IKSI6 Prefixed Help Verb, Plural, 3-rd person
+		IKSIINF Prefixed Help Verb infinitiv
+		NEIK  Non detachable verb prefix be, ki, le, fel, etc...
+		PP1  Personal pronom en
+		PP2  Personal pronom te
+		PP3  Personal pronom o
+		PP4  Personal pronom mi
+		PP5  Personal pronom ti
+		PP6  Personal pronom ok
+		RPP1 Owning Personal Pronom enyem
+		RPP2 Owning Personal Pronom tied
+		RPP3 Owning Personal Pronom ove
+		RPP4 Owning Personal Pronom mienk
+		RPP5 Owning Personal Pronom tietek
+		RPP6 Owning Personal Pronom ovek
+		IND  uhum
+		INT  Interrogating word like nemde etc...
+		CRD  Number tizenot
+		INTRN Numerical interrogation mennyi, etc...
+		INTR Interrogation miert, etc...
+		CJC  Conjunctive word like es vagy, ...
+		DNV  Double role, Noun and verb var
+		DAV  Double role, Adj and Verb irt
+		DNA  Double role, Noun and ADJ or ADV iro ...	
+		RART Conjunction word like de, hogy
+		"""
+
+	def __init__(self, lang):
+		if not self.TAG_STRING.has_key(lang):
+			raise KeyError, "no information found for language '%s'" % lang
+		tag_lines = re.split("\n", self.TAG_STRING[lang])
+		self.tags = []		# [(short, explanation)]
+		for tag_line in tag_lines:
+			tag_line = tag_line.strip()
+			parts = re.split("\s+", tag_line)
+			short_tag = parts[0]
+			tag_exp = str.join(' ', parts[1:])
+			self.tags.append((short_tag, tag_exp))
+		return
+
+	def getExp(self, short_tag_search):
+		for (tag_short, tag_exp) in self.tags:
+			if short_tag_search == tag_short:
+				return tag_exp
+		return None
+
+	def getJavascriptCode(self):
+		l = []
+		for (tag_short, tag_exp) in self.tags:
+			tag_exp = tag_exp.replace("\"", "\\\"")
+			l.append('data["%s"] = "%s";' % (tag_short, tag_exp))
+		return str.join('\n', l)
+		
+	def getHTMLCode(self):
+		l = []
+		l.append('<table border="0" cellpadding="0" cellspacing="2">')
+		for (tag_short, tag_exp) in self.tags:
+			tag_exp = tag_exp.replace("\"", "\\\"")
+			if tag_short:
+				l.append('<tr bgcolor="#dddddd"><td valign="top"><strong>%s</strong></td><td>%s</td></tr>' % (tag_short, tag_exp))
+			else:
+				l.append('<tr><td>&nbsp;</td></tr>')
+		l.append('</table>')
+		return str.join('\n', l)
+
+	def printAll(self):
+		for (tag_short, tag_exp) in self.tags:
+			if tag_short:
+				print "%s: %s" % (tag_short, tag_exp)
+			else:
+				print
+		return
+
+if __name__ == "__main__":
+	# TODO: take language as parameter
+	if len(sys.argv) < 2:
+		print "Usage: TagInfo.py <language>"
+		print "	where <language> is a language code like en, de, ..."
+		sys.exit(1)
+	taginfo = TagInfo(sys.argv[1])
+	taginfo.printAll()
diff --git a/languagetool/src/Tagger.py b/languagetool/src/Tagger.py
new file mode 100644
index 0000000..1243c41
--- /dev/null
+++ b/languagetool/src/Tagger.py
@@ -0,0 +1,1108 @@
+# -*- coding: iso-8859-1 -*-
+# A probabilistic part-of-speech tagger (see the QTag paper) with
+# a rule-based extension.
+#$rcs = ' $Id$ ' ;
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import codecs
+import os
+import re
+import string
+import sys
+import time
+import cPickle
+import htmlentitydefs
+import Wfinder
+
+# FIXME:
+dicFile = 'deutsch.txt'
+affFile = 'deutsch.aff'
+
+class Tagger:
+	"""POS-tag any text. The result in XML can be used to re-build the original
+	text by concatenating all contents of the <w> tags. Whitespace characters
+	have term=None and type=None, i.e. they are inside their own <w>
+	elements. Words that could not be tagged have type=unknown."""
+
+	def __init__(self, textlanguage, db_word_name=None, db_seq_name1=None, db_seq_name2=None):
+		"""Initialize the tagger, optionally using the given
+		file names that will be used to load and save data later."""
+		self.textlanguage = textlanguage
+		self.wfinder = Wfinder.Wfinder(textlanguage)
+		db_word_name = os.path.join(sys.path[0], "data",  dicFile)
+		db_seq_name1 = os.path.join(sys.path[0], "data", "seqs1")
+		db_seq_name2 = os.path.join(sys.path[0], "data", "seqs2")
+		#uncountable_name = os.path.join("data", "uncountable.txt")
+		self.data_table = None
+		self.seqs_table_followed_by = None	# tag sequences: table[tag1,tag2] = value
+		self.seqs_table_follows = None		# tag sequences: table[tag1,tag2] = value
+		if db_word_name:
+			self.db_word_name = db_word_name
+		if db_seq_name1:
+			self.db_seq_name1 = db_seq_name1
+		if db_seq_name2:
+			self.db_seq_name2 = db_seq_name2
+		#uncountable_nouns = self.loadUncountables()
+		self.word_count = 0
+
+		return
+
+	def loadUncountables(self):
+		"""TODO: not used yet."""
+		l = []
+		f = open(self.uncountable_name)
+		while 1:
+			line = f.readline()
+			if not line:
+				break
+			line = line.strip()
+			if not line.startswith("#") and line != '':
+				l.append(line)
+		f.close()
+		return l
+
+	def bindData(self):
+		"""Load the word/POS tag and POS tag sequence data from disk."""
+		try:
+			if self.textlanguage != 'en':
+				self.ReadData(self.db_word_name);
+			else:
+				self.data_table = cPickle.load(open(self.db_word_name, 'rb'))
+		except IOError:
+			print >> sys.stderr, "No data file '%s' yet, starting with empty table." % self.db_word_name
+			self.data_table = {}
+		if self.textlanguage == 'en':
+			try:
+				self.seqs_table_followed_by = cPickle.load(open(self.db_seq_name1, 'rb'))
+			except IOError:
+ 				print >> sys.stderr, "No data file '%s' yet, starting with empty table." % self.db_seq_name1
+			self.seqs_table_followed_by = {}
+			try:
+				self.seqs_table_follows = cPickle.load(open(self.db_seq_name2, 'rb'))
+			except IOError:
+				print >> sys.stderr, "No data file '%s' yet, starting with empty table." % self.db_seq_name2
+				self.seqs_table_follows = {}
+		else:
+			self.seqs_table_followed_by = {}
+			self.seqs_table_follows = {}
+		return
+
+	def commitData(self):
+		"""Save the word/POS tag and POS tag sequence data to disk."""
+		print >> sys.stderr, "Words = %d" % self.word_count
+		print >> sys.stderr, "Known words = %d" % len(self.data_table.keys())
+		print >> sys.stderr, "Known sequences = %d" % len(self.seqs_table_followed_by.keys())
+		print >> sys.stderr, "Commiting results..."
+#		cPickle.dump(self.data_table, open(self.db_word_name, 'wb'), 1)
+#		cPickle.dump(self.seqs_table_followed_by, open(self.db_seq_name1, 'wb'), 1)
+#		cPickle.dump(self.seqs_table_follows, open(self.db_seq_name2, 'wb'), 1)
+		return
+
+	def deleteData(self):
+		"""Remove the word/POS tag and POS tag sequence data files from disk."""
+#		print >> sys.stderr, "Deleting old data files..."
+#		try:
+#			os.remove(self.db_word_name)
+#		except OSError, e:
+#			print >> sys.stderr, "Note: Could not delete file: %s" % e
+#		try:
+#			os.remove(self.db_seq_name1)
+#		except OSError, e:
+#			print >> sys.stderr, "Note: Could not delete file: %s" % e
+#		try:
+#			os.remove(self.db_seq_name2)
+#		except OSError, e:
+#			print >> sys.stderr, "Note: Could not delete file: %s" % e
+		return
+
+	def buildData(self, filenames):
+		"""Load BNC files in XML or SGML format and count the word/POS
+		occurences and the POS tag sequences."""
+		tagged_words = []
+		for filename in filenames:
+			print >> sys.stderr, "Loading %s..." % filename
+			text = PreTaggedText(filename)
+			tagged_words.extend(text.getTaggedWords())
+		self.word_count = self.word_count + len(tagged_words)
+#		text.addToData(tagged_words, self.data_table, self.seqs_table_followed_by, self.seqs_table_follows)
+		return
+
+	def buildDataFromString(self, s):
+		"""Take a string with format "word1/tag1 word2/tag2 ..." and
+		count the word/POS occurences and the POS tag sequences.
+		Only useful for the test cases."""
+		pairs = re.compile("\s+").split(s)
+		tagged_words = []
+		split_regex = re.compile("/")
+		for pair in pairs:
+			pair = split_regex.split(pair)
+			if len(pair) != 2:
+				# e.g. punctuation
+				continue
+			word = pair[0]
+			tag = pair[1]
+			tagged_words.append((word, tag))
+		text = TextToTag(self.textlanguage, self.wfinder)
+#		text.addToData(tagged_words, self.data_table, self.seqs_table_followed_by, self.seqs_table_follows)
+		return
+
+	def ReadData(self, db_word_name):
+  		self.data_table = {}
+  		self.word_table = {}
+		table = {}
+		return
+
+
+	def tagFile(self, filename):
+		"""POS-tag the contents of a text file and return XML that contains
+		the original text with each word's POS tag in the "type"
+		attribute."""
+		text = TextToTag(self.textlanguage, self.wfinder)
+		text.setFilename(filename)
+		tagged_words = text.tag(self.data_table, self.seqs_table_followed_by, self.seqs_table_follows)
+#		print tagged_words  # tktk
+		xml = text.toXML(tagged_words)
+		return xml
+
+	def tagText(self, strng):   #textchecker check calls
+		"""POS-tag a string and return a list of (word, normalized word, tag)
+		triples."""
+		text = TextToTag(self.textlanguage, self.wfinder)
+		text.setText(strng)
+#		print strng
+		tagged_words = text.tag(self.data_table, self.seqs_table_followed_by, self.seqs_table_follows)
+#		print tagged_words # tktk
+		return tagged_words
+
+	def tagTexttoXML(self, strng):
+		"""POS-tag a string and return a list of (word, normalized word, tag)
+		triples."""
+		text = TextToTag(self.textlanguage, self.wfinder)
+		text.setText(strng)
+		tagged_words = text.tag(self.data_table, self.seqs_table_followed_by, self.seqs_table_follows)
+		xml = text.toXML(tagged_words)
+		return xml
+
+	def tagSeq(self, tup):
+		"""Return the probability of a 2-POS-tag sequence."""
+		if len(tup) != 2:
+			#TODO?: throw exception
+			print >> sys.stderr, "Sequence does not consist of 2 tokens: '%s'" % str(seq)
+			return None
+		try:
+			probability = self.seqs_table_followed_by[tup]
+			#probability = self.seqs_table_follows[tup]
+		except KeyError:
+			probability = 0
+		return probability
+
+	def tagSeq2(self, tup):
+		"""Return the probability of a 2-POS-tag sequence."""
+		if len(tup) != 2:
+			#TODO?: throw exception
+			print >> sys.stderr, "Sequence does not consist of 2 tokens: '%s'" % str(seq)
+			return None
+		try:
+			#probability = self.seqs_table_followed_by[tup]
+			probability = self.seqs_table_follows[tup]
+		except KeyError:
+			probability = 0
+		return probability
+
+	def tagWord(self, word):
+		"""See Text.tagWord()"""
+		text = TextToTag(self.textlanguage, self.wfinder)
+		text.setText("")
+		tag = text.tagWord(word, self.data_table)
+		return tag
+
+	def guessTagTest(self, word):
+		"""See Text.guessTags(). For test cases only."""
+		text = TextToTag(self.textlanguage, self.wfinder)
+		text.setText("")
+		tag = text.guessTags(word)
+		return tag
+
+
+class Text:
+
+	DUMMY = None
+	number_regex = re.compile("^(\d|\d+[.,/\-]\d+)+$")
+	time_regex = re.compile("\d(am|pm)$")
+	bnc_regex = re.compile("<(w|c) (.*?)>(.*?)<", re.DOTALL)
+
+	mapping_file = os.path.join(sys.path[0], "data", "c7toc5.txt")
+	manually_tagged_file = os.path.join(sys.path[0], "data", "postags.txt")
+
+	def __init__(self, textlanguage, wfinder):
+		self.textlanguage = textlanguage
+		self.wfinder = wfinder
+		self.count_unambiguous = 0
+		self.count_ambiguous = 0
+		self.count_unknown = 0
+		self.whitespace = re.compile("\s+$")
+		self.nonword = re.compile("([\s,:;]+)")
+		self.nonword_punct = re.compile("([,:;]+)")
+		self.sentence_end = re.compile("([.!?]+)$")
+		self.bnc_word_regexp = re.compile("<W\s+TYPE=\"(.*?)\".*?>(.*?)</W>", \
+			re.DOTALL|re.IGNORECASE)
+		self.mapping = self.loadMapping()
+		self.manually_tagged = self.loadManuallyTagged()
+		return
+
+	def loadMapping(self):
+		f = open(self.mapping_file)
+		line_count = 1
+		mapping = {}
+		while 1:
+			line = f.readline().strip()
+			if not line:
+				break
+			l = re.split("\s+", line)
+			if not len(l) == 2:
+				print >> sys.stderr, "No valid mapping in line %d: '%s'" % (line_count, line)
+			(c7, c5) = l[0], l[1]
+			if mapping.has_key(c7):
+				print >> sys.stderr, "No valid mapping in line %d: '%s', duplicate key '%s'" % (line_count, line, c7)
+				continue
+			mapping[c7] = c5
+			#print "%s -> %s" % (c7, c5)
+			line_count = line_count + 1
+		f.close()
+		return mapping
+
+	def loadManuallyTagged(self):
+		table = {}
+		regex = re.compile("^(.+)\s+(.+?)$")
+		f = open(self.manually_tagged_file)
+		while 1:
+			line = f.readline()
+			if not line:
+				break
+			line = line.strip()
+			if not line.startswith("#") and line != '':
+				regex_match = regex.search(line)
+				if regex_match:
+					word = regex_match.group(1)
+					postag = regex_match.group(2)
+					table[word] = postag
+		f.close()
+		return table
+
+	def expandEntities(self, text):
+		"""Take a text and expand a few selected entities. Return the same
+		text with entities expanded. (We cannot simply parse the file with
+		DOM, as we don't have an XML DTD -- the original files were SGML.)"""
+		### TODO: use Entities module
+		text = re.compile("&amp;", re.IGNORECASE).sub("&", text)
+		# TODO: several entities are missing here:
+		#text = re.compile("&#(x..);", re.IGNORECASE).sub(self.expandHexEntities, text)
+		text = re.compile("&#xA3;", re.IGNORECASE).sub("�", text)
+		return text
+
+	#def expandHexEntities(self, matchobj):
+	#	htmlentitydefs.entitydefs[]
+	#	s = u'\%s' % matchobj.group(1)
+	#	#s = "Y"
+	#	return s
+
+	def getBNCTuples(self, text):
+		"""Return a list of (tag, word) tuples from text if
+		text is a BNC Sampler text in XML or SGML format. Otherwise
+		return an empty list. The tags are mapped from the C7 tag set
+		to the much smaller C5 tag set."""
+		l = []
+		pos = 0
+		while 1:
+			m = self.bnc_regex.search(text, pos)
+			if not m:
+				break
+			tag = m.group(2)
+			if self.mapping.has_key(tag):
+				tag = self.mapping[tag]
+			else:
+				#print "no mapping: %s" % tag
+				pass
+			if m.group(3):
+				l.append((tag, m.group(3).strip()))
+				#print "- %s/%s" % (tag, m.group(3).strip())
+			pos = m.start()+1
+		return l
+
+	def normalise(self, text):
+		"""Take a string and remove XML markup and whitespace at the beginning
+		and the end. Return the modified string."""
+		# sometimes there's <PB...>...</PB> *inside* <W...>...</W>!
+		text = re.compile("<.*?>", re.DOTALL|re.IGNORECASE).sub("", text)
+		text = text.strip()
+		return text
+
+	def splitBNCTag(self, tag):
+		"""Take a string with BNC tags like 'NN1-NP0' and return a list,
+		e.g. ['NN1', 'NP0']. For single tags like 'NN0' this will
+		be returned: ['NN0']."""
+		tags = re.split("-", tag)
+		return tags
+
+	def guessTags(self, word):
+		"""Take a word and guess which POS tags it might have and return
+		those POS tags. This considers e.g. word prefixes, suffixes and
+		capitalization. If no guess can be made, None is returned."""
+		# TODO: return more than one tag
+
+		# �25 etc:
+		# fixme -- UnicodeDecodeError
+		#if word.startswith(u"�") or word.startswith(u"$"):
+		#	return 'NN0'
+
+		# numbers:
+		if self.number_regex.match(word):
+			return 'CRD'
+
+		# e.g. HIV
+		if len(word) >= 2 and word == word.upper():
+			return 'NN0'
+
+		# this >=3 limit also prevents to assign 'A' (i.e. determiner
+		# at sentence start) NP0, of course that's only relevant
+		# for the test cases:
+		# English only
+		# TODO: is it okay to use 'latin1' here?
+		if len(word) >= 3 and word[0] in unicode(string.uppercase, 'latin1'):	# e.g. "Jefferson"
+			return 'NP0'
+
+		# e.g. freedom, contentment, celebration, assistance, fighter,
+		# violinist, capacity
+		if self.textlanguage == 'en':
+			noun = ['dom', 'ment', 'tion', 'sion', 'ance', 'ence', 'er', 'or',
+				'ist', 'ness', 'icity']
+			for suffix in noun:
+				if word.endswith(suffix):
+					return 'NN1'
+
+			# e.g. quickly
+			if word.endswith("ly"):
+				return 'AV0'
+
+		# e.g. 8.55am
+		if self.time_regex.search(word):
+			return 'AV0'
+
+		# e.g. extensive, heroic, financial, portable, hairy
+		# mysterious, hopeful, powerless
+		# 'en' was left out, could also be a verb
+		if self.textlanguage == 'en':
+			adj = ['ive', 'ic', 'al', 'able', 'y', 'ous', 'ful', 'less']
+			for suffix in adj:
+				if word.endswith(suffix):
+					return 'AJ0'
+
+			# e.g. publicize, publicise, activate, simplify
+			# 'en' was left out, could also be a adjective
+			verb = ['ize', 'ise', 'ate', 'fy']
+			for suffix in verb:
+				if word.endswith(suffix):
+					# fixme: could also be VVB
+					return 'VVI'
+
+		return None
+
+	def tagWord(self, word, data_table):
+		"""Find all possible tags for a word and return a list of tuples:
+		[(orig_word, normalised_word, [(tag, probability])]"""
+		orig_word = word
+		word = self.normalise(word)
+		#word = re.compile("[^\w' ]", re.IGNORECASE).sub("", word)
+
+		#if word and self.nonword_punct.match(word):
+		#	# punctuation
+		#	return [(orig_word, orig_word, [])]
+		if (not word) or self.whitespace.match(word):
+			# word is just white space
+			return [(orig_word, None, [])]
+
+		if self.manually_tagged.has_key(word):
+			return [(orig_word, orig_word, [(self.manually_tagged[word], 1)])]
+
+		# sanity check:
+		#if word.count("'") > 1:
+		#	print >> sys.stderr, "*** What's this, more than one apostroph: '%s'?" % word
+
+		# Special cases: BNC tags "wasn't" like this: "<w VBD>was<w XX0>n't"
+		# Call yourself, but don't indefinitely recurse.
+		if self.textlanguage == 'en':
+			special_cases = ("n't", "'s", "'re", "'ll", "'ve")
+			for special_case in special_cases:
+				special_case_pos = word.find(special_case)
+				if special_case_pos != -1 and special_case_pos != 0:
+					first_part = self.tagWord(word[0:special_case_pos], data_table)[0]
+					second_part = self.tagWord(special_case, data_table)[0]
+					tag_results = []
+					#TODO: return probability?:
+					#print second_part
+					tag_results.append((word[0:special_case_pos], first_part[1], first_part[2]))
+					tag_results.append((special_case, second_part[1], second_part[2]))
+					return tag_results
+
+		# TODO?: ignore upper/lower case?, no -- seems to decrease precision
+		#word = word.lower() #handled by word finder itself
+		#if not data_table.has_key(word) and len(word) >= 1:
+		#	word = word.lower()
+		#	#if data_table.has_key(word):
+		#	#	print "lower: %s" % word
+		#if not data_table.has_key(word) and len(word) >= 1:
+		#	word = "%s%s" % (word[0].upper(), word[1:])
+		#	#if data_table.has_key(word):
+		#	#	print "upper: %s" % word
+
+		if self.textlanguage != 'en':
+			rc = self.wfinder.test_it(word)
+			if rc[0] != '-':
+				src = rc.split()
+				#	print len(src)
+				# last returned word exists in .dic file
+				# that's why this word was found
+				word =  src[len(src)-2]
+				return [(orig_word, orig_word, [(src [len(src)-1], 1)])]
+#				return [(orig_word, word, [(src [len(src)-1], 1)])]
+			if rc[0] == '-':
+			#if not data_table.has_key(word):
+				# word is unknown
+				#print "unknown: '%s'" % word
+				self.count_unknown = self.count_unknown + 1
+				guess_tag = self.guessTags(word)
+				if guess_tag:
+					return [(orig_word, orig_word, [(guess_tag, 1)])]
+#					return [(orig_word, word, [(guess_tag, 1)])]
+				else:
+					return [(orig_word, orig_word, [("unknown", 1)])]
+#					return [(orig_word, word, [("unknown", 1)])]
+		else:   # English case
+			if not data_table.has_key(word):
+				# word is unknown
+				#print "unknown: '%s'" % word
+				self.count_unknown = self.count_unknown + 1
+				guess_tag = self.guessTags(word)
+				if guess_tag:
+					return [(orig_word, word, [(guess_tag, 1)])]
+				else:
+					return [(orig_word, word, [("unknown", 1)])]
+			else:
+				pos_table = data_table[word].table
+				if len(pos_table) == 1:
+					# word is unambiguous
+					self.count_unambiguous = self.count_unambiguous + 1
+					return [(orig_word, word, [(pos_table.keys()[0], 1)])]
+				else:
+					# word is ambiguous
+					tag_tuples = []
+					for pos_tag in pos_table.keys():
+						#print "pos_tag=%s -> %.2f" % (pos_tag, pos_table[pos_tag])
+						tag_tuples.append((pos_tag, pos_table[pos_tag]))
+					self.count_ambiguous = self.count_ambiguous + 1
+					return [(orig_word, word, tag_tuples)]
+
+#	def addToData(self, tagged_words, data_table, seqs_table_followed_by, seqs_table_follows):
+		"""Count words and POS tags so they can later be added
+		to the persistent storage."""
+#		tag_list = self.addWords(tagged_words, data_table)
+#		self.addTagSequences(tag_list, seqs_table_followed_by, seqs_table_follows)
+#		return
+
+#	def addWords(self, tagged_words, data_table):
+		"""For each word, save the tag frequency to data_table so
+		it can later be added to the persistent storage. Return
+		a list of all tags."""
+#		all_tags_list = []
+#		for (word, tag) in tagged_words:
+			#only for testing if case-insensitivity is better:
+			#word = word.lower()
+#			all_tags_list.append(tag)
+#			tag_list = self.splitBNCTag(tag)
+#			assert(len(tag_list) == 1 or len(tag_list) == 2)
+			#print "word/pos_list: %s/%s" % (word, tag_list)
+#			if data_table.has_key(word):
+				# word is already known
+#				word_table = data_table[word].table
+#				for tag in tag_list:
+#					if word_table.has_key(tag):
+#						word_table[tag] = word_table[tag] + 1.0/len(tag_list)
+						#print "word_table[%s] += %f" % (tag, 1.0/len(tag_list))
+#					else:
+#						word_table[tag] = 1.0/len(tag_list)
+						#print "word_table[%s] = %f" % (tag, word_table[tag])
+#			else:
+#				word_table = {}
+#				for tag in tag_list:
+#					word_table[tag] = 1.0/len(tag_list)
+					#print "word_table[%s] = %f" % (tag, word_table[tag])
+#				data_table[word] = WordData(word, word_table)
+		# Normalize data_table values so they are probabilities (0 to 1):
+#		for e in data_table.keys():
+#			t = data_table[e].table
+#			occ_all = 0
+#			for occ in t.values():
+#				occ_all = occ_all + occ
+#			for key in t.keys():
+#				t[key] = t[key] / occ_all
+		# debug:
+		#for e in data_table.keys():
+		#	print "%s, %s" % (e, data_table[e])
+#		return all_tags_list
+
+	def addTagSequences(self, tag_list, seqs_table_followed_by, seqs_table_follows):
+		"""Save information about POS tag tuples to seqs_table."""
+		# TODO: add dummy entries?
+		if len(tag_list) == 0:
+			return
+		i = 0
+
+		### FIXME: does this work if data is added later? probably not...:
+		count_followed_by = {}
+		count_follows = {}
+
+		while 1:
+			if i >= len(tag_list)-1:
+				break
+			tag0 = tag_list[i]
+			key = ()
+			if self.mapping.has_key(tag0):
+				tag0 = self.mapping[tag0]
+			tag1 = tag_list[i+1]
+			if self.mapping.has_key(tag1):
+				tag1 = self.mapping[tag1]
+			try:
+				seqs_table_followed_by[(tag0,tag1)] = seqs_table_followed_by[(tag0,tag1)] + 1
+			except KeyError:
+				seqs_table_followed_by[(tag0,tag1)] = 1
+			try:
+				count_followed_by[tag0] = count_followed_by[tag0] + 1
+			except KeyError:
+				count_followed_by[tag0] = 1
+
+			#print "%s/%s" % (tag1, tag0)
+			try:
+				seqs_table_follows[(tag1,tag0)] = seqs_table_follows[(tag1,tag0)] + 1
+			except KeyError:
+				seqs_table_follows[(tag1,tag0)] = 1
+			try:
+				count_follows[tag1] = count_follows[tag1] + 1
+			except KeyError:
+				count_follows[tag1] = 1
+			i = i + 1
+
+		# Normalize to 0-1 range:
+		# TODO: do these numbers become too small, as the Qtag paper states?
+		for t in seqs_table_followed_by.keys():
+			#if t[0] == 'NN0':
+			#	print "%s=%s -- %d" % (t, seqs_table_followed_by[t], count_followed_by[t[0]])
+			seqs_table_followed_by[t] = float(seqs_table_followed_by[t]) / float(count_followed_by[t[0]])
+		for t in seqs_table_follows.keys():
+			seqs_table_follows[t] = float(seqs_table_follows[t]) / float(count_follows[t[0]])
+
+		#debug:
+		#print "FOLLOWED BY (norm):"
+		#for k in seqs_table_followed_by.keys():
+		#	print "%s -> %s" % (k, seqs_table_followed_by[k])
+		#print "FOLLOWS (norm):"
+		#for k in seqs_table_follows.keys():
+		#	print "%s -> %s" % (k, seqs_table_follows[k])
+		return
+
+
+class TextToTag(Text):
+	"""Any text (also pre-tagged texts from the BNC -- for
+	testing the tagger)."""
+
+	DUMMY = None
+
+	def __init__(self, textlanguage, wfinder):
+		# FIXME: not needed, is it? (done in base class):
+		self.textlanguage = textlanguage
+		self.text = None
+		Text.__init__(self, self.textlanguage, wfinder)
+		return
+
+	def setText(self, text):
+		self.text = text
+		return
+
+	def setFilename(self, filename):
+		f = open(filename)
+		self.text = f.read()
+		f.close()
+		return
+
+	def getBestTagSimple(self, tag_tuples):
+		"""Return the most probable tag without taking context into
+		account. Only useful for testing and checking the baseline."""
+		max_prob = 0
+		best_tag = None
+		for tag_tuples_here in tag_tuples:
+			prob = tag_tuples_here[1]
+			if prob >= max_prob:
+				max_prob = prob
+				best_tag = tag_tuples_here[0]
+		return best_tag
+
+	def checkBNCMatch(self, i, tagged_list_bnc, word, best_tag, data_table):
+		"""Check for mismatches, i.e. POS tags that differ from the original
+		tag in BNC. Print out a warning for all those differences and return
+		1, otherwise return 0. Note that the BNC's tags are only correct
+		in 97-98%. If the original tag is 'UNC' and this tagger's tag is
+		not 'unknown', this is still considered a mismatch."""
+		if i >= len(tagged_list_bnc)-1:
+			print >> sys.stderr, "Index out of range..."
+			return 0
+		if not tagged_list_bnc[i]:
+			return 0
+		word_from_bnc, tags_from_bnc = tagged_list_bnc[i]
+		#print "%s ?= %s" % (word_from_bnc, word)
+		if best_tag == 'unknown':
+			# 'UNC' means unclassified in BNC, assume that this corresponds
+			# to out 'unknown':
+			best_tag = 'UNC'
+		guessed = 1
+		if data_table.has_key(word):
+			guessed = 0
+		if not word == word_from_bnc:
+			print >> sys.stderr, "*** word mismatch: '%s'/'%s'" % (word, word_from_bnc)
+			#sys.exit()
+		elif not (best_tag in tags_from_bnc) and \
+				tags_from_bnc[0][0] != 'Y':		# ignore punctuation tags
+			print >> sys.stderr, "*** tag mismatch (guessed=%d): got %s/%s, expected %s/%s" % \
+				(guessed, word, best_tag, word_from_bnc, tags_from_bnc)
+			return 1
+		#if word == word_from_bnc and guessed:
+		#	print >> sys.stderr, "GOODGUESS"
+		return 0
+
+	def getStats(self, count_wrong_tags, is_bnc):
+		"""Get some human-readable statistics about tagging success,
+		e.g. number and percentage of correctly tagged tokens."""
+		sum = self.count_unknown + self.count_unambiguous + self.count_ambiguous
+		res = ""
+		if sum > 0:
+			res = "<!-- Statistics:\n"
+			res = res + "count_unknown = %d (%.2f%%)\n" % (self.count_unknown, float(self.count_unknown)/float(sum)*100)
+			res = res + "count_unambiguous = %d (%.2f%%)\n" % (self.count_unambiguous, float(self.count_unambiguous)/float(sum)*100)
+			res = res + "count_ambiguous = %d (%.2f%%)\n" % (self.count_ambiguous, float(self.count_ambiguous)/float(sum)*100)
+			#res = res + "sum = %d\n" % sum
+			if is_bnc:
+				res = res + "correct tags = %d (%.2f%%)\n" % (sum-count_wrong_tags, float(sum-count_wrong_tags)/float(sum)*100)
+				#res = res + "count_wrong_tags = %d (%.2f%%)\n" % (count_wrong_tags, float(count_wrong_tags)/float(sum)*100)
+			res = res + "-->"
+		return res
+
+	def applyConstraints(self, prev_word, curr_word, next_word, tagged_tuples):
+		"""Some hard-coded and manually written rules that prevent mistaggings by
+		the probabilistic tagger. Removes incorrect POS tags from tagged_tuples.
+		Returns nothing, as it works directly on tagged_tuples."""
+		# demo rule just for the test cases:
+		if curr_word and curr_word.lower() == 'demodemo':
+			self.constrain(tagged_tuples, 'AA')
+		# ...
+		return
+
+	def constrain(self, tagged_tuples, pos_tag):
+		"""Remove the pos_tag reading from tagged_tuples. Returns nothing,
+		works directly on tagged_tuples."""
+		i = 0
+		for t in tagged_tuples:
+			if t[0] == pos_tag:
+				del tagged_tuples[i]
+			i = i + 1
+		return
+
+	def applyTagRules(self, curr_word, tagged_word, curr_tag):
+		"""Some hard-coded and manually written rules that extent the
+		tagging. Returns a (word, normalized_word, tag) triple."""
+		# ...
+		return None
+
+	def tag(self, data_table, seqs_table_followed_by, seqs_table_follows): # z.164 texttag calls
+		"""Tag self.text and return list of tuples
+		(word, normalized word, most probable tag)"""
+		self.text = self.expandEntities(self.text)
+		is_bnc = 0
+		word_matches = self.getBNCTuples(self.text)
+		if len(word_matches) > 0:
+			# seems like this is a BNC text used for testing
+			is_bnc = 1
+			print >> sys.stderr, "BNC text detected."
+		else:
+			word_matches = self.nonword.split(self.text)
+		# tktk splitted looks \xe1, etc...
+		# Put sentence end periods etc into an extra element.
+		# We cannot just split on periods etc. because that would
+		# break inner-sentence tokens like "... No. 5 ...":
+		# fixme: only work on the last element (not counting white space)
+		# FIXME: doesn't work here: "I cannot , she said."
+		if not is_bnc:
+			j = len(word_matches)-1
+			while j >= 0:
+				w = word_matches[j]
+				s_end_match = self.sentence_end.search(w)
+				if s_end_match:
+					word_matches[j] = w[:len(w)-len(s_end_match.group(1))]
+					word_matches.insert(j+1, s_end_match.group(1))
+					break
+				j = j - 1
+
+#		print "word_matches=%s" % word_matches
+		i = 0
+		tagged_list = [self.DUMMY, self.DUMMY]
+		tagged_list_bnc = [self.DUMMY, self.DUMMY]
+
+		while i < len(word_matches):
+			next_token = None
+			tags = None
+			if is_bnc:
+				# word_matches[i] is a (tag,word) tuple
+				(tag, word) = word_matches[i]
+				if i+1 < len(word_matches):
+					(next_token, foo) = word_matches[i+1]
+				word = self.normalise(word)
+				tags = self.splitBNCTag(tag)
+			else:
+				word = word_matches[i]
+				if i+1 < len(word_matches):
+					next_token = word_matches[i+1]
+			if self.textlanguage == 'en':
+				if i + 2 < len(word_matches): # english only
+				# BNC special case: "of course" and some others are tagged as one word!
+					tuple_word = "%s %s" % (word, word_matches[i+2])		# +2 = jump over whitespace
+					if data_table.has_key(tuple_word):
+					#print >> sys.stderr, "*** SPECIAL CASE %d '%s' ..." % (i, tuple_word)
+						word = tuple_word
+						i = i + 2
+#
+#      The next several (6-7) lines avoid not found words
+#       because of trailing dots.
+#
+			if len(word) >= 1 and word[-1] in (  '.', ',', '?','!', ':', ';', '\'', '\"', '%', '='):
+				wordend = word[-1];
+				word = word[0:-1]
+				r = Text.tagWord(self, word, data_table)
+				tagged_list.extend(r)
+				word = wordend
+			r = Text.tagWord(self, word, data_table)
+			tagged_list.extend(r)
+
+			if is_bnc:
+				for el in r:
+					# happens e.g. with this (wrong?) markup in BNC:
+					#<W TYPE="CRD" TEIFORM="w">4's</W>
+					# My tagger tags <4> and <'s>, so there's an offset
+					# which makes futher comparisons BNC <-> tagger impossible,
+					# so use this pseudo-workaround and just re-use the tags
+					# for the <'s>, too:
+					#print "%s -> %s" % (el[0], tags)
+					tagged_list_bnc.append((el[0], tags))
+			i = i + 1
+
+		tagged_list.append(self.DUMMY)
+		tagged_list.append(self.DUMMY)
+
+		# test only:
+		#result_tuple_list = []
+		#i = 0
+		#count_wrong_tags = 0
+		#for t in tagged_list:
+		#	#print "t=%s" % t
+		#	if t:
+		#		best_tag = self.getBestTagSimple(t[2])
+		#		if is_bnc:
+		#			wrong_tags = self.checkBNCMatch(i, tagged_list_bnc, t[0], best_tag, data_table)
+		#			count_wrong_tags = count_wrong_tags + wrong_tags
+		#		result_tuple_list.append((t[0], t[1], best_tag))
+		#	i = i + 1
+		#stat = self.getStats(count_wrong_tags)
+		#print >> sys.stderr, stat
+		#print result_tuple_list
+
+		### Constraint-based part:
+		prev_word = None
+		next_word = None
+		i = 0
+		for tag_tuples in tagged_list:
+			prev_word = self.getPrevWord(i, tagged_list)
+			next_word = self.getNextWord(i, tagged_list)
+			if tag_tuples and tag_tuples[1]:
+				self.applyConstraints(prev_word, tag_tuples[0], next_word, tag_tuples[2])
+			i = i + 1
+
+		result_tuple_list = self.selectTagsByContext(tagged_list, seqs_table_followed_by, \
+			seqs_table_follows, tagged_list_bnc, is_bnc, data_table)
+
+		i = 0
+		for tag_triple in result_tuple_list:
+			triple = self.applyTagRules(tag_triple[0], tag_triple[1], tag_triple[2])
+			if triple:
+				result_tuple_list[i] = triple
+			if self.sentence_end.search(tag_triple[0]):
+				# make sure punctuation doesn't have tags:
+				result_tuple_list[i] = (tag_triple[0], None, None)
+			i = i + 1
+
+		return result_tuple_list
+
+	def selectTagsByContext(self, tagged_list, seqs_table_followed_by, \
+			seqs_table_follows, tagged_list_bnc, is_bnc, data_table):
+
+		count_wrong_tags = 0
+		tag_probs = {}
+		i = 0
+		for tagged_triple in tagged_list:
+			if tagged_triple != None and tagged_triple[1] == None:
+				# ignore whitespace
+				i = i + 1
+				continue
+			try:
+				one = tagged_list[i]
+				two = tagged_list[i+1]
+				whitespace_jump = 0
+				if two and two[1] == None:
+					two = tagged_list[i+2]
+					whitespace_jump = whitespace_jump + 1
+				two_pos = i + 1 + whitespace_jump
+				three = tagged_list[i+2+whitespace_jump]
+				if three and three[1] == None:
+					three = tagged_list[i+3+whitespace_jump]
+					whitespace_jump = whitespace_jump + 1
+				three_pos = i + 2 + whitespace_jump
+			except IndexError:
+				# list end
+				break
+
+			one_tags = [None]
+			if one: 
+				one_tags = one[2]
+			two_tags = [None]
+			if two: two_tags = two[2]
+			three_tags = [None]
+			if three: three_tags = three[2]
+
+			for one_tag in one_tags:
+				tag_one_prob = 0
+				if one_tag:
+					tag_one_prob = one_tag[1]
+
+				for two_tag in two_tags:
+					tag_two_prob = 0
+					if two_tag:
+						tag_two_prob = two_tag[1]
+
+					for three_tag in three_tags:
+						tag_three_prob = 0
+						if three_tag:
+							tag_three_prob = three_tag[1]
+
+						#print "** %s/%s/%s" % (one_tag, two_tag, three_tag)
+						one_tag_prob = None
+						if one_tag: one_tag_prob = one_tag[0]
+						two_tag_prob = None
+						if two_tag: two_tag_prob = two_tag[0]
+						three_tag_prob = None
+						if three_tag: three_tag_prob = three_tag[0]
+
+						seq_prob = 0
+						if one:
+							#print one[0],
+							#if two:
+							#	print two[0]
+							try:
+								k1 = (one_tag_prob, two_tag_prob)
+								k2 = (two_tag_prob, three_tag_prob)
+								seq_prob = seqs_table_followed_by[k1] * \
+									seqs_table_followed_by[k2]
+								#print "k1=%s, k2=%s" % (str(k1), str(k2))
+							except KeyError:
+								pass
+							prob_combined = seq_prob * tag_one_prob
+							#print "%s, %s, %s: %.7f * %.7f = %.7f" % (one_tag_prob, two_tag_prob, \
+							#	three_tag_prob, seq_prob, tag_one_prob, prob_combined)
+							k1 = (i, one_tag[0])
+							#print "%s = %.7f" % (str(k1), prob_combined)
+							try:
+								tag_probs[k1] = tag_probs[k1] + prob_combined
+							except KeyError:
+								tag_probs[k1] = prob_combined
+						if two:
+							try:
+								seq_prob = seqs_table_follows[(two_tag_prob, one_tag_prob)] * \
+									seqs_table_followed_by[(two_tag_prob, three_tag_prob)]
+							except KeyError:
+								pass
+							prob_combined = seq_prob * tag_two_prob
+							k2 = (two_pos, two_tag[0])
+							try:
+								tag_probs[k2] = tag_probs[k2] + prob_combined
+							except KeyError:
+								tag_probs[k2] = prob_combined
+							#print "%s = %.7f" % (str(k2), prob_combined)
+						if three:
+							try:
+								seq_prob = seqs_table_follows[(two_tag_prob, one_tag_prob)] * \
+									seqs_table_follows[(three_tag_prob, two_tag_prob)]
+							except KeyError:
+								pass
+							prob_combined = seq_prob * tag_three_prob
+							k3 = (three_pos, three_tag[0])
+							try:
+								tag_probs[k3] = tag_probs[k3] + prob_combined
+							except KeyError:
+								tag_probs[k3] = prob_combined
+							#print "%s = %.7f" % (str(k3), prob_combined)
+
+			orig_word = None
+			norm_word = None
+			# the word that falls out of the window is assigned its final tag:
+			if one:
+				orig_word = one[0]
+				norm_word = one[1]
+				keys = tag_probs.keys()
+				max_prob = 0
+				best_tag = None
+				for tag_prob in keys:
+					if tag_prob[0] == i and tag_probs[tag_prob] >= max_prob:
+						###print " K=%s, V=%s" % (tag_prob, tag_probs[tag_prob])
+						max_prob = tag_probs[tag_prob]
+						best_tag = tag_prob[1]
+				tagged_list[i] = (orig_word, norm_word, best_tag)
+				#print "BEST@%d: %s" % (i, best_tag)
+			
+				# this avoids inefficiencies, it's necessary because
+				# of the tag_probs.keys() call above (which becomes
+				# too slow otherwise):
+				for tag_prob in keys:
+					if tag_prob[0] <= i:
+						del tag_probs[tag_prob]
+
+			if is_bnc and one:
+				orig_word = one[0]
+				if self.textlanguage == 'en':
+					wrong_tags = self.checkBNCMatch(i, tagged_list_bnc, orig_word, best_tag, data_table)
+					count_wrong_tags = count_wrong_tags + wrong_tags
+
+			i = i + 1
+
+		stat = self.getStats(count_wrong_tags, is_bnc)
+		#print >> sys.stderr, stat
+
+		# remove dummy entries:
+		tagged_list.pop(0)
+		tagged_list.pop(0)
+		tagged_list.pop()
+		tagged_list.pop()
+		
+		return tagged_list
+
+	def getPrevWord(self, i, tagged_list):
+		"""Find the token previous to the token at position i from tagged_list,
+		ignoring whitespace tokens. Return a tuple (word, tuple_list),
+		whereas tuple_list is a list of (tag, tag_probability) tuples."""
+		j = i-1
+		while j >= 0:
+			(orig_word_tmp, tagged_word_tmp, tag_tuples_tmp) = self.getTuple(tagged_list[j])
+			j = j - 1
+			if not tagged_word_tmp:
+				continue
+			else:
+				prev = tag_tuples_tmp
+				return orig_word_tmp
+		return None
+
+	def getNextWord(self, i, tagged_list):
+		"""Find the token next to the token at position i from tagged_list,
+		ignoring whitespace tokens. See self.getPrevToken()"""
+		j = i + 1
+		while j < len(tagged_list):
+			(orig_word_tmp, tagged_word_tmp, tag_tuples_tmp) = self.getTuple(tagged_list[j])
+			j = j + 1
+			if not tagged_word_tmp:
+				continue
+			else:
+				next = tag_tuples_tmp
+				return orig_word_tmp
+		return None
+
+	def getTuple(self, tagged_list_elem):
+		if not tagged_list_elem:
+			orig_word = None
+			tagged_word = None
+			tag_tuples = None
+		else:
+			(orig_word, tagged_word, tag_tuples) = tagged_list_elem
+		return (orig_word, tagged_word, tag_tuples)
+
+
+	def toXML(self, tagged_words):
+		"Show result as XML."
+		xml_list = []
+		for (orig_word, word, tag) in tagged_words:
+			# fast appending:
+			if not word and not tag:
+				xml_list.append(' <w>%s</w>\n' % orig_word)
+			else:
+				xml_list.append(' <w term="%s" type="%s">%s</w>\n' % (word, tag, orig_word))
+		xml = "<taggedWords>\n" + string.join(xml_list, "") + "</taggedWords>\n"
+		return xml
+
+	
+class PreTaggedText(Text):
+	"Text from the BNC Sampler in XML format."
+
+	def __init__(self, filename):
+		self.content = None
+		Text.__init__(self)
+		f = open(filename)
+		self.content = f.read()
+		f.close()
+		return
+
+	def getTaggedWords(self):
+		"Returns list of tuples (word, tag)"
+		text = self.expandEntities(self.content)
+		word_matches = self.getBNCTuples(text)
+		tagged_words = []
+		for (tag, word) in word_matches:
+			tagged_words.append((word, tag))
+		return tagged_words
+
+
+class WordData:
+	"A term and the frequency of its tags."
+
+	def __init__(self, word, affix, table):
+		self.word = word
+		self.affix = affix
+		# table = tag / number of occurences
+		# deep copy the hash table (TODO: use deep copy functions):
+		self.table = {}
+		for el in table:
+			self.table[el] = table[el]
+		return
+
+	def __str__(self):
+		"Show word data (debugging only!)"
+		string = self.word + ":\n"
+		for el in self.table:
+			string = string + "\t" + el + ": " + str(self.table[el]) + "\n"
+		return string
diff --git a/languagetool/src/TaggerTest.py b/languagetool/src/TaggerTest.py
new file mode 100644
index 0000000..c94f233
--- /dev/null
+++ b/languagetool/src/TaggerTest.py
@@ -0,0 +1,168 @@
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import unittest
+import Tagger
+
+import os
+import sys
+
+class TaggerTestCase(unittest.TestCase):
+
+	FILENAME_WORDS = os.path.join(sys.path[0], "data", "tag_test_words")
+	FILENAME_SEQ1 = os.path.join(sys.path[0], "data", "tag_test_sequences1")
+	FILENAME_SEQ2 = os.path.join(sys.path[0], "data", "tag_test_sequences2")
+	
+	def cleanList(self, l):
+		"""Return a copy of the list with 'None' elements (e.g. whitespace)
+		removed. Also, only the first and last element of each triple is 
+		copied."""
+		new_list = []
+		for el in l:
+			if el[1]:
+				new_list.append((el[0], el[2]))
+		return new_list
+			
+	def cleanListAll(self, l):
+		"""Return a copy of the list with 'None' elements (e.g. whitespace)
+		removed. Also, only the last element of each triple is copied."""
+		new_list = []
+		for el in l:
+			if el[1]:
+				new_list.append(el[2])
+		return new_list
+
+	def tag(self, learn_text, text):
+
+		# build data:
+		tagger = Tagger.Tagger("en", self.FILENAME_WORDS, self.FILENAME_SEQ1, self.FILENAME_SEQ2)
+		tagger.deleteData()
+		tagger.bindData()
+		tagger.buildDataFromString(learn_text)
+		tagger.commitData()
+		tagger = None
+
+		# tag text:
+		tagger2 = Tagger.Tagger("en", self.FILENAME_WORDS, self.FILENAME_SEQ1, self.FILENAME_SEQ2)
+		tagger2.bindData()
+		res = tagger2.tagText(text)
+		res = self.cleanList(res)
+		tagger2.deleteData()
+
+		return res
+
+	def testExpandEntities(self):
+		tagger = Tagger.Text("en", None)
+		r = tagger.expandEntities("")
+		self.assertEqual(r, "")
+		r = tagger.expandEntities("bla &amp;&amp;")
+		self.assertEqual(r, "bla &&")
+		#r = tagger.expandEntities("bla &#xA3;")
+		#self.assertEqual(r, u"bla �")
+		return
+		
+	def testGuess(self):
+		tagger = Tagger.Tagger("en", self.FILENAME_WORDS, self.FILENAME_SEQ1, self.FILENAME_SEQ2)
+		tagger.deleteData()
+		tagger.bindData()
+		tagger.buildDataFromString("")		# don't learn at all!
+		tagger.commitData()
+
+		tag = tagger.guessTagTest("")
+		self.assertEqual(tag, None)
+
+		# numbers = CRD:
+		self.assertEqual(tagger.guessTagTest("0"), 'CRD')
+		self.assertEqual(tagger.guessTagTest("3123.1312"), 'CRD')
+		self.assertEqual(tagger.guessTagTest("00,99"), 'CRD')
+		self.assertEqual(tagger.guessTagTest("00/99"), 'CRD')
+		self.assertEqual(tagger.guessTagTest("1-99"), 'CRD')
+
+		# BNC Sampler tags "$xx" as NNU, which is mapped to NN0 (same for �):
+		self.assertEqual(tagger.guessTagTest("$31.12"), 'NN0')
+		self.assertEqual(tagger.guessTagTest("HIV"), 'NN0')
+		self.assertEqual(tagger.guessTagTest("8.55pm"), 'AV0')
+		self.assertEqual(tagger.guessTagTest("10.10pm"), 'AV0')
+		self.assertEqual(tagger.guessTagTest(u"Gro�ekath�fer"), 'NP0')
+		self.assertEqual(tagger.guessTagTest("jackerfoodom"), 'NN1')
+		self.assertEqual(tagger.guessTagTest("testious"), 'AJ0')
+		self.assertEqual(tagger.guessTagTest("testize"), 'VVI')
+		self.assertEqual(tagger.guessTagTest("foofooly"), 'AV0')
+		self.assertEqual(tagger.guessTagTest("unguessablexxx"), None)
+		self.assertEqual(tagger.guessTagTest("verboten"), None)
+		return
+
+	def testLearningAndTagging(self):
+	
+		print "###########1"
+		
+		#FIXME: doesn't work:
+		r = self.tag("The/AT0 fat/AJ0 man/NN1", "The big man")
+		self.assertEqual(r, [('The', 'AT0'), ('big', 'unknown'), ('man', 'NN1')])
+
+		print "###########2"
+		return		#FIXME
+
+		r = self.tag("The/AT0 fat/AJ0 man/NN1", "the xxx")
+		# the/unknown because the tagger is case sensitive:
+		self.assertEqual(r, [('the', 'unknown'), ('xxx', 'unknown')])
+
+		r = self.tag("The/AT0 fat/AJ0 man/NN1", "The fat man")
+		self.assertEqual(r, [('The', 'AT0'), ('fat', 'AJ0'), ('man', 'NN1')])
+
+		r = self.tag("A/DET cool/AJ0 large/AJ0 car/NN1", "A cool car")
+		self.assertEqual(r, [('A', 'DET'), ('cool', 'AJ0'), ('car', 'NN1')])
+		
+		# fat occurs 2 times as NN1 and 1 time as AJ0, but context decides:
+		r = self.tag("""The/DET fat/NN1 is/VB hot/AJ0
+			The/DET fat/AJ0 guy/NN1
+			A/DET man/NN1 used/VBD fat/NN1""",
+			"A fat man")
+		self.assertEqual(r, [('A', 'DET'), ('fat', 'AJ0'), ('man', 'NN1')])
+
+		# fat occurs 3 times as NN1 and 0 times as AJ0 -> tagged as NN1 of course:
+		r = self.tag("""The/DET fat/NN1 is/VB hot/AJ0
+			A/DET fat/NN1 man/NN1 . 
+			He/PP used/VBD fat/NN1""", "A fat man")
+		self.assertEqual(r, [('A', 'DET'), ('fat', 'NN1'), ('man', 'NN1')])
+
+		# fat occurs 1 times as NN1 and 2 times as AJ0 -> tagged as AJ0
+		r = self.tag("""The/DET fat/AJ0 is/VB hot/AJ0
+			A/DET fat/AJ0 man/NN1 . 
+			He/PP used/VBD fat/NN1""", "A fat man")
+		self.assertEqual(r, [('A', 'DET'), ('fat', 'AJ0'), ('man', 'NN1')])
+
+		r = self.tag("""The/DET fat/AJ0 man/NN is/VB fat/AJ0 ./PP""",
+			"A fat man he is fat.")
+		self.assertEqual(r, [('A', 'unknown'), ('fat', 'AJ0'), ('man', 'NN'),
+			('he', 'unknown'), ('is', 'VB'), ('fat', 'AJ0')])
+		
+		return
+
+	#FIXME
+	#def testApplyConstraints(self):
+	#	r = self.tag("A/X bla/X demodemo/AA demodemo/AA demodemo/BB bla/X bla/X", \
+	#		"demodemo")
+	#	self.assertEqual(r, [('demodemo', 'BB')])
+	#
+	#	return
+
+if __name__ == "__main__":
+	unittest.main()
diff --git a/languagetool/src/Tools.py b/languagetool/src/Tools.py
new file mode 100644
index 0000000..5bed1b1
--- /dev/null
+++ b/languagetool/src/Tools.py
@@ -0,0 +1,58 @@
+# -*- coding: iso-8859-1 -*-
+# Tools class
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import sys
+import re
+
+class Tools:
+
+	def __init__(self):
+		return
+	
+	def getXML(node, xmlstr=""):
+		"""Get the XML content of a node, but only elements and text."""
+		if node and node.nodeType == node.ELEMENT_NODE:
+			l = []
+			for child in node.childNodes:
+				l.append(Tools.getXML(child, xmlstr))
+			xmlstr = "<%s>%s</%s>" % (node.tagName, str.join('', l), node.tagName)
+		elif node and node.nodeType == node.TEXT_NODE:
+			xmlstr = "%s%s" % (xmlstr, node.data)
+		return xmlstr
+
+	getXML = staticmethod(getXML)
+
+	def countLinebreaks(s):
+		matches = re.findall("[\n\r]", s)
+		#print "#%s -> %s" % (s, len(matches))
+		return len(matches)
+
+	countLinebreaks = staticmethod(countLinebreaks)
+
+	def getLanguageName(shortName):
+		if shortName == 'en':
+			return 'English'
+		elif shortName == 'de':
+			return 'German'
+		elif shortName == 'hu':
+			return 'Hungarian'
+		return None
+		
+	getLanguageName = staticmethod(getLanguageName)
diff --git a/languagetool/src/Wfdeu.py b/languagetool/src/Wfdeu.py
new file mode 100755
index 0000000..89b26fc
--- /dev/null
+++ b/languagetool/src/Wfdeu.py
@@ -0,0 +1,70 @@
+# -*- coding: iso-8859-1 -*-
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2004 ....
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+import array
+import codecs
+import os
+from string import *
+import sys
+
+class Wfdeu:
+
+	encoding = "latin1"
+	
+	def __init__(self):
+		return
+	
+	def getTyp(self,typ, oword, word):
+		if typ != "":
+			if typ == 'V' or typ == 'HV':
+				if oword[-4:] == 'ende' or oword[-5:-1] == 'ende':
+					typ = 'ADJV'
+			if typ == 'V' or typ == 'HV':
+				if oword[-1:] == 'e':
+					typ =  typ + '11'
+				elif oword[-2:] == 'st':
+					typ = typ + '12'
+				elif oword[-2:] == 'en':
+					typ = typ + '14'
+				elif oword[-2:] == 'et':
+					typ = typ + '15'
+				elif oword[-1:] == 't':
+					typ = typ + '13'
+			elif typ == 'ADJ':
+				if oword[-2:] == 'er':
+					typ = 'ADJER'
+				elif oword[-2:] == 'en':
+					typ = 'ADJEN'
+				elif oword[-2:] == 'em':
+					typ = 'ADJEM'
+				elif oword[-2:] == 'es':
+					typ = 'ADJES'
+				elif oword[-1:] == 'e':
+					typ = 'ADJE'
+			elif typ == 'NMS':
+				if oword[-2:] == 'in':
+					typ = 'NFS'
+				elif oword[-5:] == 'innen':
+					typ = 'NF'
+			if typ[0] == 'N':
+				if word != oword and typ[-1:] == 'S':
+					typ = typ[0:-1]
+		return typ
+					
+	
+
diff --git a/languagetool/src/Wfhun.py b/languagetool/src/Wfhun.py
new file mode 100755
index 0000000..3514ca1
--- /dev/null
+++ b/languagetool/src/Wfhun.py
@@ -0,0 +1,88 @@
+# -*- coding: iso-8859-1 -*-
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2004 ....
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+import array
+import codecs
+import os
+from string import *
+import sys
+
+class Wfhun:
+
+	encoding = "latin1"
+	
+	def __init__(self):
+		return
+	
+	def getTyp(self,typ, oword, word):
+		dif = len(oword) - len(word)
+		if (typ[0] == 'V' or typ[0:2] == 'SI') and word != oword:
+			ik = ''
+			telo = 'SI'
+			if typ[0] == 'V':
+				telo = 'V'
+			if oword[0:2] != word[0:2]:
+				ik = 'IK'
+			if oword[-3:]  in (u'i�k','iuk', 'nak', 'nek','tak', 'tek') or oword[-2:] in (u'�k', u'�k'):
+				typ = ik + telo + '6'
+			elif oword[-3:]  in ('tok','tek', u't�k'):
+				typ = ik + telo + '5'
+			elif oword[-3:]  in (u'�nk','unk', u'�nk', u'�nk') or oword[-2:] in ('uk', u'�k'):
+				typ = ik + telo + '4'
+			elif oword[-2:]  in ('sz','od', 'ed', u'�d',u'�d','ad',u'�d'):
+				typ = ik + telo + '2'
+			elif oword[-2:]  in ('ok','ek',u'�k','om','em',u'�m', u'�m', u'�m', 'am'):
+				typ = ik + telo + '1'
+			elif oword[-2:] in ('va', 've') or oword[-3:] in (u'v�n', u'v�n'):
+				typ = 'ADV'
+			elif oword[-2:]  == 'ni':
+				typ = 'INF'
+			else:
+				typ = ik + telo + '3'
+		elif typ[0:3] == 'PP4':
+			if oword != 'mi':
+				typ = 'ADV'
+		elif typ[0:3] == 'ADJ':
+			if oword[-2:]  in ('ek','ok', 'ak', u'�k', u'�k') and dif > 0 and (dif < 3 or ((word[0:1] != oword[0:1]) and dif < 9)):
+				typ = 'ADJP'
+			elif oword[-1:]  in (u'�',u'�') and dif > 0 and (dif < 5 or ((word[0:1] != oword[0:1]) and dif < 12)):
+				typ = 'ADV'
+			elif oword[-2:] in ('an', 'en', 'bb','ul',u'�l') and dif == 2:
+				typ = 'ADV'
+			elif dif != 0:
+				typ = 'ADV'
+		elif typ[0] == 'N':
+			if oword[-1] == 'k' and oword[-2] in ('a',u'�', 'e',u'�','i',u'�','o',u'�',u'�',u'�','u',u'�',u'�',u'�') and dif > 0 and dif < 3 :
+				typ = 'NP'
+			elif oword[-1:] == 'i' and dif == 1:
+				typ = 'DNA'
+			elif (oword[-1:] in(u'�', u'�') and dif == 1) or (oword[-2:] in (u'j�', u'j�')  and dif == 2):
+				typ = 'ADJS'
+			elif typ == 'N':
+				if oword[-1] == 'k' and oword == word:
+					typ = 'NP'
+				else:
+					typ = 'NS'
+			elif  dif >= 2:
+				typ = 'N'
+		if typ[0] == 'N' and oword == word and word[-1] != 'k':
+				typ = typ+'N'
+		return typ		
+					
+	
+
diff --git a/languagetool/src/Wfinder.py b/languagetool/src/Wfinder.py
new file mode 100644
index 0000000..7ba1935
--- /dev/null
+++ b/languagetool/src/Wfinder.py
@@ -0,0 +1,568 @@
+# -*- coding: iso-8859-1 -*-
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2004 ....
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+# usage python stem.py
+#
+#  file test.txt contains are for example:
+#   carried
+#   worked
+#    play
+#
+#  example aff file (dtest.aff)
+# SFX D Y 4
+# SFX D 0 e d             # abate->abated
+# SFX D y ied [^aeiou]y   # carry -> carried
+# SFX D 0 ed [^ey]        # work -> worked
+# SFX D 0 ed [aeiuu]y     # play -> played
+#
+#  example dic file (dtest.dic)
+# 3
+# carry/D
+# work/D
+# play/D
+#
+# reads words from the file test.txt
+#
+# Speed up 9 times by helding different 
+#  append endings in different arrays  3.July, 2004
+#
+#  Speed improvement by 30% by doing the above
+#   also with the prefixes, and by helding
+#   affixes and prefixes in different lists. 4. July, 2004
+#
+
+import array
+import codecs
+import os
+import Tagger
+import Wfdeu
+import Wfhun
+from string import *
+import time
+import sys
+
+
+#aff_file = "dtest.aff"
+#dic_file = "dtest.dic"
+#test_file = "test.txt"
+yesno = {}
+comment = "#"
+condlist = []
+condlist1 = []
+alfab_conddic = {}
+palfab_conddic = {}
+alfab_condlist_group = []
+alfab2_condlist_group = []
+alfab2_conddic = {}
+palfab2_conddic = {}
+alfab2_condlist_group = []
+szodic = {}
+typdic = {}
+
+class Wfinder:
+
+	encoding = "latin1"
+	doubleflags = ""
+	doubleflagList=""
+	
+	def __init__(self, textlanguage):
+#		print time.strftime('%X %x %Z')
+		self.is_initialized = 0
+		self.is_secondflag = 0
+		self.textlanguage = textlanguage
+		self.wfdeu = Wfdeu.Wfdeu()
+		self.wfhun = Wfhun.Wfhun()
+		return
+
+	def aff_read(self):
+	 	self.aff_file = os.path.join(sys.path[0], "data", Tagger.affFile)
+		condlist = []
+		alfab_condlist_group = []
+		alfab2_condlist_group = []
+		faff = codecs.open(self.aff_file, "r", self.encoding)
+		l = " "
+		for i in range(0,256,1):
+			alfab_conddic[i] = []
+			palfab_conddic[i] = []
+			alfab2_conddic[i] = []
+			palfab2_conddic[i] = []
+		while l != "":
+  			l = faff.readline()
+  			ll =  l.split()
+  			if len(ll) <= 1:
+  				continue
+  			if ll[0][0] in comment:
+				continue
+			if ll[0][1:3] == "FX":
+				arrname = ll[1]
+				prefix = 0
+				if ll[0][0] == 'P':
+					prefix = 1
+				yesno[arrname] = ll[2]
+				for i in range(0, int(ll[3])):
+					l = faff.readline()
+					bb = l.split()
+#					print "%s %d" %(bb,len(bb))
+#					print "l:%s bb[2]:%s arrname:%s" %(l,bb[2], arrname)
+					strip = bb[2]
+					if bb[2] == '0':
+						strip = '';
+					appnd = bb[3]
+					if bb[3] == '0':
+						appnd = ''
+						appnd_last = '0'
+					else:
+						if prefix == 0:
+							appnd_last = appnd[-1]
+						else:
+							appnd_last = appnd[0]
+					if bb[4] != '.':
+						jj = 0
+						while(jj < len(bb[4])):
+							condarr = array.array('B',range(256))
+							insbit = 1;
+							for iii in range(0,256,1):
+								condarr[iii] = 0
+							if bb[4][jj] == '[':
+								kk = 0;
+								jj = jj + 1
+								if bb[4][jj] == '^':
+									jj = jj+1
+									insbit = 0;
+									for iii in range(0,256,1):
+										condarr[iii] = 1
+								while bb[4][jj] != ']':
+									condarr[ord(bb[4][jj])] = insbit;
+									jj = jj + 1
+								if bb[4][jj] == ']':
+									jj = jj +1
+							else:
+								condarr[ord(bb[4][jj])] = insbit;
+								jj = jj +1
+							condlist.append(condarr)
+					secondflag = ""
+					if len(bb) >= 7:
+						secondflag = bb[6]
+						self.is_secondflag = 1
+						if find(self.doubleflags,arrname) == -1:
+							self.doubleflags = self.doubleflags+arrname
+						for elem in secondflag:
+							if find(self.doubleflagList,elem) == -1:
+								self.doubleflagList = self.doubleflagList+elem
+#						print "is_sec:%d" % self.is_secondflag
+						alfab2_condlist_group.append(condlist)
+						alfab2_condlist_group.append(strip)
+						alfab2_condlist_group.append(appnd)
+						alfab2_condlist_group.append(arrname)
+						alfab2_condlist_group.append(secondflag)
+						if prefix == 0:
+							alfab2_conddic[ord(appnd_last)].append(alfab2_condlist_group)
+						else:
+							palfab2_conddic[ord(appnd_last)].append(alfab2_condlist_group)
+					alfab_condlist_group.append(condlist)
+					alfab_condlist_group.append(strip)
+					alfab_condlist_group.append(appnd)
+					alfab_condlist_group.append(arrname)
+					if prefix == 0:
+						alfab_conddic[ord(appnd_last)].append(alfab_condlist_group)
+					else:
+						palfab_conddic[ord(appnd_last)].append(alfab_condlist_group)
+#					print "appended %s to  %s %d" %(appnd.encode('latin1'), appnd_last.encode('latin1'), ord(appnd_last))
+					condlist = []
+					alfab_condlist_group = []
+					alfab2_condlist_group = []
+		faff.close()
+#		print self.doubleflags
+#		for i in range (0,255,1):
+#		  print len(alfab_conddic[i])
+#		print alfab_conddic[ord('a')]
+
+#
+# Now read the dictionary
+#
+	def dic_read(self):
+	 	self.dic_file = os.path.join(sys.path[0], "data", Tagger.dicFile)
+		szoszam = 0;
+		fdic = codecs.open(self.dic_file, "r", self.encoding)
+		l = " "
+		szolista = []
+		ujlista = []
+		l = fdic.readline()
+		szoszam = int(l)
+		while l != "":
+			l = fdic.readline()
+			szolista = l.split("/")
+			for szo in szolista:
+				szo = szo.strip('\n \t')
+				ujlista.append(szo)
+			if len(ujlista) > 1:
+				szodic[ujlista[0]] = ujlista[1]
+			else:
+				szodic[ujlista[0]] = ""
+			if len(ujlista) > 2:
+				typdic[ujlista[0]] = ujlista[2]
+			else:
+				typdic[ujlista[0]] = ""
+			ujlista = []
+		fdic.close()
+
+	def do_keytest(self,l):
+		if l == "":
+			return ""
+		if szodic.has_key(l):
+			return "+ %s" %l
+		else:
+			return "- %s" %l
+
+	def suffix2_search(self, l, oarrname, oword):
+		retval = ""
+		found = 0
+		for windex in ord(l[-1]), ord('0'):
+			for elem in alfab2_conddic[windex]:
+			# elem0: condlist, elem1: strip elem2 = append, elem3 = arrname 
+#				print "s2_s l:%s oarr:%s elem[4]:%s  app:%s strip:%s" % (l, oarrname, elem[4],elem[2],elem[1] )
+				if found:
+					return retval
+				if find(elem[4], oarrname) == -1:
+					continue
+			#
+			#  search first only suffixes
+			#  since prefix is optional
+			#
+				appnd    = elem[2]
+				if len(appnd):
+					if l[-len(appnd):] != appnd:
+						continue
+#				if len(appnd):
+					restoredWord = l[0:len(l)-len(appnd)]
+				else:
+					restoredWord = l
+				condlist = elem[0]
+				strip    = elem[1]
+				if len(strip):
+					restoredWord = restoredWord + strip
+				break_it = 0
+				if len(condlist) > 0 and len(restoredWord) >= len(condlist): #tktk
+					substr = restoredWord[-len(condlist):]
+					for i in range(0, len(condlist), 1): #tktk
+						if condlist[i][ord(substr[i])] != 1:
+							break_it = 1
+							break
+					if break_it:
+						continue
+					
+				if szodic.has_key(restoredWord):
+					flags = szodic[restoredWord]
+#					print "s22_s: %s %d %s %s %s %s %s"  % (restoredWord,szodic.has_key(restoredWord),elem[3], oarrname, elem[4], oarrname, flags)
+					if flags == "": # tktk
+						continue
+					else:
+						if find(flags, elem[3]) == -1:
+							continue  
+					retval = "++ %s %s" %(oword,restoredWord)
+					found = 1
+					return retval
+		return retval
+	
+
+	def suffix_search(self, l, oldl, oarrname):
+		retval = ""
+		found = 0
+		for windex in ord(l[-1]), ord('0'):
+			for elem in alfab_conddic[windex]:
+			# elem0: condlist, elem1: strip elem2 = append, elem3 = arrname 
+				if found:
+					return retval
+			#
+			#  search first only suffixes
+			#  since prefix is optional
+			#
+				appnd    = elem[2]
+				if len(appnd):
+					if l[-len(appnd):] != appnd:
+						continue
+					restoredWord = l[0:len(l)-len(appnd)]
+				else:
+					restoredWord = l
+				condlist = elem[0]
+				strip    = elem[1]
+				if len(strip):
+					restoredWord = restoredWord + strip
+				break_it = 0
+#				print "%s %s %s %s" %(restoredWord,appnd,strip, elem[3])
+				if len(condlist) > 0 and len(restoredWord) >= len(condlist): #tktk
+					substr = restoredWord[-len(condlist):]
+					for i in range(0, len(condlist), 1): #tktk
+						if condlist[i][ord(substr[i])] != 1:
+							break_it = 1
+							break
+					if break_it:
+						continue
+				if szodic.has_key(restoredWord):
+					flags = szodic[restoredWord]
+					if flags == "": # tktk
+						continue
+					else:
+						if find(flags, elem[3]) == -1:
+							continue
+						if oarrname != "" and find(flags, oarrname) == -1:
+							continue  
+					if oldl != "":
+						retval = "+++ %s %s %s" %(oldl, l,restoredWord)
+					else: 
+						retval = "++ %s %s" %(l,restoredWord)
+					found = 1
+					return retval
+ #		print windex
+		return retval
+	
+	def suffix22_search(self, l, oldl, oarrname):
+		retval = ""
+		found = 0
+		for windex in ord(l[-1]), ord('0'):
+			for elem in alfab_conddic[windex]:
+			# elem0: condlist, elem1: strip elem2 = append, elem3 = arrname 
+#				print "s.d:%s e3:%s app:%s str:%s" % (self.doubleflags, elem[3], elem[2],elem[1]) 
+				if find(self.doubleflagList, elem[3]) == -1:
+					continue
+				if found:
+					return retval
+			#
+			#  search first only suffixes
+			#  since prefix is optional
+			#
+#				print "s22x l:%s oldl:%s oarrname:%s appnd:%s strip:%s" % (l, oldl, oarrname, elem[2], elem[1])
+				appnd    = elem[2]
+				if len(appnd):
+					if l[-len(appnd):] != appnd:
+						continue
+					restoredWord = l[0:len(l)-len(appnd)]
+				else:
+					restoredWord = l
+				condlist = elem[0]
+				strip    = elem[1]
+				if len(strip):
+					restoredWord = restoredWord + strip
+				break_it = 0
+#				print "s22: %s %s %s %s" %(restoredWord,appnd,strip, elem[3])
+				if len(condlist) > 0 and len(restoredWord) >= len(condlist): #tktk
+					substr = restoredWord[-len(condlist):]
+					for i in range(0, len(condlist), 1): #tktk
+						if condlist[i][ord(substr[i])] != 1:
+							break_it = 1
+							break
+					if break_it:
+						continue
+#				print "s->s2, rw:%s e3:%s" % (restoredWord, elem[3])
+				rval = self.suffix2_search(restoredWord, elem[3], l)
+				if rval != "":
+					found = 1
+					retval = rval
+					return rval
+ #		print windex
+		return retval
+		
+	def prefix_search(self, l):
+		found = 0
+		retval = ""
+		for windex in ord(l[0]), ord('0'):
+			for elem in palfab_conddic[windex]:
+				if found:
+					return retval
+				appnd    = elem[2]
+				if appnd == l[:len(appnd)]:  # cut the matching prefix
+					l1 = l[len(appnd):]
+				else:
+					continue
+				condlist = elem[0]
+				strip    = elem[1]
+				if len(strip):
+					l1 = strip + l1
+				break_it = 0
+				if len(condlist) > 0 and len(l1) >= len(condlist): #tktk
+					substr = l1[0:len(condlist)]
+					for i in range(0, len(condlist), 1): #tktk
+						if condlist[i][ord(substr[i])] != 1:
+							break_it = 1
+							break
+					if break_it:
+						continue
+			#
+			# prefix without suffix
+			#
+				arrname = elem[3]
+				if szodic.has_key(l1):
+					flags1 = szodic[l1]
+					if flags1 != "":
+						if find(flags1, arrname) == -1:
+							continue
+						retval = "++ %s  %s" %(l,l1)
+						found = 1
+						return retval
+						
+				if lower(yesno[arrname]) == 'n':
+					continue
+#
+#			check if this unprefixed word 
+#				is a valid suffixed one
+#
+				retval = self.suffix_search(l1, l, arrname)
+				if retval != "":
+					found = 1
+					return retval
+		return retval
+	
+	def prefix22_search(self, l):
+		found = 0
+		retval = ""
+		for windex in ord(l[0]), ord('0'):
+			for elem in palfab_conddic[windex]:
+				if found:
+					return retval
+#				print "str:%s app:%s e3:%s dfl:%s df:%s" % (elem[1],elem[2], elem[3],self.doubleflagList,self.doubleflags)
+				if find(self.doubleflagList, elem[3]) == -1 and find(self.doubleflags, elem[3]) == -1:
+					continue
+				appnd    = elem[2]
+				if appnd == l[:len(appnd)]:  # cut the matching prefix
+					l1 = l[len(appnd):]
+				else:
+					continue
+				condlist = elem[0]
+				strip    = elem[1]
+				if len(strip):
+					l1 = strip + l1
+				break_it = 0
+				if len(condlist) > 0 and len(l1) >= len(condlist): #tktk
+					substr = l1[0:len(condlist)]
+					for i in range(0, len(condlist), 1): #tktk
+						if condlist[i][ord(substr[i])] != 1:
+							break_it = 1
+							break
+					if break_it:
+						continue
+			#
+			# prefix without suffix
+			#
+				arrname = elem[3]
+#				print "p22->s2 l1:%s e3:%s l:%s" %(l1,elem[3],l)
+				rval = self.suffix2_search(l1, elem[3],l)
+				if rval != "":
+					found = 1
+					retval = rval
+					return rval
+						
+				if lower(yesno[arrname]) == 'n':
+					continue
+#
+#			check if this unprefixed word 
+#				is a valid suffixed one
+#
+#				print "ps l1:%s l:%s arrn:%s" % (l1, l, arrname)
+				retval = self.suffix22_search(l1, "", "")
+				if retval != "":
+					found = 1
+					return retval
+		return retval
+
+		
+	def do_test(self,l):
+		if l == "":
+			return ""
+		else:
+			oldword = l
+			found = 0
+#			print "ss l:%s" %l
+			retval = self.suffix_search(l, "", "")
+			if retval != "":
+				found = 1
+				return retval
+#
+# searched all suffixes and not found
+# now try to combine all prefixes with all suffixes
+# that allow combinations
+#
+#			print "sp l:%s" %l
+			retval = self.prefix_search(l)
+			if retval != "":
+				found = 1
+				return retval
+			
+			if self.is_secondflag:
+#				print "s22 l:%s" %l
+				retval = self.suffix22_search(l, "", "")
+				if retval != "":
+					found = 1
+					return retval
+#				print "p22 l:%s" %l
+				retval = self.prefix22_search(l)
+				if retval != "":
+					found = 1
+					return retval
+						
+			return "- %s" % oldword
+
+	def test_it(self,l):
+		if self.is_initialized == 0:
+			self.aff_read()
+			self.dic_read()
+			self.is_initialized = 1
+		lcasetest = 0
+		result = self.do_keytest(l)
+		if result[0] == '-':
+			lu = l[0]
+			if lu != lu.lower():
+				l1 = lu[0].lower()+l[1:]
+				if l1 != l:
+					lcasetest = 1;
+					result = self.do_keytest(l1)
+					#
+					# in languages not German more likely to find
+					# a lower case word than an uppercase
+					#
+					if result[0] == '-' and self.textlanguage != 'de':
+						tmp = l1
+						l1 = l
+						l = tmp
+		if result[0] == '-':
+			result = self.do_test(l)
+		if result[0] == '-' and lcasetest == 1:
+			result = self.do_test(l1)
+		typ = ''
+		if result[0] != '-':
+			src = result.split()
+			word = src[len(src) - 1]
+			oword = src[1]
+			typ =  typdic[word]
+#			print typ + " " + oword[-1:] + " " +oword[-2:]
+#
+# Here are the language specific rules of each language
+#
+			if self.textlanguage == 'de':
+				typ = self.wfdeu.getTyp(typ, oword, word)
+			elif self.textlanguage == 'hu':
+#				print word+" "+oword+" "+typ
+				typ = self.wfhun.getTyp(typ, oword, word)
+#
+# end of language specific rules for new languages
+#
+#			print typ
+			result = result + " " + typ
+#		print result
+		return result
+
+
diff --git a/languagetool/src/client.py b/languagetool/src/client.py
new file mode 100644
index 0000000..c3826ba
--- /dev/null
+++ b/languagetool/src/client.py
@@ -0,0 +1,28 @@
+#!/usr/bin/python
+# daniel.naber@t-online.de, 2003-05-02
+# This is just a test to show how a TextChecker server can be called
+
+import socket
+
+sentence = "A sentence bigger then a short one."
+
+server_name = "127.0.0.1"
+server_port = 50100
+
+print "Test client for socket_server.py"
+print "Connecting %s, port %d..." % (server_name, server_port)
+s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+s.connect(("127.0.0.1", 50100))
+print "Connected."
+cfg = '<config textlanguage="en" mothertongue="de" grammar="COMP_THAN" />\n'
+s.sendall("%s<text>%s</text>" % (cfg, sentence))
+print "Data sent, waiting for reply..."
+data = ""
+while 1:
+	received = s.recv(1024)
+	data = "%s%s" % (data, received)
+	if not received:
+		break
+s.close()
+print "Received reply:"
+print data
diff --git a/languagetool/src/query.py b/languagetool/src/query.py
new file mode 100644
index 0000000..b34a1ff
--- /dev/null
+++ b/languagetool/src/query.py
@@ -0,0 +1,249 @@
+#!/usr/bin/python
+# Query BNC data files in XML format
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+# for debugging only
+import cgitb
+cgitb.enable()
+
+#import profile
+
+import cPickle
+import cgi
+import os
+import re
+import re
+import sys
+import time
+
+os.chdir(sys.path[0])
+sys.path.append(sys.path[0])
+import TagInfo
+
+data_dir = "/data/bnc/xml_data"
+context = 4
+limit = 30
+tags_str = "AJ0,AJC,AJS,AT0,AV0,AVP,AVQ,CJC,CJS,CJT,\
+CRD,DPS,DT0,DTQ,EX0,ITJ,NN0,NN1,NN2,NP0,ORD,PNI,PNP,\
+PNQ,PNX,POS,PRF,PRP,PUL,PUN,PUQ,PUR,TO0,UNC,VBB,VBD,\
+VBG,VBI,VBN,VBZ,VDB,VDD,VDG,VDI,VDN,VDZ,VHB,VHD,VHG,\
+VHI,VHN,VHZ,VM0,VVB,VVD,VVG,VVI,VVN,VVZ,XX0,ZZ0"
+
+tags = re.split(",", tags_str)
+sentence_count = 0
+word_count = 0
+matches = 0
+regex = re.compile("(<S.*?</S>)", re.DOTALL)
+words_regex = re.compile("(<[WC].*?</[WC]>)", re.DOTALL)
+type_regex = re.compile("TYPE=\"(.*?)\"")
+word_regex = re.compile(">(.*?)</[WC]>")
+
+def query(search_tokens, filename):
+	global sentence_count
+	global word_count
+	global limit
+	global matches
+	global tags
+	t1 = time.time()
+	tokens = buildList(filename)
+	#print "T=%.2f<br>" % (time.time()-t1)
+	t1 = time.time()
+	#print tokens
+	match_pos = 0
+	pos = 0
+	for word,tag in tokens:
+		if tag == 'S_BEGIN':
+			sentence_count = sentence_count + 1
+		word_count = word_count + 1
+		if tags.count(search_tokens[match_pos]) > 0:
+			compare = tag
+		else:
+			compare = word
+		if compare == search_tokens[match_pos] or search_tokens[match_pos] == '_':
+			match_pos = match_pos + 1
+		else:
+			match_pos = 0
+		#print match_pos
+		if match_pos == len(search_tokens):
+			if matches+1 > limit:
+				return None
+			print "%d." % (matches+1)
+			print niceFormat(tokens[pos-context:pos+context], \
+				context-len(search_tokens)+1, len(search_tokens))
+			sys.stdout.flush()
+			matches = matches + 1
+			match_pos = 0
+		pos = pos + 1
+	#print "T2=%.2f<br>" % (time.time()-t1)
+	return 1
+
+def niceFormat(tokens, rel_pos, match_len):
+	l = []
+	count = 0
+	for word,tag in tokens:
+		if count >= rel_pos and count < rel_pos+match_len:
+			l.append('<b>%s<span class="tag">/%s</span></b>' % (word,tag))
+		elif tag == 'PUN':
+			l.append(word)
+		else:
+			l.append('%s<span class="tag">/%s</span>' % (word,tag))
+		count = count + 1
+	return str.join(' ', l) + "<br>"
+
+def buildList(filename):
+	# Speed up:
+	pickle_filename = "%s.pickle" % filename
+	if os.path.exists(pickle_filename):
+		#print "Loading pickled data from %s<br>" % pickle_filename
+		t1 = time.time()
+		tokens = cPickle.load(open(pickle_filename))
+		#print "Tpickle=%.2f<br>" % (time.time()-t1)
+		return tokens
+
+	f = open(filename)
+	content = f.read()
+	f.close()
+	global regex
+	global words_regex
+	global type_regex
+	global word_regex
+	
+	sentences = regex.findall(content)
+	tokens = []
+	for s in sentences:
+		#print "X"
+		words = words_regex.findall(s)
+		tokens.append(('', 'S_BEGIN'))
+		for w in words:
+			w = w.replace("\n", " ")
+			#print w
+			type_match = type_regex.search(w)
+			if not type_match:
+				print "*** no type_match!?"
+				continue
+			type_str = type_match.group(1)
+			word_match = word_regex.search(w)
+			word = word_match.group(1).strip()
+			#print "%s/%s" % (word, type_str)
+			tokens.append((word, type_str))
+		tokens.append(('', 'S_END'))
+	# Prepare speed up for next search:
+	cPickle.dump(tokens, open(pickle_filename, 'w'), 1)
+	return tokens
+
+def queryFiles(tokens, dir_name):
+	os.chdir(dir_name)
+	dir_contents = os.listdir(".")
+	dir_contents.sort()
+	c = 0
+	for filename in dir_contents:
+		if filename.endswith(".xml"):
+			c = c + 1
+	print "Found %d *.xml files in %s<br>" % (c, dir_name)
+	w = 0
+	s = 0
+	m = 0
+	f_count = 1
+	for name in dir_contents:
+		if os.path.isdir(name):
+			queryFiles(tokens, name)
+		elif name.endswith(".xml"):
+			print "<strong>%.3d. %s</strong>, so far %d words, %d sentences<br>" % (f_count, name, word_count, sentence_count)
+			res = query(tokens, name)
+			if not res:
+				return
+			#global_file_count = global_file_count + 1
+			#print "<hr />"
+			sys.stdout.flush()
+			f_count = f_count + 1
+		# for profiling
+		#if word_count > 200000:
+		#	return
+	os.chdir("..")
+	return
+
+def displayForm():
+	taginfo = TagInfo.TagInfo()
+	print "Content-Type: text/html\n\n"
+	print """
+		<html><head>
+		<title>BNC Query</title></head>
+		<body>
+		<h1>BNC Query</h1>
+
+		<form action="query.py" method="get">
+		<table border="0" cellspacing="0" cellpadding="0">
+		<tr>
+			<td>Word/tag sequence:</td>
+			<td>Context:</td>
+			<td>Max. results:</td>
+		</tr>
+		<tr>
+			<td><input type="text" name="tokens"></td>
+			<td><select name="context">
+				<option value="4">4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</option>
+				<option>6</option>
+				<option>8</option>
+				<option>10</option>
+			</select></td>
+			<td><input type="text" name="limit" value="30" size="6" /></input>
+			<td>&nbsp;</td>
+			<td><input type="submit" value="Query" /></td>
+		</tr>
+		</table>
+		</form>
+		<br />
+		_ (underline) matches any word
+		%s
+		</body>
+		</html>""" % taginfo.getHTMLCode()
+	return
+
+def main():
+	global limit
+	global context
+	form = cgi.FieldStorage()
+	if not form.getvalue("tokens"):
+		displayForm()
+		return
+	if form.getvalue("context"):
+		context = int(form.getvalue("context"))
+	if form.getvalue("limit"):
+		limit = int(form.getvalue("limit"))
+	print "Content-Type: text/html\n\n"
+	token_display = cgi.escape(form.getvalue("tokens"), 1)
+	print """<html><head>
+		<title>BNC query result for '%s'</title>
+		<style rel="stylesheet">
+		<!--
+		.tag { color:#999999; }
+		-->
+		</style></head>
+		<body>
+		<h1>BNC query result for '%s'</h1>""" % (token_display, token_display)
+	tokens = re.split("\s+", form.getvalue("tokens"))
+	queryFiles(tokens, data_dir)
+	print '<p>Queried %d words in %d sentences.' % (word_count, \
+		sentence_count)
+	print '</body></html>'
+	#print '<pre>'	# profiling
+	return
+
+main()
+#profile.run('main()')
diff --git a/languagetool/src/socket_server.py b/languagetool/src/socket_server.py
new file mode 100644
index 0000000..81cac5b
--- /dev/null
+++ b/languagetool/src/socket_server.py
@@ -0,0 +1,218 @@
+#!/usr/bin/python
+# A server that uses TextChecker.py to check text for style 
+# and grammar errors
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import TextChecker
+
+import ConfigParser
+import os
+import re
+import socket
+import sys
+import time
+
+sys.path.append(os.path.join(sys.path[0], "snakespell-1.01"))
+from scriptfoundry.snakespell import iSpell 
+
+server_name = "127.0.0.1"
+server_port = 50100
+configfile = os.path.join(os.getenv('HOME'), ".kde/share/config/languagetool")
+
+def makeChecker(grammar_cfg=None, falsefriends_cfg=None, words_cfg=None, \
+		builtin_cfg=None, textlanguage=None, mothertongue=None, \
+		max_sentence_length=None):
+	"""Create a new TextChecker object and return it."""
+	checker = TextChecker.TextChecker(grammar_cfg, falsefriends_cfg, words_cfg, \
+		builtin_cfg, textlanguage, mothertongue, max_sentence_length)
+	return checker
+
+def loadOptionList(config, enable_name, option_name):
+	val = None
+	if config.has_option("General", enable_name) and \
+		config.getboolean("General", enable_name):
+		if config.has_option("General", option_name):
+			val = re.split(',', config.get("General", option_name))
+	else:
+		val = ["NONE"]
+	return val
+
+def loadOptionBoolean(config, option_name):
+	if config.has_option("General", option_name) and config.getboolean("General", option_name):
+		return 1
+	return None
+
+def loadOptionString(config, option_name, default):
+	val = default
+	if config.has_option("General", option_name):
+		val = config.get("General", option_name)
+	return val
+	
+def readConfig():
+	"""Read the checker config from a KDE config file (INI style).
+	Return a checker which uses that config."""
+	config = ConfigParser.ConfigParser()
+	try:
+		config.readfp(open(configfile))
+	except IOError:
+		print "Couldn't load config file '%s', using defaults..." % configfile
+	grammar = loadOptionList(config, "EnableGrammar", "GrammarRules")
+	falsefriends = loadOptionList(config, "EnableFalseFriends", "FalseFriendsRules")
+	words = loadOptionList(config, "EnableWords", "WordsRules")
+	builtin = []
+	if loadOptionBoolean(config, "EnableWhitespaceCheck"):
+		builtin.append("WHITESPACE")
+	if len(builtin) == 0:
+		builtin = None
+	textlanguage = loadOptionString(config, "TextLanguage", "en")
+	mothertongue = loadOptionString(config, "MotherTongue", "en")
+	sentence_length = 0
+	if loadOptionBoolean(config, "EnableSentenceLength"):
+		if config.has_option("General", "MaxSentenceLength"):
+			sentence_length = config.getint("General", "MaxSentenceLength")
+	checker = makeChecker(grammar, falsefriends, words, builtin, \
+		textlanguage, mothertongue, sentence_length)
+	return checker
+
+def getConfig(data):
+	"""Get a new config in pseudo XML format from the client.
+	It needs to be at the beginning of the string that comes
+	from the client and must be of form <config ... />.
+	Returns a tuple with the a checker based on this config and 
+	the 'data' string with the config section removed."""
+	print "Receiving new config..."
+	line_end_pos = data.find("/>")
+	cfg_str = data[:line_end_pos]
+	data = data[line_end_pos+3:]
+	grammar = getConfigValue(cfg_str, "grammar")
+	falsefriends = getConfigValue(cfg_str, "falsefriends")
+	words = getConfigValue(cfg_str, "words")
+	builtin = getConfigValue(cfg_str, "builtin")
+	textlanguage = getConfigValue(cfg_str, "textlanguage")
+	if textlanguage:
+		textlanguage = textlanguage[0]
+	mothertongue = getConfigValue(cfg_str, "mothertongue")
+	if mothertongue:
+		mothertongue = mothertongue[0]
+	sentence_length = getConfigValue(cfg_str, "max-sentence-length")
+	if not sentence_length:
+		sentence_length = 0
+	else:
+		sentence_length = int(sentence_length[0])
+	checker = makeChecker(grammar, falsefriends, words, builtin, \
+		textlanguage, mothertongue, sentence_length)
+	return (checker, data)
+
+def getConfigValue(cfg_str, val):
+	m = re.compile('%s="(.*?)"' % val).search(cfg_str)
+	if not m:
+		return None
+	s = m.group(1)
+	l = re.split(',', s)
+	return l
+	
+def main():
+	print "Binding to '%s:%d'..." % (server_name, server_port)
+	s.bind((server_name, server_port))
+	print "Listening..."
+	s.listen(1)
+	print "Setting up Checker..."
+	checker = readConfig()
+	print "Ready..."
+	while 1:
+		conn, addr = s.accept()
+		if addr[0] != "127.0.0.1":		# security
+			print "Connection by '%s' refused" % addr[0]
+			conn.close()
+			continue
+		else:
+			print "Connected by '%s'" % addr[0]
+
+		l = []
+		limit = 1024
+		while 1:
+			data = conn.recv(limit)
+			l.append(data)
+			#FIXME: need to look for separator, not just < limit!
+			if not data or len(data) < limit:
+				break
+		data = str.join('', l)
+
+		print "Received '%s'" % data
+		if data.find("<config") != -1:
+			del checker
+			(checker, data) = getConfig(data)
+			print "New config activated"
+		t1 = time.time()
+		check_result = checkWords(checker, data)
+		t2 = time.time()-t1
+		print "Replying (%.2fs) '%s'" % (t2, check_result.encode('utf8'))
+		#print "Replying (%.2fs)" % t2
+		conn.send(check_result.encode('utf8'))
+
+		conn.close()
+	s.close()
+	return
+
+def checkWordsTEST(words):
+	"""Just for testing. Marks 'working' as incorrect."""
+	words = re.split("\s+", words)
+	s = '<result>'
+	for w in words:	
+		if w == "working":
+			s = s + '\t<error word="working" pos="5" corrections="Bohlen,Didda"/>'
+	s = s + '</result>'
+	return s
+
+def checkWords(checker, words):
+	result = u'<result>'
+
+	### Spelling:
+	ispell = iSpell()
+	words = words.replace("\n", " ")		# iSpell works line by line
+	r = ispell.check(words)
+	if r > 0:
+		# fixme: escape word
+		for mistake in ispell.getMistakes():
+			# TODO: make faster
+			pos = []
+			for p in mistake.getPositions():
+				result = u'%s<error from="%d" to="%d" word="%s" corrections="%s"/>' % \
+					(result, p, p+len(mistake.getWord()), \
+					unicode(mistake.getWord(), 'latin1'), \
+					unicode(str.join(',', mistake.corrections), ('latin1')))
+
+	### Grammar + Style:
+	(rule_matches, res, tags) = checker.check(words)
+	# FIXME: only if there's no overlap?!
+	result = result + res
+		
+	result = result + '</result>\n'
+	return result
+
+try:
+	s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+	os.chdir(sys.path[0])
+	main()
+except KeyboardInterrupt:
+	# TODO: close explicitely, unfortunately we still get an 
+	# 'Address already in use' error if we restart immediately:
+	s.shutdown(2)
+	s.close()
+	print "Stopped."
diff --git a/languagetool/src/tag.py b/languagetool/src/tag.py
new file mode 100644
index 0000000..7ab713b
--- /dev/null
+++ b/languagetool/src/tag.py
@@ -0,0 +1,152 @@
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+# A frontend to a probabilistc part-of-speech tagger (see the QTag paper)
+#
+# LanguageTool -- A Rule-Based Style and Grammar Checker
+# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+# Usage examples:
+# 1) ./tag.py -b /data/bnc_sampler/train/*
+# 2) ./tag.py -t /data/bnc_sampler/test/fcf
+
+import re
+import sys
+import string
+import getopt
+import profile
+
+import Tagger
+import Entities
+		
+class Controller:
+	"Main program."
+
+	TAG = 0
+	BUILD = 1
+	TAGWORD = 2
+	TAGSEQ = 3
+	
+	def __init__(self):
+		return
+		
+	def usage(self):
+		print >> sys.stderr, "Usage: ./tagger.py <--build|--tag|--tagword> <filename...>"
+		print >> sys.stderr, " -h, --help    this help information"
+		print >> sys.stderr, " -t, --tag     tag any text files"
+		print >> sys.stderr, " -b, --build   train the tagger using BNC XML files"
+		print >> sys.stderr, " -w, --wordtag tag any word"
+		print >> sys.stderr, " -s, --seqtag  probability for any 2-tag-sequence"
+		# TODO: better help (e.g. 'build' adds to existing index (?))
+		return
+	
+	def sanityCheck(self, filename, xml):
+		"""Sanity check: all <w>...</w> together == original file?"""
+		words = re.compile("<w.*?>(.*?)</w>", re.DOTALL).findall(xml)
+		words_string = string.join(words, "")
+		# Load original file:
+		f = open(filename)
+		orig_contents = f.read()
+		f.close()
+		if orig_contents != words_string:
+			print >> sys.stderr, "*** Warning: joined output doesn't match original file!"
+			print >> sys.stderr, "*** (can be ignored if the file is a BNC file)"
+		return
+
+	def run(self):
+		try:
+			(options, rest) = getopt.getopt(sys.argv[1:], 'htbws',
+				['help', 'build', 'tag', 'wordtag', 'seqtag'])
+		except getopt.GetoptError, e:
+			print >> sys.stderr, "Error: %s" % e
+			self.usage()
+			sys.exit(1)
+		mode = self.TAG
+		for o, a in options:
+			if o in ("-h", "--help"):
+				self.usage()
+				sys.exit(0)
+			elif o in ("-t", "--tag"):
+				mode = self.TAG
+			elif o in ("-b", "--build"):
+				mode = self.BUILD
+			elif o in ("-w", "--wordtag"):
+				mode = self.TAGWORD
+			elif o in ("-s", "--seqtag"):
+				mode = self.TAGSEQ
+		if not rest:
+			self.usage()
+			sys.exit(1)
+
+		if mode == self.BUILD:
+			tagger = Tagger.Tagger()
+			tagger.bindData()
+			tagger.buildData(rest)
+			tagger.commitData()
+		elif mode == self.TAG:
+			tagger = Tagger.Tagger()
+			tagger.bindData()
+			for filename in rest:
+				f = open(filename)
+				content = f.read()
+				f.close()
+				content = Entities.Entities.cleanEntities(content)
+				xml = tagger.tagTexttoXML(content)
+				self.sanityCheck(filename, xml)
+				print xml
+			print >> sys.stderr, "Done."
+		elif mode == self.TAGWORD:
+			tagger = Tagger.Tagger()
+			tagger.bindData()
+			for word in rest:
+				r = tagger.tagWord(word)
+				print r
+		elif mode == self.TAGSEQ:
+			tagger = Tagger.Tagger()
+			tagger.bindData()
+			if len(rest) > 1 and rest[1] != '*':
+				key = (rest[0], rest[1])
+				prob = tagger.tagSeq(key)
+				print prob
+			else:
+				# TODO: don't duplicate code from query.py:
+				tags_str = "AJ0,AJC,AJS,AT0,AV0,AVP,AVQ,CJC,CJS,CJT,"
+				tags_str = tags_str + "CRD,DPS,DT0,DTQ,EX0,ITJ,NN0,NN1,NN2,NP0,ORD,PNI,PNP,"
+				tags_str = tags_str + "PNQ,PNX,POS,PRF,PRP,PUL,PUN,PUQ,PUR,TO0,UNC,VBB,VBD,"
+				tags_str = tags_str + "VBG,VBI,VBN,VBZ,VDB,VDD,VDG,VDI,VDN,VDZ,VHB,VHD,VHG,"
+				tags_str = tags_str + "VHI,VHN,VHZ,VM0,VVB,VVD,VVG,VVI,VVN,VVZ,XX0,ZZ0,"
+				# these are not in query.py:
+				tags_str = tags_str + "YBL,YBR,YCOL,YCOM,YDSH,YEX,YLIP,YQUE,YQUO,YSCOL,YSTP"
+				tags = re.split(",", tags_str)
+				sum = 0
+				items = 0
+				for tag in tags:
+					key = (rest[0], tag)
+					prob = tagger.tagSeq(key)
+					prob2 = tagger.tagSeq2(key)
+					if prob > 0 or prob2 > 0:
+						sum = sum + prob
+						print "%s followed by %s -> %.10f" % (key[0], key[1], prob)
+						print "%s follows     %s -> %.10f" % (key[0], key[1], prob2)
+						items = items + 1
+				print "items=%d, sum=%.5f" % (items, sum)
+		return
+
+### Main program
+
+prg = Controller()
+prg.run()
+#profile.run('prg.run()', 'fooprof')
author	Arno Teigseth <arno@teigseth.no>	2011-01-31 05:34:56 +0000
committer	Arno Teigseth <arno@teigseth.no>	2011-01-31 05:34:56 +0000
commit	1afa96100bcb613c86533698f8a9d1115e63391e (patch)
tree	07c754e874bcbc95eeaa21abc35d4bc84158f4fb /languagetool/src
parent	635a3c7c275c00748c56736b4eb593b651223edd (diff)
download	grammar-norwegian-1afa96100bcb613c86533698f8a9d1115e63391e.tar.gz grammar-norwegian-1afa96100bcb613c86533698f8a9d1115e63391e.tar.bz2 grammar-norwegian-1afa96100bcb613c86533698f8a9d1115e63391e.tar.xz