diff options
| author | Arno Teigseth <arno@teigseth.no> | 2011-01-31 05:34:56 +0000 |
|---|---|---|
| committer | Arno Teigseth <arno@teigseth.no> | 2011-01-31 05:34:56 +0000 |
| commit | 1afa96100bcb613c86533698f8a9d1115e63391e (patch) | |
| tree | 07c754e874bcbc95eeaa21abc35d4bc84158f4fb /languagetool/src | |
| parent | 635a3c7c275c00748c56736b4eb593b651223edd (diff) | |
| download | grammar-norwegian-1afa96100bcb613c86533698f8a9d1115e63391e.tar.gz grammar-norwegian-1afa96100bcb613c86533698f8a9d1115e63391e.tar.bz2 grammar-norwegian-1afa96100bcb613c86533698f8a9d1115e63391e.tar.xz | |
Added very basic pre-beta version of LanguageTool. Builds, though :)
Diffstat (limited to 'languagetool/src')
24 files changed, 4707 insertions, 0 deletions
diff --git a/languagetool/src/.cvsignore b/languagetool/src/.cvsignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/languagetool/src/.cvsignore @@ -0,0 +1 @@ +*.pyc diff --git a/languagetool/src/Chunker.py b/languagetool/src/Chunker.py new file mode 100644 index 0000000..fc0cfd3 --- /dev/null +++ b/languagetool/src/Chunker.py @@ -0,0 +1,127 @@ +# -*- coding: iso-8859-1 -*- +# Assign chunks to a tagged text +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import os +import re +import sys + +class Chunker: + """Assign chunks (like "noun phrase") to a tagged text.""" + + def __init__(self): + return + + def setRules(self, rules): + """Use the rules from this Rules object for the chunk() method.""" + self.rules = rules + return + + def chunk(self, tagged_text): + """Take a POS tagged text and find all its chunks. Returns + a list of (from, to, chunk_name) tuples where the from/to positions + refer to the list position. Only parts of the list may be + covered by chunks. There are no overlappings.""" + l = [] + + tagged_text_pos = 0 + while 1: + if tagged_text_pos >= len(tagged_text): + break + word, norm_word, tag = tagged_text[tagged_text_pos] + + for rule in self.rules.rules: + #print "### %s" % rule.name + match_start = None + match_end = None + pattern_pos = 0 + pos_corr = 0 + + rule_match = 1 + cont = 1 + + while 1: + #print " %d,%d,%d" % (tagged_text_pos,pattern_pos,pos_corr) + try: + tag = tagged_text[tagged_text_pos+pattern_pos+pos_corr][2] + except IndexError: + #print "index error" + break + #print "%s ?= %s (pp=%d, ttp=%d)" % (tag, rule.pattern[pattern_pos], pattern_pos, tagged_text_pos) + if pattern_pos == 0 and tag == None: + cont = 0 + break + if tag == None: + # ignore whitespace + pos_corr = pos_corr + 1 + continue + if tag != rule.pattern[pattern_pos]: + rule_match = 0 + break + if match_start == None: + match_start = tagged_text_pos + + pattern_pos = pattern_pos + 1 + if pattern_pos == len(rule.pattern): + #print "match (%s)! tagged_text_pos=%d" % (rule.name, tagged_text_pos) + match_end = match_start + pattern_pos + pos_corr - 1 + l.append((match_start, match_end, rule.name)) + tagged_text_pos = tagged_text_pos + (match_end - match_start) + cont = 0 + break + if not rule_match: + continue # next rule + if not cont: + break # next word + tagged_text_pos = tagged_text_pos + 1 + + #print l + return l + +class Rules: + """A container for chunking rules.""" + + chunk_rules = os.path.join(sys.path[0], "data", "chunks.txt") + + def __init__(self): + """Read the chunking rules from data/chunks.txt. The rules + can then be access via Rules.rules.""" + self.rules = [] + f = open(self.chunk_rules) + lines = f.readlines() + f.close() + for line in lines: + if line.startswith("#"): # ignore comments + continue + rule = Rule(line.strip()) + self.rules.append(rule) + return + +class Rule: + """A chunking rule, consisting of a name and a pattern. The + pattern is a list of POS tags.""" + + def __init__(self, line): + """Parse a chunk rule in this format: + name: tag1 tag2...""" + parts = re.split("\s+", line.strip()) + name = parts[0] + self.name = name[0:len(name)-1] # cut off colon + self.pattern = parts[1:] + return diff --git a/languagetool/src/ChunkerTest.py b/languagetool/src/ChunkerTest.py new file mode 100644 index 0000000..eb8889e --- /dev/null +++ b/languagetool/src/ChunkerTest.py @@ -0,0 +1,78 @@ +# -*- coding: iso-8859-1 -*- +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import re +import unittest + +import Chunker + +class LocalRules: + + def __init__(self, rule_list): + self.rules = rule_list + return + +class ChunkerTestCase(unittest.TestCase): + + def testChunking(self): + c = Chunker.Chunker() + r1 = Chunker.Rule("NP1: AT0 NN1 NN1") + r2 = Chunker.Rule("NP2: AT0 NN1") + rules = LocalRules([r1, r2]) + c.setRules(rules) + + tagged_text = self._makeList("Blah/XX the/AT0 house/NN1 foo/YY") + chunks = c.chunk(tagged_text) + self.assertEqual(chunks, [(2, 4, 'NP2')]) + + tagged_text = self._makeList("Blah/XX house/NN1 foo/YY") + chunks = c.chunk(tagged_text) + self.assertEqual(chunks, []) + + tagged_text = self._makeList("the/AT0 summer/NN1 house/NN1 foo/YY2") + chunks = c.chunk(tagged_text) + self.assertEqual(chunks, [(0, 4, 'NP1')]) + + # more than one chunk: + + tagged_text = self._makeList("the/AT0 summer/NN1 is/VB a/AT0 hit/NN1") + chunks = c.chunk(tagged_text) + self.assertEqual(chunks, [(0, 2, 'NP2'), (6, 8, 'NP2')]) + + tagged_text = self._makeList("the/AT0 summer/NN1 a/AT0 hit/NN1") + chunks = c.chunk(tagged_text) + self.assertEqual(chunks, [(0, 2, 'NP2'), (4, 6, 'NP2')]) + + return + + def _makeList(self, s): + parts = re.split("(\s+)", s) + l = [] + for part in parts: + word = None + word_norm = None + tag = None + pair = re.split("/", part) + if len(pair) == 2: + word, tag = pair + word_norm = word + else: + word = pair[0] + l.append((word, word_norm, tag)) + return l diff --git a/languagetool/src/EnglishTest.py b/languagetool/src/EnglishTest.py new file mode 100644 index 0000000..358d26c --- /dev/null +++ b/languagetool/src/EnglishTest.py @@ -0,0 +1,62 @@ +# -*- coding: iso-8859-1 -*- +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import TextChecker +import LanguageTest +from LanguageTest import ExpMatch + +class EnglishTestCase(LanguageTest.LanguageTest): + + def setUp(self): + self.checker = TextChecker.TextChecker(grammar=None, falsefriends=None, \ + words=None, builtin=None, textlanguage="en", mothertongue="de", \ + max_sentence_length=20, debug_mode=0) + return + + def testSomeRules(self): + """Some English rule checks. Requires a trained tagger.""" + + self._check("A sentence without problems.", None) + self._check("This is bigger then blah.", ExpMatch("COMP_THAN", 15, 19)) + self._check("English/German false friend: my chef", ExpMatch("CHEF", 32, 36)) + self._check("Whitespace,here it's lacking.", ExpMatch("WHITESPACE", 11, 12)) + + self._check("he good good.", ExpMatch("WORD_REPEAT", 7, 12)) + + self._check("I ask you because of him.", None) + self._check("Of cause not.", ExpMatch("OF_CAUSE", 3, 8)) + self._check("he is nice.", None) + + self._check("This is a stoopid test.", None) + # TODO: error not detected: + self._check("The baseball team are established.", None) + + self._check("I definitely think is should be less than four years.", + ExpMatch("IS_SHOULD", 19, 21)) + + self._check("Peter's car is bigger then mine, and this isa spelling error.", + ExpMatch("COMP_THAN", 22, 26)) + + self._check("Peter's car is bigger then mine, and and a word repeat.", + [ExpMatch("COMP_THAN", 22, 26), ExpMatch("WORD_REPEAT", 34, 38)]) + + return + +if __name__ == "__main__": + unittest.main() diff --git a/languagetool/src/Entities.py b/languagetool/src/Entities.py new file mode 100644 index 0000000..615bd8b --- /dev/null +++ b/languagetool/src/Entities.py @@ -0,0 +1,68 @@ +# -*- coding: iso-8859-1 -*- +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import re + +class Entities: + """Some(!) BNC SGML entities.""" + + def cleanEntities(s): + """Replace only the most common BNC entities with their + ASCII respresentation.""" + entities = { "amp" : "&", + "pound": "P", # fixme: use "£" + "eacute": "e", + "aacute": "a", + "bquo": "\"", + "equo": "\"", + "ecirc": "e", + "quot": "'", + #"deg": u"°", + "dollar": "$", + "agrave": "á", + "egrave": "é", + "percnt": "&", + "ndash": "-", + "mdash": "--", + "hellip": "...", + "lsqb": "[", + "rsqb": "]", + "uuml": "ü", #fixme: use ü + "auml": "ä", # see above! + "ouml": "ö", + "Uuml": "Ü", + "Auml": "Ä", + "Ouml": "Ö", + "szlig": "ß" + } +# print "in entities %s"%s + try: + for key in entities: + #s = re.compile("&%s;?" % key).sub("%s" % entities[key].encode('latin1'), s) + s = s.replace("&%s;" % key, entities[key]) + s = s.replace("&%s" % key, entities[key]) + except TypeError: + # FIXME: what to do here?! + print >> sys.stderr, "TypeError: '%s'" % s + return s + + cleanEntities = staticmethod(cleanEntities) + +if __name__ == "__main__": + main() diff --git a/languagetool/src/GermanTest.py b/languagetool/src/GermanTest.py new file mode 100755 index 0000000..5575b5e --- /dev/null +++ b/languagetool/src/GermanTest.py @@ -0,0 +1,41 @@ +# -*- coding: iso-8859-1 -*- +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import TextChecker +import LanguageTest +from LanguageTest import ExpMatch + +class GermanTestCase(LanguageTest.LanguageTest): + + def setUp(self): + self.checker = TextChecker.TextChecker(grammar=None, falsefriends=None, \ + words=None, builtin=None, textlanguage="de", mothertongue="de", \ + max_sentence_length=20, debug_mode=0) + return + + def testSomeRules(self): + """Some English rule checks. Requires a trained tagger.""" + + self._check(u"Ich gehe daß er sieht", ExpMatch("DASS", 4, 12)) + self._check(u"Ich gehe.", None) + self._check(u"Ich gehst.", ExpMatch("ICH", 0, 9)) + return + +if __name__ == "__main__": + unittest.main() diff --git a/languagetool/src/HungarianTest.py b/languagetool/src/HungarianTest.py new file mode 100755 index 0000000..cb6b0a5 --- /dev/null +++ b/languagetool/src/HungarianTest.py @@ -0,0 +1,39 @@ +# -*- coding: iso-8859-1 -*- +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import TextChecker +import LanguageTest +from LanguageTest import ExpMatch + +class HungarianTestCase(LanguageTest.LanguageTest): + + def setUp(self): + self.checker = TextChecker.TextChecker(grammar=None, falsefriends=None, \ + words=None, builtin=None, textlanguage="hu", mothertongue="de", \ + max_sentence_length=20, debug_mode=0) + return + + def testSomeRules(self): + """Some English rule checks. Requires a trained tagger.""" + self._check(u"Én mész moziba", ExpMatch("EN", 0, 7)) + self._check(u"Õk soha nem fogják megtanulni.", None) + return + +if __name__ == "__main__": + unittest.main() diff --git a/languagetool/src/LanguageTest.py b/languagetool/src/LanguageTest.py new file mode 100644 index 0000000..ee4f2b2 --- /dev/null +++ b/languagetool/src/LanguageTest.py @@ -0,0 +1,68 @@ +# -*- coding: iso-8859-1 -*- +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import TextChecker + +import unittest + +class LanguageTest(unittest.TestCase): + + def _check(self, sentence, expectedErrors): + (rule_matches, output, tagged_text) = self.checker.check(sentence) + rule_matches.sort() + if expectedErrors == None: + if len(rule_matches) != 0: + print "Expected no errors, found %d" % len(rule_matches) + print "Sentence: %s" % sentence + self.fail() + elif isinstance(expectedErrors, list): + if len(rule_matches) != len(expectedErrors): + print "Expected %d errors, found %d" % (len(expectedErrors), len(rule_matches)) + print "Sentence: %s" % sentence + self.fail() + i = 0 + for expError in expectedErrors: + self._checkError(sentence, rule_matches[i], expError) + i = i + 1 + else: + if len(rule_matches) != 1: + print "Expected 1 error, found %d" % len(rule_matches) + print "Sentence: %s" % sentence + self.fail() + self._checkError(sentence, rule_matches[0], expectedErrors) + return + + def _checkError(self, sentence, rule_match, expectedError): + self.assertEqual(rule_match.id, expectedError.error_type) + if rule_match.from_pos != expectedError.from_pos or \ + rule_match.to_pos != expectedError.to_pos: + print "Expected error from %d to %d, found error from %d to %d" % \ + (expectedError.from_pos, expectedError.to_pos, rule_match.from_pos, \ + rule_match.to_pos) + print "Sentence: %s" % sentence + self.fail() + return + +class ExpMatch: + + def __init__(self, error_type, from_pos, to_pos): + self.error_type = error_type + self.from_pos = from_pos + self.to_pos = to_pos + return diff --git a/languagetool/src/Rules.py b/languagetool/src/Rules.py new file mode 100644 index 0000000..551e519 --- /dev/null +++ b/languagetool/src/Rules.py @@ -0,0 +1,632 @@ +# -*- coding: iso-8859-1 -*- +# Class for Grammar and Style Rules +#$rcs = ' $Id$ ' ; +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import Tools +import codecs # tktk + +import copy +import os +import re +import string +import sys +import xml.dom.minidom +from string import * + +# FIXME: +grammarFile = 'engrammar.xml' +wordFile = 'enwords.xml' +falsefriendsFile = 'enfalse_friends.xml' + +class Rule: + """Style or grammar rule -- quasi virtual class.""" + + def __init__(self, rule_id, message, false_positives, language): + self.rule_id = rule_id + self.message = message + # errors per 100 sentences in the BNC, i.e. mostly false positives: + self.false_positives = false_positives + self.language = language # two letter code like "en" or None (= relevant for alle languages) + return + + # match() is not defined here, but in the sub classes + +class Rules: + """All known style and grammar error rules (from XML and the built-in ones).""" + + python_rules_dir = "python_rules" + + def __init__(self, max_sentence_length, grammar_rules, word_rules, \ + builtin_rules, false_friend_rules, textlanguage, mothertongue): + """Parse all rules and put them in the self.rules list, together + with built-in rules like the SentenceLengthRule.""" + self.textlanguage = textlanguage + if textlanguage == 'en': + self.rule_files = [os.path.join(sys.path[0], "rules", grammarFile), + os.path.join(sys.path[0], "rules", wordFile), + os.path.join(sys.path[0], "rules", falsefriendsFile)] + else: + self.rule_files = [os.path.join(sys.path[0], "rules", grammarFile)] + self.rules = [] + + # dynamically load rule files from the "python_rules" dir: + sys.path.append(self.python_rules_dir) + dyn_files = os.listdir(self.python_rules_dir) + for filename in dyn_files: + if textlanguage == 'en': + if filename[0:2] != 'en' and filename[0:3] != 'all': + continue + elif textlanguage == 'de': + if filename[0:2] != 'de' and filename[0:3] != 'all': + continue + elif textlanguage == 'hu': + if filename[0:2] != 'hu' and filename[0:3] != 'all': + continue + if not filename.endswith(".py") or filename.endswith("Test.py"): + continue + filename = filename[:-3] # cut off ".py" + exec("import %s" % filename) + try: + exec("dynamic_rule = %s.%s()" % (filename, filename)) + except AttributeError: + print filename + raise InvalidFilename(filename) + if not hasattr(dynamic_rule, "match"): + raise MissingMethod("match", "%s.py" % filename) + if dynamic_rule.rule_id == "SENTENCE_LENGTH" and \ + max_sentence_length != None: + dynamic_rule.setMaxLength(max_sentence_length) + # do not use the rule if it wasn't activated + # (builtin_rules == None will use all rules): + if not builtin_rules or dynamic_rule.rule_id in builtin_rules: + self.rules.append(dynamic_rule) + + for filename in self.rule_files: + # minidom expects the DTD in the current directory, not in the + # documents directory, so we have to chdir to 'rules': + dir_temp = os.getcwd() + os.chdir(os.path.dirname(filename)) + doc = xml.dom.minidom.parse(os.path.basename(filename)) + os.chdir(dir_temp) + if filename.endswith(grammarFile): + rule_nodes = doc.getElementsByTagName("rule") + for rule_node in rule_nodes: + rule = PatternRule(rule_node) + lang_ok = 0 + if self.textlanguage == None or self.textlanguage == rule.language: + lang_ok = 1 + if lang_ok and (grammar_rules == None or rule.rule_id in grammar_rules): + self.rules.append(rule) + elif filename.endswith("words.xml"): + rule_nodes = doc.getElementsByTagName("rule") + for rule_node in rule_nodes: + rule = PatternRule(rule_node) + lang_ok = 0 + if self.textlanguage == None or self.textlanguage == rule.language: + lang_ok = 1 + if lang_ok and (word_rules == None or rule.rule_id in word_rules): + self.rules.append(rule) + elif filename.endswith("false_friends.xml"): + pattern_nodes = doc.getElementsByTagName("pattern") + for pattern_node in pattern_nodes: + lang = pattern_node.getAttribute("lang") + if self.textlanguage == None or lang == self.textlanguage: + rule = PatternRule(pattern_node.parentNode, 1, mothertongue, textlanguage) + if rule.valid and (false_friend_rules == None or \ + rule.rule_id in false_friend_rules): + self.rules.append(rule) + return + +class InvalidFilename(Exception): + + def __init__(self, value): + self.value = value + return + + def __str__(self): + s = "Constructor must be named as the file, i.e. '%s'" % self.value + return s + +class MissingMethod(Exception): + + def __init__(self, value, filename): + self.value = value + self.filename = filename + return + + def __str__(self): + s = "The '%s' method needs to be implemented in %s" % (self.value, self.filename) + return s + +class WhitespaceRule(Rule): + """A rule that matches punctuation not followed by a whitespace + and whitespace preceding punctuation. This rule does not work + on sentence level, it works on complete tagged texts or paragraphs.""" + + punct = "[.,?!:;]" + punct_regex = re.compile("^%s+$" % punct) + whitespace_regex = re.compile("^\s+$") + after_punct_regex = re.compile("^[\"]+$") + number_regex = re.compile("^\d+$") + whitespace_before_punct = re.compile("^\s+%s" % punct) + + def __init__(self): + Rule.__init__(self, "WHITESPACE", "Insert a space character before punctuation.", 0, None) + return + + def getNextTriple(self, tagged_words, pos): + """Get the next triple form the tagged_words list, starting at + pos but ignoring all SENT_START and SENT_END tags.""" + tag = tagged_words[pos][2] + while tag == 'SENT_START' or tag == 'SENT_END': + pos = pos + 1 + if pos >= len(tagged_words): + return None + tag = tagged_words[pos][2] + return tagged_words[pos] + + def match(self, tagged_words, chunks=None, position_fix=0, line_fix=0, column_fix=0): + """Check if a sentence contains whitespace/token sequences + that are against the 'use a space after, but not before, a token' + rule.""" + matches = [] + text_length = 0 + line_breaks = 1 + column = 0 + i = 0 + while 1: + if i >= len(tagged_words)-1: + break + org_word = tagged_words[i][0] + line_breaks_cur = Tools.Tools.countLinebreaks(org_word) + if line_breaks_cur > 0: + column = 0 + line_breaks = line_breaks + line_breaks_cur + org_word_next = self.getNextTriple(tagged_words, i+1) + if org_word_next: + org_word_next = org_word_next[0] + text_length = text_length + len(org_word) + if tagged_words[i][1] == None: + # ignore whitespace + if line_breaks_cur == 0: + column = column + len(org_word) + i = i + 1 + continue + whitespace_length = len(tagged_words[i+1][0]) + if line_breaks_cur == 0: + column = column + len(org_word) + if self.punct_regex.match(org_word) and not (org_word.endswith("\n") or org_word.endswith("\r")): + word_next = tagged_words[i+1][1] + word_next = self.getNextTriple(tagged_words, i+1) + if word_next: + word_next = word_next[1] + if word_next and self.number_regex.match(word_next): + # don't complain about "24,000" etc. + i = i + 1 + continue + if word_next and (not self.after_punct_regex.match(org_word_next)) and \ + (not self.whitespace_regex.match(org_word_next)): + matches.append(RuleMatch(self.rule_id, text_length, text_length + len(org_word), + line_breaks+line_fix, + column+column_fix, + "Usually a space character is inserted after punctuation.")) + elif self.whitespace_before_punct.match(org_word): + if not self.punct_regex.match(org_word_next): + matches.append(RuleMatch(self.rule_id, text_length, text_length + len(org_word), + line_breaks+line_fix, column+column_fix, + "Usually no space character is inserted before punctuation.")) + i = i + 1 + return matches + +class PatternRule(Rule): + """A rule that can be formalised in the XML configuration file.""" + + def __init__(self, node, is_false_friend_node=None, mothertongue=None, textlang=None): + """Build an object by parsing an XML rule node.""" + if node == None: + # for the test cases. They use setVars(). + return + if is_false_friend_node: + self.parseFalseFriendsRuleNode(node, mothertongue, textlang) + else: + self.parseRuleNode(node) + return + + def parseRuleNode(self, rule_node): + self.rule_id = rule_node.getAttribute("id") + if not self.rule_id: + # FIXME? rule_id is not unique... + self.rule_id = rule_node.parentNode.getAttribute("id") + self.pattern = rule_node.getElementsByTagName("pattern")[0].childNodes[0].data.strip() + token_strings = re.split("\s+", self.pattern) + self.tokens = [] + for token_string in token_strings: + token = Token(token_string) + self.tokens.append(token) + pattern_node = rule_node.getElementsByTagName("pattern")[0] + self.language = pattern_node.getAttribute("lang") + marker_from_att = pattern_node.getAttribute("mark_from") + if marker_from_att: + self.marker_from = int(marker_from_att) + else: + self.marker_from = 0 + marker_to_att = pattern_node.getAttribute("mark_to") + if marker_to_att: + self.marker_to = int(marker_to_att) + else: + self.marker_to = 0 + self.case_sensitive = 0 + if rule_node.getElementsByTagName("pattern")[0].getAttribute("case_sensitive") == 'yes': + #print "*** %s" % rule_node.getElementsByTagName("pattern")[0].getAttribute("case_sensitive") + self.case_sensitive = 1 + if rule_node.getElementsByTagName("message"): + self.message = Tools.Tools.getXML(rule_node.getElementsByTagName("message")[0]) + else: + self.message = Tools.Tools.getXML(rule_node.parentNode.getElementsByTagName("message")[0]) + example_nodes = rule_node.getElementsByTagName("example") + self.example_good = "" + self.example_bad = "" + for example_node in example_nodes: + # TODO?: only one good and one bad example currently supported: + if example_node.getAttribute("type") == 'correct': + self.example_good = Tools.Tools.getXML(example_node.childNodes[0]) + else: + self.example_bad = Tools.Tools.getXML(example_node.childNodes[0]) + self.false_positives = None # None = unknown + if rule_node.getElementsByTagName("error_rate"): + error_rate_node = rule_node.getElementsByTagName("error_rate")[0] + warnings = error_rate_node.getAttribute("warnings") + sentences = error_rate_node.getAttribute("sentences") + try: + if int(sentences) != 0: + error_rate = float(warnings) / float(sentences) * 100 + self.false_positives = error_rate + except ValueError: + pass + return + + def parseFalseFriendsRuleNode(self, rule_node, mothertongue, textlang): + # This is only called for rule nodes that have a pattern + # element with the relevant language. + self.rule_id = rule_node.parentNode.getAttribute("id") + pattern_node = rule_node.getElementsByTagName("pattern")[0] + self.language = rule_node.getAttribute("lang") + # Now look for the correct translation: + trans_nodes = rule_node.getElementsByTagName("translation") + self.valid = 0 # useless object because no translation was found + translations = [] + for trans_node in trans_nodes: + trans_lang = trans_node.getAttribute("lang") + if trans_lang == mothertongue: + self.valid = 1 + trans_str = trans_node.childNodes[0].data + translations.append(trans_str) + if self.valid: + self.case_sensitive = 0 + self.pattern = rule_node.getElementsByTagName("pattern")[0].childNodes[0].data.strip() + repl_word, repl_trans = self.getOtherMeaning(rule_node.parentNode, mothertongue, textlang) + l = [] + for elem in repl_trans: + l.append("<em>%s</em>" % elem) + repl_trans_str = str.join(', ', l) + self.message = "'%s' means %s. " % (self.pattern, str.join(', ', translations)) + if repl_word: + self.message = self.message + " Did you maybe mean '%s', which is %s?" % \ + (repl_word, repl_trans_str) + #print "#%s" % self.message.encode('latin1') + token_strings = re.split("\s+", self.pattern) + self.tokens = [] + for token_string in token_strings: + token = Token('"%s"' % token_string) # quotes = it's a word (not a POS tag) + self.tokens.append(token) + #print "#%s" % token + self.marker_from = 0 + self.marker_to = 0 + return + + def getOtherMeaning(self, rulegroup_node, mothertongue, textlang): + """Get the word (and its correct translations) that the user + maybe meant when he used a false friend. Returns a tuple + (word, [translations]).""" + replace_nodes = rulegroup_node.getElementsByTagName("pattern") + word = None + translations = [] + for replace_node in replace_nodes: + repl_lang = replace_node.getAttribute("lang") + if repl_lang == mothertongue: + word = replace_node.childNodes[0].data + trans_nodes = replace_node.parentNode.getElementsByTagName("translation") + for trans_node in trans_nodes: + trans_lang = trans_node.getAttribute("lang") + #print "#%s, %s" % (trans_lang, textlang) + if trans_lang == textlang: + self.valid = 1 + trans_str = trans_node.childNodes[0].data + translations.append(trans_str) + return (word, translations) + + def setVars(self, rule_id, pattern, message, marker_from, marker_to, \ + example_good, example_bad, case_sensitive, false_positives, language): + """Manually initialize the pattern rule -- for test cases only.""" + self.rule_id = rule_id + self.message = message + self.false_positives = false_positives + self.language = language + self.marker_from = marker_from + self.marker_to = marker_to + self.example_good = example_good + self.example_bad = example_bad + self.case_sensitive = case_sensitive + self.tokens = [] + token_strings = re.split("\s+", pattern) + for token_string in token_strings: + token = Token(token_string) + self.tokens.append(token) + return + + def match(self, tagged_words, chunks=None, position_fix=0, line_fix=0, column_fix=0): + """Check if there are rules that match the tagged_words. Returns a list + of RuleMatch objects.""" + matches = [] + ct = 0 + tagged_words_copy = tagged_words # no copy, just a refernce + last_match = None + + #print self.rule_id + #print tagged_words_copy + for word_tag_tuple in tagged_words_copy: + i = ct + p = 0 # matched position in the pattern so far + expected_token = None # expected token if the pattern matches + found = None + match = 1 + first_match = None + chunk_corr = 0 + chunk_len = 0 + + while match: + try: + if not tagged_words_copy[i][1] and tagged_words_copy[i][2] != 'SENT_START' and tagged_words_copy[i][2] != 'SENT_END': + # here's just whitespace or other un-taggable stuff: + i = i + 1 + ct = ct + 1 + continue + elif not first_match: + first_match = ct + except IndexError: # end of tagged words + break + try: + expected_token = self.tokens[p] + except IndexError: + # pattern isn't that long + break + expected_token_str = expected_token.token + + #print "expected_token_str=%s" % expected_token_str + if tagged_words_copy[i][2] == 'SENT_START': + found = 'SENT_START' + elif tagged_words_copy[i][2] == 'SENT_END': + found = 'SENT_END' + elif expected_token.is_word: + # TODO: some cases need to be escaped, e.g. "?", but + # this breaks the pipe etc. + #expected_token_str = re.escape(expected_token_str) + # look at the real word: + try: + found = tagged_words_copy[i][1].strip() + except: # text isn't that long + break + elif expected_token.is_chunk: + #print "chunk %s@%d?" % (expected_token.token, i) + found = None + for from_pos, to_pos, chunk_name in chunks: + if i >= from_pos and i <= to_pos: + found = chunk_name + #print "CHUNK %d-%d: %s" % (from_pos, to_pos, chunk_name) + i = i + (to_pos - from_pos) + chunk_corr = chunk_corr + (to_pos - from_pos) + chunk_len = chunk_len + 1 + break + else: + # look at the word's POS tag: + try: + found = tagged_words_copy[i][2] + except: # text ends here + break + if not found: + #print >> sys.stderr, "*** 'found' undefined (i=%d, %s/%s)" % (i, tagged_words_copy[i][1], tagged_words_copy[i][2]) + break + case_sensitive = re.IGNORECASE + if self.case_sensitive: + case_sensitive = 0 + if expected_token.simple_token: + # speed up for e.g. simple false friends rules that don't + # require regex matching: + if case_sensitive: + #print "exp:%s" %expected_token + match = (expected_token_str.lower() == found.lower()) + else: + match = (expected_token_str == found) + else: + match = re.compile("%s$" % expected_token_str, case_sensitive).match(found) + #print "%s: %s/%s -> %s" % (self.rule_id, found, expected_token_str, match) + if expected_token.negation: + if not match: + match = 1 + else: + match = None + #print "F=%s, m=%s, '%s'" % (found, match, re.escape(expected_token.token)) + i = i + 1 + p = p + 1 + + #print "p=%d, len(self.tokens)=%d" % (p, len(self.tokens)) + if match and p == len(self.tokens): + + #print "##MATCH "+found+" " +expected_token_str + #FIXME: does this always mark the correct position? + (first_match, from_pos, to_pos, line, column) = self.listPosToAbsPos(tagged_words_copy, \ + first_match, 0) + to_pos = to_pos + chunk_corr + + # Let \n in a rule refer to the n'th matched word: + l = first_match + lcount = 1 + msg = self.message + while lcount <= len(self.tokens) and l < len(tagged_words_copy): + if not tagged_words_copy[l][1] and tagged_words_copy[l][2] != 'SENT_START' and tagged_words_copy[l][2] != 'SENT_END': + pass + else: + msg = msg.replace("\\%d" % lcount, tagged_words_copy[l][0]) + lcount = lcount + 1 + l = l + 1 + + first_match_word = tagged_words_copy[first_match][0] + match = RuleMatch(self.rule_id, from_pos+position_fix, to_pos+position_fix, \ + line+line_fix, column+column_fix, msg, first_match_word) + matches.append(match) + + ct = ct + 1 + return matches + + def listPosToAbsPos(self, l, first_match, chunk_corr=0): + #print "*%d (%d)" % (first_match, chunk_corr) + j = first_match + 1 + i = 0 + mark_from_tmp = self.marker_from + while mark_from_tmp > 0 and j < len(l): + if l[j][1]: + mark_from_tmp = mark_from_tmp - 1 + i = i + 1 + j = j + 1 + first_match = first_match + i + + last_match = first_match + match_len = len(self.tokens)-self.marker_from+self.marker_to+chunk_corr + for el in l[first_match:]: + if match_len == 0: + break + if el[1]: + match_len = match_len - 1 + last_match = last_match + 1 + + from_pos = 0 + line = 0 + column = 0 # FIXME! + for el in l[:first_match]: + #print "** '%s' (%d)" % (el[0], first_match) + matches = re.findall("[\n\r]", el[0]) + line = line + len(matches) + if len(matches) > 0: + column = 0 + else: + column = column + len(el[0]) + from_pos = from_pos + len(el[0]) + #print "** L=%s" % line + to_pos = 0 + for el in l[:last_match]: + to_pos = to_pos + len(el[0]) + + return (first_match, from_pos, to_pos, line, column) + +class RuleMatch: + """A matching rule, i.e. an error or a warning and from/to positions.""" + + def __init__(self, rule_id, from_pos, to_pos, line, column, message, first_match_word=None): + self.id = rule_id + self.from_pos = from_pos + self.to_pos = to_pos + self.line = line + self.column = column + self.message = message + # TOOD: is it okay to use 'latin1' here?: + if first_match_word and first_match_word[0] in unicode(string.uppercase, 'latin1'): + # Replace the first char in <em>...</em> with its uppercase + # variant. Useful for replacements at the beginning of the + # sentence + self.message = re.compile("<em>(.)").sub(self.upper, self.message) + return + + def upper(self, match): + return "<em>%s" % match.group(1)[0].upper() + + def __str__(self): + """String representation of this object, i.e. human readable output.""" + msg = self.message + msg = re.compile("</?message>").sub("", msg) + msg = re.compile("</?em>").sub("'", msg) + strng = 'Line %d, Column %d: %s' % (self.line, self.column, msg) + return strng + + def toXML(self): + """XML representation of this object.""" + strng = '<error from="%d" to="%d">%s</error>' % (self.from_pos, self.to_pos, self.message) + return strng + + def __cmp__(self, b): + """Compare by 'from' position.""" + if self.from_pos > b.from_pos: + return 1 + elif self.from_pos < b.from_pos: + return -1 + else: + return 0 + +class Token: + """A word, tag or chunk token, negated or not. Examples: + "^(has|will)", + "he", + (VB|VBP), + _NP + """ + + def __init__(self, token): + self.token = token + self.negation = 0 + self.is_word = 0 + self.is_tag = 0 + self.is_chunk = 0 + if self.token.find("|") != -1 or self.token.find("(") != -1 \ + or self.token.find("[") != -1 or self.token.find(".") != -1: + self.simple_token = 0 + else: + self.simple_token = 1 # no regex required + if self.token.startswith('^'): + self.token = token[1:] # remove '^' + self.negation = 1 + if self.token.startswith('"'): + self.is_word = 1 + if not self.token.endswith('"'): + print >> sys.stderr, "*** Warning: token '%s' starts with quote but doesn't end with quote!" % self.token + self.token = self.token[1:(len(self.token)-1)] # remove quotes + elif self.token.startswith('_'): + self.token = token[1:] # remove '_' + self.is_chunk = 1 + else: + self.is_tag = 1 + return + + def __str__(self): + """For debugging only""" + strng = self.token + if self.negation: + strng = "^%s" % strng + if self.is_word: + strng = '"%s"' % strng + return strng diff --git a/languagetool/src/RulesTest.py b/languagetool/src/RulesTest.py new file mode 100644 index 0000000..fd54598 --- /dev/null +++ b/languagetool/src/RulesTest.py @@ -0,0 +1,257 @@ +#!/usr/bin/python +# Test cases for Rule.py +#$rcs = ' $Id$ ' ; +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import unittest +import Rules +import os +import sys + +sys.path.append(os.path.join("python_rules")) +import allSentenceLengthRule +import enWordRepeatRule +import enAvsAnRule + +class RuleTestCase(unittest.TestCase): + + def setUp(self): + self.rule = Rules.PatternRule(None) + self.rule.setVars("TEST1", '"word" (VB|TST)', "Test message 1.", 0, 0, \ + "Good example.", "Bad example.", 0, 5, "en") + # negation: + self.rule2 = Rules.PatternRule(None) + self.rule2.setVars("TEST2", '"word" ^(VB|TST)', "Test message 2.", 0, 0, \ + "Good example.", "Bad example.", 0, 5, "en") + # negation at the beginning: + self.rule3 = Rules.PatternRule(None) + self.rule3.setVars("TEST3", '^"word" (VB|TST)', "Test message 3.", 0, 0, \ + "Good example.", "Bad example.", 0, 5, "en") + return + + def testConstructor(self): + self.assertEqual(self.rule.rule_id, "TEST1") + self.assertEqual(len(self.rule.tokens), 2) + self.assertEqual(self.rule2.rule_id, "TEST2") + self.assertEqual(len(self.rule.tokens), 2) + self.assertEqual(self.rule3.rule_id, "TEST3") + self.assertEqual(len(self.rule.tokens), 2) + return + + def testSentenceLengthRule(self): + r = allSentenceLengthRule.allSentenceLengthRule() + r.setMaxLength(3) + + # just below the limit: + warnings = r.match([('x','x','T'),('x','x','T'),('x','x','T')]) + self.assertEqual(len(warnings), 0) + + # just on the limit: + warnings = r.match([('x','x','T'),('x','x','T'),('x','x','T'),('x','x','T')]) + self.assertEqual(len(warnings), 1) + assert(warnings[0].toXML().startswith('<error from="3" to="4">')) + r.setMaxLength(60) + warnings = r.match([('x','x','T'),('x','x','T'),('x','x','T'),('x','x','T')]) + self.assertEqual(len(warnings), 0) + r.setMaxLength(3) + + # whitespace is okay: + warnings = r.match([(' ',None,None),('x','x','T'),('x','x','T'),('x','x','T')]) + self.assertEqual(len(warnings), 0) + + # much longer than the limit: + warnings = r.match([('x','x','T'),('x','x','T'),('x','x','T'),('x','x','T'),\ + ('x','x','T'),('x','x','T'),('x','x','T')]) + self.assertEqual(len(warnings), 1) + + return + + def testAvsAnRule(self): + r = enAvsAnRule.enAvsAnRule() + # okay: + warnings = r.match([('A','A','DET'),(' ',None,None),('test','test','NN')], []) + self.assertEqual(len(warnings), 0) + warnings = r.match([('a','a','DET'),(' ',None,None),('test','test','NN')], []) + self.assertEqual(len(warnings), 0) + warnings = r.match([('an','an','DET'),(' ',None,None),('idea','idea','NN')], []) + self.assertEqual(len(warnings), 0) + + # okay (exceptions list): + warnings = r.match([('a','a','DET'),(' ',None,None),('university','university','NN')], []) + self.assertEqual(len(warnings), 0) + warnings = r.match([('an','an','DET'),(' ',None,None),('hour','hour','NN')], []) + self.assertEqual(len(warnings), 0) + + # wrong: + warnings = r.match([('An','An','DET'),(' ',None,None),('test','test','NN')], []) + self.assertEqual(len(warnings), 1) + warnings = r.match([('an','an','DET'),(' ',None,None),('test','test','NN')], []) + self.assertEqual(len(warnings), 1) + warnings = r.match([('a','a','DET'),(' ',None,None),('idea','idea','NN')], []) + self.assertEqual(len(warnings), 1) + + # wrong (exceptions list): + warnings = r.match([('an','an','DET'),(' ',None,None),('university','university','NN')], []) + self.assertEqual(len(warnings), 1) + warnings = r.match([('a','a','DET'),(' ',None,None),('hour','hour','NN')], []) + self.assertEqual(len(warnings), 1) + + return + + def testWhitespaceRule(self): + r = Rules.WhitespaceRule() + + # okay: + warnings = r.match([('blah','blah','XX'),('?',None,None)]) + self.assertEqual(len(warnings), 0) + warnings = r.match([('3.14','3.14','XX'),('?',None,None)]) + self.assertEqual(len(warnings), 0) + + # error - whitespace before punctuation: + warnings = r.match([('blah','blah','XX'),(' ',None,None),('.',None,None)]) + self.assertEqual(len(warnings), 1) + warnings = r.match([('blah','blah','XX'),(' ',None,None),('?',None,None)]) + self.assertEqual(len(warnings), 1) + warnings = r.match([('blah','blah','XX'),(' ',None,None),('...',None,None)]) + self.assertEqual(len(warnings), 1) + warnings = r.match([('blah','blah','XX'),(' ',None,None),('?!',None,None)]) + self.assertEqual(len(warnings), 1) + + # both errors + warnings = r.match([('blah','blah','XX'),(' ',None,None),(',',None,None),('blah','blah','XX')]) + self.assertEqual(len(warnings), 2) + + # okay: + warnings = r.match([('blah','blah','XX'),('?',None,None),(None,None,'SENT_END')]) + self.assertEqual(len(warnings), 0) + + # error - no whitespace after punctuation: + warnings = r.match([('blah','blah','XX'),('?',None,None),('foo','foo','YY')]) + self.assertEqual(len(warnings), 1) + + return + + def testWordRepeat(self): + r = enWordRepeatRule.enWordRepeatRule() + + warnings = r.match([('blah','blah','XX'),(' ',None,None),('blahbla','blahbla','YY')], []) + self.assertEqual(len(warnings), 0) + + warnings = r.match([('blah','blah','XX'),(' ',None,None),('blah','blah','YY')], []) + self.assertEqual(len(warnings), 1) + warnings = r.match([('blah','blah','XX'),(' ',None,None),('BLAH','BLAH','XX')], []) + self.assertEqual(len(warnings), 1) + + return + + def testPatternRuleMatch(self): + + # rule 1: + + res_list = self.rule.match([('', None, 'SENT_START'), + ('word', 'word', 'XX'),(' ', None, None),('bla', 'bla', 'VB')], 0) + self.assertEqual(len(res_list), 1) + self.assertEqual(res_list[0].toXML(), '<error from="0" to="8">Test message 1.</error>') + + res_list = self.rule.match([('no', 'no', 'XX'),('foo', 'foo', 'VB')], 0) + self.assertEqual(len(res_list), 0) + + res_list = self.rule.match([], 0) + self.assertEqual(len(res_list), 0) + + res_list = self.rule.match([('word', 'word', 'XX')], 0) + self.assertEqual(len(res_list), 0) + + # rule 2: + + res_list = self.rule2.match([('word', 'word', 'XX'),('', None, None),('xxx', 'xxx', 'VBX')], 0) + self.assertEqual(len(res_list), 1) + + # rule 3: + + res_list = self.rule3.match([('foo', 'foo', 'XX'),(' ', None, None),('xxx', 'xxx', 'VB')], 0) + self.assertEqual(len(res_list), 1) + return + +class RuleMatchTestCase(unittest.TestCase): + + def testCompare(self): + r1 = Rules.RuleMatch("ONE", 1, 2, 0, 0, "fake1", 0) + r2 = Rules.RuleMatch("ONE", 2, 3, 0, 0, "fake2", 0) + assert(r1 < r2) + r3 = Rules.RuleMatch("ONE", 1, 3, 0, 0, "fake3", 0) + assert(r1 == r3) + assert(r2 > r3) + return + +class TokenTestCase(unittest.TestCase): + + def testToken(self): + + token = Rules.Token('NN') + self.assertEqual(token.token, "NN") + assert(not token.negation) + assert(token.is_tag) + assert(not token.is_word) + assert(not token.is_chunk) + assert(token.simple_token) + + token = Rules.Token('"word"') + self.assertEqual(token.token, "word") + assert(not token.negation) + assert(not token.is_tag) + assert(token.is_word) + assert(not token.is_chunk) + assert(token.simple_token) + + token = Rules.Token("^(NN)") + self.assertEqual(token.token, "(NN)") + assert(token.negation) + assert(token.is_tag) + assert(not token.is_word) + assert(not token.is_chunk) + assert(not token.simple_token) # b/c of the parenthesis + + token = Rules.Token('^"word"') + self.assertEqual(token.token, "word") + assert(token.negation) + assert(not token.is_tag) + assert(token.is_word) + assert(not token.is_chunk) + assert(token.simple_token) + + token = Rules.Token('_NP') + self.assertEqual(token.token, "NP") + assert(not token.negation) + assert(not token.is_tag) + assert(not token.is_word) + assert(token.is_chunk) + assert(token.simple_token) + + token = Rules.Token("(AA|BB|CC)") + self.assertEqual(token.token, "(AA|BB|CC)") + assert(not token.negation) + assert(token.is_tag) + assert(not token.is_word) + assert(not token.is_chunk) + assert(not token.simple_token) # b/c of the parenthesis + return + +if __name__ == "__main__": + unittest.main() diff --git a/languagetool/src/SentenceSplitter.py b/languagetool/src/SentenceSplitter.py new file mode 100644 index 0000000..35dfb7d --- /dev/null +++ b/languagetool/src/SentenceSplitter.py @@ -0,0 +1,132 @@ +#!/usr/bin/python +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2003 Daniel Naber <daniel.naber@t-online.de> +# Based on Shlomo Yona's Perl module Lingua::EN::Sentence 0.25 + +import os +import string +import re +import sys + +class SentenceSplitter: + + ABBR_FILE = os.path.join(sys.path[0], "data", "abbr.txt") + + EOS = "\001" + #EOS = "<>" # for testing only + P = """[\.!?]""" ## PUNCTUATION + AP = """(?:'|"|�|\)|\]|\})?""" ## AFTER PUNCTUATION + PAP = "%s%s" % (P, AP) + + reFlags = re.DOTALL|re.LOCALE + + def __init__(self): + """Init the object by loading the abbreviation list.""" + self.abbr = self.loadAbbreviations() + return + + def loadAbbreviations(self): + """Load the abbreviation list and return all words in a list.""" + abbr = [] + f = open(self.ABBR_FILE, "r") + while 1: + l = f.readline() + if not l: + break + l = l.strip() + if l: + abbr.append(l) + f.close() + return abbr + + def split(self, text): + """Take a text and split it into sentences. Return the list + of sentences. Adapted from Perl's Lingua-EN-Sentence-0.25 module.""" + if text == None: + return [] + #print "text=%s" % text + marked_text = self.first_sentence_breaking(text) + #print "marked_text=%s" % marked_text + fixed_marked_text = self.remove_false_end_of_sentence(marked_text) + #print "fixed_marked_text=%s" % fixed_marked_text + fixed_marked_text = self.split_unsplit_stuff(fixed_marked_text) + #print "fixed_marked_text=%s" % fixed_marked_text + sentences = re.split(self.EOS, fixed_marked_text) + return sentences + + def first_sentence_breaking(self, text): + """Add a special break character at all places with typical sentence + delimiters.""" + # Double new-line means a new sentence: + text = re.compile("(\n\s*\n)", self.reFlags).sub("\\1%s" % self.EOS, text) + # Punctuation followed by whitespace means a new sentence: + text = re.compile("(%s\s)" % self.PAP, self.reFlags).sub("\\1%s" % self.EOS, text) + # New (compared to the perl module): Punctuation followed by uppercase followed + # by non-uppercase character (except dot) means a new sentence: + text = re.compile("(%s)([%s][^%s.])" % (self.PAP, string.uppercase, string.uppercase), \ + self.reFlags).sub("\\1%s\\2" % self.EOS, text) + # Break also when single letter comes before punctuation: + text = re.compile("(\s\w%s)" % self.P, self.reFlags).sub("\\1%s" % self.EOS, text) + return text + + def remove_false_end_of_sentence(self, text): + """Repair some positions that don't require a split, i.e. remove the + special break character.""" + + # Don't split at e.g. "U. S. A.": + text = re.compile("([^-\w]\w%s\s)%s" % (self.PAP, self.EOS), self.reFlags).sub("\\1", text) + # Don't split at e.g. "U.S.A.": + text = re.compile("([^-\w]\w%s)%s" % (self.P, self.EOS), self.reFlags).sub("\\1", text) + + # Don't split after a white-space followed by a single letter followed + # by a dot followed by another whitespace. + # e.g. " p. " + text = re.compile("(\s\w\.\s+)%s" % self.EOS, self.reFlags).sub("\\1", text) + + # Don't split at "bla bla... yada yada" (TODO: use \.\.\.\s+ instead?) + text = re.compile("(\.\.\. )%s([%s])" % (self.EOS, string.lowercase), self.reFlags).sub("\\1\\2", text) + # Don't split [.?!] when the're quoted: + text = re.compile("(['\"]%s['\"]\s+)%s" % (self.P, self.EOS)).sub("\\1", text) + + # Don't split at abbreviations: + for abbr in self.abbr: + # TODO: really ignore case? + s = "(\\b%s%s\s)%s" % (abbr, self.PAP, self.EOS) + text = re.compile(s, self.reFlags|re.IGNORECASE).sub("\\1", text) + + # Don't break after quote unless there's a capital letter: + # e.g.: "That's right!" he said. + text = re.compile('(["\']\s*)%s(\s*[%s])' % (self.EOS, string.lowercase), self.reFlags).sub("\\1\\2", text) + + # fixme? not sure where this should occur, leaving it commented out: + # don't break: text . . some more text. + #text=~s/(\s\.\s)$EOS(\s*)/$1$2/sg; + + text = re.compile("(\s%s\s)%s" % (self.PAP, self.EOS), self.reFlags).sub("\\1", text) + + # extension by dnaber --commented out, doesn't help: + #text = re.compile("(:\s+)%s(\s*[%s])" % (self.EOS, string.lowercase), self.reFlags).sub("\\1\\2", text) + return text + + def split_unsplit_stuff(self, text): + """Treat some more special cases that make up a sentence boundary. Insert + the special break character at these positions.""" + # Split at e.g. "no. 5 ": + text = re.compile("(\D\d+)(%s)(\s+)" % self.P, self.reFlags).sub("\\1\\2%s\\3" % self.EOS, text) + # TODO: Not sure about this one, leaving out foir now: + #text = re.compile("(%s\s)(\s*\()" % self.PAP, self.reFlags).sub("\\1%s\\2" % self.EOS, text) + # Split e.g.: He won't. #Really. + text = re.compile("('\w%s)(\s)" % self.P, self.reFlags).sub("\\1%s\\2" % self.EOS, text) + # Split e.g.: He won't say no. Not really. + text = re.compile("(\sno\.)(\s+)(?!\d)", self.reFlags|re.IGNORECASE).sub("\\1%s\\2" % self.EOS, text) + # Split at "a.m." or "p.m." followed by a capital letter. + text = re.compile("([ap]\.m\.\s+)([%s])" % string.uppercase, self.reFlags).sub("\\1%s\\2" % self.EOS, text) + return text + +if __name__ == "__main__": + #t = '"Do split me." Will you?' + #print t + #s = SentenceSplitter() + #l = s.split(t) + #print l + print "Please use ./SentenceSplitterTest.py for testing." diff --git a/languagetool/src/SentenceSplitterEval.py b/languagetool/src/SentenceSplitterEval.py new file mode 100644 index 0000000..cdf8745 --- /dev/null +++ b/languagetool/src/SentenceSplitterEval.py @@ -0,0 +1,128 @@ +#!/usr/bin/python +# -*- coding: iso-8859-1 -*- +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import sys +import re + +import Entities +import SentenceSplitter + +class SentenceSplitterEval: + + def __init__(self): + return + + def findSentence(self, real_boundary, bnc_sentences): + sent = None + sent_disp = None + l = 0 + i = 0 + for s in bnc_sentences: + l = l + len(s) + if l == real_boundary: + sent = s + next_sent_start = "" + try: + next_sent_start = bnc_sentences[i+1][0:20] + except IndexError: + pass + sent_disp = "%s###%s..." % (s, next_sent_start) + break + i = i + 1 + return sent, sent_disp + + def run(self, bnc_string): + self.s = SentenceSplitter.SentenceSplitter() + + # manual testing: + #bnc_string = "<s n=0000>This a test. Sentence.</s> <s n=1111>Another one.</s>" + #bnc_string = "<s n=0000>This a Sentence</s> <s n=1111>Another one.</s>" + + bnc_paras = re.compile("<p>(.*?)</p>", re.DOTALL).findall(bnc_string) + bnc_paras_str = str.join(' ', bnc_paras) + bnc_sentences = re.compile("<s\s.*?>(.*?)</s>", re.DOTALL).findall(bnc_paras_str) + bnc_boundaries = [] + l = 0 + i = 0 + for s in bnc_sentences: + s = bnc_sentences[i] + s = Entities.Entities.cleanEntities(s) + s = re.compile("<.*?>").sub("", s) + s = s.strip() + if not s.endswith(" "): + # TODO: is this fair? + s = s + " " + bnc_sentences[i] = s + l = l + len(s) + bnc_boundaries.append(l) + i = i + 1 + ###print bnc_sentences + bnc_sentences_str = str.join('', bnc_sentences) + #print bnc_sentences_str + + detected_sentences = self.s.split(bnc_sentences_str) + ###print detected_sentences + detected_boundaries = [] + l = 0 + for s in detected_sentences: + l = l + len(s) + detected_boundaries.append(l) + + sent_count = 0 + # recall = how many of the sentence boundaries have been detected? + recall_count = 0 + for real_boundary in bnc_boundaries: + if real_boundary in detected_boundaries: + recall_count = recall_count + 1 + #print "Found: '%s'" % s + else: + pass + (s, s_disp) = self.findSentence(real_boundary, bnc_sentences) + print "Not found: '%s'" % s_disp + sent_count = sent_count + 1 + recall = 0 + if len(bnc_boundaries) > 0: + recall = float(recall_count) / float(len(bnc_boundaries)) + + # precision = how many of detected boundaries are real sentence boundaries? + precision_count = 0 + for detected_boundary in detected_boundaries: + if detected_boundary in bnc_boundaries: + precision_count = precision_count + 1 + precision = 0 + if len(detected_boundaries) > 0: + precision = float(precision_count) / float(len(detected_boundaries)) + + print "Real sentences = %d" % sent_count + print "Recall = %.3f" % recall + print "Precision = %.3f" % precision + return + +if __name__ == "__main__": + prg = SentenceSplitterEval() + if len(sys.argv) <= 1: + print "Usage: ./SentenceSplitterEval.py <bnc_sampler_files>" + else: + for filename in sys.argv[1:]: + print filename + f = open(filename) + bnc_string = f.read() + f.close() + prg.run(bnc_string) diff --git a/languagetool/src/SentenceSplitterTest.py b/languagetool/src/SentenceSplitterTest.py new file mode 100644 index 0000000..52fe732 --- /dev/null +++ b/languagetool/src/SentenceSplitterTest.py @@ -0,0 +1,91 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2003,2004 Daniel Naber <daniel.naber@t-online.de> + +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import os + +import SentenceSplitter +import unittest + +class SentenceSplitterTestCase(unittest.TestCase): + + def testSplit(self): + self.s = SentenceSplitter.SentenceSplitter() + + l = self.s.split(None) + self.assertEqual(len(l), 0) + + self._doTest("") + self._doTest("This is a sentence.") + self._doTest("This is a sentence. #And this is another one.") + self._doTest("This is a sentence. #Isn't it? #Yes, it is.") + self._doTest("This is e.g. Mr. Smith, who talks slowly... #But this is another sentence.") + self._doTest("Chanel no. 5 is groovy.") + self._doTest("Mrs. Jones gave Peter $4.5, to buy Chanel No 5. #He never came back.") + self._doTest("On p. 6 there's nothing. #Another sentence.") + self._doTest("Leave me alone!, he yelled. #Another sentence.") + self._doTest("\"Leave me alone!\", he yelled.") + self._doTest("'Leave me alone!', he yelled. #Another sentence.") + self._doTest("'Leave me alone,' he yelled. #Another sentence.") + self._doTest("This works on the phrase level, i.e. not on the word level.") + self._doTest("Let's meet at 5 p.m. in the main street.") + self._doTest("James comes from the U.K. where he worked as a programmer.") + self._doTest("Don't split strings like U.S.A. please.") + self._doTest("Don't split strings like U. S. A. either.") + self._doTest("Don't split... #Well you know. #Here comes more text.") + self._doTest("Don't split... well you know. #Here comes more text.") + self._doTest('The "." should not be a delimiter in quotes.') + self._doTest('"Here he comes!" she said.') + self._doTest('"Here he comes!", she said.') + self._doTest('"Here he comes." #But this is another sentence.') + self._doTest('"Here he comes!". #That\'s what he said.') + self._doTest('The sentence ends here. #(Not me.)') + self._doTest("He won't. #Really.") + self._doTest("He won't say no. #Not really.") + self._doTest("He won't say no. 5 is better. #Not really.") + self._doTest("They met at 5 p.m. on Thursday.") + self._doTest("They met at 5 p.m. #It was Thursday.") + self._doTest("This is it: a test.") + # known not to work: + #self._doTest("This is it: #A final test.") + # two returns -> paragraph -> new sentence: + self._doTest("He won't\n\n#Really.") + # Some people make two spaces after sentence end: + self._doTest("This is a sentence. #And this is another one.") + # Missing space after sentence end: + self._doTest("James is from the Ireland!#He lives in Spain now.") + # From the abbreviation list: + self._doTest("Jones Bros. have built a succesful company.") + # Doesn't work: + #self._doTest("James is from the U.K. #He lives in Spain now.") + + return + + def _doTest(self, s): + s_copy = s.replace("#", "") + l = self.s.split(s_copy) + correct_result = s.split("#") + # ignore leading/trailing whitespace differences: + i = 0 + for item in l: + l[i] = l[i].strip() + i = i + 1 + i = 0 + for item in correct_result: + correct_result[i] = correct_result[i].strip() + i = i + 1 + self.assertEqual(l, correct_result) + return diff --git a/languagetool/src/TagInfo.py b/languagetool/src/TagInfo.py new file mode 100644 index 0000000..31aec80 --- /dev/null +++ b/languagetool/src/TagInfo.py @@ -0,0 +1,276 @@ +#!/usr/bin/python +# -*- coding: iso-8859-1 -*- +# Provide user information about BNC tags +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import re +import sys + +class TagInfo: + + TAG_STRING = {} + TAG_STRING['en'] = """AJ0 Adjective (general or positive) (e.g. good, old, beautiful) + AJC Comparative adjective (e.g. better, older) + AJS Superlative adjective (e.g. best, oldest) + AT0 Article (e.g. the, a, an, no) [N.B. no is included among articles, which are defined here as determiner words which typically begin a noun phrase, but which cannot occur as the head of a noun phrase.] + AV0 General adverb: an adverb not subclassified as AVP or AVQ (see below) (e.g. often, well, longer (adv.), furthest. [Note that adverbs, unlike adjectives, are not tagged as positive, comparative, or superlative. This is because of the relative rarity of comparative and superlative adverbs.] + AVP Adverb particle (e.g. up, off, out) [N.B. AVP is used for such "prepositional adverbs", whether or not they are used idiomatically in a phrasal verb: e.g. in 'Come out here' and 'I can't hold out any longer', the same AVP tag is used for out. + AVQ Wh-adverb (e.g. when, where, how, why, wherever) [The same tag is used, whether the word occurs in interrogative or relative use.] + CJC Coordinating conjunction (e.g. and, or, but) + CJS Subordinating conjunction (e.g. although, when) + CJT The subordinating conjunction that [N.B. that is tagged CJT when it introduces not only a nominal clause, but also a relative clause, as in 'the day that follows Christmas'. Some theories treat that here as a relative pronoun, whereas others treat it as a conjunction.We have adopted the latter analysis.] + CRD Cardinal number (e.g. one, 3, fifty-five, 3609) + DPS Possessive determiner (e.g. your, their, his) + DT0 General determiner: i.e. a determiner which is not a DTQ. [Here a determiner is defined as a word which typically occurs either as the first word in a noun phrase, or as the head of a noun phrase. E.g. This is tagged DT0 both in 'This is my house' and in 'This house is mine'.] + DTQ Wh-determiner (e.g. which, what, whose, whichever) [The category of determiner here is defined as for DT0 above. These words are tagged as wh-determiners whether they occur in interrogative use or in relative use.] + EX0 Existential there, i.e. there occurring in the there is ... or there are ... construction + ITJ Interjection or other isolate (e.g. oh, yes, mhm, wow) + + NN0 Common noun, neutral for number (e.g. aircraft, data, committee) [N.B. Singular collective nouns such as committee and team are tagged NN0, on the grounds that they are capable of taking singular or plural agreement with the following verb: e.g. 'The committee disagrees/disagree'.] + NN1 Singular common noun (e.g. pencil, goose, time, revelation) + NN2 Plural common noun (e.g. pencils, geese, times, revelations) + NP0 Proper noun (e.g. London, Michael, Mars, IBM) [N.B. the distinction between singular and plural proper nouns is not indicated in the tagset, plural proper nouns being a comparative rarity.] + ORD Ordinal numeral (e.g. first, sixth, 77th, last) . [N.B. The ORD tag is used whether these words are used in a nominal or in an adverbial role. Next and last, as "general ordinals", are also assigned to this category.] + PNI Indefinite pronoun (e.g. none, everything, one [as pronoun], nobody) [N.B. This tag applies to words which always function as [heads of] noun phrases. Words like some and these, which can also occur before a noun head in an article-like function, are tagged as determiners (see DT0 and AT0 above).] + PNP Personal pronoun (e.g. I, you, them, ours) [Note that possessive pronouns like ours and theirs are tagged as personal pronouns.] + PNQ Wh-pronoun (e.g. who, whoever, whom) [N.B. These words are tagged as wh-pronouns whether they occur in interrogative or in relative use.] + PNX Reflexive pronoun (e.g. myself, yourself, itself, ourselves) + + POS The possessive or genitive marker 's or ' (e.g. for 'Peter's or somebody else's', the sequence of tags is: NP0 POS CJC PNI AV0 POS) + PRF The preposition of. Because of its frequency and its almost exclusively postnominal function, of is assigned a special tag of its own. + PRP Preposition (except for of) (e.g. about, at, in, on, on behalf of, with) + PUL Punctuation: left bracket - i.e. ( or [ + PUN Punctuation: general separating mark - i.e. . , ! , : ; - or ? + PUQ Punctuation: quotation mark - i.e. ' or " + PUR Punctuation: right bracket - i.e. ) or ] + TO0 Infinitive marker to + UNC Unclassified items which are not appropriately classified as items of the English lexicon. [Items tagged UNC include foreign (non-English) words, special typographical symbols, formulae, and (in spoken language) hesitation fillers such as er and erm.] + + VBB The present tense forms of the verb BE, except for is, 's: i.e. am, are, 'm, 're and be [subjunctive or imperative] + VBD The past tense forms of the verb BE: was and were + VBG The -ing form of the verb BE: being + VBI The infinitive form of the verb BE: be + VBN The past participle form of the verb BE: been + VBZ The -s form of the verb BE: is, 's + + VDB The finite base form of the verb DO: do + VDD The past tense form of the verb DO: did + VDG The -ing form of the verb DO: doing + VDI The infinitive form of the verb DO: do + VDN The past participle form of the verb DO: done + VDZ The -s form of the verb DO: does, 's + + VHB The finite base form of the verb HAVE: have, 've + VHD The past tense form of the verb HAVE: had, 'd + VHG The -ing form of the verb HAVE: having + VHI The infinitive form of the verb HAVE: have + VHN The past participle form of the verb HAVE: had + VHZ The -s form of the verb HAVE: has, 's + + VM0 Modal auxiliary verb (e.g. will, would, can, could, 'll, 'd) + + VVB The finite base form of lexical verbs (e.g. forget, send, live, return) [Including the imperative and present subjunctive] + VVD The past tense form of lexical verbs (e.g. forgot, sent, lived, returned) + VVG The -ing form of lexical verbs (e.g. forgetting, sending, living, returning) + VVI The infinitive form of lexical verbs (e.g. forget, send, live, return) + VVN The past participle form of lexical verbs (e.g. forgotten, sent, lived, returned) + VVZ The -s form of lexical verbs (e.g. forgets, sends, lives, returns) + + XX0 The negative particle not or n't + ZZ0 Alphabetical symbols (e.g. A, a, B, b, c, d)""" + + TAG_STRING['de'] = """ADJ Adjective (general) (e.g. gut, alt) + ADJE Comparative adjective (e.g. alte) + ADJER adjective with er Ending (e.g. alter) + ADJES adjective with es Ending (e.g. altes) + ADJEM adjective with em Ending (e.g. altem) + ADJEN adjective with en Ending (e.g. alten) + *ADV Adverb like abends, morgen + + PRA Pronoun with accusativ wider, gegen + PRD Pronoun with dativ ab, aus + PRD Pronoun with accusativ or dativ in, über + + PP1 Personal pronoun ich, mich, mir + PP2 Personal pronoun du + PP3 Personal pronoun er, sie, es + PP4 Personal pronoun wir + PP5 Personal pronoun ihr + + *IND oh, ah, heisa + *INT Interrogating word like Wer, wo, etc... + + CNT Number + CJC Conjunctive word like und, oder, ... + + V verb, e.g. gehen + V11 verb, e.g. gehe + V12 verb, e.g. gehst + V13 verb, e.g. geht + V14 verb, e.g. gehen + V15 verb, e.g. gehet + + HV auxiliary verb, e.g. moegen + HV11 auxiliary verb, e.g. mag + HV12 auxiliary verb, e.g. magst + HV13 auxiliary verb, e.g. mag + HV14 auxiliary verb, e.g. moegen + HV15 auxiliary verb, e.g. moeget + + N Noun + NMS Noun male no ending, e.g. Garten + NFS Noun female no ending, e.g. Frau + NNS Noun neutrum no ending + NFNS Noun female or neutrum no ending + NFMS Noun female or male no ending + NMNS Noun male or neutrum no ending + NFMNS Noun male female or neutrum no ending + NM Noun male with ending like Gartens + NF Noun female with ending like Frauen + NN Noun neutrum with ending + NFN Noun female or neutrum with ending + NFM Noun female or male with ending + NMN Noun male or neutrum with ending + NFMN Noun male female or neutrum with ending + + UA1 indefinite article ein + UAE indefinite article eine + UAR indefinite article einer + UAN indefinite article einen + UAM indefinite article einem + UAS indefinite article eines + * INT,IND,ADV sometimes mixed up in the word collection - to be corrected""" + + TAG_STRING['hu'] = """ADJS Singular adjective (e.g. szep) + ADJP Plural Adjective (e.g. szepek) + ADJN Numeric Adjective (e.g. tizedik) + ADV Adverb like szepen, jol + NS Noun, singular asztalnak + NSN Noun, singular, nominativ asztal + NSR Noun, singular, not nominativ asztalt + NP Noun, plural asztalokat + NPN Noun, plural, nominativ asztalok + NPR Noun, plural, not nominativ asztalokra + V1 Verb, Singular, 1-st person irok + V2 Verb, Singular, 2-nd person + V3 Verb, Singular, 3-rd person + V4 Verb, Plural, 1-st person + V5 Verb, Plural, 2-nd person + V6 Verb, Plural, 3-rd person + VINF Verb infinitiv + IKV1 Prefixed Verb, Singular, 1-st person megirok + IKV2 Prefixed Verb, Singular, 2-nd person + IKV3 Prefixed Verb, Singular, 3-rd person + IKV4 Prefixed Verb, Plural, 1-st person + IKV5 Prefixed Verb, Plural, 2-nd person + IKV6 Prefixed Verb, Plural, 3-rd person + VINF Prefixed Verb infinitiv + SI1 Help Verb, Singular, 1-st person akarok + SI2 Help Verb, Singular, 2-nd person + SI3 Help Verb, Singular, 3-rd person + SI4 Help Verb, Plural, 1-st person + SI5 Help Verb, Plural, 2-nd person + SI6 Help Verb, Plural, 3-rd person + SIINF Help Verb infinitiv + IKSI1 Prefixed Help Verb, Singular, 1-st person megvagyok + IKSI2 Prefixed Help Verb, Singular, 2-nd person + IKSI3 Prefixed Help Verb, Singular, 3-rd person + IKSI4 Prefixed Help Verb, Plural, 1-st person + IKSI5 Prefixed Help Verb, Plural, 2-nd person + IKSI6 Prefixed Help Verb, Plural, 3-rd person + IKSIINF Prefixed Help Verb infinitiv + NEIK Non detachable verb prefix be, ki, le, fel, etc... + PP1 Personal pronom en + PP2 Personal pronom te + PP3 Personal pronom o + PP4 Personal pronom mi + PP5 Personal pronom ti + PP6 Personal pronom ok + RPP1 Owning Personal Pronom enyem + RPP2 Owning Personal Pronom tied + RPP3 Owning Personal Pronom ove + RPP4 Owning Personal Pronom mienk + RPP5 Owning Personal Pronom tietek + RPP6 Owning Personal Pronom ovek + IND uhum + INT Interrogating word like nemde etc... + CRD Number tizenot + INTRN Numerical interrogation mennyi, etc... + INTR Interrogation miert, etc... + CJC Conjunctive word like es vagy, ... + DNV Double role, Noun and verb var + DAV Double role, Adj and Verb irt + DNA Double role, Noun and ADJ or ADV iro ... + RART Conjunction word like de, hogy + """ + + def __init__(self, lang): + if not self.TAG_STRING.has_key(lang): + raise KeyError, "no information found for language '%s'" % lang + tag_lines = re.split("\n", self.TAG_STRING[lang]) + self.tags = [] # [(short, explanation)] + for tag_line in tag_lines: + tag_line = tag_line.strip() + parts = re.split("\s+", tag_line) + short_tag = parts[0] + tag_exp = str.join(' ', parts[1:]) + self.tags.append((short_tag, tag_exp)) + return + + def getExp(self, short_tag_search): + for (tag_short, tag_exp) in self.tags: + if short_tag_search == tag_short: + return tag_exp + return None + + def getJavascriptCode(self): + l = [] + for (tag_short, tag_exp) in self.tags: + tag_exp = tag_exp.replace("\"", "\\\"") + l.append('data["%s"] = "%s";' % (tag_short, tag_exp)) + return str.join('\n', l) + + def getHTMLCode(self): + l = [] + l.append('<table border="0" cellpadding="0" cellspacing="2">') + for (tag_short, tag_exp) in self.tags: + tag_exp = tag_exp.replace("\"", "\\\"") + if tag_short: + l.append('<tr bgcolor="#dddddd"><td valign="top"><strong>%s</strong></td><td>%s</td></tr>' % (tag_short, tag_exp)) + else: + l.append('<tr><td> </td></tr>') + l.append('</table>') + return str.join('\n', l) + + def printAll(self): + for (tag_short, tag_exp) in self.tags: + if tag_short: + print "%s: %s" % (tag_short, tag_exp) + else: + print + return + +if __name__ == "__main__": + # TODO: take language as parameter + if len(sys.argv) < 2: + print "Usage: TagInfo.py <language>" + print " where <language> is a language code like en, de, ..." + sys.exit(1) + taginfo = TagInfo(sys.argv[1]) + taginfo.printAll() diff --git a/languagetool/src/Tagger.py b/languagetool/src/Tagger.py new file mode 100644 index 0000000..1243c41 --- /dev/null +++ b/languagetool/src/Tagger.py @@ -0,0 +1,1108 @@ +# -*- coding: iso-8859-1 -*- +# A probabilistic part-of-speech tagger (see the QTag paper) with +# a rule-based extension. +#$rcs = ' $Id$ ' ; +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import codecs +import os +import re +import string +import sys +import time +import cPickle +import htmlentitydefs +import Wfinder + +# FIXME: +dicFile = 'deutsch.txt' +affFile = 'deutsch.aff' + +class Tagger: + """POS-tag any text. The result in XML can be used to re-build the original + text by concatenating all contents of the <w> tags. Whitespace characters + have term=None and type=None, i.e. they are inside their own <w> + elements. Words that could not be tagged have type=unknown.""" + + def __init__(self, textlanguage, db_word_name=None, db_seq_name1=None, db_seq_name2=None): + """Initialize the tagger, optionally using the given + file names that will be used to load and save data later.""" + self.textlanguage = textlanguage + self.wfinder = Wfinder.Wfinder(textlanguage) + db_word_name = os.path.join(sys.path[0], "data", dicFile) + db_seq_name1 = os.path.join(sys.path[0], "data", "seqs1") + db_seq_name2 = os.path.join(sys.path[0], "data", "seqs2") + #uncountable_name = os.path.join("data", "uncountable.txt") + self.data_table = None + self.seqs_table_followed_by = None # tag sequences: table[tag1,tag2] = value + self.seqs_table_follows = None # tag sequences: table[tag1,tag2] = value + if db_word_name: + self.db_word_name = db_word_name + if db_seq_name1: + self.db_seq_name1 = db_seq_name1 + if db_seq_name2: + self.db_seq_name2 = db_seq_name2 + #uncountable_nouns = self.loadUncountables() + self.word_count = 0 + + return + + def loadUncountables(self): + """TODO: not used yet.""" + l = [] + f = open(self.uncountable_name) + while 1: + line = f.readline() + if not line: + break + line = line.strip() + if not line.startswith("#") and line != '': + l.append(line) + f.close() + return l + + def bindData(self): + """Load the word/POS tag and POS tag sequence data from disk.""" + try: + if self.textlanguage != 'en': + self.ReadData(self.db_word_name); + else: + self.data_table = cPickle.load(open(self.db_word_name, 'rb')) + except IOError: + print >> sys.stderr, "No data file '%s' yet, starting with empty table." % self.db_word_name + self.data_table = {} + if self.textlanguage == 'en': + try: + self.seqs_table_followed_by = cPickle.load(open(self.db_seq_name1, 'rb')) + except IOError: + print >> sys.stderr, "No data file '%s' yet, starting with empty table." % self.db_seq_name1 + self.seqs_table_followed_by = {} + try: + self.seqs_table_follows = cPickle.load(open(self.db_seq_name2, 'rb')) + except IOError: + print >> sys.stderr, "No data file '%s' yet, starting with empty table." % self.db_seq_name2 + self.seqs_table_follows = {} + else: + self.seqs_table_followed_by = {} + self.seqs_table_follows = {} + return + + def commitData(self): + """Save the word/POS tag and POS tag sequence data to disk.""" + print >> sys.stderr, "Words = %d" % self.word_count + print >> sys.stderr, "Known words = %d" % len(self.data_table.keys()) + print >> sys.stderr, "Known sequences = %d" % len(self.seqs_table_followed_by.keys()) + print >> sys.stderr, "Commiting results..." +# cPickle.dump(self.data_table, open(self.db_word_name, 'wb'), 1) +# cPickle.dump(self.seqs_table_followed_by, open(self.db_seq_name1, 'wb'), 1) +# cPickle.dump(self.seqs_table_follows, open(self.db_seq_name2, 'wb'), 1) + return + + def deleteData(self): + """Remove the word/POS tag and POS tag sequence data files from disk.""" +# print >> sys.stderr, "Deleting old data files..." +# try: +# os.remove(self.db_word_name) +# except OSError, e: +# print >> sys.stderr, "Note: Could not delete file: %s" % e +# try: +# os.remove(self.db_seq_name1) +# except OSError, e: +# print >> sys.stderr, "Note: Could not delete file: %s" % e +# try: +# os.remove(self.db_seq_name2) +# except OSError, e: +# print >> sys.stderr, "Note: Could not delete file: %s" % e + return + + def buildData(self, filenames): + """Load BNC files in XML or SGML format and count the word/POS + occurences and the POS tag sequences.""" + tagged_words = [] + for filename in filenames: + print >> sys.stderr, "Loading %s..." % filename + text = PreTaggedText(filename) + tagged_words.extend(text.getTaggedWords()) + self.word_count = self.word_count + len(tagged_words) +# text.addToData(tagged_words, self.data_table, self.seqs_table_followed_by, self.seqs_table_follows) + return + + def buildDataFromString(self, s): + """Take a string with format "word1/tag1 word2/tag2 ..." and + count the word/POS occurences and the POS tag sequences. + Only useful for the test cases.""" + pairs = re.compile("\s+").split(s) + tagged_words = [] + split_regex = re.compile("/") + for pair in pairs: + pair = split_regex.split(pair) + if len(pair) != 2: + # e.g. punctuation + continue + word = pair[0] + tag = pair[1] + tagged_words.append((word, tag)) + text = TextToTag(self.textlanguage, self.wfinder) +# text.addToData(tagged_words, self.data_table, self.seqs_table_followed_by, self.seqs_table_follows) + return + + def ReadData(self, db_word_name): + self.data_table = {} + self.word_table = {} + table = {} + return + + + def tagFile(self, filename): + """POS-tag the contents of a text file and return XML that contains + the original text with each word's POS tag in the "type" + attribute.""" + text = TextToTag(self.textlanguage, self.wfinder) + text.setFilename(filename) + tagged_words = text.tag(self.data_table, self.seqs_table_followed_by, self.seqs_table_follows) +# print tagged_words # tktk + xml = text.toXML(tagged_words) + return xml + + def tagText(self, strng): #textchecker check calls + """POS-tag a string and return a list of (word, normalized word, tag) + triples.""" + text = TextToTag(self.textlanguage, self.wfinder) + text.setText(strng) +# print strng + tagged_words = text.tag(self.data_table, self.seqs_table_followed_by, self.seqs_table_follows) +# print tagged_words # tktk + return tagged_words + + def tagTexttoXML(self, strng): + """POS-tag a string and return a list of (word, normalized word, tag) + triples.""" + text = TextToTag(self.textlanguage, self.wfinder) + text.setText(strng) + tagged_words = text.tag(self.data_table, self.seqs_table_followed_by, self.seqs_table_follows) + xml = text.toXML(tagged_words) + return xml + + def tagSeq(self, tup): + """Return the probability of a 2-POS-tag sequence.""" + if len(tup) != 2: + #TODO?: throw exception + print >> sys.stderr, "Sequence does not consist of 2 tokens: '%s'" % str(seq) + return None + try: + probability = self.seqs_table_followed_by[tup] + #probability = self.seqs_table_follows[tup] + except KeyError: + probability = 0 + return probability + + def tagSeq2(self, tup): + """Return the probability of a 2-POS-tag sequence.""" + if len(tup) != 2: + #TODO?: throw exception + print >> sys.stderr, "Sequence does not consist of 2 tokens: '%s'" % str(seq) + return None + try: + #probability = self.seqs_table_followed_by[tup] + probability = self.seqs_table_follows[tup] + except KeyError: + probability = 0 + return probability + + def tagWord(self, word): + """See Text.tagWord()""" + text = TextToTag(self.textlanguage, self.wfinder) + text.setText("") + tag = text.tagWord(word, self.data_table) + return tag + + def guessTagTest(self, word): + """See Text.guessTags(). For test cases only.""" + text = TextToTag(self.textlanguage, self.wfinder) + text.setText("") + tag = text.guessTags(word) + return tag + + +class Text: + + DUMMY = None + number_regex = re.compile("^(\d|\d+[.,/\-]\d+)+$") + time_regex = re.compile("\d(am|pm)$") + bnc_regex = re.compile("<(w|c) (.*?)>(.*?)<", re.DOTALL) + + mapping_file = os.path.join(sys.path[0], "data", "c7toc5.txt") + manually_tagged_file = os.path.join(sys.path[0], "data", "postags.txt") + + def __init__(self, textlanguage, wfinder): + self.textlanguage = textlanguage + self.wfinder = wfinder + self.count_unambiguous = 0 + self.count_ambiguous = 0 + self.count_unknown = 0 + self.whitespace = re.compile("\s+$") + self.nonword = re.compile("([\s,:;]+)") + self.nonword_punct = re.compile("([,:;]+)") + self.sentence_end = re.compile("([.!?]+)$") + self.bnc_word_regexp = re.compile("<W\s+TYPE=\"(.*?)\".*?>(.*?)</W>", \ + re.DOTALL|re.IGNORECASE) + self.mapping = self.loadMapping() + self.manually_tagged = self.loadManuallyTagged() + return + + def loadMapping(self): + f = open(self.mapping_file) + line_count = 1 + mapping = {} + while 1: + line = f.readline().strip() + if not line: + break + l = re.split("\s+", line) + if not len(l) == 2: + print >> sys.stderr, "No valid mapping in line %d: '%s'" % (line_count, line) + (c7, c5) = l[0], l[1] + if mapping.has_key(c7): + print >> sys.stderr, "No valid mapping in line %d: '%s', duplicate key '%s'" % (line_count, line, c7) + continue + mapping[c7] = c5 + #print "%s -> %s" % (c7, c5) + line_count = line_count + 1 + f.close() + return mapping + + def loadManuallyTagged(self): + table = {} + regex = re.compile("^(.+)\s+(.+?)$") + f = open(self.manually_tagged_file) + while 1: + line = f.readline() + if not line: + break + line = line.strip() + if not line.startswith("#") and line != '': + regex_match = regex.search(line) + if regex_match: + word = regex_match.group(1) + postag = regex_match.group(2) + table[word] = postag + f.close() + return table + + def expandEntities(self, text): + """Take a text and expand a few selected entities. Return the same + text with entities expanded. (We cannot simply parse the file with + DOM, as we don't have an XML DTD -- the original files were SGML.)""" + ### TODO: use Entities module + text = re.compile("&", re.IGNORECASE).sub("&", text) + # TODO: several entities are missing here: + #text = re.compile("&#(x..);", re.IGNORECASE).sub(self.expandHexEntities, text) + text = re.compile("£", re.IGNORECASE).sub("�", text) + return text + + #def expandHexEntities(self, matchobj): + # htmlentitydefs.entitydefs[] + # s = u'\%s' % matchobj.group(1) + # #s = "Y" + # return s + + def getBNCTuples(self, text): + """Return a list of (tag, word) tuples from text if + text is a BNC Sampler text in XML or SGML format. Otherwise + return an empty list. The tags are mapped from the C7 tag set + to the much smaller C5 tag set.""" + l = [] + pos = 0 + while 1: + m = self.bnc_regex.search(text, pos) + if not m: + break + tag = m.group(2) + if self.mapping.has_key(tag): + tag = self.mapping[tag] + else: + #print "no mapping: %s" % tag + pass + if m.group(3): + l.append((tag, m.group(3).strip())) + #print "- %s/%s" % (tag, m.group(3).strip()) + pos = m.start()+1 + return l + + def normalise(self, text): + """Take a string and remove XML markup and whitespace at the beginning + and the end. Return the modified string.""" + # sometimes there's <PB...>...</PB> *inside* <W...>...</W>! + text = re.compile("<.*?>", re.DOTALL|re.IGNORECASE).sub("", text) + text = text.strip() + return text + + def splitBNCTag(self, tag): + """Take a string with BNC tags like 'NN1-NP0' and return a list, + e.g. ['NN1', 'NP0']. For single tags like 'NN0' this will + be returned: ['NN0'].""" + tags = re.split("-", tag) + return tags + + def guessTags(self, word): + """Take a word and guess which POS tags it might have and return + those POS tags. This considers e.g. word prefixes, suffixes and + capitalization. If no guess can be made, None is returned.""" + # TODO: return more than one tag + + # �25 etc: + # fixme -- UnicodeDecodeError + #if word.startswith(u"�") or word.startswith(u"$"): + # return 'NN0' + + # numbers: + if self.number_regex.match(word): + return 'CRD' + + # e.g. HIV + if len(word) >= 2 and word == word.upper(): + return 'NN0' + + # this >=3 limit also prevents to assign 'A' (i.e. determiner + # at sentence start) NP0, of course that's only relevant + # for the test cases: + # English only + # TODO: is it okay to use 'latin1' here? + if len(word) >= 3 and word[0] in unicode(string.uppercase, 'latin1'): # e.g. "Jefferson" + return 'NP0' + + # e.g. freedom, contentment, celebration, assistance, fighter, + # violinist, capacity + if self.textlanguage == 'en': + noun = ['dom', 'ment', 'tion', 'sion', 'ance', 'ence', 'er', 'or', + 'ist', 'ness', 'icity'] + for suffix in noun: + if word.endswith(suffix): + return 'NN1' + + # e.g. quickly + if word.endswith("ly"): + return 'AV0' + + # e.g. 8.55am + if self.time_regex.search(word): + return 'AV0' + + # e.g. extensive, heroic, financial, portable, hairy + # mysterious, hopeful, powerless + # 'en' was left out, could also be a verb + if self.textlanguage == 'en': + adj = ['ive', 'ic', 'al', 'able', 'y', 'ous', 'ful', 'less'] + for suffix in adj: + if word.endswith(suffix): + return 'AJ0' + + # e.g. publicize, publicise, activate, simplify + # 'en' was left out, could also be a adjective + verb = ['ize', 'ise', 'ate', 'fy'] + for suffix in verb: + if word.endswith(suffix): + # fixme: could also be VVB + return 'VVI' + + return None + + def tagWord(self, word, data_table): + """Find all possible tags for a word and return a list of tuples: + [(orig_word, normalised_word, [(tag, probability])]""" + orig_word = word + word = self.normalise(word) + #word = re.compile("[^\w' ]", re.IGNORECASE).sub("", word) + + #if word and self.nonword_punct.match(word): + # # punctuation + # return [(orig_word, orig_word, [])] + if (not word) or self.whitespace.match(word): + # word is just white space + return [(orig_word, None, [])] + + if self.manually_tagged.has_key(word): + return [(orig_word, orig_word, [(self.manually_tagged[word], 1)])] + + # sanity check: + #if word.count("'") > 1: + # print >> sys.stderr, "*** What's this, more than one apostroph: '%s'?" % word + + # Special cases: BNC tags "wasn't" like this: "<w VBD>was<w XX0>n't" + # Call yourself, but don't indefinitely recurse. + if self.textlanguage == 'en': + special_cases = ("n't", "'s", "'re", "'ll", "'ve") + for special_case in special_cases: + special_case_pos = word.find(special_case) + if special_case_pos != -1 and special_case_pos != 0: + first_part = self.tagWord(word[0:special_case_pos], data_table)[0] + second_part = self.tagWord(special_case, data_table)[0] + tag_results = [] + #TODO: return probability?: + #print second_part + tag_results.append((word[0:special_case_pos], first_part[1], first_part[2])) + tag_results.append((special_case, second_part[1], second_part[2])) + return tag_results + + # TODO?: ignore upper/lower case?, no -- seems to decrease precision + #word = word.lower() #handled by word finder itself + #if not data_table.has_key(word) and len(word) >= 1: + # word = word.lower() + # #if data_table.has_key(word): + # # print "lower: %s" % word + #if not data_table.has_key(word) and len(word) >= 1: + # word = "%s%s" % (word[0].upper(), word[1:]) + # #if data_table.has_key(word): + # # print "upper: %s" % word + + if self.textlanguage != 'en': + rc = self.wfinder.test_it(word) + if rc[0] != '-': + src = rc.split() + # print len(src) + # last returned word exists in .dic file + # that's why this word was found + word = src[len(src)-2] + return [(orig_word, orig_word, [(src [len(src)-1], 1)])] +# return [(orig_word, word, [(src [len(src)-1], 1)])] + if rc[0] == '-': + #if not data_table.has_key(word): + # word is unknown + #print "unknown: '%s'" % word + self.count_unknown = self.count_unknown + 1 + guess_tag = self.guessTags(word) + if guess_tag: + return [(orig_word, orig_word, [(guess_tag, 1)])] +# return [(orig_word, word, [(guess_tag, 1)])] + else: + return [(orig_word, orig_word, [("unknown", 1)])] +# return [(orig_word, word, [("unknown", 1)])] + else: # English case + if not data_table.has_key(word): + # word is unknown + #print "unknown: '%s'" % word + self.count_unknown = self.count_unknown + 1 + guess_tag = self.guessTags(word) + if guess_tag: + return [(orig_word, word, [(guess_tag, 1)])] + else: + return [(orig_word, word, [("unknown", 1)])] + else: + pos_table = data_table[word].table + if len(pos_table) == 1: + # word is unambiguous + self.count_unambiguous = self.count_unambiguous + 1 + return [(orig_word, word, [(pos_table.keys()[0], 1)])] + else: + # word is ambiguous + tag_tuples = [] + for pos_tag in pos_table.keys(): + #print "pos_tag=%s -> %.2f" % (pos_tag, pos_table[pos_tag]) + tag_tuples.append((pos_tag, pos_table[pos_tag])) + self.count_ambiguous = self.count_ambiguous + 1 + return [(orig_word, word, tag_tuples)] + +# def addToData(self, tagged_words, data_table, seqs_table_followed_by, seqs_table_follows): + """Count words and POS tags so they can later be added + to the persistent storage.""" +# tag_list = self.addWords(tagged_words, data_table) +# self.addTagSequences(tag_list, seqs_table_followed_by, seqs_table_follows) +# return + +# def addWords(self, tagged_words, data_table): + """For each word, save the tag frequency to data_table so + it can later be added to the persistent storage. Return + a list of all tags.""" +# all_tags_list = [] +# for (word, tag) in tagged_words: + #only for testing if case-insensitivity is better: + #word = word.lower() +# all_tags_list.append(tag) +# tag_list = self.splitBNCTag(tag) +# assert(len(tag_list) == 1 or len(tag_list) == 2) + #print "word/pos_list: %s/%s" % (word, tag_list) +# if data_table.has_key(word): + # word is already known +# word_table = data_table[word].table +# for tag in tag_list: +# if word_table.has_key(tag): +# word_table[tag] = word_table[tag] + 1.0/len(tag_list) + #print "word_table[%s] += %f" % (tag, 1.0/len(tag_list)) +# else: +# word_table[tag] = 1.0/len(tag_list) + #print "word_table[%s] = %f" % (tag, word_table[tag]) +# else: +# word_table = {} +# for tag in tag_list: +# word_table[tag] = 1.0/len(tag_list) + #print "word_table[%s] = %f" % (tag, word_table[tag]) +# data_table[word] = WordData(word, word_table) + # Normalize data_table values so they are probabilities (0 to 1): +# for e in data_table.keys(): +# t = data_table[e].table +# occ_all = 0 +# for occ in t.values(): +# occ_all = occ_all + occ +# for key in t.keys(): +# t[key] = t[key] / occ_all + # debug: + #for e in data_table.keys(): + # print "%s, %s" % (e, data_table[e]) +# return all_tags_list + + def addTagSequences(self, tag_list, seqs_table_followed_by, seqs_table_follows): + """Save information about POS tag tuples to seqs_table.""" + # TODO: add dummy entries? + if len(tag_list) == 0: + return + i = 0 + + ### FIXME: does this work if data is added later? probably not...: + count_followed_by = {} + count_follows = {} + + while 1: + if i >= len(tag_list)-1: + break + tag0 = tag_list[i] + key = () + if self.mapping.has_key(tag0): + tag0 = self.mapping[tag0] + tag1 = tag_list[i+1] + if self.mapping.has_key(tag1): + tag1 = self.mapping[tag1] + try: + seqs_table_followed_by[(tag0,tag1)] = seqs_table_followed_by[(tag0,tag1)] + 1 + except KeyError: + seqs_table_followed_by[(tag0,tag1)] = 1 + try: + count_followed_by[tag0] = count_followed_by[tag0] + 1 + except KeyError: + count_followed_by[tag0] = 1 + + #print "%s/%s" % (tag1, tag0) + try: + seqs_table_follows[(tag1,tag0)] = seqs_table_follows[(tag1,tag0)] + 1 + except KeyError: + seqs_table_follows[(tag1,tag0)] = 1 + try: + count_follows[tag1] = count_follows[tag1] + 1 + except KeyError: + count_follows[tag1] = 1 + i = i + 1 + + # Normalize to 0-1 range: + # TODO: do these numbers become too small, as the Qtag paper states? + for t in seqs_table_followed_by.keys(): + #if t[0] == 'NN0': + # print "%s=%s -- %d" % (t, seqs_table_followed_by[t], count_followed_by[t[0]]) + seqs_table_followed_by[t] = float(seqs_table_followed_by[t]) / float(count_followed_by[t[0]]) + for t in seqs_table_follows.keys(): + seqs_table_follows[t] = float(seqs_table_follows[t]) / float(count_follows[t[0]]) + + #debug: + #print "FOLLOWED BY (norm):" + #for k in seqs_table_followed_by.keys(): + # print "%s -> %s" % (k, seqs_table_followed_by[k]) + #print "FOLLOWS (norm):" + #for k in seqs_table_follows.keys(): + # print "%s -> %s" % (k, seqs_table_follows[k]) + return + + +class TextToTag(Text): + """Any text (also pre-tagged texts from the BNC -- for + testing the tagger).""" + + DUMMY = None + + def __init__(self, textlanguage, wfinder): + # FIXME: not needed, is it? (done in base class): + self.textlanguage = textlanguage + self.text = None + Text.__init__(self, self.textlanguage, wfinder) + return + + def setText(self, text): + self.text = text + return + + def setFilename(self, filename): + f = open(filename) + self.text = f.read() + f.close() + return + + def getBestTagSimple(self, tag_tuples): + """Return the most probable tag without taking context into + account. Only useful for testing and checking the baseline.""" + max_prob = 0 + best_tag = None + for tag_tuples_here in tag_tuples: + prob = tag_tuples_here[1] + if prob >= max_prob: + max_prob = prob + best_tag = tag_tuples_here[0] + return best_tag + + def checkBNCMatch(self, i, tagged_list_bnc, word, best_tag, data_table): + """Check for mismatches, i.e. POS tags that differ from the original + tag in BNC. Print out a warning for all those differences and return + 1, otherwise return 0. Note that the BNC's tags are only correct + in 97-98%. If the original tag is 'UNC' and this tagger's tag is + not 'unknown', this is still considered a mismatch.""" + if i >= len(tagged_list_bnc)-1: + print >> sys.stderr, "Index out of range..." + return 0 + if not tagged_list_bnc[i]: + return 0 + word_from_bnc, tags_from_bnc = tagged_list_bnc[i] + #print "%s ?= %s" % (word_from_bnc, word) + if best_tag == 'unknown': + # 'UNC' means unclassified in BNC, assume that this corresponds + # to out 'unknown': + best_tag = 'UNC' + guessed = 1 + if data_table.has_key(word): + guessed = 0 + if not word == word_from_bnc: + print >> sys.stderr, "*** word mismatch: '%s'/'%s'" % (word, word_from_bnc) + #sys.exit() + elif not (best_tag in tags_from_bnc) and \ + tags_from_bnc[0][0] != 'Y': # ignore punctuation tags + print >> sys.stderr, "*** tag mismatch (guessed=%d): got %s/%s, expected %s/%s" % \ + (guessed, word, best_tag, word_from_bnc, tags_from_bnc) + return 1 + #if word == word_from_bnc and guessed: + # print >> sys.stderr, "GOODGUESS" + return 0 + + def getStats(self, count_wrong_tags, is_bnc): + """Get some human-readable statistics about tagging success, + e.g. number and percentage of correctly tagged tokens.""" + sum = self.count_unknown + self.count_unambiguous + self.count_ambiguous + res = "" + if sum > 0: + res = "<!-- Statistics:\n" + res = res + "count_unknown = %d (%.2f%%)\n" % (self.count_unknown, float(self.count_unknown)/float(sum)*100) + res = res + "count_unambiguous = %d (%.2f%%)\n" % (self.count_unambiguous, float(self.count_unambiguous)/float(sum)*100) + res = res + "count_ambiguous = %d (%.2f%%)\n" % (self.count_ambiguous, float(self.count_ambiguous)/float(sum)*100) + #res = res + "sum = %d\n" % sum + if is_bnc: + res = res + "correct tags = %d (%.2f%%)\n" % (sum-count_wrong_tags, float(sum-count_wrong_tags)/float(sum)*100) + #res = res + "count_wrong_tags = %d (%.2f%%)\n" % (count_wrong_tags, float(count_wrong_tags)/float(sum)*100) + res = res + "-->" + return res + + def applyConstraints(self, prev_word, curr_word, next_word, tagged_tuples): + """Some hard-coded and manually written rules that prevent mistaggings by + the probabilistic tagger. Removes incorrect POS tags from tagged_tuples. + Returns nothing, as it works directly on tagged_tuples.""" + # demo rule just for the test cases: + if curr_word and curr_word.lower() == 'demodemo': + self.constrain(tagged_tuples, 'AA') + # ... + return + + def constrain(self, tagged_tuples, pos_tag): + """Remove the pos_tag reading from tagged_tuples. Returns nothing, + works directly on tagged_tuples.""" + i = 0 + for t in tagged_tuples: + if t[0] == pos_tag: + del tagged_tuples[i] + i = i + 1 + return + + def applyTagRules(self, curr_word, tagged_word, curr_tag): + """Some hard-coded and manually written rules that extent the + tagging. Returns a (word, normalized_word, tag) triple.""" + # ... + return None + + def tag(self, data_table, seqs_table_followed_by, seqs_table_follows): # z.164 texttag calls + """Tag self.text and return list of tuples + (word, normalized word, most probable tag)""" + self.text = self.expandEntities(self.text) + is_bnc = 0 + word_matches = self.getBNCTuples(self.text) + if len(word_matches) > 0: + # seems like this is a BNC text used for testing + is_bnc = 1 + print >> sys.stderr, "BNC text detected." + else: + word_matches = self.nonword.split(self.text) + # tktk splitted looks \xe1, etc... + # Put sentence end periods etc into an extra element. + # We cannot just split on periods etc. because that would + # break inner-sentence tokens like "... No. 5 ...": + # fixme: only work on the last element (not counting white space) + # FIXME: doesn't work here: "I cannot , she said." + if not is_bnc: + j = len(word_matches)-1 + while j >= 0: + w = word_matches[j] + s_end_match = self.sentence_end.search(w) + if s_end_match: + word_matches[j] = w[:len(w)-len(s_end_match.group(1))] + word_matches.insert(j+1, s_end_match.group(1)) + break + j = j - 1 + +# print "word_matches=%s" % word_matches + i = 0 + tagged_list = [self.DUMMY, self.DUMMY] + tagged_list_bnc = [self.DUMMY, self.DUMMY] + + while i < len(word_matches): + next_token = None + tags = None + if is_bnc: + # word_matches[i] is a (tag,word) tuple + (tag, word) = word_matches[i] + if i+1 < len(word_matches): + (next_token, foo) = word_matches[i+1] + word = self.normalise(word) + tags = self.splitBNCTag(tag) + else: + word = word_matches[i] + if i+1 < len(word_matches): + next_token = word_matches[i+1] + if self.textlanguage == 'en': + if i + 2 < len(word_matches): # english only + # BNC special case: "of course" and some others are tagged as one word! + tuple_word = "%s %s" % (word, word_matches[i+2]) # +2 = jump over whitespace + if data_table.has_key(tuple_word): + #print >> sys.stderr, "*** SPECIAL CASE %d '%s' ..." % (i, tuple_word) + word = tuple_word + i = i + 2 +# +# The next several (6-7) lines avoid not found words +# because of trailing dots. +# + if len(word) >= 1 and word[-1] in ( '.', ',', '?','!', ':', ';', '\'', '\"', '%', '='): + wordend = word[-1]; + word = word[0:-1] + r = Text.tagWord(self, word, data_table) + tagged_list.extend(r) + word = wordend + r = Text.tagWord(self, word, data_table) + tagged_list.extend(r) + + if is_bnc: + for el in r: + # happens e.g. with this (wrong?) markup in BNC: + #<W TYPE="CRD" TEIFORM="w">4's</W> + # My tagger tags <4> and <'s>, so there's an offset + # which makes futher comparisons BNC <-> tagger impossible, + # so use this pseudo-workaround and just re-use the tags + # for the <'s>, too: + #print "%s -> %s" % (el[0], tags) + tagged_list_bnc.append((el[0], tags)) + i = i + 1 + + tagged_list.append(self.DUMMY) + tagged_list.append(self.DUMMY) + + # test only: + #result_tuple_list = [] + #i = 0 + #count_wrong_tags = 0 + #for t in tagged_list: + # #print "t=%s" % t + # if t: + # best_tag = self.getBestTagSimple(t[2]) + # if is_bnc: + # wrong_tags = self.checkBNCMatch(i, tagged_list_bnc, t[0], best_tag, data_table) + # count_wrong_tags = count_wrong_tags + wrong_tags + # result_tuple_list.append((t[0], t[1], best_tag)) + # i = i + 1 + #stat = self.getStats(count_wrong_tags) + #print >> sys.stderr, stat + #print result_tuple_list + + ### Constraint-based part: + prev_word = None + next_word = None + i = 0 + for tag_tuples in tagged_list: + prev_word = self.getPrevWord(i, tagged_list) + next_word = self.getNextWord(i, tagged_list) + if tag_tuples and tag_tuples[1]: + self.applyConstraints(prev_word, tag_tuples[0], next_word, tag_tuples[2]) + i = i + 1 + + result_tuple_list = self.selectTagsByContext(tagged_list, seqs_table_followed_by, \ + seqs_table_follows, tagged_list_bnc, is_bnc, data_table) + + i = 0 + for tag_triple in result_tuple_list: + triple = self.applyTagRules(tag_triple[0], tag_triple[1], tag_triple[2]) + if triple: + result_tuple_list[i] = triple + if self.sentence_end.search(tag_triple[0]): + # make sure punctuation doesn't have tags: + result_tuple_list[i] = (tag_triple[0], None, None) + i = i + 1 + + return result_tuple_list + + def selectTagsByContext(self, tagged_list, seqs_table_followed_by, \ + seqs_table_follows, tagged_list_bnc, is_bnc, data_table): + + count_wrong_tags = 0 + tag_probs = {} + i = 0 + for tagged_triple in tagged_list: + if tagged_triple != None and tagged_triple[1] == None: + # ignore whitespace + i = i + 1 + continue + try: + one = tagged_list[i] + two = tagged_list[i+1] + whitespace_jump = 0 + if two and two[1] == None: + two = tagged_list[i+2] + whitespace_jump = whitespace_jump + 1 + two_pos = i + 1 + whitespace_jump + three = tagged_list[i+2+whitespace_jump] + if three and three[1] == None: + three = tagged_list[i+3+whitespace_jump] + whitespace_jump = whitespace_jump + 1 + three_pos = i + 2 + whitespace_jump + except IndexError: + # list end + break + + one_tags = [None] + if one: + one_tags = one[2] + two_tags = [None] + if two: two_tags = two[2] + three_tags = [None] + if three: three_tags = three[2] + + for one_tag in one_tags: + tag_one_prob = 0 + if one_tag: + tag_one_prob = one_tag[1] + + for two_tag in two_tags: + tag_two_prob = 0 + if two_tag: + tag_two_prob = two_tag[1] + + for three_tag in three_tags: + tag_three_prob = 0 + if three_tag: + tag_three_prob = three_tag[1] + + #print "** %s/%s/%s" % (one_tag, two_tag, three_tag) + one_tag_prob = None + if one_tag: one_tag_prob = one_tag[0] + two_tag_prob = None + if two_tag: two_tag_prob = two_tag[0] + three_tag_prob = None + if three_tag: three_tag_prob = three_tag[0] + + seq_prob = 0 + if one: + #print one[0], + #if two: + # print two[0] + try: + k1 = (one_tag_prob, two_tag_prob) + k2 = (two_tag_prob, three_tag_prob) + seq_prob = seqs_table_followed_by[k1] * \ + seqs_table_followed_by[k2] + #print "k1=%s, k2=%s" % (str(k1), str(k2)) + except KeyError: + pass + prob_combined = seq_prob * tag_one_prob + #print "%s, %s, %s: %.7f * %.7f = %.7f" % (one_tag_prob, two_tag_prob, \ + # three_tag_prob, seq_prob, tag_one_prob, prob_combined) + k1 = (i, one_tag[0]) + #print "%s = %.7f" % (str(k1), prob_combined) + try: + tag_probs[k1] = tag_probs[k1] + prob_combined + except KeyError: + tag_probs[k1] = prob_combined + if two: + try: + seq_prob = seqs_table_follows[(two_tag_prob, one_tag_prob)] * \ + seqs_table_followed_by[(two_tag_prob, three_tag_prob)] + except KeyError: + pass + prob_combined = seq_prob * tag_two_prob + k2 = (two_pos, two_tag[0]) + try: + tag_probs[k2] = tag_probs[k2] + prob_combined + except KeyError: + tag_probs[k2] = prob_combined + #print "%s = %.7f" % (str(k2), prob_combined) + if three: + try: + seq_prob = seqs_table_follows[(two_tag_prob, one_tag_prob)] * \ + seqs_table_follows[(three_tag_prob, two_tag_prob)] + except KeyError: + pass + prob_combined = seq_prob * tag_three_prob + k3 = (three_pos, three_tag[0]) + try: + tag_probs[k3] = tag_probs[k3] + prob_combined + except KeyError: + tag_probs[k3] = prob_combined + #print "%s = %.7f" % (str(k3), prob_combined) + + orig_word = None + norm_word = None + # the word that falls out of the window is assigned its final tag: + if one: + orig_word = one[0] + norm_word = one[1] + keys = tag_probs.keys() + max_prob = 0 + best_tag = None + for tag_prob in keys: + if tag_prob[0] == i and tag_probs[tag_prob] >= max_prob: + ###print " K=%s, V=%s" % (tag_prob, tag_probs[tag_prob]) + max_prob = tag_probs[tag_prob] + best_tag = tag_prob[1] + tagged_list[i] = (orig_word, norm_word, best_tag) + #print "BEST@%d: %s" % (i, best_tag) + + # this avoids inefficiencies, it's necessary because + # of the tag_probs.keys() call above (which becomes + # too slow otherwise): + for tag_prob in keys: + if tag_prob[0] <= i: + del tag_probs[tag_prob] + + if is_bnc and one: + orig_word = one[0] + if self.textlanguage == 'en': + wrong_tags = self.checkBNCMatch(i, tagged_list_bnc, orig_word, best_tag, data_table) + count_wrong_tags = count_wrong_tags + wrong_tags + + i = i + 1 + + stat = self.getStats(count_wrong_tags, is_bnc) + #print >> sys.stderr, stat + + # remove dummy entries: + tagged_list.pop(0) + tagged_list.pop(0) + tagged_list.pop() + tagged_list.pop() + + return tagged_list + + def getPrevWord(self, i, tagged_list): + """Find the token previous to the token at position i from tagged_list, + ignoring whitespace tokens. Return a tuple (word, tuple_list), + whereas tuple_list is a list of (tag, tag_probability) tuples.""" + j = i-1 + while j >= 0: + (orig_word_tmp, tagged_word_tmp, tag_tuples_tmp) = self.getTuple(tagged_list[j]) + j = j - 1 + if not tagged_word_tmp: + continue + else: + prev = tag_tuples_tmp + return orig_word_tmp + return None + + def getNextWord(self, i, tagged_list): + """Find the token next to the token at position i from tagged_list, + ignoring whitespace tokens. See self.getPrevToken()""" + j = i + 1 + while j < len(tagged_list): + (orig_word_tmp, tagged_word_tmp, tag_tuples_tmp) = self.getTuple(tagged_list[j]) + j = j + 1 + if not tagged_word_tmp: + continue + else: + next = tag_tuples_tmp + return orig_word_tmp + return None + + def getTuple(self, tagged_list_elem): + if not tagged_list_elem: + orig_word = None + tagged_word = None + tag_tuples = None + else: + (orig_word, tagged_word, tag_tuples) = tagged_list_elem + return (orig_word, tagged_word, tag_tuples) + + + def toXML(self, tagged_words): + "Show result as XML." + xml_list = [] + for (orig_word, word, tag) in tagged_words: + # fast appending: + if not word and not tag: + xml_list.append(' <w>%s</w>\n' % orig_word) + else: + xml_list.append(' <w term="%s" type="%s">%s</w>\n' % (word, tag, orig_word)) + xml = "<taggedWords>\n" + string.join(xml_list, "") + "</taggedWords>\n" + return xml + + +class PreTaggedText(Text): + "Text from the BNC Sampler in XML format." + + def __init__(self, filename): + self.content = None + Text.__init__(self) + f = open(filename) + self.content = f.read() + f.close() + return + + def getTaggedWords(self): + "Returns list of tuples (word, tag)" + text = self.expandEntities(self.content) + word_matches = self.getBNCTuples(text) + tagged_words = [] + for (tag, word) in word_matches: + tagged_words.append((word, tag)) + return tagged_words + + +class WordData: + "A term and the frequency of its tags." + + def __init__(self, word, affix, table): + self.word = word + self.affix = affix + # table = tag / number of occurences + # deep copy the hash table (TODO: use deep copy functions): + self.table = {} + for el in table: + self.table[el] = table[el] + return + + def __str__(self): + "Show word data (debugging only!)" + string = self.word + ":\n" + for el in self.table: + string = string + "\t" + el + ": " + str(self.table[el]) + "\n" + return string diff --git a/languagetool/src/TaggerTest.py b/languagetool/src/TaggerTest.py new file mode 100644 index 0000000..c94f233 --- /dev/null +++ b/languagetool/src/TaggerTest.py @@ -0,0 +1,168 @@ +#!/usr/bin/python +# -*- coding: iso-8859-1 -*- +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import unittest +import Tagger + +import os +import sys + +class TaggerTestCase(unittest.TestCase): + + FILENAME_WORDS = os.path.join(sys.path[0], "data", "tag_test_words") + FILENAME_SEQ1 = os.path.join(sys.path[0], "data", "tag_test_sequences1") + FILENAME_SEQ2 = os.path.join(sys.path[0], "data", "tag_test_sequences2") + + def cleanList(self, l): + """Return a copy of the list with 'None' elements (e.g. whitespace) + removed. Also, only the first and last element of each triple is + copied.""" + new_list = [] + for el in l: + if el[1]: + new_list.append((el[0], el[2])) + return new_list + + def cleanListAll(self, l): + """Return a copy of the list with 'None' elements (e.g. whitespace) + removed. Also, only the last element of each triple is copied.""" + new_list = [] + for el in l: + if el[1]: + new_list.append(el[2]) + return new_list + + def tag(self, learn_text, text): + + # build data: + tagger = Tagger.Tagger("en", self.FILENAME_WORDS, self.FILENAME_SEQ1, self.FILENAME_SEQ2) + tagger.deleteData() + tagger.bindData() + tagger.buildDataFromString(learn_text) + tagger.commitData() + tagger = None + + # tag text: + tagger2 = Tagger.Tagger("en", self.FILENAME_WORDS, self.FILENAME_SEQ1, self.FILENAME_SEQ2) + tagger2.bindData() + res = tagger2.tagText(text) + res = self.cleanList(res) + tagger2.deleteData() + + return res + + def testExpandEntities(self): + tagger = Tagger.Text("en", None) + r = tagger.expandEntities("") + self.assertEqual(r, "") + r = tagger.expandEntities("bla &&") + self.assertEqual(r, "bla &&") + #r = tagger.expandEntities("bla £") + #self.assertEqual(r, u"bla £") + return + + def testGuess(self): + tagger = Tagger.Tagger("en", self.FILENAME_WORDS, self.FILENAME_SEQ1, self.FILENAME_SEQ2) + tagger.deleteData() + tagger.bindData() + tagger.buildDataFromString("") # don't learn at all! + tagger.commitData() + + tag = tagger.guessTagTest("") + self.assertEqual(tag, None) + + # numbers = CRD: + self.assertEqual(tagger.guessTagTest("0"), 'CRD') + self.assertEqual(tagger.guessTagTest("3123.1312"), 'CRD') + self.assertEqual(tagger.guessTagTest("00,99"), 'CRD') + self.assertEqual(tagger.guessTagTest("00/99"), 'CRD') + self.assertEqual(tagger.guessTagTest("1-99"), 'CRD') + + # BNC Sampler tags "$xx" as NNU, which is mapped to NN0 (same for £): + self.assertEqual(tagger.guessTagTest("$31.12"), 'NN0') + self.assertEqual(tagger.guessTagTest("HIV"), 'NN0') + self.assertEqual(tagger.guessTagTest("8.55pm"), 'AV0') + self.assertEqual(tagger.guessTagTest("10.10pm"), 'AV0') + self.assertEqual(tagger.guessTagTest(u"Großekathöfer"), 'NP0') + self.assertEqual(tagger.guessTagTest("jackerfoodom"), 'NN1') + self.assertEqual(tagger.guessTagTest("testious"), 'AJ0') + self.assertEqual(tagger.guessTagTest("testize"), 'VVI') + self.assertEqual(tagger.guessTagTest("foofooly"), 'AV0') + self.assertEqual(tagger.guessTagTest("unguessablexxx"), None) + self.assertEqual(tagger.guessTagTest("verboten"), None) + return + + def testLearningAndTagging(self): + + print "###########1" + + #FIXME: doesn't work: + r = self.tag("The/AT0 fat/AJ0 man/NN1", "The big man") + self.assertEqual(r, [('The', 'AT0'), ('big', 'unknown'), ('man', 'NN1')]) + + print "###########2" + return #FIXME + + r = self.tag("The/AT0 fat/AJ0 man/NN1", "the xxx") + # the/unknown because the tagger is case sensitive: + self.assertEqual(r, [('the', 'unknown'), ('xxx', 'unknown')]) + + r = self.tag("The/AT0 fat/AJ0 man/NN1", "The fat man") + self.assertEqual(r, [('The', 'AT0'), ('fat', 'AJ0'), ('man', 'NN1')]) + + r = self.tag("A/DET cool/AJ0 large/AJ0 car/NN1", "A cool car") + self.assertEqual(r, [('A', 'DET'), ('cool', 'AJ0'), ('car', 'NN1')]) + + # fat occurs 2 times as NN1 and 1 time as AJ0, but context decides: + r = self.tag("""The/DET fat/NN1 is/VB hot/AJ0 + The/DET fat/AJ0 guy/NN1 + A/DET man/NN1 used/VBD fat/NN1""", + "A fat man") + self.assertEqual(r, [('A', 'DET'), ('fat', 'AJ0'), ('man', 'NN1')]) + + # fat occurs 3 times as NN1 and 0 times as AJ0 -> tagged as NN1 of course: + r = self.tag("""The/DET fat/NN1 is/VB hot/AJ0 + A/DET fat/NN1 man/NN1 . + He/PP used/VBD fat/NN1""", "A fat man") + self.assertEqual(r, [('A', 'DET'), ('fat', 'NN1'), ('man', 'NN1')]) + + # fat occurs 1 times as NN1 and 2 times as AJ0 -> tagged as AJ0 + r = self.tag("""The/DET fat/AJ0 is/VB hot/AJ0 + A/DET fat/AJ0 man/NN1 . + He/PP used/VBD fat/NN1""", "A fat man") + self.assertEqual(r, [('A', 'DET'), ('fat', 'AJ0'), ('man', 'NN1')]) + + r = self.tag("""The/DET fat/AJ0 man/NN is/VB fat/AJ0 ./PP""", + "A fat man he is fat.") + self.assertEqual(r, [('A', 'unknown'), ('fat', 'AJ0'), ('man', 'NN'), + ('he', 'unknown'), ('is', 'VB'), ('fat', 'AJ0')]) + + return + + #FIXME + #def testApplyConstraints(self): + # r = self.tag("A/X bla/X demodemo/AA demodemo/AA demodemo/BB bla/X bla/X", \ + # "demodemo") + # self.assertEqual(r, [('demodemo', 'BB')]) + # + # return + +if __name__ == "__main__": + unittest.main() diff --git a/languagetool/src/Tools.py b/languagetool/src/Tools.py new file mode 100644 index 0000000..5bed1b1 --- /dev/null +++ b/languagetool/src/Tools.py @@ -0,0 +1,58 @@ +# -*- coding: iso-8859-1 -*- +# Tools class +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import sys +import re + +class Tools: + + def __init__(self): + return + + def getXML(node, xmlstr=""): + """Get the XML content of a node, but only elements and text.""" + if node and node.nodeType == node.ELEMENT_NODE: + l = [] + for child in node.childNodes: + l.append(Tools.getXML(child, xmlstr)) + xmlstr = "<%s>%s</%s>" % (node.tagName, str.join('', l), node.tagName) + elif node and node.nodeType == node.TEXT_NODE: + xmlstr = "%s%s" % (xmlstr, node.data) + return xmlstr + + getXML = staticmethod(getXML) + + def countLinebreaks(s): + matches = re.findall("[\n\r]", s) + #print "#%s -> %s" % (s, len(matches)) + return len(matches) + + countLinebreaks = staticmethod(countLinebreaks) + + def getLanguageName(shortName): + if shortName == 'en': + return 'English' + elif shortName == 'de': + return 'German' + elif shortName == 'hu': + return 'Hungarian' + return None + + getLanguageName = staticmethod(getLanguageName) diff --git a/languagetool/src/Wfdeu.py b/languagetool/src/Wfdeu.py new file mode 100755 index 0000000..89b26fc --- /dev/null +++ b/languagetool/src/Wfdeu.py @@ -0,0 +1,70 @@ +# -*- coding: iso-8859-1 -*- +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2004 .... +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +import array +import codecs +import os +from string import * +import sys + +class Wfdeu: + + encoding = "latin1" + + def __init__(self): + return + + def getTyp(self,typ, oword, word): + if typ != "": + if typ == 'V' or typ == 'HV': + if oword[-4:] == 'ende' or oword[-5:-1] == 'ende': + typ = 'ADJV' + if typ == 'V' or typ == 'HV': + if oword[-1:] == 'e': + typ = typ + '11' + elif oword[-2:] == 'st': + typ = typ + '12' + elif oword[-2:] == 'en': + typ = typ + '14' + elif oword[-2:] == 'et': + typ = typ + '15' + elif oword[-1:] == 't': + typ = typ + '13' + elif typ == 'ADJ': + if oword[-2:] == 'er': + typ = 'ADJER' + elif oword[-2:] == 'en': + typ = 'ADJEN' + elif oword[-2:] == 'em': + typ = 'ADJEM' + elif oword[-2:] == 'es': + typ = 'ADJES' + elif oword[-1:] == 'e': + typ = 'ADJE' + elif typ == 'NMS': + if oword[-2:] == 'in': + typ = 'NFS' + elif oword[-5:] == 'innen': + typ = 'NF' + if typ[0] == 'N': + if word != oword and typ[-1:] == 'S': + typ = typ[0:-1] + return typ + + + diff --git a/languagetool/src/Wfhun.py b/languagetool/src/Wfhun.py new file mode 100755 index 0000000..3514ca1 --- /dev/null +++ b/languagetool/src/Wfhun.py @@ -0,0 +1,88 @@ +# -*- coding: iso-8859-1 -*- +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2004 .... +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +import array +import codecs +import os +from string import * +import sys + +class Wfhun: + + encoding = "latin1" + + def __init__(self): + return + + def getTyp(self,typ, oword, word): + dif = len(oword) - len(word) + if (typ[0] == 'V' or typ[0:2] == 'SI') and word != oword: + ik = '' + telo = 'SI' + if typ[0] == 'V': + telo = 'V' + if oword[0:2] != word[0:2]: + ik = 'IK' + if oword[-3:] in (u'iük','iuk', 'nak', 'nek','tak', 'tek') or oword[-2:] in (u'ák', u'ék'): + typ = ik + telo + '6' + elif oword[-3:] in ('tok','tek', u'tök'): + typ = ik + telo + '5' + elif oword[-3:] in (u'ünk','unk', u'ánk', u'énk') or oword[-2:] in ('uk', u'ük'): + typ = ik + telo + '4' + elif oword[-2:] in ('sz','od', 'ed', u'öd',u'éd','ad',u'ád'): + typ = ik + telo + '2' + elif oword[-2:] in ('ok','ek',u'ök','om','em',u'öm', u'ám', u'ém', 'am'): + typ = ik + telo + '1' + elif oword[-2:] in ('va', 've') or oword[-3:] in (u'ván', u'vén'): + typ = 'ADV' + elif oword[-2:] == 'ni': + typ = 'INF' + else: + typ = ik + telo + '3' + elif typ[0:3] == 'PP4': + if oword != 'mi': + typ = 'ADV' + elif typ[0:3] == 'ADJ': + if oword[-2:] in ('ek','ok', 'ak', u'ék', u'ák') and dif > 0 and (dif < 3 or ((word[0:1] != oword[0:1]) and dif < 9)): + typ = 'ADJP' + elif oword[-1:] in (u'é',u'á') and dif > 0 and (dif < 5 or ((word[0:1] != oword[0:1]) and dif < 12)): + typ = 'ADV' + elif oword[-2:] in ('an', 'en', 'bb','ul',u'ül') and dif == 2: + typ = 'ADV' + elif dif != 0: + typ = 'ADV' + elif typ[0] == 'N': + if oword[-1] == 'k' and oword[-2] in ('a',u'á', 'e',u'é','i',u'í','o',u'ó',u'ö',u'õ','u',u'ú',u'ü',u'û') and dif > 0 and dif < 3 : + typ = 'NP' + elif oword[-1:] == 'i' and dif == 1: + typ = 'DNA' + elif (oword[-1:] in(u'ú', u'û') and dif == 1) or (oword[-2:] in (u'jú', u'jû') and dif == 2): + typ = 'ADJS' + elif typ == 'N': + if oword[-1] == 'k' and oword == word: + typ = 'NP' + else: + typ = 'NS' + elif dif >= 2: + typ = 'N' + if typ[0] == 'N' and oword == word and word[-1] != 'k': + typ = typ+'N' + return typ + + + diff --git a/languagetool/src/Wfinder.py b/languagetool/src/Wfinder.py new file mode 100644 index 0000000..7ba1935 --- /dev/null +++ b/languagetool/src/Wfinder.py @@ -0,0 +1,568 @@ +# -*- coding: iso-8859-1 -*- +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2004 .... +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +# usage python stem.py +# +# file test.txt contains are for example: +# carried +# worked +# play +# +# example aff file (dtest.aff) +# SFX D Y 4 +# SFX D 0 e d # abate->abated +# SFX D y ied [^aeiou]y # carry -> carried +# SFX D 0 ed [^ey] # work -> worked +# SFX D 0 ed [aeiuu]y # play -> played +# +# example dic file (dtest.dic) +# 3 +# carry/D +# work/D +# play/D +# +# reads words from the file test.txt +# +# Speed up 9 times by helding different +# append endings in different arrays 3.July, 2004 +# +# Speed improvement by 30% by doing the above +# also with the prefixes, and by helding +# affixes and prefixes in different lists. 4. July, 2004 +# + +import array +import codecs +import os +import Tagger +import Wfdeu +import Wfhun +from string import * +import time +import sys + + +#aff_file = "dtest.aff" +#dic_file = "dtest.dic" +#test_file = "test.txt" +yesno = {} +comment = "#" +condlist = [] +condlist1 = [] +alfab_conddic = {} +palfab_conddic = {} +alfab_condlist_group = [] +alfab2_condlist_group = [] +alfab2_conddic = {} +palfab2_conddic = {} +alfab2_condlist_group = [] +szodic = {} +typdic = {} + +class Wfinder: + + encoding = "latin1" + doubleflags = "" + doubleflagList="" + + def __init__(self, textlanguage): +# print time.strftime('%X %x %Z') + self.is_initialized = 0 + self.is_secondflag = 0 + self.textlanguage = textlanguage + self.wfdeu = Wfdeu.Wfdeu() + self.wfhun = Wfhun.Wfhun() + return + + def aff_read(self): + self.aff_file = os.path.join(sys.path[0], "data", Tagger.affFile) + condlist = [] + alfab_condlist_group = [] + alfab2_condlist_group = [] + faff = codecs.open(self.aff_file, "r", self.encoding) + l = " " + for i in range(0,256,1): + alfab_conddic[i] = [] + palfab_conddic[i] = [] + alfab2_conddic[i] = [] + palfab2_conddic[i] = [] + while l != "": + l = faff.readline() + ll = l.split() + if len(ll) <= 1: + continue + if ll[0][0] in comment: + continue + if ll[0][1:3] == "FX": + arrname = ll[1] + prefix = 0 + if ll[0][0] == 'P': + prefix = 1 + yesno[arrname] = ll[2] + for i in range(0, int(ll[3])): + l = faff.readline() + bb = l.split() +# print "%s %d" %(bb,len(bb)) +# print "l:%s bb[2]:%s arrname:%s" %(l,bb[2], arrname) + strip = bb[2] + if bb[2] == '0': + strip = ''; + appnd = bb[3] + if bb[3] == '0': + appnd = '' + appnd_last = '0' + else: + if prefix == 0: + appnd_last = appnd[-1] + else: + appnd_last = appnd[0] + if bb[4] != '.': + jj = 0 + while(jj < len(bb[4])): + condarr = array.array('B',range(256)) + insbit = 1; + for iii in range(0,256,1): + condarr[iii] = 0 + if bb[4][jj] == '[': + kk = 0; + jj = jj + 1 + if bb[4][jj] == '^': + jj = jj+1 + insbit = 0; + for iii in range(0,256,1): + condarr[iii] = 1 + while bb[4][jj] != ']': + condarr[ord(bb[4][jj])] = insbit; + jj = jj + 1 + if bb[4][jj] == ']': + jj = jj +1 + else: + condarr[ord(bb[4][jj])] = insbit; + jj = jj +1 + condlist.append(condarr) + secondflag = "" + if len(bb) >= 7: + secondflag = bb[6] + self.is_secondflag = 1 + if find(self.doubleflags,arrname) == -1: + self.doubleflags = self.doubleflags+arrname + for elem in secondflag: + if find(self.doubleflagList,elem) == -1: + self.doubleflagList = self.doubleflagList+elem +# print "is_sec:%d" % self.is_secondflag + alfab2_condlist_group.append(condlist) + alfab2_condlist_group.append(strip) + alfab2_condlist_group.append(appnd) + alfab2_condlist_group.append(arrname) + alfab2_condlist_group.append(secondflag) + if prefix == 0: + alfab2_conddic[ord(appnd_last)].append(alfab2_condlist_group) + else: + palfab2_conddic[ord(appnd_last)].append(alfab2_condlist_group) + alfab_condlist_group.append(condlist) + alfab_condlist_group.append(strip) + alfab_condlist_group.append(appnd) + alfab_condlist_group.append(arrname) + if prefix == 0: + alfab_conddic[ord(appnd_last)].append(alfab_condlist_group) + else: + palfab_conddic[ord(appnd_last)].append(alfab_condlist_group) +# print "appended %s to %s %d" %(appnd.encode('latin1'), appnd_last.encode('latin1'), ord(appnd_last)) + condlist = [] + alfab_condlist_group = [] + alfab2_condlist_group = [] + faff.close() +# print self.doubleflags +# for i in range (0,255,1): +# print len(alfab_conddic[i]) +# print alfab_conddic[ord('a')] + +# +# Now read the dictionary +# + def dic_read(self): + self.dic_file = os.path.join(sys.path[0], "data", Tagger.dicFile) + szoszam = 0; + fdic = codecs.open(self.dic_file, "r", self.encoding) + l = " " + szolista = [] + ujlista = [] + l = fdic.readline() + szoszam = int(l) + while l != "": + l = fdic.readline() + szolista = l.split("/") + for szo in szolista: + szo = szo.strip('\n \t') + ujlista.append(szo) + if len(ujlista) > 1: + szodic[ujlista[0]] = ujlista[1] + else: + szodic[ujlista[0]] = "" + if len(ujlista) > 2: + typdic[ujlista[0]] = ujlista[2] + else: + typdic[ujlista[0]] = "" + ujlista = [] + fdic.close() + + def do_keytest(self,l): + if l == "": + return "" + if szodic.has_key(l): + return "+ %s" %l + else: + return "- %s" %l + + def suffix2_search(self, l, oarrname, oword): + retval = "" + found = 0 + for windex in ord(l[-1]), ord('0'): + for elem in alfab2_conddic[windex]: + # elem0: condlist, elem1: strip elem2 = append, elem3 = arrname +# print "s2_s l:%s oarr:%s elem[4]:%s app:%s strip:%s" % (l, oarrname, elem[4],elem[2],elem[1] ) + if found: + return retval + if find(elem[4], oarrname) == -1: + continue + # + # search first only suffixes + # since prefix is optional + # + appnd = elem[2] + if len(appnd): + if l[-len(appnd):] != appnd: + continue +# if len(appnd): + restoredWord = l[0:len(l)-len(appnd)] + else: + restoredWord = l + condlist = elem[0] + strip = elem[1] + if len(strip): + restoredWord = restoredWord + strip + break_it = 0 + if len(condlist) > 0 and len(restoredWord) >= len(condlist): #tktk + substr = restoredWord[-len(condlist):] + for i in range(0, len(condlist), 1): #tktk + if condlist[i][ord(substr[i])] != 1: + break_it = 1 + break + if break_it: + continue + + if szodic.has_key(restoredWord): + flags = szodic[restoredWord] +# print "s22_s: %s %d %s %s %s %s %s" % (restoredWord,szodic.has_key(restoredWord),elem[3], oarrname, elem[4], oarrname, flags) + if flags == "": # tktk + continue + else: + if find(flags, elem[3]) == -1: + continue + retval = "++ %s %s" %(oword,restoredWord) + found = 1 + return retval + return retval + + + def suffix_search(self, l, oldl, oarrname): + retval = "" + found = 0 + for windex in ord(l[-1]), ord('0'): + for elem in alfab_conddic[windex]: + # elem0: condlist, elem1: strip elem2 = append, elem3 = arrname + if found: + return retval + # + # search first only suffixes + # since prefix is optional + # + appnd = elem[2] + if len(appnd): + if l[-len(appnd):] != appnd: + continue + restoredWord = l[0:len(l)-len(appnd)] + else: + restoredWord = l + condlist = elem[0] + strip = elem[1] + if len(strip): + restoredWord = restoredWord + strip + break_it = 0 +# print "%s %s %s %s" %(restoredWord,appnd,strip, elem[3]) + if len(condlist) > 0 and len(restoredWord) >= len(condlist): #tktk + substr = restoredWord[-len(condlist):] + for i in range(0, len(condlist), 1): #tktk + if condlist[i][ord(substr[i])] != 1: + break_it = 1 + break + if break_it: + continue + if szodic.has_key(restoredWord): + flags = szodic[restoredWord] + if flags == "": # tktk + continue + else: + if find(flags, elem[3]) == -1: + continue + if oarrname != "" and find(flags, oarrname) == -1: + continue + if oldl != "": + retval = "+++ %s %s %s" %(oldl, l,restoredWord) + else: + retval = "++ %s %s" %(l,restoredWord) + found = 1 + return retval + # print windex + return retval + + def suffix22_search(self, l, oldl, oarrname): + retval = "" + found = 0 + for windex in ord(l[-1]), ord('0'): + for elem in alfab_conddic[windex]: + # elem0: condlist, elem1: strip elem2 = append, elem3 = arrname +# print "s.d:%s e3:%s app:%s str:%s" % (self.doubleflags, elem[3], elem[2],elem[1]) + if find(self.doubleflagList, elem[3]) == -1: + continue + if found: + return retval + # + # search first only suffixes + # since prefix is optional + # +# print "s22x l:%s oldl:%s oarrname:%s appnd:%s strip:%s" % (l, oldl, oarrname, elem[2], elem[1]) + appnd = elem[2] + if len(appnd): + if l[-len(appnd):] != appnd: + continue + restoredWord = l[0:len(l)-len(appnd)] + else: + restoredWord = l + condlist = elem[0] + strip = elem[1] + if len(strip): + restoredWord = restoredWord + strip + break_it = 0 +# print "s22: %s %s %s %s" %(restoredWord,appnd,strip, elem[3]) + if len(condlist) > 0 and len(restoredWord) >= len(condlist): #tktk + substr = restoredWord[-len(condlist):] + for i in range(0, len(condlist), 1): #tktk + if condlist[i][ord(substr[i])] != 1: + break_it = 1 + break + if break_it: + continue +# print "s->s2, rw:%s e3:%s" % (restoredWord, elem[3]) + rval = self.suffix2_search(restoredWord, elem[3], l) + if rval != "": + found = 1 + retval = rval + return rval + # print windex + return retval + + def prefix_search(self, l): + found = 0 + retval = "" + for windex in ord(l[0]), ord('0'): + for elem in palfab_conddic[windex]: + if found: + return retval + appnd = elem[2] + if appnd == l[:len(appnd)]: # cut the matching prefix + l1 = l[len(appnd):] + else: + continue + condlist = elem[0] + strip = elem[1] + if len(strip): + l1 = strip + l1 + break_it = 0 + if len(condlist) > 0 and len(l1) >= len(condlist): #tktk + substr = l1[0:len(condlist)] + for i in range(0, len(condlist), 1): #tktk + if condlist[i][ord(substr[i])] != 1: + break_it = 1 + break + if break_it: + continue + # + # prefix without suffix + # + arrname = elem[3] + if szodic.has_key(l1): + flags1 = szodic[l1] + if flags1 != "": + if find(flags1, arrname) == -1: + continue + retval = "++ %s %s" %(l,l1) + found = 1 + return retval + + if lower(yesno[arrname]) == 'n': + continue +# +# check if this unprefixed word +# is a valid suffixed one +# + retval = self.suffix_search(l1, l, arrname) + if retval != "": + found = 1 + return retval + return retval + + def prefix22_search(self, l): + found = 0 + retval = "" + for windex in ord(l[0]), ord('0'): + for elem in palfab_conddic[windex]: + if found: + return retval +# print "str:%s app:%s e3:%s dfl:%s df:%s" % (elem[1],elem[2], elem[3],self.doubleflagList,self.doubleflags) + if find(self.doubleflagList, elem[3]) == -1 and find(self.doubleflags, elem[3]) == -1: + continue + appnd = elem[2] + if appnd == l[:len(appnd)]: # cut the matching prefix + l1 = l[len(appnd):] + else: + continue + condlist = elem[0] + strip = elem[1] + if len(strip): + l1 = strip + l1 + break_it = 0 + if len(condlist) > 0 and len(l1) >= len(condlist): #tktk + substr = l1[0:len(condlist)] + for i in range(0, len(condlist), 1): #tktk + if condlist[i][ord(substr[i])] != 1: + break_it = 1 + break + if break_it: + continue + # + # prefix without suffix + # + arrname = elem[3] +# print "p22->s2 l1:%s e3:%s l:%s" %(l1,elem[3],l) + rval = self.suffix2_search(l1, elem[3],l) + if rval != "": + found = 1 + retval = rval + return rval + + if lower(yesno[arrname]) == 'n': + continue +# +# check if this unprefixed word +# is a valid suffixed one +# +# print "ps l1:%s l:%s arrn:%s" % (l1, l, arrname) + retval = self.suffix22_search(l1, "", "") + if retval != "": + found = 1 + return retval + return retval + + + def do_test(self,l): + if l == "": + return "" + else: + oldword = l + found = 0 +# print "ss l:%s" %l + retval = self.suffix_search(l, "", "") + if retval != "": + found = 1 + return retval +# +# searched all suffixes and not found +# now try to combine all prefixes with all suffixes +# that allow combinations +# +# print "sp l:%s" %l + retval = self.prefix_search(l) + if retval != "": + found = 1 + return retval + + if self.is_secondflag: +# print "s22 l:%s" %l + retval = self.suffix22_search(l, "", "") + if retval != "": + found = 1 + return retval +# print "p22 l:%s" %l + retval = self.prefix22_search(l) + if retval != "": + found = 1 + return retval + + return "- %s" % oldword + + def test_it(self,l): + if self.is_initialized == 0: + self.aff_read() + self.dic_read() + self.is_initialized = 1 + lcasetest = 0 + result = self.do_keytest(l) + if result[0] == '-': + lu = l[0] + if lu != lu.lower(): + l1 = lu[0].lower()+l[1:] + if l1 != l: + lcasetest = 1; + result = self.do_keytest(l1) + # + # in languages not German more likely to find + # a lower case word than an uppercase + # + if result[0] == '-' and self.textlanguage != 'de': + tmp = l1 + l1 = l + l = tmp + if result[0] == '-': + result = self.do_test(l) + if result[0] == '-' and lcasetest == 1: + result = self.do_test(l1) + typ = '' + if result[0] != '-': + src = result.split() + word = src[len(src) - 1] + oword = src[1] + typ = typdic[word] +# print typ + " " + oword[-1:] + " " +oword[-2:] +# +# Here are the language specific rules of each language +# + if self.textlanguage == 'de': + typ = self.wfdeu.getTyp(typ, oword, word) + elif self.textlanguage == 'hu': +# print word+" "+oword+" "+typ + typ = self.wfhun.getTyp(typ, oword, word) +# +# end of language specific rules for new languages +# +# print typ + result = result + " " + typ +# print result + return result + + diff --git a/languagetool/src/client.py b/languagetool/src/client.py new file mode 100644 index 0000000..c3826ba --- /dev/null +++ b/languagetool/src/client.py @@ -0,0 +1,28 @@ +#!/usr/bin/python +# daniel.naber@t-online.de, 2003-05-02 +# This is just a test to show how a TextChecker server can be called + +import socket + +sentence = "A sentence bigger then a short one." + +server_name = "127.0.0.1" +server_port = 50100 + +print "Test client for socket_server.py" +print "Connecting %s, port %d..." % (server_name, server_port) +s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +s.connect(("127.0.0.1", 50100)) +print "Connected." +cfg = '<config textlanguage="en" mothertongue="de" grammar="COMP_THAN" />\n' +s.sendall("%s<text>%s</text>" % (cfg, sentence)) +print "Data sent, waiting for reply..." +data = "" +while 1: + received = s.recv(1024) + data = "%s%s" % (data, received) + if not received: + break +s.close() +print "Received reply:" +print data diff --git a/languagetool/src/query.py b/languagetool/src/query.py new file mode 100644 index 0000000..b34a1ff --- /dev/null +++ b/languagetool/src/query.py @@ -0,0 +1,249 @@ +#!/usr/bin/python +# Query BNC data files in XML format +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +# for debugging only +import cgitb +cgitb.enable() + +#import profile + +import cPickle +import cgi +import os +import re +import re +import sys +import time + +os.chdir(sys.path[0]) +sys.path.append(sys.path[0]) +import TagInfo + +data_dir = "/data/bnc/xml_data" +context = 4 +limit = 30 +tags_str = "AJ0,AJC,AJS,AT0,AV0,AVP,AVQ,CJC,CJS,CJT,\ +CRD,DPS,DT0,DTQ,EX0,ITJ,NN0,NN1,NN2,NP0,ORD,PNI,PNP,\ +PNQ,PNX,POS,PRF,PRP,PUL,PUN,PUQ,PUR,TO0,UNC,VBB,VBD,\ +VBG,VBI,VBN,VBZ,VDB,VDD,VDG,VDI,VDN,VDZ,VHB,VHD,VHG,\ +VHI,VHN,VHZ,VM0,VVB,VVD,VVG,VVI,VVN,VVZ,XX0,ZZ0" + +tags = re.split(",", tags_str) +sentence_count = 0 +word_count = 0 +matches = 0 +regex = re.compile("(<S.*?</S>)", re.DOTALL) +words_regex = re.compile("(<[WC].*?</[WC]>)", re.DOTALL) +type_regex = re.compile("TYPE=\"(.*?)\"") +word_regex = re.compile(">(.*?)</[WC]>") + +def query(search_tokens, filename): + global sentence_count + global word_count + global limit + global matches + global tags + t1 = time.time() + tokens = buildList(filename) + #print "T=%.2f<br>" % (time.time()-t1) + t1 = time.time() + #print tokens + match_pos = 0 + pos = 0 + for word,tag in tokens: + if tag == 'S_BEGIN': + sentence_count = sentence_count + 1 + word_count = word_count + 1 + if tags.count(search_tokens[match_pos]) > 0: + compare = tag + else: + compare = word + if compare == search_tokens[match_pos] or search_tokens[match_pos] == '_': + match_pos = match_pos + 1 + else: + match_pos = 0 + #print match_pos + if match_pos == len(search_tokens): + if matches+1 > limit: + return None + print "%d." % (matches+1) + print niceFormat(tokens[pos-context:pos+context], \ + context-len(search_tokens)+1, len(search_tokens)) + sys.stdout.flush() + matches = matches + 1 + match_pos = 0 + pos = pos + 1 + #print "T2=%.2f<br>" % (time.time()-t1) + return 1 + +def niceFormat(tokens, rel_pos, match_len): + l = [] + count = 0 + for word,tag in tokens: + if count >= rel_pos and count < rel_pos+match_len: + l.append('<b>%s<span class="tag">/%s</span></b>' % (word,tag)) + elif tag == 'PUN': + l.append(word) + else: + l.append('%s<span class="tag">/%s</span>' % (word,tag)) + count = count + 1 + return str.join(' ', l) + "<br>" + +def buildList(filename): + # Speed up: + pickle_filename = "%s.pickle" % filename + if os.path.exists(pickle_filename): + #print "Loading pickled data from %s<br>" % pickle_filename + t1 = time.time() + tokens = cPickle.load(open(pickle_filename)) + #print "Tpickle=%.2f<br>" % (time.time()-t1) + return tokens + + f = open(filename) + content = f.read() + f.close() + global regex + global words_regex + global type_regex + global word_regex + + sentences = regex.findall(content) + tokens = [] + for s in sentences: + #print "X" + words = words_regex.findall(s) + tokens.append(('', 'S_BEGIN')) + for w in words: + w = w.replace("\n", " ") + #print w + type_match = type_regex.search(w) + if not type_match: + print "*** no type_match!?" + continue + type_str = type_match.group(1) + word_match = word_regex.search(w) + word = word_match.group(1).strip() + #print "%s/%s" % (word, type_str) + tokens.append((word, type_str)) + tokens.append(('', 'S_END')) + # Prepare speed up for next search: + cPickle.dump(tokens, open(pickle_filename, 'w'), 1) + return tokens + +def queryFiles(tokens, dir_name): + os.chdir(dir_name) + dir_contents = os.listdir(".") + dir_contents.sort() + c = 0 + for filename in dir_contents: + if filename.endswith(".xml"): + c = c + 1 + print "Found %d *.xml files in %s<br>" % (c, dir_name) + w = 0 + s = 0 + m = 0 + f_count = 1 + for name in dir_contents: + if os.path.isdir(name): + queryFiles(tokens, name) + elif name.endswith(".xml"): + print "<strong>%.3d. %s</strong>, so far %d words, %d sentences<br>" % (f_count, name, word_count, sentence_count) + res = query(tokens, name) + if not res: + return + #global_file_count = global_file_count + 1 + #print "<hr />" + sys.stdout.flush() + f_count = f_count + 1 + # for profiling + #if word_count > 200000: + # return + os.chdir("..") + return + +def displayForm(): + taginfo = TagInfo.TagInfo() + print "Content-Type: text/html\n\n" + print """ + <html><head> + <title>BNC Query</title></head> + <body> + <h1>BNC Query</h1> + + <form action="query.py" method="get"> + <table border="0" cellspacing="0" cellpadding="0"> + <tr> + <td>Word/tag sequence:</td> + <td>Context:</td> + <td>Max. results:</td> + </tr> + <tr> + <td><input type="text" name="tokens"></td> + <td><select name="context"> + <option value="4">4 </option> + <option>6</option> + <option>8</option> + <option>10</option> + </select></td> + <td><input type="text" name="limit" value="30" size="6" /></input> + <td> </td> + <td><input type="submit" value="Query" /></td> + </tr> + </table> + </form> + <br /> + _ (underline) matches any word + %s + </body> + </html>""" % taginfo.getHTMLCode() + return + +def main(): + global limit + global context + form = cgi.FieldStorage() + if not form.getvalue("tokens"): + displayForm() + return + if form.getvalue("context"): + context = int(form.getvalue("context")) + if form.getvalue("limit"): + limit = int(form.getvalue("limit")) + print "Content-Type: text/html\n\n" + token_display = cgi.escape(form.getvalue("tokens"), 1) + print """<html><head> + <title>BNC query result for '%s'</title> + <style rel="stylesheet"> + <!-- + .tag { color:#999999; } + --> + </style></head> + <body> + <h1>BNC query result for '%s'</h1>""" % (token_display, token_display) + tokens = re.split("\s+", form.getvalue("tokens")) + queryFiles(tokens, data_dir) + print '<p>Queried %d words in %d sentences.' % (word_count, \ + sentence_count) + print '</body></html>' + #print '<pre>' # profiling + return + +main() +#profile.run('main()') diff --git a/languagetool/src/socket_server.py b/languagetool/src/socket_server.py new file mode 100644 index 0000000..81cac5b --- /dev/null +++ b/languagetool/src/socket_server.py @@ -0,0 +1,218 @@ +#!/usr/bin/python +# A server that uses TextChecker.py to check text for style +# and grammar errors +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import TextChecker + +import ConfigParser +import os +import re +import socket +import sys +import time + +sys.path.append(os.path.join(sys.path[0], "snakespell-1.01")) +from scriptfoundry.snakespell import iSpell + +server_name = "127.0.0.1" +server_port = 50100 +configfile = os.path.join(os.getenv('HOME'), ".kde/share/config/languagetool") + +def makeChecker(grammar_cfg=None, falsefriends_cfg=None, words_cfg=None, \ + builtin_cfg=None, textlanguage=None, mothertongue=None, \ + max_sentence_length=None): + """Create a new TextChecker object and return it.""" + checker = TextChecker.TextChecker(grammar_cfg, falsefriends_cfg, words_cfg, \ + builtin_cfg, textlanguage, mothertongue, max_sentence_length) + return checker + +def loadOptionList(config, enable_name, option_name): + val = None + if config.has_option("General", enable_name) and \ + config.getboolean("General", enable_name): + if config.has_option("General", option_name): + val = re.split(',', config.get("General", option_name)) + else: + val = ["NONE"] + return val + +def loadOptionBoolean(config, option_name): + if config.has_option("General", option_name) and config.getboolean("General", option_name): + return 1 + return None + +def loadOptionString(config, option_name, default): + val = default + if config.has_option("General", option_name): + val = config.get("General", option_name) + return val + +def readConfig(): + """Read the checker config from a KDE config file (INI style). + Return a checker which uses that config.""" + config = ConfigParser.ConfigParser() + try: + config.readfp(open(configfile)) + except IOError: + print "Couldn't load config file '%s', using defaults..." % configfile + grammar = loadOptionList(config, "EnableGrammar", "GrammarRules") + falsefriends = loadOptionList(config, "EnableFalseFriends", "FalseFriendsRules") + words = loadOptionList(config, "EnableWords", "WordsRules") + builtin = [] + if loadOptionBoolean(config, "EnableWhitespaceCheck"): + builtin.append("WHITESPACE") + if len(builtin) == 0: + builtin = None + textlanguage = loadOptionString(config, "TextLanguage", "en") + mothertongue = loadOptionString(config, "MotherTongue", "en") + sentence_length = 0 + if loadOptionBoolean(config, "EnableSentenceLength"): + if config.has_option("General", "MaxSentenceLength"): + sentence_length = config.getint("General", "MaxSentenceLength") + checker = makeChecker(grammar, falsefriends, words, builtin, \ + textlanguage, mothertongue, sentence_length) + return checker + +def getConfig(data): + """Get a new config in pseudo XML format from the client. + It needs to be at the beginning of the string that comes + from the client and must be of form <config ... />. + Returns a tuple with the a checker based on this config and + the 'data' string with the config section removed.""" + print "Receiving new config..." + line_end_pos = data.find("/>") + cfg_str = data[:line_end_pos] + data = data[line_end_pos+3:] + grammar = getConfigValue(cfg_str, "grammar") + falsefriends = getConfigValue(cfg_str, "falsefriends") + words = getConfigValue(cfg_str, "words") + builtin = getConfigValue(cfg_str, "builtin") + textlanguage = getConfigValue(cfg_str, "textlanguage") + if textlanguage: + textlanguage = textlanguage[0] + mothertongue = getConfigValue(cfg_str, "mothertongue") + if mothertongue: + mothertongue = mothertongue[0] + sentence_length = getConfigValue(cfg_str, "max-sentence-length") + if not sentence_length: + sentence_length = 0 + else: + sentence_length = int(sentence_length[0]) + checker = makeChecker(grammar, falsefriends, words, builtin, \ + textlanguage, mothertongue, sentence_length) + return (checker, data) + +def getConfigValue(cfg_str, val): + m = re.compile('%s="(.*?)"' % val).search(cfg_str) + if not m: + return None + s = m.group(1) + l = re.split(',', s) + return l + +def main(): + print "Binding to '%s:%d'..." % (server_name, server_port) + s.bind((server_name, server_port)) + print "Listening..." + s.listen(1) + print "Setting up Checker..." + checker = readConfig() + print "Ready..." + while 1: + conn, addr = s.accept() + if addr[0] != "127.0.0.1": # security + print "Connection by '%s' refused" % addr[0] + conn.close() + continue + else: + print "Connected by '%s'" % addr[0] + + l = [] + limit = 1024 + while 1: + data = conn.recv(limit) + l.append(data) + #FIXME: need to look for separator, not just < limit! + if not data or len(data) < limit: + break + data = str.join('', l) + + print "Received '%s'" % data + if data.find("<config") != -1: + del checker + (checker, data) = getConfig(data) + print "New config activated" + t1 = time.time() + check_result = checkWords(checker, data) + t2 = time.time()-t1 + print "Replying (%.2fs) '%s'" % (t2, check_result.encode('utf8')) + #print "Replying (%.2fs)" % t2 + conn.send(check_result.encode('utf8')) + + conn.close() + s.close() + return + +def checkWordsTEST(words): + """Just for testing. Marks 'working' as incorrect.""" + words = re.split("\s+", words) + s = '<result>' + for w in words: + if w == "working": + s = s + '\t<error word="working" pos="5" corrections="Bohlen,Didda"/>' + s = s + '</result>' + return s + +def checkWords(checker, words): + result = u'<result>' + + ### Spelling: + ispell = iSpell() + words = words.replace("\n", " ") # iSpell works line by line + r = ispell.check(words) + if r > 0: + # fixme: escape word + for mistake in ispell.getMistakes(): + # TODO: make faster + pos = [] + for p in mistake.getPositions(): + result = u'%s<error from="%d" to="%d" word="%s" corrections="%s"/>' % \ + (result, p, p+len(mistake.getWord()), \ + unicode(mistake.getWord(), 'latin1'), \ + unicode(str.join(',', mistake.corrections), ('latin1'))) + + ### Grammar + Style: + (rule_matches, res, tags) = checker.check(words) + # FIXME: only if there's no overlap?! + result = result + res + + result = result + '</result>\n' + return result + +try: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + os.chdir(sys.path[0]) + main() +except KeyboardInterrupt: + # TODO: close explicitely, unfortunately we still get an + # 'Address already in use' error if we restart immediately: + s.shutdown(2) + s.close() + print "Stopped." diff --git a/languagetool/src/tag.py b/languagetool/src/tag.py new file mode 100644 index 0000000..7ab713b --- /dev/null +++ b/languagetool/src/tag.py @@ -0,0 +1,152 @@ +#!/usr/bin/python +# -*- coding: iso-8859-1 -*- +# A frontend to a probabilistc part-of-speech tagger (see the QTag paper) +# +# LanguageTool -- A Rule-Based Style and Grammar Checker +# Copyright (C) 2002,2003,2004 Daniel Naber <daniel.naber@t-online.de> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Usage examples: +# 1) ./tag.py -b /data/bnc_sampler/train/* +# 2) ./tag.py -t /data/bnc_sampler/test/fcf + +import re +import sys +import string +import getopt +import profile + +import Tagger +import Entities + +class Controller: + "Main program." + + TAG = 0 + BUILD = 1 + TAGWORD = 2 + TAGSEQ = 3 + + def __init__(self): + return + + def usage(self): + print >> sys.stderr, "Usage: ./tagger.py <--build|--tag|--tagword> <filename...>" + print >> sys.stderr, " -h, --help this help information" + print >> sys.stderr, " -t, --tag tag any text files" + print >> sys.stderr, " -b, --build train the tagger using BNC XML files" + print >> sys.stderr, " -w, --wordtag tag any word" + print >> sys.stderr, " -s, --seqtag probability for any 2-tag-sequence" + # TODO: better help (e.g. 'build' adds to existing index (?)) + return + + def sanityCheck(self, filename, xml): + """Sanity check: all <w>...</w> together == original file?""" + words = re.compile("<w.*?>(.*?)</w>", re.DOTALL).findall(xml) + words_string = string.join(words, "") + # Load original file: + f = open(filename) + orig_contents = f.read() + f.close() + if orig_contents != words_string: + print >> sys.stderr, "*** Warning: joined output doesn't match original file!" + print >> sys.stderr, "*** (can be ignored if the file is a BNC file)" + return + + def run(self): + try: + (options, rest) = getopt.getopt(sys.argv[1:], 'htbws', + ['help', 'build', 'tag', 'wordtag', 'seqtag']) + except getopt.GetoptError, e: + print >> sys.stderr, "Error: %s" % e + self.usage() + sys.exit(1) + mode = self.TAG + for o, a in options: + if o in ("-h", "--help"): + self.usage() + sys.exit(0) + elif o in ("-t", "--tag"): + mode = self.TAG + elif o in ("-b", "--build"): + mode = self.BUILD + elif o in ("-w", "--wordtag"): + mode = self.TAGWORD + elif o in ("-s", "--seqtag"): + mode = self.TAGSEQ + if not rest: + self.usage() + sys.exit(1) + + if mode == self.BUILD: + tagger = Tagger.Tagger() + tagger.bindData() + tagger.buildData(rest) + tagger.commitData() + elif mode == self.TAG: + tagger = Tagger.Tagger() + tagger.bindData() + for filename in rest: + f = open(filename) + content = f.read() + f.close() + content = Entities.Entities.cleanEntities(content) + xml = tagger.tagTexttoXML(content) + self.sanityCheck(filename, xml) + print xml + print >> sys.stderr, "Done." + elif mode == self.TAGWORD: + tagger = Tagger.Tagger() + tagger.bindData() + for word in rest: + r = tagger.tagWord(word) + print r + elif mode == self.TAGSEQ: + tagger = Tagger.Tagger() + tagger.bindData() + if len(rest) > 1 and rest[1] != '*': + key = (rest[0], rest[1]) + prob = tagger.tagSeq(key) + print prob + else: + # TODO: don't duplicate code from query.py: + tags_str = "AJ0,AJC,AJS,AT0,AV0,AVP,AVQ,CJC,CJS,CJT," + tags_str = tags_str + "CRD,DPS,DT0,DTQ,EX0,ITJ,NN0,NN1,NN2,NP0,ORD,PNI,PNP," + tags_str = tags_str + "PNQ,PNX,POS,PRF,PRP,PUL,PUN,PUQ,PUR,TO0,UNC,VBB,VBD," + tags_str = tags_str + "VBG,VBI,VBN,VBZ,VDB,VDD,VDG,VDI,VDN,VDZ,VHB,VHD,VHG," + tags_str = tags_str + "VHI,VHN,VHZ,VM0,VVB,VVD,VVG,VVI,VVN,VVZ,XX0,ZZ0," + # these are not in query.py: + tags_str = tags_str + "YBL,YBR,YCOL,YCOM,YDSH,YEX,YLIP,YQUE,YQUO,YSCOL,YSTP" + tags = re.split(",", tags_str) + sum = 0 + items = 0 + for tag in tags: + key = (rest[0], tag) + prob = tagger.tagSeq(key) + prob2 = tagger.tagSeq2(key) + if prob > 0 or prob2 > 0: + sum = sum + prob + print "%s followed by %s -> %.10f" % (key[0], key[1], prob) + print "%s follows %s -> %.10f" % (key[0], key[1], prob2) + items = items + 1 + print "items=%d, sum=%.5f" % (items, sum) + return + +### Main program + +prg = Controller() +prg.run() +#profile.run('prg.run()', 'fooprof') |
