From 7ddc4b0ee23a42601b27c97435cf51c5ac5068d0 Mon Sep 17 00:00:00 2001
From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com>
Date: Thu, 5 Aug 2021 10:43:50 +0200
Subject: [PATCH] Add postprocessing EUD script.

---
 scripts/postprocessing.py | 454 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 454 insertions(+)
 create mode 100644 scripts/postprocessing.py

diff --git a/scripts/postprocessing.py b/scripts/postprocessing.py
new file mode 100644
index 0000000..2f4da16
--- /dev/null
+++ b/scripts/postprocessing.py
@@ -0,0 +1,454 @@
+# TODO lemma remove punctuation - ukrainian
+# TODO lemma remove punctuation - russian
+# TODO consider handling multiple 'case'
+import sys
+
+import conllu
+
+from re import *
+
+rus = compile(u'^Ð¸Ð·-Ð·Ð°$')
+expand = compile('^\d+\.\d+$')
+
+'''
+A script correcting automatically predicted enhanced dependency graphs.
+Running the script: python postprocessing.py cs
+
+You have to modified the paths to the input CoNLL-U file and the output file.
+
+The last argument (e.g. cs) corresponds to the language symbol.
+All language symbols:
+ar (Arabic), bg (Bulgarian), cs (Czech), nl (Dutch), en (English), et (Estonian), fi (Finnish)
+fr (French), it (Italian), lv (Latvian), lt (Lithuanian), pl (Polish), ru (Russian)
+sk (Slovak), sv (Swedish), ta (Tamil), uk (Ukrainian)
+
+There are two main rules:
+1) the first one add case information to the following labels: nmod, obl, acl, advcl. 
+The case information comes from case/mark dependent of the current token and from the morphological feature Case.
+Depending on the language, not all information is added.
+In some languages ('en', 'it', 'nl', 'sv') the lemma of coordinating conjunction (cc) is appendend to the conjunct label (conj). 
+Functions: fix_mod_deps, fix_obj_deps, fix_acl_deps, fix_advcl_deps and fix_conj_deps
+
+2) the second rule correct enhanced edges comming into function words labelled ref, mark, punct, root, case, det, cc, cop, aux
+They should not be assinged other functions. For example, if a token, e.g. "and" is labelled cc (coordinating conjunction), 
+it cannot be simultaneously a subject (nsubj) and if this wrong enhanced edge exists, it should be removed from the graph.
+
+There is one additional rule for Estonian: 
+if the label is nsubj:cop or csubj:cop, the cop sublabel is removed and we have nsubj and csubj, respectively. 
+'''
+
+
+def fix_nmod_deps(dep, token, sentence, relation):
+    """
+    This function modifies enhanced edges labelled 'nmod'
+    """
+    label: str
+    label, head = dep
+
+    # All labels starting with 'relation' are checked
+    if not label.startswith(relation):
+        return dep
+
+    # case_lemma is a (complex) preposition labelled 'case' e.g. 'po' in nmod:po:loc
+    # or a (complex) subordinating conjunction labelled 'mark'
+    case_lemma = None
+    case_tokens = []
+    for t in sentence:
+        if t["deprel"] in ["case", "mark"] and t["head"] == token["id"]:
+            case_tokens.append(t)
+            break
+
+    if case_tokens:
+        fixed_tokens = []
+        for t in sentence:
+            for c in case_tokens:
+                if t["deprel"] == "fixed" and t["head"] == c["id"]:
+                    fixed_tokens.append(t)
+
+        if fixed_tokens:
+            case_lemma = "_".join(rus.sub('Ð¸Ð·Ð·Ð°', f["lemma"]) for f in quicksort(case_tokens + fixed_tokens))
+        else:
+            case_lemma = "_".join(rus.sub('Ð¸Ð·Ð·Ð°', f["lemma"]) for f in quicksort(case_tokens))
+
+    # case_val is a value of Case, e.g. 'gen' in nmod:gen and 'loc' in nmod:po:loc
+    case_val = None
+    if token['feats'] is not None:
+        if 'Case' in token["feats"]:
+            case_val = token["feats"]['Case'].lower()
+
+    #TODO: check for other languages
+    if language in ['fi'] and label not in ['nmod', 'nmod:poss']:
+        return dep
+    elif language not in ['fi'] and label not in ['nmod']:
+        return dep
+    else:
+        label_lst = [label]
+        if case_lemma:
+            label_lst.append(case_lemma)
+        if case_val:
+            #TODO: check for other languages
+            if language not in ['bg', 'en', 'nl', 'sv']:
+                label_lst.append(case_val)
+        label = ":".join(label_lst)
+
+    # print(label, sentence.metadata["sent_id"])
+    return label, head
+
+
+def fix_obl_deps(dep, token, sentence, relation):
+    """
+    This function modifies enhanced edges labelled 'obl', 'obl:arg', 'obl:rel'
+    """
+    label: str
+    label, head = dep
+
+    if not label.startswith(relation):
+        return dep
+
+    # case_lemma is a (complex) preposition labelled 'case' e.g. 'pod' in obl:pod:loc
+    # or a (complex) subordinating conjunction labelled 'mark'
+    case_lemma = None
+    case_tokens = []
+    for t in sentence:
+        if t["deprel"] in ["case", "mark"] and t["head"] == token["id"]:
+            case_tokens.append(t)
+            break
+
+    if case_tokens:
+        # fixed_token is the lemma of a complex preposition, e.g. 'przypadek' in obl:w_przypadku:gen
+        fixed_tokens = []
+        for t in sentence:
+            for c in case_tokens:
+                if t["deprel"] == "fixed" and t["head"] == c["id"]:
+                    fixed_tokens.append(t)
+
+        if fixed_tokens:
+            case_lemma = "_".join(rus.sub('Ð¸Ð·Ð·Ð°', f["lemma"]) for f in quicksort(case_tokens + fixed_tokens))
+        else:
+            case_lemma = "_".join(rus.sub('Ð¸Ð·Ð·Ð°', f["lemma"]) for f in quicksort(case_tokens))
+
+    # case_val is a value of Case feature, e.g. 'loc' in obl:pod:loc
+    case_val = None
+    if token['feats'] is not None:
+        if 'Case' in token["feats"]:
+            case_val = token["feats"]['Case'].lower()
+
+    if label not in ['obl', 'obl:arg', 'obl:agent']:
+        return dep
+    else:
+        label_lst = [label]
+        if case_lemma:
+            label_lst.append(case_lemma)
+            if case_val:
+                # TODO: check for other languages
+                if language not in ['bg', 'en', 'lv', 'nl', 'sv']:
+                    label_lst.append(case_val)
+        # TODO: check it for other languages
+        if language not in ['pl', 'sv']:
+            if case_val and not case_lemma:
+                if label == token['deprel']:
+                    label_lst.append(case_val)
+        label = ":".join(label_lst)
+
+    # print(label, sentence.metadata["sent_id"])
+    return label, head
+
+
+def fix_acl_deps(dep, acl_token, sentence, acl, lang):
+    """
+    This function modifies enhanced edges labelled 'acl'
+    """
+    label: str
+    label, head = dep
+
+    if not label.startswith(acl):
+        return dep
+
+    if label.startswith("acl:relcl"):
+        if lang not in ['uk']:
+            return dep
+
+    case_lemma = None
+    case_tokens = []
+    for token in sentence:
+        if token["deprel"] == "mark" and token["head"] == acl_token["id"]:
+            case_tokens.append(token)
+            break
+
+    if case_tokens:
+        fixed_tokens = []
+        for token in sentence:
+            if token["deprel"] == "fixed" and token["head"] == quicksort(case_tokens)[0]["id"]:
+                fixed_tokens.append(token)
+
+        if fixed_tokens:
+            case_lemma = "_".join([t["lemma"] for t in quicksort(case_tokens + fixed_tokens)])
+        else:
+            case_lemma = quicksort(case_tokens)[0]["lemma"]
+
+    if lang in ['uk']:
+        if label not in ['acl', 'acl:relcl']:
+            return dep
+        else:
+            label_lst = [label]
+            if case_lemma:
+                label_lst.append(case_lemma)
+            label = ":".join(label_lst)
+    else:
+        if label not in ['acl']:
+            return dep
+        else:
+            label_lst = [label]
+            if case_lemma:
+                label_lst.append(case_lemma)
+            label = ":".join(label_lst)
+
+    # print(label, sentence.metadata["sent_id"])
+    return label, head
+
+def fix_advcl_deps(dep, advcl_token, sentence, advcl):
+    """
+    This function modifies enhanced edges labelled 'advcl'
+    """
+    label: str
+    label, head = dep
+
+    if not label.startswith(advcl):
+        return dep
+
+    case_lemma = None
+    case_tokens = []
+    # TODO: check for other languages
+    if language in ['bg', 'lt']:
+        for token in sentence:
+            if token["deprel"] in ["mark", "case"] and token["head"] == advcl_token["id"]:
+                case_tokens.append(token)
+    else:
+        for token in sentence:
+            if token["deprel"] == "mark" and token["head"] == advcl_token["id"]:
+                case_tokens.append(token)
+
+    if case_tokens:
+        fixed_tokens = []
+        # TODO: check for other languages
+        if language not in ['bg', 'nl']:
+            for token in sentence:
+                for case in quicksort(case_tokens):
+                    if token["deprel"] == "fixed" and token["head"] == case["id"]:
+                        fixed_tokens.append(token)
+
+        if fixed_tokens:
+            case_lemma = "_".join([t["lemma"] for t in quicksort(case_tokens + fixed_tokens)])
+        else:
+            case_lemma = "_".join([t["lemma"] for t in quicksort(case_tokens)])
+
+    if label not in ['advcl']:
+        return dep
+    else:
+        label_lst = [label]
+        if case_lemma:
+            label_lst.append(case_lemma)
+        label = ":".join(label_lst)
+
+    # print(label, sentence.metadata["sent_id"])
+    return label, head
+
+
+def fix_conj_deps(dep, conj_token, sentence, conj):
+    """
+    This function modifies enhanced edges labelled 'conj' which should be assined the lemma of cc as sublabel
+    """
+    label: str
+    label, head = dep
+
+    if not label.startswith(conj):
+        return dep
+
+    case_lemma = None
+    case_tokens = []
+    for token in sentence:
+        if token["deprel"] == "cc" and token["head"] == conj_token["id"]:
+            case_tokens.append(token)
+
+    if case_tokens:
+        fixed_tokens = []
+        for token in sentence:
+            for case in quicksort(case_tokens):
+                if token["deprel"] == "fixed" and token["head"] == case["id"]:
+                    fixed_tokens.append(token)
+
+        if fixed_tokens:
+            case_lemma = "_".join([t["lemma"] for t in quicksort(case_tokens + fixed_tokens)])
+        else:
+            case_lemma = "_".join([t["lemma"] for t in quicksort(case_tokens)])
+
+    if label not in ['conj']:
+        return dep
+    else:
+        label_lst = [label]
+        if case_lemma:
+            label_lst.append(case_lemma)
+        label = ":".join(label_lst)
+
+    # print(label, sentence.metadata["sent_id"])
+    return label, head
+
+
+
+def quicksort(tokens):
+    if len(tokens) <= 1:
+        return tokens
+    else:
+        return quicksort([x for x in tokens[1:] if int(x["id"]) < int(tokens[0]["id"])]) \
+               + [tokens[0]] \
+               + quicksort([y for y in tokens[1:] if int(y["id"]) >= int(tokens[0]["id"])])
+
+
+language = sys.argv[1]
+errors = 0
+
+input_file = f"./token_test/{language}_pred.fixed.conllu"
+output_file = f"./token_test/{language}.nofixed.conllu"
+with open(input_file) as fh:
+    with open(output_file, "w") as oh:
+        for sentence in conllu.parse_incr(fh):
+            for token in sentence:
+                deps = token["deps"]
+                if deps:
+                    if language not in ['fr']:
+                        for idx, dep in enumerate(deps):
+                            assert len(dep) == 2, dep
+                            new_dep = fix_obl_deps(dep, token, sentence, "obl")
+                            token["deps"][idx] = new_dep
+                            if new_dep[0] != dep[0]:
+                                errors += 1
+                    if language not in ['fr']:
+                        for idx, dep in enumerate(deps):
+                            assert len(dep) == 2, dep
+                            new_dep = fix_nmod_deps(dep, token, sentence, "nmod")
+                            token["deps"][idx] = new_dep
+                            if new_dep[0] != dep[0]:
+                                errors += 1
+                    # TODO: check for other languages
+                    if language not in ['fr', 'lv']:
+                        for idx, dep in enumerate(deps):
+                            assert len(dep) == 2, dep
+                            new_dep = fix_acl_deps(dep, token, sentence, "acl", language)
+                            token["deps"][idx] = new_dep
+                            if new_dep[0] != dep[0]:
+                                errors += 1
+
+                    # TODO: check for other languages
+                    if language not in ['fr', 'lv']:
+                        for idx, dep in enumerate(deps):
+                            assert len(dep) == 2, dep
+                            new_dep = fix_advcl_deps(dep, token, sentence, "advcl")
+                            token["deps"][idx] = new_dep
+                            if new_dep[0] != dep[0]:
+                                errors += 1
+                    # TODO: check for other languages
+                    if language in ['en', 'it', 'nl', 'sv']:
+                        for idx, dep in enumerate(deps):
+                            assert len(dep) == 2, dep
+                            new_dep = fix_conj_deps(dep, token, sentence, "conj")
+                            token["deps"][idx] = new_dep
+                            if new_dep[0] != dep[0]:
+                                errors += 1
+                    # TODO: check for other languages
+                    if language in ['et']:
+                        for idx, dep in enumerate(deps):
+                            assert len(dep) == 2, dep
+                            if token['deprel'] == 'nsubj:cop' and dep[0] == 'nsubj:cop':
+                                new_dep = ('nsubj', dep[1])
+                                token["deps"][idx] = new_dep
+                                if new_dep[0] != dep[0]:
+                                    errors += 1
+                            if token['deprel'] == 'csubj:cop' and dep[0] == 'csubj:cop':
+                                new_dep = ('csubj', dep[1])
+                                token["deps"][idx] = new_dep
+                                if new_dep[0] != dep[0]:
+                                    errors += 1
+                    # BELOW ARE THE RULES FOR CORRECTION OF THE FUNCTION WORDS
+                    # labelled ref, mark, punct, root, case, det, cc, cop, aux
+                    # They should not be assinged other functions
+                    #TODO: to check for other languages
+                    if language in ['ar', 'bg', 'cs', 'en', 'et', 'fi', 'it', 'lt', 'lv', 'nl', 'pl', 'sk', 'sv', 'ru']:
+                        refs = [s for s in deps if s[0] == 'ref']
+                        if refs:
+                            token["deps"] = refs
+                    #TODO: to check for other languages
+                    if language in ['ar', 'bg', 'en', 'et', 'fi', 'it', 'lt', 'nl', 'pl', 'sk', 'sv', 'ta', 'uk', 'fr']:
+                        marks = [s for s in deps if s[0] == 'mark']
+                        if marks and token['deprel'] == 'mark':
+                            token["deps"] = marks
+                    #TODO: to check for other languages
+                    if language in ['ar', 'bg', 'cs', 'en', 'et', 'fi', 'lv', 'nl', 'pl', 'sk', 'sv', 'ta', 'uk', 'fr', 'ru']:
+                        puncts = [s for s in deps if s[0] == 'punct' and s[1] == token['head']]
+                        if puncts and token['deprel'] == 'punct':
+                            token["deps"] = puncts
+                    #TODO: to check for other languages
+                    if language in ['ar', 'lt', 'pl']:
+                        roots = [s for s in deps if s[0] == 'root']
+                        if roots and token['deprel'] == 'root':
+                            token["deps"] = roots
+                    #TODO: to check for other languages
+                    if language in ['en', 'ar', 'bg', 'et', 'fi', 'it', 'lt', 'lv', 'nl', 'pl', 'sk', 'sv', 'ta', 'uk', 'fr']:
+                        cases = [s for s in deps if s[0] == 'case']
+                        if cases and token['deprel'] == 'case':
+                            token["deps"] = cases
+                    #TODO: to check for other languages
+                    if language in ['en', 'ar', 'et', 'fi', 'it', 'lt', 'lv', 'nl', 'pl', 'sk', 'sv', 'ta', 'uk', 'fr', 'ru']:
+                        dets = [s for s in deps if s[0] == 'det']
+                        if dets and token['deprel'] == 'det':
+                            token["deps"] = dets
+                    #TODO: to check for other languages
+                    if language in ['et', 'fi', 'it', 'lv', 'nl', 'pl', 'sk', 'sv', 'uk', 'fr', 'ar', 'ru', 'ta']:
+                        ccs = [s for s in deps if s[0] == 'cc']
+                        if ccs and token['deprel'] == 'cc':
+                            token["deps"] = ccs
+                    #TODO: to check for other languages
+                    if language in ['bg', 'fi','et', 'it', 'sk', 'sv', 'uk', 'nl', 'fr', 'ru']:
+                        cops = [s for s in deps if s[0] == 'cop']
+                        if cops and token['deprel'] == 'cop':
+                            token["deps"] = cops
+                    #TODO: to check for other languages
+                    if language in ['bg', 'et', 'fi', 'it', 'lv', 'pl', 'sv']:
+                        auxs = [s for s in deps if s[0] == 'aux']
+                        if auxs and token['deprel'] == 'aux':
+                            token["deps"] = auxs
+
+                    #TODO: to check for other languages
+                    if language in ['ar', 'bg', 'cs', 'et', 'fi', 'fr', 'lt', 'lv', 'pl', 'sk', 'sv', 'uk', 'ru', 'ta']:
+                        conjs = [s for s in deps if s[0] == 'conj' and s[1] == token['head']]
+                        other = [s for s in deps if s[0] != 'conj']
+                        if conjs and token['deprel'] == 'conj':
+                            token["deps"] = conjs+other
+
+                    #TODO: to check for other languages
+                    # EXTRA rule 1
+                    if language in ['cs', 'et', 'fi', 'lv', 'pl', 'uk']: #ar nl ru
+                        # not use for: lt, bg, fr, sk, ta, sv, en
+                        deprel = [s for s in deps if s[0] == token['deprel'] and s[1] == token['head']]
+                        other_exp = [s for s in deps if type(s[1]) == tuple]
+                        other_noexp = [s for s in deps if s[1] != token['head'] and type(s[1]) != tuple]
+                        if other_exp:
+                            token["deps"] = other_exp+other_noexp
+
+                    # EXTRA rule 2
+                    if language in ['cs', 'lt', 'pl', 'sk', 'uk']: #ar nl ru
+                        conjs = [s for s in deps if s[0] == 'conj' and s[1] == token['head']]
+                        if conjs and len(deps) == 1 and len(conjs) == 1:
+                            for t in sentence:
+                                if t['id'] == conjs[0][1] and t['deprel'] == 'root':
+                                    conjs.append((t['deprel'], t['head']))
+                            token["deps"] = conjs
+
+                    if language in ['ta']:
+                        if token['deprel'] != 'conj':
+                            conjs = [s for s in deps if s[0] == 'conj']
+                            if conjs:
+                                new_dep = [s for s in deps if s[1] == token['head']]
+                                token["deps"] = new_dep
+
+            oh.write(sentence.serialize())
+print(errors)
-- 
GitLab