# TODO lemma remove punctuation - ukrainian # TODO lemma remove punctuation - russian # TODO consider handling multiple 'case' import sys import conllu from re import * rus = compile(u'^из-за$') expand = compile('^\d+\.\d+$') ''' A script correcting automatically predicted enhanced dependency graphs. Running the script: python postprocessing.py cs You have to modified the paths to the input CoNLL-U file and the output file. The last argument (e.g. cs) corresponds to the language symbol. All language symbols: ar (Arabic), bg (Bulgarian), cs (Czech), nl (Dutch), en (English), et (Estonian), fi (Finnish) fr (French), it (Italian), lv (Latvian), lt (Lithuanian), pl (Polish), ru (Russian) sk (Slovak), sv (Swedish), ta (Tamil), uk (Ukrainian) There are two main rules: 1) the first one add case information to the following labels: nmod, obl, acl, advcl. The case information comes from case/mark dependent of the current token and from the morphological feature Case. Depending on the language, not all information is added. In some languages ('en', 'it', 'nl', 'sv') the lemma of coordinating conjunction (cc) is appendend to the conjunct label (conj). Functions: fix_mod_deps, fix_obj_deps, fix_acl_deps, fix_advcl_deps and fix_conj_deps 2) the second rule correct enhanced edges comming into function words labelled ref, mark, punct, root, case, det, cc, cop, aux They should not be assinged other functions. For example, if a token, e.g. "and" is labelled cc (coordinating conjunction), it cannot be simultaneously a subject (nsubj) and if this wrong enhanced edge exists, it should be removed from the graph. There is one additional rule for Estonian: if the label is nsubj:cop or csubj:cop, the cop sublabel is removed and we have nsubj and csubj, respectively. ''' def fix_nmod_deps(dep, token, sentence, relation): """ This function modifies enhanced edges labelled 'nmod' """ label: str label, head = dep # All labels starting with 'relation' are checked if not label.startswith(relation): return dep # case_lemma is a (complex) preposition labelled 'case' e.g. 'po' in nmod:po:loc # or a (complex) subordinating conjunction labelled 'mark' case_lemma = None case_tokens = [] for t in sentence: if t["deprel"] in ["case", "mark"] and t["head"] == token["id"]: case_tokens.append(t) break if case_tokens: fixed_tokens = [] for t in sentence: for c in case_tokens: if t["deprel"] == "fixed" and t["head"] == c["id"]: fixed_tokens.append(t) if fixed_tokens: case_lemma = "_".join(rus.sub('изза', f["lemma"]) for f in quicksort(case_tokens + fixed_tokens)) else: case_lemma = "_".join(rus.sub('изза', f["lemma"]) for f in quicksort(case_tokens)) # case_val is a value of Case, e.g. 'gen' in nmod:gen and 'loc' in nmod:po:loc case_val = None if token['feats'] is not None: if 'Case' in token["feats"]: case_val = token["feats"]['Case'].lower() #TODO: check for other languages if language in ['fi'] and label not in ['nmod', 'nmod:poss']: return dep elif language not in ['fi'] and label not in ['nmod']: return dep else: label_lst = [label] if case_lemma: label_lst.append(case_lemma) if case_val: #TODO: check for other languages if language not in ['bg', 'en', 'nl', 'sv']: label_lst.append(case_val) label = ":".join(label_lst) # print(label, sentence.metadata["sent_id"]) return label, head def fix_obl_deps(dep, token, sentence, relation): """ This function modifies enhanced edges labelled 'obl', 'obl:arg', 'obl:rel' """ label: str label, head = dep if not label.startswith(relation): return dep # case_lemma is a (complex) preposition labelled 'case' e.g. 'pod' in obl:pod:loc # or a (complex) subordinating conjunction labelled 'mark' case_lemma = None case_tokens = [] for t in sentence: if t["deprel"] in ["case", "mark"] and t["head"] == token["id"]: case_tokens.append(t) break if case_tokens: # fixed_token is the lemma of a complex preposition, e.g. 'przypadek' in obl:w_przypadku:gen fixed_tokens = [] for t in sentence: for c in case_tokens: if t["deprel"] == "fixed" and t["head"] == c["id"]: fixed_tokens.append(t) if fixed_tokens: case_lemma = "_".join(rus.sub('изза', f["lemma"]) for f in quicksort(case_tokens + fixed_tokens)) else: case_lemma = "_".join(rus.sub('изза', f["lemma"]) for f in quicksort(case_tokens)) # case_val is a value of Case feature, e.g. 'loc' in obl:pod:loc case_val = None if token['feats'] is not None: if 'Case' in token["feats"]: case_val = token["feats"]['Case'].lower() if label not in ['obl', 'obl:arg', 'obl:agent']: return dep else: label_lst = [label] if case_lemma: label_lst.append(case_lemma) if case_val: # TODO: check for other languages if language not in ['bg', 'en', 'lv', 'nl', 'sv']: label_lst.append(case_val) # TODO: check it for other languages if language not in ['pl', 'sv']: if case_val and not case_lemma: if label == token['deprel']: label_lst.append(case_val) label = ":".join(label_lst) # print(label, sentence.metadata["sent_id"]) return label, head def fix_acl_deps(dep, acl_token, sentence, acl, lang): """ This function modifies enhanced edges labelled 'acl' """ label: str label, head = dep if not label.startswith(acl): return dep if label.startswith("acl:relcl"): if lang not in ['uk']: return dep case_lemma = None case_tokens = [] for token in sentence: if token["deprel"] == "mark" and token["head"] == acl_token["id"]: case_tokens.append(token) break if case_tokens: fixed_tokens = [] for token in sentence: if token["deprel"] == "fixed" and token["head"] == quicksort(case_tokens)[0]["id"]: fixed_tokens.append(token) if fixed_tokens: case_lemma = "_".join([t["lemma"] for t in quicksort(case_tokens + fixed_tokens)]) else: case_lemma = quicksort(case_tokens)[0]["lemma"] if lang in ['uk']: if label not in ['acl', 'acl:relcl']: return dep else: label_lst = [label] if case_lemma: label_lst.append(case_lemma) label = ":".join(label_lst) else: if label not in ['acl']: return dep else: label_lst = [label] if case_lemma: label_lst.append(case_lemma) label = ":".join(label_lst) # print(label, sentence.metadata["sent_id"]) return label, head def fix_advcl_deps(dep, advcl_token, sentence, advcl): """ This function modifies enhanced edges labelled 'advcl' """ label: str label, head = dep if not label.startswith(advcl): return dep case_lemma = None case_tokens = [] # TODO: check for other languages if language in ['bg', 'lt']: for token in sentence: if token["deprel"] in ["mark", "case"] and token["head"] == advcl_token["id"]: case_tokens.append(token) else: for token in sentence: if token["deprel"] == "mark" and token["head"] == advcl_token["id"]: case_tokens.append(token) if case_tokens: fixed_tokens = [] # TODO: check for other languages if language not in ['bg', 'nl']: for token in sentence: for case in quicksort(case_tokens): if token["deprel"] == "fixed" and token["head"] == case["id"]: fixed_tokens.append(token) if fixed_tokens: case_lemma = "_".join([t["lemma"] for t in quicksort(case_tokens + fixed_tokens)]) else: case_lemma = "_".join([t["lemma"] for t in quicksort(case_tokens)]) if label not in ['advcl']: return dep else: label_lst = [label] if case_lemma: label_lst.append(case_lemma) label = ":".join(label_lst) # print(label, sentence.metadata["sent_id"]) return label, head def fix_conj_deps(dep, conj_token, sentence, conj): """ This function modifies enhanced edges labelled 'conj' which should be assined the lemma of cc as sublabel """ label: str label, head = dep if not label.startswith(conj): return dep case_lemma = None case_tokens = [] for token in sentence: if token["deprel"] == "cc" and token["head"] == conj_token["id"]: case_tokens.append(token) if case_tokens: fixed_tokens = [] for token in sentence: for case in quicksort(case_tokens): if token["deprel"] == "fixed" and token["head"] == case["id"]: fixed_tokens.append(token) if fixed_tokens: case_lemma = "_".join([t["lemma"] for t in quicksort(case_tokens + fixed_tokens)]) else: case_lemma = "_".join([t["lemma"] for t in quicksort(case_tokens)]) if label not in ['conj']: return dep else: label_lst = [label] if case_lemma: label_lst.append(case_lemma) label = ":".join(label_lst) # print(label, sentence.metadata["sent_id"]) return label, head def quicksort(tokens): if len(tokens) <= 1: return tokens else: return quicksort([x for x in tokens[1:] if int(x["id"]) < int(tokens[0]["id"])]) \ + [tokens[0]] \ + quicksort([y for y in tokens[1:] if int(y["id"]) >= int(tokens[0]["id"])]) language = sys.argv[1] errors = 0 input_file = f"./token_test/{language}_pred.fixed.conllu" output_file = f"./token_test/{language}.nofixed.conllu" with open(input_file) as fh: with open(output_file, "w") as oh: for sentence in conllu.parse_incr(fh): for token in sentence: deps = token["deps"] if deps: if language not in ['fr']: for idx, dep in enumerate(deps): assert len(dep) == 2, dep new_dep = fix_obl_deps(dep, token, sentence, "obl") token["deps"][idx] = new_dep if new_dep[0] != dep[0]: errors += 1 if language not in ['fr']: for idx, dep in enumerate(deps): assert len(dep) == 2, dep new_dep = fix_nmod_deps(dep, token, sentence, "nmod") token["deps"][idx] = new_dep if new_dep[0] != dep[0]: errors += 1 # TODO: check for other languages if language not in ['fr', 'lv']: for idx, dep in enumerate(deps): assert len(dep) == 2, dep new_dep = fix_acl_deps(dep, token, sentence, "acl", language) token["deps"][idx] = new_dep if new_dep[0] != dep[0]: errors += 1 # TODO: check for other languages if language not in ['fr', 'lv']: for idx, dep in enumerate(deps): assert len(dep) == 2, dep new_dep = fix_advcl_deps(dep, token, sentence, "advcl") token["deps"][idx] = new_dep if new_dep[0] != dep[0]: errors += 1 # TODO: check for other languages if language in ['en', 'it', 'nl', 'sv']: for idx, dep in enumerate(deps): assert len(dep) == 2, dep new_dep = fix_conj_deps(dep, token, sentence, "conj") token["deps"][idx] = new_dep if new_dep[0] != dep[0]: errors += 1 # TODO: check for other languages if language in ['et']: for idx, dep in enumerate(deps): assert len(dep) == 2, dep if token['deprel'] == 'nsubj:cop' and dep[0] == 'nsubj:cop': new_dep = ('nsubj', dep[1]) token["deps"][idx] = new_dep if new_dep[0] != dep[0]: errors += 1 if token['deprel'] == 'csubj:cop' and dep[0] == 'csubj:cop': new_dep = ('csubj', dep[1]) token["deps"][idx] = new_dep if new_dep[0] != dep[0]: errors += 1 # BELOW ARE THE RULES FOR CORRECTION OF THE FUNCTION WORDS # labelled ref, mark, punct, root, case, det, cc, cop, aux # They should not be assinged other functions #TODO: to check for other languages if language in ['ar', 'bg', 'cs', 'en', 'et', 'fi', 'it', 'lt', 'lv', 'nl', 'pl', 'sk', 'sv', 'ru']: refs = [s for s in deps if s[0] == 'ref'] if refs: token["deps"] = refs #TODO: to check for other languages if language in ['ar', 'bg', 'en', 'et', 'fi', 'it', 'lt', 'nl', 'pl', 'sk', 'sv', 'ta', 'uk', 'fr']: marks = [s for s in deps if s[0] == 'mark'] if marks and token['deprel'] == 'mark': token["deps"] = marks #TODO: to check for other languages if language in ['ar', 'bg', 'cs', 'en', 'et', 'fi', 'lv', 'nl', 'pl', 'sk', 'sv', 'ta', 'uk', 'fr', 'ru']: puncts = [s for s in deps if s[0] == 'punct' and s[1] == token['head']] if puncts and token['deprel'] == 'punct': token["deps"] = puncts #TODO: to check for other languages if language in ['ar', 'lt', 'pl']: roots = [s for s in deps if s[0] == 'root'] if roots and token['deprel'] == 'root': token["deps"] = roots #TODO: to check for other languages if language in ['en', 'ar', 'bg', 'et', 'fi', 'it', 'lt', 'lv', 'nl', 'pl', 'sk', 'sv', 'ta', 'uk', 'fr']: cases = [s for s in deps if s[0] == 'case'] if cases and token['deprel'] == 'case': token["deps"] = cases #TODO: to check for other languages if language in ['en', 'ar', 'et', 'fi', 'it', 'lt', 'lv', 'nl', 'pl', 'sk', 'sv', 'ta', 'uk', 'fr', 'ru']: dets = [s for s in deps if s[0] == 'det'] if dets and token['deprel'] == 'det': token["deps"] = dets #TODO: to check for other languages if language in ['et', 'fi', 'it', 'lv', 'nl', 'pl', 'sk', 'sv', 'uk', 'fr', 'ar', 'ru', 'ta']: ccs = [s for s in deps if s[0] == 'cc'] if ccs and token['deprel'] == 'cc': token["deps"] = ccs #TODO: to check for other languages if language in ['bg', 'fi','et', 'it', 'sk', 'sv', 'uk', 'nl', 'fr', 'ru']: cops = [s for s in deps if s[0] == 'cop'] if cops and token['deprel'] == 'cop': token["deps"] = cops #TODO: to check for other languages if language in ['bg', 'et', 'fi', 'it', 'lv', 'pl', 'sv']: auxs = [s for s in deps if s[0] == 'aux'] if auxs and token['deprel'] == 'aux': token["deps"] = auxs #TODO: to check for other languages if language in ['ar', 'bg', 'cs', 'et', 'fi', 'fr', 'lt', 'lv', 'pl', 'sk', 'sv', 'uk', 'ru', 'ta']: conjs = [s for s in deps if s[0] == 'conj' and s[1] == token['head']] other = [s for s in deps if s[0] != 'conj'] if conjs and token['deprel'] == 'conj': token["deps"] = conjs+other #TODO: to check for other languages # EXTRA rule 1 if language in ['cs', 'et', 'fi', 'lv', 'pl', 'uk']: #ar nl ru # not use for: lt, bg, fr, sk, ta, sv, en deprel = [s for s in deps if s[0] == token['deprel'] and s[1] == token['head']] other_exp = [s for s in deps if type(s[1]) == tuple] other_noexp = [s for s in deps if s[1] != token['head'] and type(s[1]) != tuple] if other_exp: token["deps"] = other_exp+other_noexp # EXTRA rule 2 if language in ['cs', 'lt', 'pl', 'sk', 'uk']: #ar nl ru conjs = [s for s in deps if s[0] == 'conj' and s[1] == token['head']] if conjs and len(deps) == 1 and len(conjs) == 1: for t in sentence: if t['id'] == conjs[0][1] and t['deprel'] == 'root': conjs.append((t['deprel'], t['head'])) token["deps"] = conjs if language in ['ta']: if token['deprel'] != 'conj': conjs = [s for s in deps if s[0] == 'conj'] if conjs: new_dep = [s for s in deps if s[1] == token['head']] token["deps"] = new_dep oh.write(sentence.serialize()) print(errors)