From 2a072097890fa69c0283ceb98a07eb90b62ccf0c Mon Sep 17 00:00:00 2001 From: Colin Batchelor Date: Fri, 13 Dec 2024 20:19:39 +0000 Subject: [PATCH] code to update treebanks and provide additional validation for 2.16 (May 2025) release --- fixed.gd | 1 + update_ud2.16.py | 60 +++++++++++++ validate_gd_extras.py | 202 +++++++++++++++++++++++------------------- 3 files changed, 174 insertions(+), 89 deletions(-) create mode 100644 update_ud2.16.py diff --git a/fixed.gd b/fixed.gd index b213b03..234c5c3 100644 --- a/fixed.gd +++ b/fixed.gd @@ -8,6 +8,7 @@ 's an e 's e 's i +a bu a b' a bhòn-uiridh a chaoidh diff --git a/update_ud2.16.py b/update_ud2.16.py new file mode 100644 index 0000000..61039a0 --- /dev/null +++ b/update_ud2.16.py @@ -0,0 +1,60 @@ +import re +import sys +import pyconll + +corpus = pyconll.load_from_file(sys.argv[1]) +trees = [] + +def ud_words(ud_sentence, condition = lambda x: True): + """ + Returns the 'words' and their predecessors in the UD sense by rejecting multiword tokens. + """ + prev_token = None + for word_token in [s for s in ud_sentence if not s.is_multiword()]: + # the condition may only apply to UD words + if condition(word_token): + yield word_token, prev_token + prev_token = word_token + +advtype_mapping = { "Rs": "Loc", "Rt": "Tim", "Rg": "Man", "Uf": "Man", "Uq": "Man", "Xsi": "Loc" } + +with open(sys.argv[2],'w') as clean: + for sentence in corpus: + cop_heads = [t.head for t, _ in ud_words(sentence, lambda t: t.deprel == "cop")] + cleft_heads = [t.head for t, _ in ud_words(sentence, lambda t: t.deprel in ["csubj:cleft", "csubj:outer"])] + case_heads = { t.head: t.form for t, _ in ud_words(sentence, lambda t: t.deprel == "case") } + for token, prev_token in ud_words(sentence, lambda t: t): + if token.id in cop_heads and token.id in cleft_heads: + if token.upos == "ADJ": + token.feats["CleftType"] = ["Adj"] + elif token.upos == "ADV": + token.feats["CleftType"] = ["Adv"] + elif token.upos in ["NOUN", "NUM", "PART", "PRON", "PROPN"]: + if token.upos == "PART" and "Pat" not in token.feats["PartType"]: + print(f"{sentence.id} {token.id} {token.form} {token.upos} {token.feats}") + elif token.id in case_heads: + token.feats["CleftType"] = ["Obl"] + else: + token.feats["CleftType"] = ["Nom"] + elif token.upos == "VERB": + token.feats["CleftType"] = ["Verb"] + else: + print(f"{sentence.id} {token.id} {token.form} {token.upos}") + if token.upos == "ADV": + if token.xpos not in advtype_mapping: + print(sentence.id, token.id, token.form, token.upos, token.xpos) + else: + token.feats["AdvType"] = [advtype_mapping[token.xpos]] + if token.xpos == "Nt": + token.upos = "PROPN" + token.feats["NounType"] = ["Top"] + if token.deprel == "flat": + token.deprel = "flat:name" + token.misc["FlatType"] = ["Top"] + if prev_token is not None: + if token.deprel == "fixed" and prev_token.deprel != "fixed": + if "ExtPos" not in prev_token.feats: + prev_token.feats["ExtPos"] = [prev_token.upos] + + clean.write(sentence.conll()) + clean.write('\n\n') diff --git a/validate_gd_extras.py b/validate_gd_extras.py index 9853a08..af4a2da 100644 --- a/validate_gd_extras.py +++ b/validate_gd_extras.py @@ -1,4 +1,4 @@ -"""Checks for Gaelic-specific things that aren't covered by the standard UD validation tools.""" +"""Checks for Scottish Gaelic-specific things that aren't covered by the standard UD validation tools.""" import sys import pyconll @@ -34,33 +34,40 @@ def check_fixed(sentence): Prints errors and returns the error count. """ - score = 0 + errors = 0 allowed = read_fixed() for token, prev_token in ud_words(sentence, lambda t: t.deprel == "fixed"): norm_token_form = token.form.lower().replace("‘", "'").replace("’", "'") norm_prev_token_form = prev_token.form.lower().replace("‘", "'").replace("’", "'") if norm_token_form not in allowed: - score +=1 + errors +=1 print(f"E {sentence.id} {token.id} '{token.form}' not in fixed list") elif norm_prev_token_form not in allowed[norm_token_form]: - score +=1 + errors +=1 print(f"E {sentence.id} {token.id} '{prev_token.form} {token.form}' not in fixed list") - return score + return errors def check_feats(sentence) -> int: """ - Checks the FEATS column for Scottish Gaelic-specific features (currrently AdvType). + Checks the FEATS column for + 1. ExtPos if the node is head of the fixed relation + 2. Scottish Gaelic-specific features (currently AdvType). Returns an integer with the number of errors found. """ - score = 0 + errors = 0 + for token, prev_token in ud_words(sentence, lambda t: t.deprel == "fixed"): + if prev_token.deprel != "fixed": + if "ExtPos" not in prev_token.feats: + errors += 1 + print(f"E {sentence.id} {prev_token.id} head of fixed should have ExtPos feature") for token in sentence: if "AdvType" in token.feats: for advtype in token.feats["AdvType"]: - if advtype not in ["Conj", "Loc", "Tim"]: - score += 1 + if advtype not in ["Conj", "Man", "Loc", "Tim"]: + errors += 1 print(f"E {sentence.id} {token.id} Unrecognised AdvType {advtype}") - return score + return errors def check_misc(sentence) -> int: """ @@ -68,65 +75,71 @@ def check_misc(sentence) -> int: Returns an integer with the number of errors found. """ - score = 0 + errors = 0 for token, _ in ud_words(sentence, lambda t: t.lemma in ["[Name]", "[Placename]"]): if "Anonymised" not in token.misc: - score += 1 + errors += 1 print(f"E {sentence.id} {token.id} Anonymised=Yes missing from MISC column") for token in sentence: if "FlatType" in token.misc: for flattype in token.misc["FlatType"]: - if flattype not in ["Borrow", "Date", "Top", "Num", "Redup", "Name", "Foreign"]: - score += 1 + if flattype not in ["Borrow", "Date", "Top", "Num", "Redup", "Name", "Foreign", "Time"]: + errors += 1 print(f"E {sentence.id} {token.id} Unrecognised FlatType {flattype}") - return score + return errors def check_others(sentence) -> int: """ Checks for things that don't fit in anywhere else. + Specifically: + * that _ais_ is tagged as a NOUN + * that reflexives are tagged as nmod, fixed or obl + * that patronymics are tagged as part of a longer name + * that the mark deprel is only used for PART and SCONJ + * that the flat deprel is typed in the MISC column """ - score = 0 + errors = 0 for token, prev_token in ud_words(sentence, lambda t: t.form in ["ais"] and t.upos != "NOUN"): - score +=1 + errors +=1 print(f"E {sentence.id} {token.id} UPOS for 'ais' should be NOUN") for token, prev_token in ud_words(sentence, lambda t: t.xpos == t.upos and t.feats == {}): - score +=1 + errors +=1 print(f"E {sentence.id} {token.id} XPOS {token.xpos} should not match UPOS if feats is empty") for token, prev_token in ud_words(sentence): if token.xpos == "Px" and token.deprel not in ["nmod", "fixed", "obl"]: - score += 1 + errors += 1 print(f"E {sentence.id} {token.id} {token.form} should be nmod or obl (or fixed)") if token.xpos == "Up" and token.deprel != "flat:name" and prev_token is not None and prev_token.xpos == "Nn": - score += 1 + errors += 1 print(f"E {sentence.id} {token.id} Patronymic should be flat:name") if token.deprel.startswith("mark") and token.upos not in ["PART", "SCONJ"]: - score += 1 + errors += 1 print(f"E {sentence.id} {token.id} mark should only be for PART or SCONJ") if token.deprel == "flat" and "FlatType" not in token.misc: - score += 1 + errors += 1 print(f"?E {sentence.id} {token.id} should be flat:name or flat:foreign, or FlatType should be specified") - return score + return errors def check_ranges(sentence) -> (int, int): """ Checks that deprels that can only go in one direction go in that direction and does some sense checks on the length. - Numbers are difficult so there are special cases built in for _ceud_ 'hundred', _fichead_ 'twenty' and £. + Numbers are difficult so there are special cases built in for _ceud_ 'hundred', _fichead_ 'twenty' and symbols. Returns a tuple of the errors found and warnings found. """ leftward_only = ["acl:relcl", "flat", "fixed"] rightward_only = ["case", "cc", "cop", "mark", "nummod"] short_range = {"compound": 2 ,"det": 3, "fixed": 2, "flat": 4} - score = 0 + errors = 0 warnings = 0 - head_forms = {} + head_upos = {} for token in sentence: - head_forms[token.id] = token.form + head_upos[token.id] = token.upos for token, prev_token in ud_words(sentence): deprel_range = abs(int(token.id) - int(token.head)) if token.deprel in leftward_only and int(token.head) > int(token.id): @@ -136,8 +149,8 @@ def check_ranges(sentence) -> (int, int): int(token.head) < int(token.id) and\ prev_token.xpos != "Uo" and\ token.form not in ["ceud", "fichead"] and\ - head_forms[token.head] != "£": - score += 1 + head_upos[token.head] != "SYM": + errors += 1 print(f"E {sentence.id} {token.id} {token.deprel} goes wrong way for gd") if token.deprel in short_range and\ @@ -147,7 +160,7 @@ def check_ranges(sentence) -> (int, int): warnings += 1 code = "W" else: - score += 1 + errors += 1 code = "E" print(f"{code} {sentence.id} {token.id} Too long a range ({deprel_range}) for {token.deprel}") if token.deprel in ["nsubj", "obj"] and\ @@ -156,9 +169,9 @@ def check_ranges(sentence) -> (int, int): if "ExtPos" in token.feats: pass else: - score +=1 + errors +=1 print(f"E {sentence.id} {token.id} nsubj and (rightward) obj should only be for NOUN, PART, PRON, PROPN, NUM, SYM or X") - return score, warnings + return errors, warnings def check_heads_for_upos(sentence) -> int: """ @@ -166,7 +179,7 @@ def check_heads_for_upos(sentence) -> int: Returns an integer number of errors found in the sentence """ - score = 0 + errors = 0 head_ids = {} heads = { "obl": ["VERB", "ADJ", "ADV"], @@ -182,23 +195,23 @@ def check_heads_for_upos(sentence) -> int: actual = token.upos correct = heads[head_ids[int(token.id)][0]] if actual not in correct: - score +=1 + errors +=1 print(f"E {sentence.id} {token.id} {head_ids[int(token.id)][1]} head of {head_ids[int(token.id)]} must be one of ({', '.join(correct)}) not {actual}") if token.form == "ais": - score +=1 + errors +=1 print(f"E {sentence.id} {token.id} 'ais' should not be a head") - return score + return errors def check_reported_speech(sentence) -> int: - score = 0 - return score + errors = 0 + return errors def check_target_deprels(sentence) -> int: """ Checks that, for example, cc connects a conjunction to a node that is linked to its parent by conj. """ - score = 0 + errors = 0 target_ids = {} targets = { "cc": ["conj"], @@ -213,28 +226,30 @@ def check_target_deprels(sentence) -> int: correct = [*targets[target_ids[int(token.id)]], "root", "parataxis", "reparandum",\ "appos", "orphan"] if actual not in correct: - score +=1 + errors +=1 print(f"E {sentence.id} {token.id} target of {target_ids[int(token.id)]} must be one of ({', '.join(correct)}) not {actual}") - return score + return errors def check_target_upos(sentence) -> int: """ Checks that, for example, the part of speech of a node linked by amod is ADJ """ - score = 0 + errors = 0 targets = { "amod": ["ADJ"], - # "flat:name": ["PART", "PROPN"], # consider when obl/nmod fixed + "flat:name": ["ADJ", "DET", "NUM", "PART", "PROPN"], "nmod": ["NOUN", "NUM", "PART", "PRON", "PROPN", "X"] } for token, _ in ud_words(sentence,\ lambda t: t.deprel in targets and t.upos not in targets[t.deprel]): - score += 1 + errors += 1 print(f"E {sentence.id} {token.id} UPOS for {token.deprel} must be one of ({', '.join(targets[token.deprel])}) not {token.upos}") - return score + return errors def ud_words(ud_sentence, condition = lambda x: True): - """Returns the 'words' and their predecessors in the UD sense by rejecting multiword tokens.""" + """ + Returns the 'words' and their predecessors in the UD sense by rejecting multiword tokens. + """ prev_token = None for word_token in [s for s in ud_sentence if not s.is_multiword()]: # the condition may only apply to UD words @@ -244,7 +259,7 @@ def ud_words(ud_sentence, condition = lambda x: True): def check_relatives(sentence) -> int: """Checks the possibilities for relative particles""" - score = 0 + errors = 0 heads = {} for token, prev_token in ud_words(sentence,\ lambda t: t.xpos in ["Q-r", "Qnr"] and\ @@ -252,21 +267,21 @@ def check_relatives(sentence) -> int: message_stub = f"E {sentence.id} {token.id} deprel for '{token.form}'" if prev_token is not None: if prev_token.upos == "ADP": - score += 1 + errors += 1 print(f"E {message_stub} should be obl, nmod or xcomp:pred") elif prev_token.lemma in ["carson", "ciamar", "cuin'"]: - score += 1 + errors += 1 print(f"E {message_stub} should be advmod or xcomp:pred") elif prev_token.upos not in ["CCONJ", "SCONJ"]: heads[token.head] = [] - score += 1 + errors += 1 print(f"E {message_stub} should usually be nsubj or obj") for token,_ in ud_words(sentence, lambda t: t.head in heads): heads[token.head].append(token.deprel) if heads != {}: for head in heads: print(f"{sentence.id} {head} {heads[head]} suggestion: {suggest_relative_deprel(heads[head])}") - return score + return errors def suggest_relative_deprel(deprels) -> str: """ @@ -285,9 +300,9 @@ def check_bi(sentence) -> int: Note that in the last case there are adverbs that won't be suitable if they are adverbs of time. We also use OblType in the MISC column for phrases like "mar eisimpleir" = 'for example'. - Returns an integer score. + Returns an integer errors. """ - score = 0 + errors = 0 ids = {} deprels = {} upos = {} @@ -310,12 +325,12 @@ def check_bi(sentence) -> int: stub = f"E {sentence.id} {key}" if "xcomp:pred" not in deprels[key] and "ccomp" not in deprels[key]: print(f"{stub} bi should have an xcomp:pred among {list(zip(ids[key], deprels[key]))}") - score += 1 + errors += 1 if "obj" in deprels[key] and "PART" not in upos[key]: # check what Irish does about obj of bi. - score += 1 + errors += 1 print(f"E {stub} bi should not have obj") - return score + return errors def check_passive(sentence) -> int: """ @@ -329,35 +344,45 @@ def check_passive(sentence) -> int: coded in. Example n02_026 in test. - Returns an integer score + Returns an integer errors """ - score = 0 + errors = 0 ids = {} - rach_ids = [t.id for t,_ in ud_words(sentence,\ + rach_ids = [t.id for t, _ in ud_words(sentence,\ lambda t: t.lemma == "rach" and t.upos != "NOUN")] - + adps = {} + for t, _ in ud_words(sentence, lambda t: t.deprel == "case"): + adps[t.head] = t.lemma for token, _ in ud_words(sentence, lambda t: t.head in rach_ids): if token.head in ids: ids[token.head].append(token.id) else: ids[token.head] = [token.id] for key in ids: - deprels = [sentence[i].deprel for i in ids[key]] + indexed_deprels = [(i, sentence[i].deprel) for i in ids[key]] + deprels = [d[1] for d in indexed_deprels] if "xcomp" in deprels and "nsubj" not in deprels: - for token in ids[key]: - if sentence[token].deprel == "xcomp": - message_stub = f"E {sentence.id} {sentence[token].id} '{sentence[token].form}'" - print(f"{message_stub} should be the head") - score +=1 - return score + rach_aig = False + if "obl" in deprels: + for deprel in indexed_deprels: + if deprel[1] == "obl" and adps[deprel[0]] == "aig": + rach_aig = True + if not rach_aig: + for token in ids[key]: + if sentence[token].deprel == "xcomp": + message_stub = f"E {sentence.id} {sentence[token].id} '{sentence[token].form}'" + print(f"{message_stub} should be the head") + errors +=1 + return errors def check_clauses(sentence) -> (int, int): """ Checks that mark and mark:prt and ccomp, advcl and acl:relcl work together properly. + For example, if the head of a clause or complement is marked with both a mark and a mark:prt, mark takes precedence. Returns an (int, int) tuple of the number of errors and number of warnings found. """ - score = 0 + errors = 0 warnings = 0 ids = {} @@ -379,7 +404,6 @@ def check_clauses(sentence) -> (int, int): deprels[token.head] = [token.deprel] feats[token.head] = [token.feats] for key in deprels: - # mark beats mark:prt if 'mark' in deprels[key]: if sentence[key].deprel != "advcl": warnings += 1 @@ -395,11 +419,11 @@ def check_clauses(sentence) -> (int, int): warnings += 1 print(f"W {sentence.id} {key} deprel should be acl:relcl") - return score, warnings + return errors, warnings def validate_corpus(corpus): """Prints a number of errors and a number of warnings.""" - total_score = 0 + total_errors = 0 total_warnings = 0 old_id = "" @@ -407,29 +431,29 @@ def validate_corpus(corpus): doc_id = tree.id.split("_")[0] if doc_id != old_id and not tree.meta_present("newdoc id"): print(f"E newdoc id declaration missing for {tree.id}") - total_score += 1 + total_errors += 1 old_id = doc_id - total_score += check_others(tree) - total_score += check_feats(tree) - total_score += check_misc(tree) - total_score += check_fixed(tree) - score, warnings = check_ranges(tree) - total_score += score + total_errors += check_others(tree) + total_errors += check_feats(tree) + total_errors += check_misc(tree) + total_errors += check_fixed(tree) + errors, warnings = check_ranges(tree) + total_errors += errors total_warnings += warnings - total_score += check_heads_for_upos(tree) - total_score += check_target_deprels(tree) - total_score += check_target_upos(tree) - total_score += check_bi(tree) - total_score += check_reported_speech(tree) - total_score += check_passive(tree) - total_score += check_relatives(tree) - score, warnings = check_clauses(tree) - total_score += score + total_errors += check_heads_for_upos(tree) + total_errors += check_target_deprels(tree) + total_errors += check_target_upos(tree) + total_errors += check_bi(tree) + total_errors += check_reported_speech(tree) + total_errors += check_passive(tree) + total_errors += check_relatives(tree) + errors, warnings = check_clauses(tree) + total_errors += errors total_warnings += warnings - if total_score == 0: + if total_errors == 0: print("*** PASSED ***") else: - print("*** FAILED *** with %s error%s" % (total_score, "s" if total_score > 1 else "")) + print("*** FAILED *** with %s error%s" % (total_errors, "s" if total_errors > 1 else "")) validate_corpus(pyconll.load_from_file(sys.argv[1]))