Skip to content

Commit

Permalink
code to update treebanks and provide additional validation for 2.16 (…
Browse files Browse the repository at this point in the history
…May 2025) release
  • Loading branch information
colinbatchelor committed Dec 13, 2024
1 parent 89b95ca commit 2a07209
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 89 deletions.
1 change: 1 addition & 0 deletions fixed.gd
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
's an e
's e
's i
a bu
a b'
a bhòn-uiridh
a chaoidh
Expand Down
60 changes: 60 additions & 0 deletions update_ud2.16.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import re
import sys
import pyconll

corpus = pyconll.load_from_file(sys.argv[1])
trees = []

def ud_words(ud_sentence, condition = lambda x: True):
"""
Returns the 'words' and their predecessors in the UD sense by rejecting multiword tokens.
"""
prev_token = None
for word_token in [s for s in ud_sentence if not s.is_multiword()]:
# the condition may only apply to UD words
if condition(word_token):
yield word_token, prev_token
prev_token = word_token

advtype_mapping = { "Rs": "Loc", "Rt": "Tim", "Rg": "Man", "Uf": "Man", "Uq": "Man", "Xsi": "Loc" }

with open(sys.argv[2],'w') as clean:
for sentence in corpus:
cop_heads = [t.head for t, _ in ud_words(sentence, lambda t: t.deprel == "cop")]
cleft_heads = [t.head for t, _ in ud_words(sentence, lambda t: t.deprel in ["csubj:cleft", "csubj:outer"])]
case_heads = { t.head: t.form for t, _ in ud_words(sentence, lambda t: t.deprel == "case") }
for token, prev_token in ud_words(sentence, lambda t: t):
if token.id in cop_heads and token.id in cleft_heads:
if token.upos == "ADJ":
token.feats["CleftType"] = ["Adj"]
elif token.upos == "ADV":
token.feats["CleftType"] = ["Adv"]
elif token.upos in ["NOUN", "NUM", "PART", "PRON", "PROPN"]:
if token.upos == "PART" and "Pat" not in token.feats["PartType"]:
print(f"{sentence.id} {token.id} {token.form} {token.upos} {token.feats}")
elif token.id in case_heads:
token.feats["CleftType"] = ["Obl"]
else:
token.feats["CleftType"] = ["Nom"]
elif token.upos == "VERB":
token.feats["CleftType"] = ["Verb"]
else:
print(f"{sentence.id} {token.id} {token.form} {token.upos}")
if token.upos == "ADV":
if token.xpos not in advtype_mapping:
print(sentence.id, token.id, token.form, token.upos, token.xpos)
else:
token.feats["AdvType"] = [advtype_mapping[token.xpos]]
if token.xpos == "Nt":
token.upos = "PROPN"
token.feats["NounType"] = ["Top"]
if token.deprel == "flat":
token.deprel = "flat:name"
token.misc["FlatType"] = ["Top"]
if prev_token is not None:
if token.deprel == "fixed" and prev_token.deprel != "fixed":
if "ExtPos" not in prev_token.feats:
prev_token.feats["ExtPos"] = [prev_token.upos]

clean.write(sentence.conll())
clean.write('\n\n')
Loading

0 comments on commit 2a07209

Please sign in to comment.