-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
code to update treebanks and provide additional validation for 2.16 (…
…May 2025) release
- Loading branch information
1 parent
89b95ca
commit 2a07209
Showing
3 changed files
with
174 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
's an e | ||
's e | ||
's i | ||
a bu | ||
a b' | ||
a bhòn-uiridh | ||
a chaoidh | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import re | ||
import sys | ||
import pyconll | ||
|
||
corpus = pyconll.load_from_file(sys.argv[1]) | ||
trees = [] | ||
|
||
def ud_words(ud_sentence, condition = lambda x: True): | ||
""" | ||
Returns the 'words' and their predecessors in the UD sense by rejecting multiword tokens. | ||
""" | ||
prev_token = None | ||
for word_token in [s for s in ud_sentence if not s.is_multiword()]: | ||
# the condition may only apply to UD words | ||
if condition(word_token): | ||
yield word_token, prev_token | ||
prev_token = word_token | ||
|
||
advtype_mapping = { "Rs": "Loc", "Rt": "Tim", "Rg": "Man", "Uf": "Man", "Uq": "Man", "Xsi": "Loc" } | ||
|
||
with open(sys.argv[2],'w') as clean: | ||
for sentence in corpus: | ||
cop_heads = [t.head for t, _ in ud_words(sentence, lambda t: t.deprel == "cop")] | ||
cleft_heads = [t.head for t, _ in ud_words(sentence, lambda t: t.deprel in ["csubj:cleft", "csubj:outer"])] | ||
case_heads = { t.head: t.form for t, _ in ud_words(sentence, lambda t: t.deprel == "case") } | ||
for token, prev_token in ud_words(sentence, lambda t: t): | ||
if token.id in cop_heads and token.id in cleft_heads: | ||
if token.upos == "ADJ": | ||
token.feats["CleftType"] = ["Adj"] | ||
elif token.upos == "ADV": | ||
token.feats["CleftType"] = ["Adv"] | ||
elif token.upos in ["NOUN", "NUM", "PART", "PRON", "PROPN"]: | ||
if token.upos == "PART" and "Pat" not in token.feats["PartType"]: | ||
print(f"{sentence.id} {token.id} {token.form} {token.upos} {token.feats}") | ||
elif token.id in case_heads: | ||
token.feats["CleftType"] = ["Obl"] | ||
else: | ||
token.feats["CleftType"] = ["Nom"] | ||
elif token.upos == "VERB": | ||
token.feats["CleftType"] = ["Verb"] | ||
else: | ||
print(f"{sentence.id} {token.id} {token.form} {token.upos}") | ||
if token.upos == "ADV": | ||
if token.xpos not in advtype_mapping: | ||
print(sentence.id, token.id, token.form, token.upos, token.xpos) | ||
else: | ||
token.feats["AdvType"] = [advtype_mapping[token.xpos]] | ||
if token.xpos == "Nt": | ||
token.upos = "PROPN" | ||
token.feats["NounType"] = ["Top"] | ||
if token.deprel == "flat": | ||
token.deprel = "flat:name" | ||
token.misc["FlatType"] = ["Top"] | ||
if prev_token is not None: | ||
if token.deprel == "fixed" and prev_token.deprel != "fixed": | ||
if "ExtPos" not in prev_token.feats: | ||
prev_token.feats["ExtPos"] = [prev_token.upos] | ||
|
||
clean.write(sentence.conll()) | ||
clean.write('\n\n') |
Oops, something went wrong.