Skip to content

Commit

Permalink
Merge pull request #1087 from ajdapretnar/import-conllu-id
Browse files Browse the repository at this point in the history
Import Documents: add options for meta matching in CoNLL-U
  • Loading branch information
markotoplak authored Dec 6, 2024
2 parents 4293a2c + f056910 commit 220ccc1
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 3 deletions.
10 changes: 7 additions & 3 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ def make_text_data(self):
class ImportDocuments:
META_DATA_FILE_KEY = "Text file"
# this is what we will merge meta data on, change to user-set variable
CONLLU_META_DATA = "ID"
CONLLU_META_DATA = ["ID", "Text_ID"]

def __init__(
self,
Expand Down Expand Up @@ -520,13 +520,17 @@ def _add_metadata(self, corpus: Corpus) -> Corpus:
or self._meta_data is None
or (
self.META_DATA_FILE_KEY not in self._meta_data.columns
and self.CONLLU_META_DATA not in self._meta_data.columns
and not any(i in self._meta_data.columns for i in
self.CONLLU_META_DATA)
)
):
return corpus

if self.is_conllu:
df = self._meta_data.set_index(self.CONLLU_META_DATA)
# find the first matching column
match_id = next((idx for idx in self.CONLLU_META_DATA if idx in
self._meta_data.columns))
df = self._meta_data.set_index(match_id)
path_column = corpus.get_column("utterance")
else:
df = self._meta_data.set_index(
Expand Down
2 changes: 2 additions & 0 deletions orangecontrib/text/tests/test_import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ def test_conllu_reader(self):
self.assertEqual(len(corpus), len(lemma))
self.assertEqual(len(corpus), len(pos))
self.assertEqual(len(corpus), len(ner))
self.assertTrue(np.any(~np.isnan(corpus.get_column(
"Speaker_birth"))))

@patch(SF_LIST, return_value=SPECIAL_CHAR_FILES)
@patch(PATCH_METHOD, side_effect=ConnectTimeout("test message", request=""))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ID Title From To House Term Session Meeting Sitting Agenda Subcorpus Speaker_role Speaker_type Speaker_party Speaker_party_name Party_status Speaker_name Speaker_gender Speaker_birth
ParlaMint-SI_2014-08-01-SDZ7-Redna-01.u1 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Regular Session 1, 1.8.2014 2014-08-01 2014-08-01 Lower house 7 1 Reference Chairperson MP DeSUS Demokratična stranka upokojencev Slovenije Kotnik Poropat, Marjana F 1944
ParlaMint-SI_2014-08-01-SDZ7-Redna-01.u2 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Regular Session 1, 1.8.2014 2014-08-01 2014-08-01 Lower house 7 1 Reference Regular MP SD Socialni demokrati Veber, Janko M 1960
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ID Title From To House Term Session Meeting Sitting Agenda Subcorpus Speaker_role Speaker_type Speaker_party Speaker_party_name Party_status Speaker_name Speaker_gender Speaker_birth
ParlaMint-SI_2014-08-25-SDZ7-Izredna-01.u1 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Extraordinary Session 1, 25.8.2014 2014-08-25 2014-08-25 Lower house 7 1 Reference Chairperson MP SMC Stranka Mira Cerarja Brglez, Milan M 1967
ParlaMint-SI_2014-08-25-SDZ7-Izredna-01.u2 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Extraordinary Session 1, 25.8.2014 2014-08-25 2014-08-25 Lower house 7 1 Reference Chairperson MP SMC Stranka Mira Cerarja Brglez, Milan M 1967
ParlaMint-SI_2014-08-25-SDZ7-Izredna-01.u3 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Extraordinary Session 1, 25.8.2014 2014-08-25 2014-08-25 Lower house 7 1 Reference Regular MP SD Socialni demokrati Židan, Dejan M 1967

0 comments on commit 220ccc1

Please sign in to comment.