From 1919edce1f9f70862a851aa79cdb7ddfa5cc6a81 Mon Sep 17 00:00:00 2001 From: vgkz <43313785+vgkz@users.noreply.github.com> Date: Mon, 10 Jun 2024 15:29:43 +0200 Subject: [PATCH 01/16] add functions to add context to sequences and get context sequences for a full protocol --- pyriksdagen/utils.py | 115 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/pyriksdagen/utils.py b/pyriksdagen/utils.py index a876434..8eb9168 100644 --- a/pyriksdagen/utils.py +++ b/pyriksdagen/utils.py @@ -15,6 +15,7 @@ import zipfile import os from trainerlog import get_logger +import re LOGGER = get_logger("pyriksdagen") XML_NS = "{http://www.w3.org/XML/1998/namespace}" @@ -334,3 +335,117 @@ def get_data_location(partition): d["metadata"] = os.environ.get("METADATA_PATH", "data") return d[partition] +def remove_whitespace_from_sequence(text_seq): + # function to remove whitespace from string to get comparable text between corpus and kblab + text_seq = text_seq.split() + text_seq_list = [s for s in text_seq if s != ''] + text_seq_string = ' '.join(text_seq_list) + return text_seq_string + +def add_context_to_sequence(previous_sequence, current_sequence, next_sequence, context_type, target_length = 120): + # if previous sequence is long, we want it to truncate the sequence so that the + # current sequence is not unecessarily + if context_type == 'left_context': + max_previous_length = target_length//2 + elif context_type == 'full_context': + max_previous_length = target_length//3 + + # remove whitespace from sequences + previous_sequence = remove_whitespace_from_sequence(str(previous_sequence)) + current_sequence = remove_whitespace_from_sequence(str(current_sequence)) + next_sequence = remove_whitespace_from_sequence(str(next_sequence)) + + + previous_as_list = re.split(r'([.!?])', previous_sequence) + if (previous_as_list[-1] == '') & (len(previous_as_list) != 1): + prev_last_sentence = previous_as_list[-3:] + prev_last_sentence = ''.join(prev_last_sentence) + else: + prev_last_sentence = previous_as_list[-1] + + next_as_list = re.split(r'([.!?])', next_sequence) + if len(next_as_list) != 1: + next_first_sentence = next_as_list[:2] + next_first_sentence = ''.join(next_first_sentence) + else: + next_first_sentence = next_as_list[0] + + # regardless of sequence type, we combine prev last sentence with curr sequence + prev_last_sentence_as_list = prev_last_sentence.split(' ') + n_words = len(prev_last_sentence_as_list) + if n_words > max_previous_length: + prev_last_sentence_as_list = prev_last_sentence_as_list[-max_previous_length:] + prev_last_sentence = ' '.join(prev_last_sentence_as_list) + # use new line (/n) as token to signify where current sequence begins + left_context_sequence = prev_last_sentence + ' /n ' + current_sequence + + if context_type == 'left_context': + return left_context_sequence + elif context_type == 'full_context': + # add next first sentence to left context sequence to get full context + full_context_sequence = left_context_sequence + ' /n ' + next_first_sentence + return full_context_sequence + +def get_context_sequences_for_protocol(protocol, context_type, max_length = 120): + # returns dictionary with ids and context sequences for a complete protocol + id_list = [] + context_sequence_list = [] + + id_key = f'{XML_NS}id' + parser = etree.XMLParser(remove_blank_text=True) + root = etree.parse(protocol, parser).getroot() + + prev_elem_is_text_seq = False + elem_idx = '' + prev_sequence = '' + next_sequence = '' + prev_elem_sequence = '' + for tag, elem in elem_iter(root): + if tag == 'note': + elem_sequence = elem.text + elem_idx = elem.attrib[id_key] + + if prev_elem_is_text_seq == True: + next_sequence = elem_sequence + context_sequence = add_context_to_sequence(prev_sequence, curr_sequence, next_sequence, context_type, max_length) + + id_list.append(idx) + context_sequence_list.append(context_sequence) + + + idx = elem_idx + curr_sequence = elem_sequence + prev_sequence = prev_elem_sequence + + prev_elem_sequence = elem_sequence + prev_elem_is_text_seq = True + elif tag == 'u': + for child in elem.getchildren(): + elem_sequence = child.text + elem_idx = child.values()[0] + + if prev_elem_is_text_seq == True: + next_sequence = elem_sequence + context_sequence = add_context_to_sequence(prev_sequence, curr_sequence, next_sequence, context_type, max_length) + + id_list.append(idx) + context_sequence_list.append(context_sequence) + + + idx = elem_idx + curr_sequence = elem_sequence + prev_sequence = prev_elem_sequence + + prev_elem_sequence = elem_sequence + prev_elem_is_text_seq = True + + next_sequence = '' + context_sequence = add_context_to_sequence(prev_sequence, curr_sequence, next_sequence, context_type, max_length) + + id_list.append(idx) + context_sequence_list.append(context_sequence) + + + output_dict = {'id' : id_list, + 'context_sequence' : context_sequence_list} + return output_dict From 664bd55e53917bb13b18521ae35b0dbe98f842a1 Mon Sep 17 00:00:00 2001 From: vgkz Date: Thu, 27 Jun 2024 13:49:08 +0200 Subject: [PATCH 02/16] get_context_sequences_for_protocol() now handles case were protocol includes no text sequences --- pyriksdagen/utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pyriksdagen/utils.py b/pyriksdagen/utils.py index 8eb9168..be497f5 100644 --- a/pyriksdagen/utils.py +++ b/pyriksdagen/utils.py @@ -397,6 +397,7 @@ def get_context_sequences_for_protocol(protocol, context_type, max_length = 120) prev_elem_is_text_seq = False elem_idx = '' + curr_sequence = None prev_sequence = '' next_sequence = '' prev_elem_sequence = '' @@ -440,12 +441,12 @@ def get_context_sequences_for_protocol(protocol, context_type, max_length = 120) prev_elem_is_text_seq = True next_sequence = '' - context_sequence = add_context_to_sequence(prev_sequence, curr_sequence, next_sequence, context_type, max_length) - - id_list.append(idx) - context_sequence_list.append(context_sequence) + if curr_sequence: + context_sequence = add_context_to_sequence(prev_sequence, curr_sequence, next_sequence, context_type, max_length) + id_list.append(idx) + context_sequence_list.append(context_sequence) output_dict = {'id' : id_list, - 'context_sequence' : context_sequence_list} + 'text' : context_sequence_list} return output_dict From c2a6f319907a8845447a83a4b184468c2dc6b794 Mon Sep 17 00:00:00 2001 From: vgkz Date: Tue, 2 Jul 2024 17:16:36 +0200 Subject: [PATCH 03/16] context_sequence now extracted from elem --- pyriksdagen/utils.py | 103 +++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 63 deletions(-) diff --git a/pyriksdagen/utils.py b/pyriksdagen/utils.py index be497f5..b43fa0c 100644 --- a/pyriksdagen/utils.py +++ b/pyriksdagen/utils.py @@ -342,7 +342,14 @@ def remove_whitespace_from_sequence(text_seq): text_seq_string = ' '.join(text_seq_list) return text_seq_string -def add_context_to_sequence(previous_sequence, current_sequence, next_sequence, context_type, target_length = 120): +def get_sequence_from_elem_list(elem_list): + # parses output from elem neighbor search into a text sequence + sequence = '' + if len(elem_list) != 0: + sequence = str(elem_list[0].text) + return sequence + +def extract_context_sequence(elem, context_type, target_length = 128, sep_char = '/n'): # if previous sequence is long, we want it to truncate the sequence so that the # current sequence is not unecessarily if context_type == 'left_context': @@ -350,25 +357,34 @@ def add_context_to_sequence(previous_sequence, current_sequence, next_sequence, elif context_type == 'full_context': max_previous_length = target_length//3 - # remove whitespace from sequences - previous_sequence = remove_whitespace_from_sequence(str(previous_sequence)) - current_sequence = remove_whitespace_from_sequence(str(current_sequence)) - next_sequence = remove_whitespace_from_sequence(str(next_sequence)) - + # find previous and next sequence using xpath, remove whitespace + previous_elem_list = elem.xpath("preceding::*[local-name() = 'note' or local-name() = 'seg'][1]") + previous_sequence = get_sequence_from_elem_list(previous_elem_list) + previous_sequence = remove_whitespace_from_sequence(previous_sequence) + if context_type == 'full_context': + next_elem_list = elem.xpath("following::*[local-name() = 'note' or local-name() = 'seg'][1]") + next_sequence = get_sequence_from_elem_list(next_elem_list) + next_sequence = remove_whitespace_from_sequence(next_sequence) + # current sequence is elem.text + current_sequence = elem.text + current_sequence = remove_whitespace_from_sequence(current_sequence) + + # split by punctuation previous_as_list = re.split(r'([.!?])', previous_sequence) if (previous_as_list[-1] == '') & (len(previous_as_list) != 1): prev_last_sentence = previous_as_list[-3:] prev_last_sentence = ''.join(prev_last_sentence) else: prev_last_sentence = previous_as_list[-1] - - next_as_list = re.split(r'([.!?])', next_sequence) - if len(next_as_list) != 1: - next_first_sentence = next_as_list[:2] - next_first_sentence = ''.join(next_first_sentence) - else: - next_first_sentence = next_as_list[0] + + if context_type == 'full_context': + next_as_list = re.split(r'([.!?])', next_sequence) + if len(next_as_list) != 1: + next_first_sentence = next_as_list[:2] + next_first_sentence = ''.join(next_first_sentence) + else: + next_first_sentence = next_as_list[0] # regardless of sequence type, we combine prev last sentence with curr sequence prev_last_sentence_as_list = prev_last_sentence.split(' ') @@ -377,16 +393,16 @@ def add_context_to_sequence(previous_sequence, current_sequence, next_sequence, prev_last_sentence_as_list = prev_last_sentence_as_list[-max_previous_length:] prev_last_sentence = ' '.join(prev_last_sentence_as_list) # use new line (/n) as token to signify where current sequence begins - left_context_sequence = prev_last_sentence + ' /n ' + current_sequence + left_context_sequence = prev_last_sentence + f' {sep_char} ' + current_sequence if context_type == 'left_context': return left_context_sequence elif context_type == 'full_context': # add next first sentence to left context sequence to get full context - full_context_sequence = left_context_sequence + ' /n ' + next_first_sentence + full_context_sequence = left_context_sequence + f' {sep_char} ' + next_first_sentence return full_context_sequence -def get_context_sequences_for_protocol(protocol, context_type, max_length = 120): +def get_context_sequences_for_protocol(protocol, context_type, target_length = 128, sep_char = '/n'): # returns dictionary with ids and context sequences for a complete protocol id_list = [] context_sequence_list = [] @@ -395,57 +411,18 @@ def get_context_sequences_for_protocol(protocol, context_type, max_length = 120) parser = etree.XMLParser(remove_blank_text=True) root = etree.parse(protocol, parser).getroot() - prev_elem_is_text_seq = False - elem_idx = '' - curr_sequence = None - prev_sequence = '' - next_sequence = '' - prev_elem_sequence = '' for tag, elem in elem_iter(root): if tag == 'note': - elem_sequence = elem.text - elem_idx = elem.attrib[id_key] - - if prev_elem_is_text_seq == True: - next_sequence = elem_sequence - context_sequence = add_context_to_sequence(prev_sequence, curr_sequence, next_sequence, context_type, max_length) - - id_list.append(idx) - context_sequence_list.append(context_sequence) - - - idx = elem_idx - curr_sequence = elem_sequence - prev_sequence = prev_elem_sequence - - prev_elem_sequence = elem_sequence - prev_elem_is_text_seq = True + elem_id = elem.get(id_key) + id_list.append(elem_id) + context_sequence = extract_context_sequence(elem, context_type = context_type, target_length = target_length, sep_char = sep_char) + context_sequence_list.append(context_sequence) elif tag == 'u': for child in elem.getchildren(): - elem_sequence = child.text - elem_idx = child.values()[0] - - if prev_elem_is_text_seq == True: - next_sequence = elem_sequence - context_sequence = add_context_to_sequence(prev_sequence, curr_sequence, next_sequence, context_type, max_length) - - id_list.append(idx) - context_sequence_list.append(context_sequence) - - - idx = elem_idx - curr_sequence = elem_sequence - prev_sequence = prev_elem_sequence - - prev_elem_sequence = elem_sequence - prev_elem_is_text_seq = True - - next_sequence = '' - if curr_sequence: - context_sequence = add_context_to_sequence(prev_sequence, curr_sequence, next_sequence, context_type, max_length) - id_list.append(idx) - context_sequence_list.append(context_sequence) - + child_id = child.get(id_key) + id_list.append(child_id) + context_sequence = extract_context_sequence(child, context_type=context_type, target_length = target_length, sep_char = sep_char) + context_sequence_list.append(context_sequence) output_dict = {'id' : id_list, 'text' : context_sequence_list} From ba6e56b54016ee2393431aea9c0148eef1d79ffa Mon Sep 17 00:00:00 2001 From: vgkz Date: Wed, 3 Jul 2024 17:50:04 +0200 Subject: [PATCH 04/16] rewrite context functions to reduce number of lines and add docstring --- pyriksdagen/utils.py | 75 ++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 45 deletions(-) diff --git a/pyriksdagen/utils.py b/pyriksdagen/utils.py index b43fa0c..fd7188b 100644 --- a/pyriksdagen/utils.py +++ b/pyriksdagen/utils.py @@ -336,74 +336,59 @@ def get_data_location(partition): return d[partition] def remove_whitespace_from_sequence(text_seq): - # function to remove whitespace from string to get comparable text between corpus and kblab + """ + Remove whitespace from string to get comparable text between corpus and kblab. + Input is string and output is string. + """ text_seq = text_seq.split() text_seq_list = [s for s in text_seq if s != ''] text_seq_string = ' '.join(text_seq_list) return text_seq_string def get_sequence_from_elem_list(elem_list): - # parses output from elem neighbor search into a text sequence + """ + Get sequence from first elem in list. + Returns string. If list is empty, returns empty string. + """ sequence = '' if len(elem_list) != 0: sequence = str(elem_list[0].text) return sequence def extract_context_sequence(elem, context_type, target_length = 128, sep_char = '/n'): - # if previous sequence is long, we want it to truncate the sequence so that the - # current sequence is not unecessarily + """ + Get sequence with context from xml element. Returns string. + """ + sequence_to_list_by_punctuation = lambda sequence_string: list(filter(None, re.split(r'([.!?])', sequence_string))) + + current_sequence = remove_whitespace_from_sequence(elem.text) + + previous_elem_list = elem.xpath("preceding::*[local-name() = 'note' or local-name() = 'seg'][1]") + previous_sequence = remove_whitespace_from_sequence(get_sequence_from_elem_list(previous_elem_list)) + previous_sequence_as_list = sequence_to_list_by_punctuation(previous_sequence) + previous_last_sentence = ''.join(previous_sequence_as_list[-2:]).lstrip('.!? ') + if context_type == 'left_context': max_previous_length = target_length//2 elif context_type == 'full_context': max_previous_length = target_length//3 - - # find previous and next sequence using xpath, remove whitespace - previous_elem_list = elem.xpath("preceding::*[local-name() = 'note' or local-name() = 'seg'][1]") - previous_sequence = get_sequence_from_elem_list(previous_elem_list) - previous_sequence = remove_whitespace_from_sequence(previous_sequence) - if context_type == 'full_context': next_elem_list = elem.xpath("following::*[local-name() = 'note' or local-name() = 'seg'][1]") - next_sequence = get_sequence_from_elem_list(next_elem_list) - next_sequence = remove_whitespace_from_sequence(next_sequence) - - # current sequence is elem.text - current_sequence = elem.text - current_sequence = remove_whitespace_from_sequence(current_sequence) - - # split by punctuation - previous_as_list = re.split(r'([.!?])', previous_sequence) - if (previous_as_list[-1] == '') & (len(previous_as_list) != 1): - prev_last_sentence = previous_as_list[-3:] - prev_last_sentence = ''.join(prev_last_sentence) - else: - prev_last_sentence = previous_as_list[-1] - - if context_type == 'full_context': - next_as_list = re.split(r'([.!?])', next_sequence) - if len(next_as_list) != 1: - next_first_sentence = next_as_list[:2] - next_first_sentence = ''.join(next_first_sentence) - else: - next_first_sentence = next_as_list[0] - - # regardless of sequence type, we combine prev last sentence with curr sequence - prev_last_sentence_as_list = prev_last_sentence.split(' ') - n_words = len(prev_last_sentence_as_list) - if n_words > max_previous_length: - prev_last_sentence_as_list = prev_last_sentence_as_list[-max_previous_length:] - prev_last_sentence = ' '.join(prev_last_sentence_as_list) - # use new line (/n) as token to signify where current sequence begins - left_context_sequence = prev_last_sentence + f' {sep_char} ' + current_sequence + next_sequence = remove_whitespace_from_sequence(get_sequence_from_elem_list(next_elem_list)) + next_sequence_as_list = sequence_to_list_by_punctuation(next_sequence) + next_first_sentence = ''.join(next_sequence_as_list[:2]) + previous_last_sentence = ' '.join(previous_last_sentence.split(' ')[-max_previous_length:]) # truncate sequence if too long + left_context_sequence = previous_last_sentence + f' {sep_char} ' + current_sequence + if context_type == 'left_context': return left_context_sequence elif context_type == 'full_context': - # add next first sentence to left context sequence to get full context - full_context_sequence = left_context_sequence + f' {sep_char} ' + next_first_sentence - return full_context_sequence + return left_context_sequence + f' {sep_char} ' + next_first_sentence def get_context_sequences_for_protocol(protocol, context_type, target_length = 128, sep_char = '/n'): - # returns dictionary with ids and context sequences for a complete protocol + """ + Gets context sequences for a protocol. Returns dictionary with ids and corresponding context sequences. + """ id_list = [] context_sequence_list = [] From 320ccef580890d42e2827ea83141d978ea4e2bc1 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Wed, 7 Aug 2024 15:51:05 +0300 Subject: [PATCH 05/16] fix: reset speaker and next/prev notation for 'commentSection's --- pyriksdagen/refine.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pyriksdagen/refine.py b/pyriksdagen/refine.py index 148cb36..84491c4 100644 --- a/pyriksdagen/refine.py +++ b/pyriksdagen/refine.py @@ -137,7 +137,7 @@ def detect_mps(root, names_ids, pattern_db, mp_db=None, minister_db=None, minist for tag, elem in elem_iter(root): parent = elem.getparent() - if "type" not in parent.attrib or ("type" in parent.attrib and parent.attrib['type'] != "commentSection"): #ignore where people don't talk + if parent.attrib.get("type") != "commentSection": #ignore where people don't talk if tag == "u": # Deleting and adding attributes changes their order; # Mark as 'delete' instead and delete later @@ -195,7 +195,16 @@ def detect_mps(root, names_ids, pattern_db, mp_db=None, minister_db=None, minist if current_speaker is None: unknowns.append([protocol_id, elem.attrib.get(f'{xml_ns}id')] + [d.get(key, "") for key in unknown_variables]) - + else: + # If the whole section has no speeches, reset speaker and next/prev notation + if tag == "u": + elem.set("prev", "delete") + elem.set("next", "delete") + elem.set("who", "unknown") + + current_speaker = None + prev = None + # Do two loops to preserve attribute order for tag, elem in elem_iter(root): if tag == "u": From c6538010b303f917b0ae4ab79a64c57308d9bc40 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Wed, 7 Aug 2024 15:53:27 +0300 Subject: [PATCH 06/16] chore: v1.2.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 035e9c3..825d455 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyriksdagen" -version = "0.17.0" +version = "1.2.1" description = "Access the Riksdagen corpus" authors = ["ninpnin "] repository = "https://github.com/welfare-state-analytics/riksdagen-corpus" From 0f7d62f15c11050d22ad097f983cc98b4bf83725 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Fri, 9 Aug 2024 14:02:50 +0300 Subject: [PATCH 07/16] refactor: simplify code --- pyriksdagen/utils.py | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/pyriksdagen/utils.py b/pyriksdagen/utils.py index fd7188b..94b6f12 100644 --- a/pyriksdagen/utils.py +++ b/pyriksdagen/utils.py @@ -335,27 +335,25 @@ def get_data_location(partition): d["metadata"] = os.environ.get("METADATA_PATH", "data") return d[partition] -def remove_whitespace_from_sequence(text_seq): +def remove_whitespace_from_sequence(text): """ - Remove whitespace from string to get comparable text between corpus and kblab. + Remove repeated whitespace and replace all whitespace with spaces Input is string and output is string. """ - text_seq = text_seq.split() - text_seq_list = [s for s in text_seq if s != ''] - text_seq_string = ' '.join(text_seq_list) - return text_seq_string + text_seq = text.split() + text_seq = [s for s in text_seq if s != ''] + return ' '.join(text_seq) def get_sequence_from_elem_list(elem_list): """ - Get sequence from first elem in list. + Get sequence from first elem in list. Returns string. If list is empty, returns empty string. """ - sequence = '' - if len(elem_list) != 0: - sequence = str(elem_list[0].text) - return sequence + if len(elem_list) > 0: + return str(elem_list[0].text) + return "" -def extract_context_sequence(elem, context_type, target_length = 128, sep_char = '/n'): +def extract_context_sequence(elem, context_type, target_length = 128, separator = '/n'): """ Get sequence with context from xml element. Returns string. """ @@ -389,26 +387,24 @@ def get_context_sequences_for_protocol(protocol, context_type, target_length = 1 """ Gets context sequences for a protocol. Returns dictionary with ids and corresponding context sequences. """ - id_list = [] - context_sequence_list = [] + id_list, texts_with_contexts = [], [] - id_key = f'{XML_NS}id' parser = etree.XMLParser(remove_blank_text=True) root = etree.parse(protocol, parser).getroot() for tag, elem in elem_iter(root): if tag == 'note': - elem_id = elem.get(id_key) + elem_id = elem.get(f'{XML_NS}id') id_list.append(elem_id) context_sequence = extract_context_sequence(elem, context_type = context_type, target_length = target_length, sep_char = sep_char) - context_sequence_list.append(context_sequence) + texts_with_contexts.append(context_sequence) elif tag == 'u': - for child in elem.getchildren(): - child_id = child.get(id_key) + for child in elem: + child_id = child.get(f'{XML_NS}id') id_list.append(child_id) context_sequence = extract_context_sequence(child, context_type=context_type, target_length = target_length, sep_char = sep_char) - context_sequence_list.append(context_sequence) + texts_with_contexts.append(context_sequence) output_dict = {'id' : id_list, - 'text' : context_sequence_list} + 'text' : texts_with_contexts} return output_dict From 9fb4fde5af3c330400e7e23fe95ad820cb0ca212 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Fri, 9 Aug 2024 14:10:56 +0300 Subject: [PATCH 08/16] fix: variable name --- pyriksdagen/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pyriksdagen/utils.py b/pyriksdagen/utils.py index 94b6f12..ebad701 100644 --- a/pyriksdagen/utils.py +++ b/pyriksdagen/utils.py @@ -376,14 +376,14 @@ def extract_context_sequence(elem, context_type, target_length = 128, separator next_first_sentence = ''.join(next_sequence_as_list[:2]) previous_last_sentence = ' '.join(previous_last_sentence.split(' ')[-max_previous_length:]) # truncate sequence if too long - left_context_sequence = previous_last_sentence + f' {sep_char} ' + current_sequence + left_context_sequence = previous_last_sentence + f' {separator} ' + current_sequence if context_type == 'left_context': return left_context_sequence elif context_type == 'full_context': - return left_context_sequence + f' {sep_char} ' + next_first_sentence + return left_context_sequence + f' {separator} ' + next_first_sentence -def get_context_sequences_for_protocol(protocol, context_type, target_length = 128, sep_char = '/n'): +def get_context_sequences_for_protocol(protocol, context_type, target_length = 128, separator = '/n'): """ Gets context sequences for a protocol. Returns dictionary with ids and corresponding context sequences. """ @@ -396,13 +396,13 @@ def get_context_sequences_for_protocol(protocol, context_type, target_length = 1 if tag == 'note': elem_id = elem.get(f'{XML_NS}id') id_list.append(elem_id) - context_sequence = extract_context_sequence(elem, context_type = context_type, target_length = target_length, sep_char = sep_char) + context_sequence = extract_context_sequence(elem, context_type = context_type, target_length = target_length, separator = separator) texts_with_contexts.append(context_sequence) elif tag == 'u': for child in elem: child_id = child.get(f'{XML_NS}id') id_list.append(child_id) - context_sequence = extract_context_sequence(child, context_type=context_type, target_length = target_length, sep_char = sep_char) + context_sequence = extract_context_sequence(child, context_type=context_type, target_length = target_length, separator = separator) texts_with_contexts.append(context_sequence) output_dict = {'id' : id_list, From 116d71d3d04c9387ab2c35f8fb6de35e63524da7 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Wed, 23 Oct 2024 12:34:59 +0200 Subject: [PATCH 09/16] feat: pass etree Elem or str to get_doc_dates() --- pyriksdagen/utils.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/pyriksdagen/utils.py b/pyriksdagen/utils.py index 455fb6e..f46381e 100644 --- a/pyriksdagen/utils.py +++ b/pyriksdagen/utils.py @@ -26,6 +26,10 @@ XML_NS = "{http://www.w3.org/XML/1998/namespace}" TEI_NS = "{http://www.tei-c.org/ns/1.0}" +def fetch_ns(): + return {"tei_ns": "{http://www.tei-c.org/ns/1.0}", + "xml_ns": "{http://www.w3.org/XML/1998/namespace}"} + def elem_iter(root, ns="{http://www.tei-c.org/ns/1.0}"): """ @@ -293,19 +297,26 @@ def download_corpus(path="./", partitions=["records"]): def get_doc_dates(protocol): """ - Gets the content of elements. + Gets the content of elements. + + Args: + protocol: str or etree.Element - - match_error is True when the value of the "when" attribte doesn't match the element's text value. + Returns: - - dates is a list of dates. + match_error (bool): True when the value of the "when" attribte doesn't match the element's text value. + dates (list): a list of dates. """ match_error = False dates = [] - tei_ns = ".//{http://www.tei-c.org/ns/1.0}" - xml_ns = "{http://www.w3.org/XML/1998/namespace}" - parser = etree.XMLParser(remove_blank_text=True) - root = etree.parse(protocol, parser).getroot() - date_elems = root.findall(f"{tei_ns}docDate") + if type(protocol) == str: + root, ns = parse_tei(protocol) + elif type(protocol) == etree._Element: + root = protocol + ns = fetch_ns() + else: + raise TypeError(f"You need to pass a string or etree Element, not {type(protocol)}") + date_elems = root.findall(f"{ns['tei_ns']}docDate") for de in date_elems: when_attrib = de.get("when") elem_text = de.text @@ -360,9 +371,8 @@ def parse_tei(_path, get_ns=True) -> tuple: parser = etree.XMLParser(remove_blank_text=True) root = etree.parse(_path, parser).getroot() if get_ns: - tei_ns = "{http://www.tei-c.org/ns/1.0}" - xml_ns = "{http://www.w3.org/XML/1998/namespace}" - return root, {"tei_ns":tei_ns, "xml_ns":xml_ns} + ns = fetch_ns() + return root, ns else: return root @@ -426,3 +436,5 @@ def get_gh_link(_file, line_number = elem.sourceline gh = f"https://github.com/{username}/{repo}/blob/{branch}/{_file}/#L{line_number}" return gh + + From 4b530d814f8467cb8c2b77ea0b8fccc7f3d260a5 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Wed, 23 Oct 2024 12:39:09 +0200 Subject: [PATCH 10/16] feat: add compiled db to get_data_location() opts --- pyriksdagen/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyriksdagen/utils.py b/pyriksdagen/utils.py index f46381e..fd988ff 100644 --- a/pyriksdagen/utils.py +++ b/pyriksdagen/utils.py @@ -404,6 +404,7 @@ def get_data_location(partition): d["records"] = os.environ.get("RECORDS_PATH", "data") d["motions"] = os.environ.get("MOTIONS_PATH", "data") d["metadata"] = os.environ.get("METADATA_PATH", "data") + d["metadata_db"] = os.environ.get("METADATA_DB", "data") # path to csv or pkl of compiled Corpus() d["interpellations"] = os.environ.get("INTERPELLATIONS_PATH", "data") return d[partition] From 5b3def89c6a4698a02d9db44edc0fecca79a6876 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Wed, 23 Oct 2024 12:56:35 +0200 Subject: [PATCH 11/16] style: cleanup whitespace --- pyriksdagen/metadata.py | 707 ++++++++++++++++++++-------------------- 1 file changed, 354 insertions(+), 353 deletions(-) diff --git a/pyriksdagen/metadata.py b/pyriksdagen/metadata.py index ae4c534..8c40181 100644 --- a/pyriksdagen/metadata.py +++ b/pyriksdagen/metadata.py @@ -6,402 +6,403 @@ import calendar from .utils import get_data_location + + + def increase_date_precision(date, start=True): - if pd.isna(date): - return date - # Year - if len(date) == 4 and start: - return date + '-01-01' - if len(date) == 4 and not start: - return date + '-12-31' - # Month - if len(date) == 7 and start: - return date + '-01' - if len(date) == 7 and start: - last_day = calendar.monthrange(int(date[0]), int(date[1]))[1] - return date + f'-{last_day}' - # Day - if len(date) == 10: - return date + if pd.isna(date): + return date + # Year + if len(date) == 4 and start: + return date + '-01-01' + if len(date) == 4 and not start: + return date + '-12-31' + # Month + if len(date) == 7 and start: + return date + '-01' + if len(date) == 7 and start: + last_day = calendar.monthrange(int(date[0]), int(date[1]))[1] + return date + f'-{last_day}' + # Day + if len(date) == 10: + return date def check_date_overlap(start1, end1, start2, end2): - latest_start = max(start1, start2) - earliest_end = min(end1, end2) - delta = (earliest_end - latest_start).days + 1 - overlap = max(0, delta) - return True if overlap > 0 else False + latest_start = max(start1, start2) + earliest_end = min(end1, end2) + delta = (earliest_end - latest_start).days + 1 + overlap = max(0, delta) + return True if overlap > 0 else False def impute_member_date(db, gov_db, from_gov='Regeringen Löfven I'): - gov_start = gov_db.loc[gov_db['government'] == from_gov, 'start'].iloc[0] - idx = (db['source'] == 'member_of_parliament') &\ - (db['start'] > gov_start) &\ - (db['end'].isna()) - db.loc[idx, 'end'] = gov_db['end'].max() - return db + gov_start = gov_db.loc[gov_db['government'] == from_gov, 'start'].iloc[0] + idx = (db['source'] == 'member_of_parliament') &\ + (db['start'] > gov_start) &\ + (db['end'].isna()) + db.loc[idx, 'end'] = gov_db['end'].max() + return db def impute_member_dates(db, metadata_folder): - def _fill_na(row, **kwargs): - if row['start'] == 'nan' and row['end'] == 'nan': - return row - elif pd.isna(row['start']) and pd.isna(row['end']): - return row - else: - riksmote = pd.read_csv(f"{metadata_folder}/riksdag-year.csv") - if pd.isna(row['start']) or row['start'] == 'nan': - try: - py = riksmote.loc[ - (riksmote['start'] <= row['end']) & - (riksmote['end'] >= row['end']) - ].copy() - row['start'] = py['start'].unique()[0] - except: - #pass - print("no bueno ---------------------> end:", row['end'], row['person_id']) - elif pd.isna(row['end']) or row['end'] == 'nan': - if int(row['start'][:4]) < 1867: - return row - try: - py = riksmote.loc[ - (riksmote['start'] <= row['start']) & - (riksmote['end'] > row['start']) - ].copy() - row['end'] = py['end'].unique()[0] - except: - py = riksmote.loc[ - riksmote['end'].str.startswith(row['start'][:4]) - ].copy() - rs = sorted(py['end'].unique(), reverse=True)[0] - if rs < row['start']: - py = riksmote.loc[ - riksmote['end'].str.startswith( - str(int(row['start'][:4])+1)) - ].copy() - rs = sorted(py['end'].unique(), reverse=True)[0] - row['end'] = rs - return row - - def _impute_start(date, **kwargs): - riksmote = kwargs['riksmote'] - if len(date) == 10: - return date - elif len(date) == 7: - s = sorted(list(riksmote.loc[riksmote['start'].str.startswith(date, na=False), 'start'])) - if len(s) > 0: - return s[0] - else: - return date + "-01" - else: - s = sorted(list(riksmote.loc[riksmote['start'].str.startswith(date, na=False), 'start'])) - if len(s) > 0: - return s[0] - else: - print(f"Problem with start date: {date} not in riksmote") - return date + '-01-01' - - def _impute_end(date, **kwargs): - riksmote = kwargs['riksmote'] - if len(date) == 10: - return date - elif len(date) == 7: - s = sorted(list(riksmote.loc[riksmote['end'].str.startswith(date, na=False), 'end']), reverse=True) - if len(s) > 0: - return s[0] - else: - date_year, date_month = date.split("-") - last_day_of_the_month = calendar.monthrange(int(date_year), int(date_month))[1] - return date + f'-{last_day_of_the_month}' - else: - s = sorted(list(riksmote.loc[riksmote['end'].str.startswith(date, na=False), 'end']), reverse=True) - if len(s) > 0: - return s[0] - else: - print(f"Problem with end date: {date} not in riksmote") - return date + '-12-31' - - riksmote = pd.read_csv(f"{metadata_folder}/riksdag-year.csv") - riksmote[['start', 'end']] = riksmote[['start', 'end']].astype(str) - - idx = (db['source'] == 'member_of_parliament') &\ - (((pd.isna(db['start'])) | (db['start'] == 'nan')) |\ - ((pd.isna(db['end'])) | (db['end'] == 'nan'))) - db.loc[idx] = db.loc[idx].apply(lambda x: _fill_na(x, riksmote=riksmote), axis = 1) - - idx = (db['source'] == 'member_of_parliament') &\ - (pd.notnull(db['start'])) & (db['start'] != 'nan') - db.loc[idx, 'start'] = db.loc[idx, 'start'].apply(_impute_start, riksmote=riksmote) - - idx = (db['source'] == 'member_of_parliament') &\ - (pd.notnull(db['start'])) &\ - (pd.notnull(db['end'])) & (db['end'] != 'nan') - db.loc[idx, 'end'] = db.loc[idx, 'end'].apply(_impute_end, riksmote=riksmote) - return db - + def _fill_na(row, **kwargs): + if row['start'] == 'nan' and row['end'] == 'nan': + return row + elif pd.isna(row['start']) and pd.isna(row['end']): + return row + else: + riksmote = pd.read_csv(f"{metadata_folder}/riksdag-year.csv") + if pd.isna(row['start']) or row['start'] == 'nan': + try: + py = riksmote.loc[ + (riksmote['start'] <= row['end']) & + (riksmote['end'] >= row['end']) + ].copy() + row['start'] = py['start'].unique()[0] + except: + #pass + print("no bueno ---------------------> end:", row['end'], row['person_id']) + elif pd.isna(row['end']) or row['end'] == 'nan': + if int(row['start'][:4]) < 1867: + return row + try: + py = riksmote.loc[ + (riksmote['start'] <= row['start']) & + (riksmote['end'] > row['start']) + ].copy() + row['end'] = py['end'].unique()[0] + except: + py = riksmote.loc[ + riksmote['end'].str.startswith(row['start'][:4]) + ].copy() + rs = sorted(py['end'].unique(), reverse=True)[0] + if rs < row['start']: + py = riksmote.loc[ + riksmote['end'].str.startswith( + str(int(row['start'][:4])+1)) + ].copy() + rs = sorted(py['end'].unique(), reverse=True)[0] + row['end'] = rs + return row + + def _impute_start(date, **kwargs): + riksmote = kwargs['riksmote'] + if len(date) == 10: + return date + elif len(date) == 7: + s = sorted(list(riksmote.loc[riksmote['start'].str.startswith(date, na=False), 'start'])) + if len(s) > 0: + return s[0] + else: + return date + "-01" + else: + s = sorted(list(riksmote.loc[riksmote['start'].str.startswith(date, na=False), 'start'])) + if len(s) > 0: + return s[0] + else: + print(f"Problem with start date: {date} not in riksmote") + return date + '-01-01' + + def _impute_end(date, **kwargs): + riksmote = kwargs['riksmote'] + if len(date) == 10: + return date + elif len(date) == 7: + s = sorted(list(riksmote.loc[riksmote['end'].str.startswith(date, na=False), 'end']), reverse=True) + if len(s) > 0: + return s[0] + else: + date_year, date_month = date.split("-") + last_day_of_the_month = calendar.monthrange(int(date_year), int(date_month))[1] + return date + f'-{last_day_of_the_month}' + else: + s = sorted(list(riksmote.loc[riksmote['end'].str.startswith(date, na=False), 'end']), reverse=True) + if len(s) > 0: + return s[0] + else: + print(f"Problem with end date: {date} not in riksmote") + return date + '-12-31' + + riksmote = pd.read_csv(f"{metadata_folder}/riksdag-year.csv") + riksmote[['start', 'end']] = riksmote[['start', 'end']].astype(str) + + idx = (db['source'] == 'member_of_parliament') &\ + (((pd.isna(db['start'])) | (db['start'] == 'nan')) |\ + ((pd.isna(db['end'])) | (db['end'] == 'nan'))) + db.loc[idx] = db.loc[idx].apply(lambda x: _fill_na(x, riksmote=riksmote), axis = 1) + + idx = (db['source'] == 'member_of_parliament') &\ + (pd.notnull(db['start'])) & (db['start'] != 'nan') + db.loc[idx, 'start'] = db.loc[idx, 'start'].apply(_impute_start, riksmote=riksmote) + + idx = (db['source'] == 'member_of_parliament') &\ + (pd.notnull(db['start'])) &\ + (pd.notnull(db['end'])) & (db['end'] != 'nan') + db.loc[idx, 'end'] = db.loc[idx, 'end'].apply(_impute_end, riksmote=riksmote) + return db def impute_minister_date(db, gov_db): - def _impute_minister_date(minister, gov_db): - if pd.isna(minister['start']): - minister['start'] = gov_db.loc[gov_db['government'] == minister['government'], 'start'].iloc[0] - if pd.isna(minister['end']): - minister['end'] = gov_db.loc[gov_db['government'] == minister['government'], 'end'].iloc[0] - return minister - - # Impute missing minister dates using government dates - if 'source' in db.columns: - db.loc[db['source'] == 'minister'] =\ - db.loc[db['source'] == 'minister'].apply(partial(_impute_minister_date, gov_db=gov_db), axis=1) - else: - db = db.apply(partial(_impute_minister_date, gov_db=gov_db), axis=1) - return db + def _impute_minister_date(minister, gov_db): + if pd.isna(minister['start']): + minister['start'] = gov_db.loc[gov_db['government'] == minister['government'], 'start'].iloc[0] + if pd.isna(minister['end']): + minister['end'] = gov_db.loc[gov_db['government'] == minister['government'], 'end'].iloc[0] + return minister + + # Impute missing minister dates using government dates + if 'source' in db.columns: + db.loc[db['source'] == 'minister'] =\ + db.loc[db['source'] == 'minister'].apply(partial(_impute_minister_date, gov_db=gov_db), axis=1) + else: + db = db.apply(partial(_impute_minister_date, gov_db=gov_db), axis=1) + return db def impute_speaker_date(db): - if "source" in db.columns: - idx = (db['source'] == 'speaker') &\ - (db['end'].isna()) &\ - (db['role'].str.contains('kammare') == False) - db.loc[idx, 'end'] = db.loc[idx, 'start'] + datetime.timedelta(days = 365*4) - else: - idx = (db['end'].isna()) &\ - (db['role'].str.contains('kammare') == False) - db.loc[idx, 'end'] = db.loc[idx, 'start'] + datetime.timedelta(days = 365*4) - return db + if "source" in db.columns: + idx = (db['source'] == 'speaker') &\ + (db['end'].isna()) &\ + (db['role'].str.contains('kammare') == False) + db.loc[idx, 'end'] = db.loc[idx, 'start'] + datetime.timedelta(days = 365*4) + else: + idx = (db['end'].isna()) &\ + (db['role'].str.contains('kammare') == False) + db.loc[idx, 'end'] = db.loc[idx, 'start'] + datetime.timedelta(days = 365*4) + return db def impute_date(db, metadata_folder): - db[["start", "end"]] = db[["start", "end"]].astype(str) - if 'source' in db.columns: - sources = set(db['source']) - if 'member_of_parliament' in sources: - #db = impute_member_date(db, gov_db) - db = impute_member_dates(db, metadata_folder) - - db['start'] = db['start'].apply(increase_date_precision, start=True) - db['end'] = db['end'].apply(increase_date_precision, start=False) - db[["start", "end"]] = db[["start", "end"]].apply(pd.to_datetime, format='%Y-%m-%d') - - gov_db = pd.read_csv(f'{metadata_folder}/government.csv') - gov_db[["start", "end"]] = gov_db[["start", "end"]].apply(pd.to_datetime, format='%Y-%m-%d') - idx = gov_db['start'].idxmax() - gov_db.loc[idx, 'end'] = gov_db.loc[idx, 'start'] + datetime.timedelta(days = 365*4) - - if 'member_of_parliament' in sources: - db = impute_member_date(db, gov_db) - if 'minister' in sources: - db = impute_minister_date(db, gov_db) - if 'speaker' in sources: - db = impute_speaker_date(db) - - else: - db['start'] = db['start'].apply(increase_date_precision, start=True) - db['end'] = db['end'].apply(increase_date_precision, start=False) - db[["start", "end"]] = db[["start", "end"]].apply(pd.to_datetime, format='%Y-%m-%d') - return db + db[["start", "end"]] = db[["start", "end"]].astype(str) + if 'source' in db.columns: + sources = set(db['source']) + if 'member_of_parliament' in sources: + #db = impute_member_date(db, gov_db) + db = impute_member_dates(db, metadata_folder) + + db['start'] = db['start'].apply(increase_date_precision, start=True) + db['end'] = db['end'].apply(increase_date_precision, start=False) + db[["start", "end"]] = db[["start", "end"]].apply(pd.to_datetime, format='%Y-%m-%d') + + gov_db = pd.read_csv(f'{metadata_folder}/government.csv') + gov_db[["start", "end"]] = gov_db[["start", "end"]].apply(pd.to_datetime, format='%Y-%m-%d') + idx = gov_db['start'].idxmax() + gov_db.loc[idx, 'end'] = gov_db.loc[idx, 'start'] + datetime.timedelta(days = 365*4) + + if 'member_of_parliament' in sources: + db = impute_member_date(db, gov_db) + if 'minister' in sources: + db = impute_minister_date(db, gov_db) + if 'speaker' in sources: + db = impute_speaker_date(db) + + else: + db['start'] = db['start'].apply(increase_date_precision, start=True) + db['end'] = db['end'].apply(increase_date_precision, start=False) + db[["start", "end"]] = db[["start", "end"]].apply(pd.to_datetime, format='%Y-%m-%d') + return db def impute_party(db, party): - if 'party' not in db.columns: - db['party'] = pd.Series(dtype=str) - data = [] - for i, row in db[db['party'].isnull()].iterrows(): - parties = party[party['person_id'] == row['person_id']] - if len(set(parties['party'])) == 1: - db.loc[i,'party'] = parties['party'].iloc[0] - if len(set(parties['party'])) >= 2: - for j, sow in parties.iterrows(): - try: - res = check_date_overlap(row['start'], sow['start'], row['end'], sow['end']) - except: - print("Impute dates on Corpus using impute_date() before imputing parties!\n") - raise - if res: - m = row.copy() - m['party'] = sow['party'] - data.append(m) - db = pd.concat([db, pd.DataFrame(data)]).reset_index(drop=True) - return db + if 'party' not in db.columns: + db['party'] = pd.Series(dtype=str) + data = [] + for i, row in db[db['party'].isnull()].iterrows(): + parties = party[party['person_id'] == row['person_id']] + if len(set(parties['party'])) == 1: + db.loc[i,'party'] = parties['party'].iloc[0] + if len(set(parties['party'])) >= 2: + for j, sow in parties.iterrows(): + try: + res = check_date_overlap(row['start'], sow['start'], row['end'], sow['end']) + except: + print("Impute dates on Corpus using impute_date() before imputing parties!\n") + raise + if res: + m = row.copy() + m['party'] = sow['party'] + data.append(m) + db = pd.concat([db, pd.DataFrame(data)]).reset_index(drop=True) + return db def abbreviate_party(db, party): - party = {row['party']:row['abbreviation'] for _, row in party.iterrows()} - db["party_abbrev"] = db["party"].fillna('').map(party) - return db + party = {row['party']:row['abbreviation'] for _, row in party.iterrows()} + db["party_abbrev"] = db["party"].fillna('').map(party) + return db def clean_name(db): - idx = db['name'].notna() - db.loc[idx, 'name'] = db.loc[idx, 'name'].str.lower() - db.loc[idx, 'name'] = db.loc[idx, 'name'].astype(str).apply(multiple_replace) - db.loc[idx, 'name'] = db.loc[idx, 'name'].str.replace('-', ' ', regex=False) - db.loc[idx, 'name'] = db.loc[idx, 'name'].str.replace(r'[^a-zåäö\s\-]', '', regex=True) - return db + idx = db['name'].notna() + db.loc[idx, 'name'] = db.loc[idx, 'name'].str.lower() + db.loc[idx, 'name'] = db.loc[idx, 'name'].astype(str).apply(multiple_replace) + db.loc[idx, 'name'] = db.loc[idx, 'name'].str.replace('-', ' ', regex=False) + db.loc[idx, 'name'] = db.loc[idx, 'name'].str.replace(r'[^a-zåäö\s\-]', '', regex=True) + return db def infer_chamber(db): - def _infer_chamber(role): - d = {'första': 1, 'andra': 2} - match = re.search(r'([a-zåäö]+)\s*(?:kammar)', role) - return d[match.group(1)] if match else 0 - db['chamber'] = db['role'].apply(_infer_chamber).astype(dtype=pd.Int8Dtype()) - return db + def _infer_chamber(role): + d = {'första': 1, 'andra': 2} + match = re.search(r'([a-zåäö]+)\s*(?:kammar)', role) + return d[match.group(1)] if match else 0 + db['chamber'] = db['role'].apply(_infer_chamber).astype(dtype=pd.Int8Dtype()) + return db def format_member_role(db): - db['role'] = db['role'].str.extract(r'(ledamot)') - return db + db['role'] = db['role'].str.extract(r'(ledamot)') + return db def format_minister_role(db): - db["role"] = db["role"].str.replace('Sveriges ', '').str.lower() - return db + db["role"] = db["role"].str.replace('Sveriges ', '').str.lower() + return db def format_speaker_role(db): - def _format_speaker_role(role): - match = re.search(r'(andre |förste |tredje )?(vice )?talman', role) - return match.group(0) - db['role'] = db['role'].apply(_format_speaker_role) - return db + def _format_speaker_role(role): + match = re.search(r'(andre |förste |tredje )?(vice )?talman', role) + return match.group(0) + db['role'] = db['role'].apply(_format_speaker_role) + return db class Corpus(pd.DataFrame): - """ - Store corpus metadata as a single pandas DataFrame where - the column 'source' indicates the type of the row - """ - def __init__(self, *args, **kwargs): - super(Corpus, self).__init__(*args, **kwargs) - - @property - def _constructor(self): - return Corpus - - def _load_metadata(self, file, metadata_folder="corpus/metadata", source=False): - df = pd.read_csv(f"{metadata_folder}/{file}.csv") - - # Adjust to new structure where party information - # is not included in member_of_parliament.csv - if file == "member_of_parliament": - print(df) - columns = list(df.columns) + ["party"] - party_df = pd.read_csv(f"{metadata_folder}/party_affiliation.csv") - party_df = party_df[party_df["start"].notnull()] - party_df = party_df[party_df["end"].notnull()] - df = df.merge(party_df, on=["person_id", "start", "end"], how="left") - df = df[columns] - print(df) - print(df[df["party"].notnull()]) - if source: - df['source'] = file - return df - - def add_mps(self, metadata_folder="corpus/metadata"): - df = self._load_metadata('member_of_parliament', metadata_folder=metadata_folder, source=True) - df = infer_chamber(df) - df = format_member_role(df) - return Corpus(pd.concat([self, df])) - - def add_ministers(self, metadata_folder="corpus/metadata"): - df = self._load_metadata('minister', metadata_folder=metadata_folder, source=True) - df = format_minister_role(df) - return Corpus(pd.concat([self, df])) - - def add_speakers(self, metadata_folder="corpus/metadata"): - df = self._load_metadata('speaker', metadata_folder=metadata_folder, source=True) - df = infer_chamber(df) - df = format_speaker_role(df) - return Corpus(pd.concat([self, df])) - - def add_persons(self, metadata_folder="corpus/metadata"): - df = self._load_metadata('person', metadata_folder=metadata_folder) - return self.merge(df, on='person_id', how='left') - - def add_location_specifiers(self, metadata_folder="corpus/metadata"): - df = self._load_metadata('location_specifier', metadata_folder=metadata_folder) - return self.merge(df, on='person_id', how='left') - - def add_names(self, metadata_folder="corpus/metadata"): - df = self._load_metadata('name', metadata_folder=metadata_folder) - return self.merge(df, on='person_id', how='left') - - def impute_dates(self, metadata_folder="corpus/metadata"): - return impute_date(self, metadata_folder) - - def impute_parties(self, metadata_folder="corpus/metadata"): - df = self._load_metadata('party_affiliation', metadata_folder=metadata_folder) - df = impute_date(df, metadata_folder) - return impute_party(self, df) - - def abbreviate_parties(self, metadata_folder="corpus/metadata"): - df = self._load_metadata('party_abbreviation', metadata_folder=metadata_folder) - return abbreviate_party(self, df) - - def add_twitter(self, metadata_folder="corpus/metadata"): - df = self._load_metadata('twitter', metadata_folder=metadata_folder) - return self.merge(df, on='person_id', how='left') - - def clean_names(self): - return clean_name(self) - + """ + Store corpus metadata as a single pandas DataFrame where + the column 'source' indicates the type of the row + """ + def __init__(self, *args, **kwargs): + super(Corpus, self).__init__(*args, **kwargs) + + @property + def _constructor(self): + return Corpus + + def _load_metadata(self, file, metadata_folder="corpus/metadata", source=False): + df = pd.read_csv(f"{metadata_folder}/{file}.csv") + + # Adjust to new structure where party information + # is not included in member_of_parliament.csv + if file == "member_of_parliament": + print(df) + columns = list(df.columns) + ["party"] + party_df = pd.read_csv(f"{metadata_folder}/party_affiliation.csv") + party_df = party_df[party_df["start"].notnull()] + party_df = party_df[party_df["end"].notnull()] + df = df.merge(party_df, on=["person_id", "start", "end"], how="left") + df = df[columns] + print(df) + print(df[df["party"].notnull()]) + if source: + df['source'] = file + return df + + def add_mps(self, metadata_folder="corpus/metadata"): + df = self._load_metadata('member_of_parliament', metadata_folder=metadata_folder, source=True) + df = infer_chamber(df) + df = format_member_role(df) + return Corpus(pd.concat([self, df])) + + def add_ministers(self, metadata_folder="corpus/metadata"): + df = self._load_metadata('minister', metadata_folder=metadata_folder, source=True) + df = format_minister_role(df) + return Corpus(pd.concat([self, df])) + + def add_speakers(self, metadata_folder="corpus/metadata"): + df = self._load_metadata('speaker', metadata_folder=metadata_folder, source=True) + df = infer_chamber(df) + df = format_speaker_role(df) + return Corpus(pd.concat([self, df])) + + def add_persons(self, metadata_folder="corpus/metadata"): + df = self._load_metadata('person', metadata_folder=metadata_folder) + return self.merge(df, on='person_id', how='left') + + def add_location_specifiers(self, metadata_folder="corpus/metadata"): + df = self._load_metadata('location_specifier', metadata_folder=metadata_folder) + return self.merge(df, on='person_id', how='left') + + def add_names(self, metadata_folder="corpus/metadata"): + df = self._load_metadata('name', metadata_folder=metadata_folder) + return self.merge(df, on='person_id', how='left') + + def impute_dates(self, metadata_folder="corpus/metadata"): + return impute_date(self, metadata_folder) + + def impute_parties(self, metadata_folder="corpus/metadata"): + df = self._load_metadata('party_affiliation', metadata_folder=metadata_folder) + df = impute_date(df, metadata_folder) + return impute_party(self, df) + + def abbreviate_parties(self, metadata_folder="corpus/metadata"): + df = self._load_metadata('party_abbreviation', metadata_folder=metadata_folder) + return abbreviate_party(self, df) + + def add_twitter(self, metadata_folder="corpus/metadata"): + df = self._load_metadata('twitter', metadata_folder=metadata_folder) + return self.merge(df, on='person_id', how='left') + + def clean_names(self): + return clean_name(self) def load_Corpus_metadata(metadata_folder=None, read_db_from=None): - """ - Populates Corpus object - """ - if read_db_from is not None: - print("Reading metadata db from a file.") - try: - corpus = pd.read_csv(read_db_from) - except: - corpus = pd.read_pickle(read_db_from) - else: - print("Compiling metadata db from source.") - if metadata_folder is None: - metadata_folder = get_data_location("metadata") - - corpus = Corpus() - - corpus = corpus.add_mps(metadata_folder=metadata_folder) - corpus = corpus.add_ministers(metadata_folder=metadata_folder) - corpus = corpus.add_speakers(metadata_folder=metadata_folder) - - corpus = corpus.add_persons(metadata_folder=metadata_folder) - corpus = corpus.add_location_specifiers(metadata_folder=metadata_folder) - corpus = corpus.add_names(metadata_folder=metadata_folder) - - corpus = corpus.impute_dates(metadata_folder=metadata_folder) - corpus = corpus.impute_parties(metadata_folder=metadata_folder) - corpus = corpus.abbreviate_parties(metadata_folder=metadata_folder) - corpus = corpus.add_twitter(metadata_folder=metadata_folder) - corpus = corpus.clean_names() - - # Clean up speaker role formatting - corpus["role"] = corpus["role"].replace({ - 'Sveriges riksdags talman':'speaker', - 'andra kammarens andre vice talman':'ak_2_vice_speaker', - 'andra kammarens förste vice talman':'ak_1_vice_speaker', - 'andra kammarens talman':'ak_speaker', - 'andra kammarens vice talman':'ak_1_vice_speaker', - 'andre vice talman i första kammaren':'fk_2_vice_speaker', - 'första kammarens talman':'fk_speaker', - 'första kammarens vice talman':'fk_1_vice_speaker', - 'förste vice talman i första kammaren':'fk_1_vice_speaker' - }) - - # Temporary ids - corpus['person_id'] = corpus['person_id'] - - # Drop individuals with missing names - corpus = corpus[corpus['name'].notna()] - - # Remove redundancy and split file - corpus = corpus.drop_duplicates() - #print( corpus.loc[(pd.isna(corpus['start'])) | (pd.isna(corpus['end']))] ) - corpus = corpus.dropna(subset=['name', 'start', 'end']) - corpus = corpus.sort_values(['person_id', 'start', 'end', 'name']) - - return corpus + """ + Populates Corpus object + """ + if read_db_from is not None: + print("Reading metadata db from a file.") + try: + corpus = pd.read_csv(read_db_from) + except: + corpus = pd.read_pickle(read_db_from) + else: + print("Compiling metadata db from source.") + if metadata_folder is None: + metadata_folder = get_data_location("metadata") + + corpus = Corpus() + + corpus = corpus.add_mps(metadata_folder=metadata_folder) + corpus = corpus.add_ministers(metadata_folder=metadata_folder) + corpus = corpus.add_speakers(metadata_folder=metadata_folder) + + corpus = corpus.add_persons(metadata_folder=metadata_folder) + corpus = corpus.add_location_specifiers(metadata_folder=metadata_folder) + corpus = corpus.add_names(metadata_folder=metadata_folder) + + corpus = corpus.impute_dates(metadata_folder=metadata_folder) + corpus = corpus.impute_parties(metadata_folder=metadata_folder) + corpus = corpus.abbreviate_parties(metadata_folder=metadata_folder) + corpus = corpus.add_twitter(metadata_folder=metadata_folder) + corpus = corpus.clean_names() + + # Clean up speaker role formatting + corpus["role"] = corpus["role"].replace({ + 'Sveriges riksdags talman':'speaker', + 'andra kammarens andre vice talman':'ak_2_vice_speaker', + 'andra kammarens förste vice talman':'ak_1_vice_speaker', + 'andra kammarens talman':'ak_speaker', + 'andra kammarens vice talman':'ak_1_vice_speaker', + 'andre vice talman i första kammaren':'fk_2_vice_speaker', + 'första kammarens talman':'fk_speaker', + 'första kammarens vice talman':'fk_1_vice_speaker', + 'förste vice talman i första kammaren':'fk_1_vice_speaker' + }) + + # Temporary ids + corpus['person_id'] = corpus['person_id'] + + # Drop individuals with missing names + corpus = corpus[corpus['name'].notna()] + + # Remove redundancy and split file + corpus = corpus.drop_duplicates() + #print( corpus.loc[(pd.isna(corpus['start'])) | (pd.isna(corpus['end']))] ) + corpus = corpus.dropna(subset=['name', 'start', 'end']) + corpus = corpus.sort_values(['person_id', 'start', 'end', 'name']) + + return corpus From e08f2e7bbaf8ff4c277e37f8e982cf56dc331c1c Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Wed, 23 Oct 2024 13:00:17 +0200 Subject: [PATCH 12/16] style: cleanup package imports --- pyriksdagen/metadata.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pyriksdagen/metadata.py b/pyriksdagen/metadata.py index 8c40181..ef318eb 100644 --- a/pyriksdagen/metadata.py +++ b/pyriksdagen/metadata.py @@ -1,10 +1,15 @@ -import pandas as pd -import re -from .match_mp import multiple_replace +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Class declaration and functions related to corpus metadata +""" from functools import partial -import datetime +from pyriksdagen.match_mp import multiple_replace +from pyriksdagen.utils import get_data_location +import pandas as pd import calendar -from .utils import get_data_location +import datetime +import re From 21864f9437f9bb8817f4afb47641b24fdca63f70 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Wed, 23 Oct 2024 16:14:41 +0200 Subject: [PATCH 13/16] feat: add fetch_person_*() functions --- pyriksdagen/metadata.py | 90 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/pyriksdagen/metadata.py b/pyriksdagen/metadata.py index ef318eb..09d7211 100644 --- a/pyriksdagen/metadata.py +++ b/pyriksdagen/metadata.py @@ -411,3 +411,93 @@ def load_Corpus_metadata(metadata_folder=None, read_db_from=None): corpus = corpus.sort_values(['person_id', 'start', 'end', 'name']) return corpus + + +def fetch_person_name(person_id, corpus, primary=True): + """ + Get a person's primary name or list of names. + + Args: + person_id (str): a swerik-style person ID + corpus (df): compiled metadata DB + primary (bool): return only person's primary name + + Returns: + name or names (str or list of str) + """ + def _names(df): + return df["name"].unique() + + if person_id is None or person_id == "unknown": + return None + df = corpus.loc[corpus["person_id"] == person_id].copy() + if primary == True: + df = df.loc[corpus["primary_name"] == True].copy() + names = _names(df) + if len(names) == 1: + return names[0] + else: + raise ValueError(f"There should be exactly one primary name for {person_id}, but: {names}") + else: + return _names(df) + + +def fetch_person_gender(person_id, corpus): + """ + Get Person's gender. + + Args: + person_id (str): a swerik-style person ID + corpus (df): compiled metadata DB + + Returns: + gender (str): man or woman + """ + if person_id is None or person_id == "unknown": + return None + df = corpus.loc[(corpus["person_id"] == person_id) & (pd.notnull(corpus["gender"]))].copy() + genders = df["gender"].unique() + if len(genders) == 1: + return genders[0] + elif len(genders) == 0: + return None + else: + raise ValueError(f"There probably shouldn't be multiple genders for {person_id}, but {genders}") + + +def fetch_person_party(person_id, corpus, date=None): + """ + Get a person's party affiliation (during a particular period. + + Args: + person_id (str): a swerik-style person ID + corpus (df): compiled metadata DB + date (str): (yyyy-mm-dd) formatted date string. + If the date arg is not None, the fn will try to match + party affiliations that overlap with that date. If none + are found, it defaults to all party affiliations + + Returns: + party/ies: str or list of str + """ + def _party(df): + parties = df["party"].unique() + if len(parties) == 1: + return "string", parties[0] + elif len(parties) == 0: + return None + else: + return parties + + if person_id is None or person_id == "unknown": + return None + + df = corpus.loc[(corpus["person_id"] == person_id) & (pd.notnull(corpus["party"]))].copy() + if date is not None: + df_date = df.loc[(df["start"] <= date) & (df["end"] >= date)] + parties = _party(df_date) + if parties is not None: + return parties + return _party(df) + + From c8aedcbb327674b9df7bc6d558110424eee4b3fc Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Wed, 23 Oct 2024 16:50:09 +0200 Subject: [PATCH 14/16] refactor: rm valid partition list, rework test --- pyriksdagen/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyriksdagen/utils.py b/pyriksdagen/utils.py index fd988ff..a0be12c 100644 --- a/pyriksdagen/utils.py +++ b/pyriksdagen/utils.py @@ -398,14 +398,13 @@ def get_data_location(partition): RECORDS_PATH, MOTIONS_PATH and METADATA_PATH. If those do not exist returns the defaults data/, data/, data/ """ - valid_partitions = ["records", "motions", "metadata", "interpellations"] - assert partition in valid_partitions, f"Provide valid partition of the dataset ({valid_partitions})" d = {} d["records"] = os.environ.get("RECORDS_PATH", "data") d["motions"] = os.environ.get("MOTIONS_PATH", "data") d["metadata"] = os.environ.get("METADATA_PATH", "data") d["metadata_db"] = os.environ.get("METADATA_DB", "data") # path to csv or pkl of compiled Corpus() d["interpellations"] = os.environ.get("INTERPELLATIONS_PATH", "data") + assert partition in d, f"Provide valid partition of the dataset ({list(d.keys())})" return d[partition] From 09d267194acd8cf06857c68560041d0886efb232 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Wed, 23 Oct 2024 16:50:44 +0200 Subject: [PATCH 15/16] refactor: improve db loading --- pyriksdagen/metadata.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pyriksdagen/metadata.py b/pyriksdagen/metadata.py index 09d7211..f298718 100644 --- a/pyriksdagen/metadata.py +++ b/pyriksdagen/metadata.py @@ -9,6 +9,7 @@ import pandas as pd import calendar import datetime +import os import re @@ -354,16 +355,22 @@ def clean_names(self): return clean_name(self) -def load_Corpus_metadata(metadata_folder=None, read_db_from=None): +def load_Corpus_metadata(metadata_folder=None, read_db=False, read_db_from=None): """ Populates Corpus object """ - if read_db_from is not None: + if read_db or read_db_from is not None: + if read_db_from is None: + read_db_from = get_data_location("metadata_db") + if not os.path.exists(read_db_from): + raise FileNotFoundError(f"File not found at {read_db_from}. Try compiling the database or set the METADATA_DB variable in your environment.") + print("Reading metadata db from a file.") try: corpus = pd.read_csv(read_db_from) except: corpus = pd.read_pickle(read_db_from) + assert type(corpus) == Corpus, f"{read_db_from} is not a CSV or pickle file." else: print("Compiling metadata db from source.") if metadata_folder is None: From eaf1f83c3ed935c7cd77535abca123abdfcb3348 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Wed, 30 Oct 2024 12:01:10 +0100 Subject: [PATCH 16/16] chore: compromise --- pyriksdagen/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyriksdagen/utils.py b/pyriksdagen/utils.py index a0be12c..2a4f16b 100644 --- a/pyriksdagen/utils.py +++ b/pyriksdagen/utils.py @@ -26,9 +26,10 @@ XML_NS = "{http://www.w3.org/XML/1998/namespace}" TEI_NS = "{http://www.tei-c.org/ns/1.0}" + def fetch_ns(): - return {"tei_ns": "{http://www.tei-c.org/ns/1.0}", - "xml_ns": "{http://www.w3.org/XML/1998/namespace}"} + return {"tei_ns": TEI_NS, + "xml_ns": XML_NS} def elem_iter(root, ns="{http://www.tei-c.org/ns/1.0}"):