Skip to content

Commit

Permalink
chore: merge pull request #12 from swerik-project/dev
Browse files Browse the repository at this point in the history
  • Loading branch information
ninpnin authored Sep 12, 2024
2 parents 6b7c006 + 65f3082 commit f50f777
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 45 deletions.
20 changes: 20 additions & 0 deletions .github/workflows/validate-cff.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: "Pyriksdagen: Validate CITATION cff"

on:
push:
paths:
- CITATION.cff
- .github/workflows/validate-cff.yml

jobs:
Validate-CITATION-cff:
runs-on: ubuntu-latest
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Validate CITATION.cff
uses: dieghernan/cff-validator@v3
20 changes: 20 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
cff-version: 1.2.0
message: To cite this reposository, please use these metadata.
title: "The Swedish Parliament Corpus: pyriksdagen"
version: v1.2.0
authors:
- given-names: Väinö
family-names: Yrjänäinen
alias: ninpnin
- family-names: Borges
given-names: Robert
orcid: "https://orcid.org/0000-0002-7647-4048"
alias: BobBorges
date-released: 2024-04-25
identifiers:
- description: Repository basename
type: other
value: pyriksdagen
repository-code: "https://github.com/swerik-project/pyriksdagen"
url: "https://github.com/swerik-project/the-swedish-parliament-corpus"
type: dataset
96 changes: 52 additions & 44 deletions pyriksdagen/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,52 +348,60 @@ def clean_names(self):



def load_Corpus_metadata(metadata_folder=None):
def load_Corpus_metadata(metadata_folder=None, read_db_from=None):
"""
Populates Corpus object
"""
if metadata_folder is None:
metadata_folder = get_data_location("metadata")

corpus = Corpus()

corpus = corpus.add_mps(metadata_folder=metadata_folder)
corpus = corpus.add_ministers(metadata_folder=metadata_folder)
corpus = corpus.add_speakers(metadata_folder=metadata_folder)

corpus = corpus.add_persons(metadata_folder=metadata_folder)
corpus = corpus.add_location_specifiers(metadata_folder=metadata_folder)
corpus = corpus.add_names(metadata_folder=metadata_folder)

corpus = corpus.impute_dates(metadata_folder=metadata_folder)
corpus = corpus.impute_parties(metadata_folder=metadata_folder)
corpus = corpus.abbreviate_parties(metadata_folder=metadata_folder)
corpus = corpus.add_twitter(metadata_folder=metadata_folder)
corpus = corpus.clean_names()

# Clean up speaker role formatting
corpus["role"] = corpus["role"].replace({
'Sveriges riksdags talman':'speaker',
'andra kammarens andre vice talman':'ak_2_vice_speaker',
'andra kammarens förste vice talman':'ak_1_vice_speaker',
'andra kammarens talman':'ak_speaker',
'andra kammarens vice talman':'ak_1_vice_speaker',
'andre vice talman i första kammaren':'fk_2_vice_speaker',
'första kammarens talman':'fk_speaker',
'första kammarens vice talman':'fk_1_vice_speaker',
'förste vice talman i första kammaren':'fk_1_vice_speaker'
})

# Temporary ids
corpus['person_id'] = corpus['person_id']

# Drop individuals with missing names
corpus = corpus[corpus['name'].notna()]

# Remove redundancy and split file
corpus = corpus.drop_duplicates()
#print( corpus.loc[(pd.isna(corpus['start'])) | (pd.isna(corpus['end']))] )
corpus = corpus.dropna(subset=['name', 'start', 'end'])
corpus = corpus.sort_values(['person_id', 'start', 'end', 'name'])
if read_db_from is not None:
print("Reading metadata db from a file.")
try:
corpus = pd.read_csv(read_db_from)
except:
corpus = pd.read_pickle(read_db_from)
else:
print("Compiling metadata db from source.")
if metadata_folder is None:
metadata_folder = get_data_location("metadata")

corpus = Corpus()

corpus = corpus.add_mps(metadata_folder=metadata_folder)
corpus = corpus.add_ministers(metadata_folder=metadata_folder)
corpus = corpus.add_speakers(metadata_folder=metadata_folder)

corpus = corpus.add_persons(metadata_folder=metadata_folder)
corpus = corpus.add_location_specifiers(metadata_folder=metadata_folder)
corpus = corpus.add_names(metadata_folder=metadata_folder)

corpus = corpus.impute_dates(metadata_folder=metadata_folder)
corpus = corpus.impute_parties(metadata_folder=metadata_folder)
corpus = corpus.abbreviate_parties(metadata_folder=metadata_folder)
corpus = corpus.add_twitter(metadata_folder=metadata_folder)
corpus = corpus.clean_names()

# Clean up speaker role formatting
corpus["role"] = corpus["role"].replace({
'Sveriges riksdags talman':'speaker',
'andra kammarens andre vice talman':'ak_2_vice_speaker',
'andra kammarens förste vice talman':'ak_1_vice_speaker',
'andra kammarens talman':'ak_speaker',
'andra kammarens vice talman':'ak_1_vice_speaker',
'andre vice talman i första kammaren':'fk_2_vice_speaker',
'första kammarens talman':'fk_speaker',
'första kammarens vice talman':'fk_1_vice_speaker',
'förste vice talman i första kammaren':'fk_1_vice_speaker'
})

# Temporary ids
corpus['person_id'] = corpus['person_id']

# Drop individuals with missing names
corpus = corpus[corpus['name'].notna()]

# Remove redundancy and split file
corpus = corpus.drop_duplicates()
#print( corpus.loc[(pd.isna(corpus['start'])) | (pd.isna(corpus['end']))] )
corpus = corpus.dropna(subset=['name', 'start', 'end'])
corpus = corpus.sort_values(['person_id', 'start', 'end', 'name'])

return corpus
2 changes: 1 addition & 1 deletion pyriksdagen/swerik_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _identifier_template(auth_code, identifier:str) -> dict:
"name": "Swedish Portrait Archive ID"
},
"UpUnAlID": { # P6821
"formatter-url": "https://www.alvin-portal.org/alvin/view.jsf?pid=alvin-person%{}",
"formatter-url": "https://www.alvin-portal.org/alvin/view.jsf?pid={}",
"name": "Uppsala University Alvin ID"
},
"WiDaID": {
Expand Down

0 comments on commit f50f777

Please sign in to comment.