diff --git a/data/processed/environment_statements/prepared/EnvironmentStatementsInstitutionLevel.parquet b/data/processed/environment_statements/EnvironmentStatementsInstitutionLevel.parquet similarity index 99% rename from data/processed/environment_statements/prepared/EnvironmentStatementsInstitutionLevel.parquet rename to data/processed/environment_statements/EnvironmentStatementsInstitutionLevel.parquet index f340a6b..0ac55eb 100644 Binary files a/data/processed/environment_statements/prepared/EnvironmentStatementsInstitutionLevel.parquet and b/data/processed/environment_statements/EnvironmentStatementsInstitutionLevel.parquet differ diff --git a/data/processed/environment_statements/prepared/EnvironmentStatementsUnitLevel.parquet b/data/processed/environment_statements/EnvironmentStatementsUnitLevel.parquet similarity index 99% rename from data/processed/environment_statements/prepared/EnvironmentStatementsUnitLevel.parquet rename to data/processed/environment_statements/EnvironmentStatementsUnitLevel.parquet index d9403a0..339a3bb 100644 Binary files a/data/processed/environment_statements/prepared/EnvironmentStatementsUnitLevel.parquet and b/data/processed/environment_statements/EnvironmentStatementsUnitLevel.parquet differ diff --git a/logs/EnvironmentStatementsInstitutionLevel.log b/logs/EnvironmentStatementsInstitutionLevel.log index 6572852..f3071f3 100644 --- a/logs/EnvironmentStatementsInstitutionLevel.log +++ b/logs/EnvironmentStatementsInstitutionLevel.log @@ -1,4 +1,11 @@ -2024-02-06 17:05:39,048 [INFO] EnvironmentStatementsInstitutionLevel - read data from 'data/processed/environment_statements/extracted/institution/' -2024-02-06 17:05:39,048 [INFO] EnvironmentStatementsInstitutionLevel - statements: 143, sections: 4 -2024-02-06 17:05:39,485 [INFO] EnvironmentStatementsInstitutionLevel - prepared institution statements: 143 records, 5 columns -2024-02-06 17:05:39,508 [INFO] EnvironmentStatementsInstitutionLevel - write dataset to 'data/processed/environment_statements/prepared/EnvironmentStatementsInstitutionLevel.parquet' +2024-02-08 19:40:09,081 [INFO] EnvironmentStatementsInstitutionLevel - read data from 'data/processed/environment_statements/extracted/institution/' +2024-02-08 19:40:09,081 [INFO] EnvironmentStatementsInstitutionLevel - statements: 143, sections: 4 +2024-02-08 19:40:09,082 [INFO] EnvironmentStatementsInstitutionLevel - split statements into lines +2024-02-08 19:40:09,082 [INFO] EnvironmentStatementsInstitutionLevel - deleted empty lines +2024-02-08 19:40:09,082 [INFO] EnvironmentStatementsInstitutionLevel - replaced tabs with spaces +2024-02-08 19:40:09,082 [INFO] EnvironmentStatementsInstitutionLevel - replaced multiple spaces with a single space +2024-02-08 19:40:09,083 [INFO] EnvironmentStatementsInstitutionLevel - deleted lines with page numbers +2024-02-08 19:40:09,084 [INFO] EnvironmentStatementsInstitutionLevel - deleted lines equal to any of ['Institutional level environment template (REF5a)', 'Institutional level environment template (REF5b)', 'Unit-level environment template (REF5a)', 'Unit-level environment template (REF5b)', 'REF5a - Institution Environment Statement', 'Institutional-Level Environment Statement (REF5a)'] +2024-02-08 19:40:09,558 [INFO] EnvironmentStatementsInstitutionLevel - processed all 143 available statements +2024-02-08 19:40:09,559 [INFO] EnvironmentStatementsInstitutionLevel - make categorical ['Institution name'] +2024-02-08 19:40:09,609 [INFO] EnvironmentStatementsInstitutionLevel - write dataset to 'data/processed/environment_statements/EnvironmentStatementsInstitutionLevel.parquet' diff --git a/logs/EnvironmentStatementsUnitLevel.log b/logs/EnvironmentStatementsUnitLevel.log index 8a52f5e..2bd1a99 100644 --- a/logs/EnvironmentStatementsUnitLevel.log +++ b/logs/EnvironmentStatementsUnitLevel.log @@ -1,4 +1,11 @@ -2024-02-06 17:05:39,030 [INFO] EnvironmentStatementsUnitLevel - read data from 'data/processed/environment_statements/extracted/unit/' -2024-02-06 17:05:39,030 [INFO] EnvironmentStatementsUnitLevel - statements: 1874, sections: 4 -2024-02-06 17:05:57,989 [INFO] EnvironmentStatementsUnitLevel - prepared statements: 1874 records -2024-02-06 17:05:58,524 [INFO] EnvironmentStatementsUnitLevel - write dataset to 'data/processed/environment_statements/prepared/EnvironmentStatementsUnitLevel.parquet' +2024-02-08 19:40:09,059 [INFO] EnvironmentStatementsUnitLevel - read data from 'data/processed/environment_statements/extracted/unit/' +2024-02-08 19:40:09,060 [INFO] EnvironmentStatementsUnitLevel - statements: 1874, sections: 4 +2024-02-08 19:40:09,061 [INFO] EnvironmentStatementsUnitLevel - split statements into lines +2024-02-08 19:40:09,061 [INFO] EnvironmentStatementsUnitLevel - deleted empty lines +2024-02-08 19:40:09,061 [INFO] EnvironmentStatementsUnitLevel - replaced tabs with spaces +2024-02-08 19:40:09,062 [INFO] EnvironmentStatementsUnitLevel - replaced multiple spaces with a single space +2024-02-08 19:40:09,063 [INFO] EnvironmentStatementsUnitLevel - deleted lines with page numbers +2024-02-08 19:40:09,063 [INFO] EnvironmentStatementsUnitLevel - deleted lines equal to any of ['Institutional level environment template (REF5a)', 'Institutional level environment template (REF5b)', 'Unit-level environment template (REF5a)', 'Unit-level environment template (REF5b)', 'REF5a - Institution Environment Statement', 'Institutional-Level Environment Statement (REF5a)'] +2024-02-08 19:40:28,754 [INFO] EnvironmentStatementsUnitLevel - processed all 1874 available statements +2024-02-08 19:40:28,754 [INFO] EnvironmentStatementsUnitLevel - make categorical ['Institution name', 'Unit of assessment name', 'Multiple submission letter'] +2024-02-08 19:40:29,145 [INFO] EnvironmentStatementsUnitLevel - write dataset to 'data/processed/environment_statements/EnvironmentStatementsUnitLevel.parquet' diff --git a/logs/ImpactCaseStudies.log b/logs/ImpactCaseStudies.log index 6cf960f..a88eef1 100644 --- a/logs/ImpactCaseStudies.log +++ b/logs/ImpactCaseStudies.log @@ -1,10 +1,10 @@ -2024-02-06 17:05:42,732 [INFO] ImpactCaseStudies - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' -2024-02-06 17:05:43,321 [INFO] ImpactCaseStudies - parsed sheet: 6361 records -2024-02-06 17:05:43,322 [INFO] ImpactCaseStudies - rename 'Main panel' to 'Main panel code' -2024-02-06 17:05:43,324 [INFO] ImpactCaseStudies - replace '['/', ':']' with '_' in 'Institution name' -2024-02-06 17:05:43,325 [INFO] ImpactCaseStudies - add columns for panel names -2024-02-06 17:05:43,326 [INFO] ImpactCaseStudies - shift columns from title to the left to fix raw data issue -2024-02-06 17:05:46,539 [INFO] ImpactCaseStudies - replace styling characters in ['1. Summary of the impact', '2. Underpinning research', '3. References to the research', '4. Details of the impact', '5. Sources to corroborate the impact'] -2024-02-06 17:05:46,543 [INFO] ImpactCaseStudies - drop columns '['Researcher ORCIDs', 'Institution UKPRN code', '5. Sources to corroborate the impact', 'Unit of assessment number', 'Global research identifiers', 'Main panel code', '3. References to the research', 'Formal partners', 'Is continued from 2014', 'Grant funding', '2. Underpinning research', 'Countries']' -2024-02-06 17:05:46,566 [INFO] ImpactCaseStudies - make categorical ['Institution name', 'Main panel name', 'Joint submission', 'Unit of assessment name', 'Multiple submission letter', 'Multiple submission name'] -2024-02-06 17:05:46,765 [INFO] ImpactCaseStudies - write dataset to 'data/processed/sheets/ImpactCaseStudies.parquet' +2024-02-08 19:40:12,766 [INFO] ImpactCaseStudies - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' +2024-02-08 19:40:13,319 [INFO] ImpactCaseStudies - parsed sheet: 6361 records +2024-02-08 19:40:13,319 [INFO] ImpactCaseStudies - rename 'Main panel' to 'Main panel code' +2024-02-08 19:40:13,321 [INFO] ImpactCaseStudies - replace '['/', ':']' with '_' in 'Institution name' +2024-02-08 19:40:13,323 [INFO] ImpactCaseStudies - add columns for panel names +2024-02-08 19:40:13,324 [INFO] ImpactCaseStudies - shift columns from title to the left to fix raw data issue +2024-02-08 19:40:16,184 [INFO] ImpactCaseStudies - replace styling characters in ['1. Summary of the impact', '2. Underpinning research', '3. References to the research', '4. Details of the impact', '5. Sources to corroborate the impact'] +2024-02-08 19:40:16,187 [INFO] ImpactCaseStudies - drop columns '['3. References to the research', '5. Sources to corroborate the impact', 'Researcher ORCIDs', 'Institution UKPRN code', 'Is continued from 2014', 'Unit of assessment number', 'Countries', 'Grant funding', 'Main panel code', 'Formal partners', 'Global research identifiers', '2. Underpinning research']' +2024-02-08 19:40:16,215 [INFO] ImpactCaseStudies - make categorical ['Main panel name', 'Institution name', 'Multiple submission letter', 'Unit of assessment name', 'Multiple submission name', 'Joint submission'] +2024-02-08 19:40:16,429 [INFO] ImpactCaseStudies - write dataset to 'data/processed/sheets/ImpactCaseStudies.parquet' diff --git a/logs/Outputs.log b/logs/Outputs.log index d661b9b..76a4e5c 100644 --- a/logs/Outputs.log +++ b/logs/Outputs.log @@ -1,12 +1,12 @@ -2024-02-06 17:05:42,713 [INFO] Outputs - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' -2024-02-06 17:05:59,201 [INFO] Outputs - parsed sheet: 185353 records -2024-02-06 17:05:59,221 [INFO] Outputs - rename 'Main panel' to 'Main panel code' -2024-02-06 17:05:59,252 [INFO] Outputs - rename 'Output type' to 'Output type code' -2024-02-06 17:05:59,298 [INFO] Outputs - replace '['/', ':']' with '_' in 'Institution name' -2024-02-06 17:05:59,338 [INFO] Outputs - add columns for panel names -2024-02-06 17:06:00,062 [INFO] Outputs - replace styling characters in ['Title'] -2024-02-06 17:06:00,097 [INFO] Outputs - add columns for output types names -2024-02-06 17:06:00,098 [INFO] Outputs - make output year categorical -2024-02-06 17:06:00,141 [INFO] Outputs - drop columns '['Main panel code', 'Institution UKPRN code', 'Unit of assessment number', 'Output type code']' -2024-02-06 17:06:00,155 [INFO] Outputs - make categorical ['Institution name', 'Multiple submission letter', 'Research group', 'Delayed by COVID19', 'Interdisciplinary', 'Output type', 'Joint submission', 'Is reserve output', 'Open access status', 'Cross-referral requested', 'Propose double weighting', 'Citations applicable', 'Main panel name', 'Unit of assessment name', 'Multiple submission name', 'Forensic science', 'Non-English', 'Criminology'] -2024-02-06 17:06:00,469 [INFO] Outputs - write dataset to 'data/processed/sheets/Outputs.parquet' +2024-02-08 19:40:12,758 [INFO] Outputs - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' +2024-02-08 19:40:29,215 [INFO] Outputs - parsed sheet: 185353 records +2024-02-08 19:40:29,240 [INFO] Outputs - rename 'Main panel' to 'Main panel code' +2024-02-08 19:40:29,273 [INFO] Outputs - rename 'Output type' to 'Output type code' +2024-02-08 19:40:29,319 [INFO] Outputs - replace '['/', ':']' with '_' in 'Institution name' +2024-02-08 19:40:29,363 [INFO] Outputs - add columns for panel names +2024-02-08 19:40:30,092 [INFO] Outputs - replace styling characters in ['Title'] +2024-02-08 19:40:30,128 [INFO] Outputs - add columns for output types names +2024-02-08 19:40:30,132 [INFO] Outputs - make output year categorical +2024-02-08 19:40:30,175 [INFO] Outputs - drop columns '['Institution UKPRN code', 'Unit of assessment number', 'Output type code', 'Main panel code']' +2024-02-08 19:40:30,189 [INFO] Outputs - make categorical ['Unit of assessment name', 'Institution name', 'Multiple submission name', 'Interdisciplinary', 'Cross-referral requested', 'Main panel name', 'Multiple submission letter', 'Forensic science', 'Open access status', 'Citations applicable', 'Propose double weighting', 'Non-English', 'Is reserve output', 'Delayed by COVID19', 'Joint submission', 'Research group', 'Output type', 'Criminology'] +2024-02-08 19:40:30,511 [INFO] Outputs - write dataset to 'data/processed/sheets/Outputs.parquet' diff --git a/logs/ResearchDoctoralDegreesAwarded.log b/logs/ResearchDoctoralDegreesAwarded.log index 1acb7bd..17b78cf 100644 --- a/logs/ResearchDoctoralDegreesAwarded.log +++ b/logs/ResearchDoctoralDegreesAwarded.log @@ -1,9 +1,9 @@ -2024-02-06 17:05:42,746 [INFO] ResearchDoctoralDegreesAwarded - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' -2024-02-06 17:05:42,829 [INFO] ResearchDoctoralDegreesAwarded - parsed sheet: 1888 records -2024-02-06 17:05:42,830 [INFO] ResearchDoctoralDegreesAwarded - rename 'Main panel' to 'Main panel code' -2024-02-06 17:05:42,831 [INFO] ResearchDoctoralDegreesAwarded - replace '['/', ':']' with '_' in 'Institution name' -2024-02-06 17:05:42,832 [INFO] ResearchDoctoralDegreesAwarded - add columns for panel names -2024-02-06 17:05:42,833 [INFO] ResearchDoctoralDegreesAwarded - calculate total number of degrees awarded -2024-02-06 17:05:42,833 [INFO] ResearchDoctoralDegreesAwarded - drop columns '['Unit of assessment number', 'Main panel code', 'Institution UKPRN code']' -2024-02-06 17:05:42,833 [INFO] ResearchDoctoralDegreesAwarded - make categorical ['Multiple submission name', 'Joint submission', 'Main panel name', 'Multiple submission letter', 'Unit of assessment name', 'Institution name'] -2024-02-06 17:05:42,851 [INFO] ResearchDoctoralDegreesAwarded - write dataset to 'data/processed/sheets/ResearchDoctoralDegreesAwarded.parquet' +2024-02-08 19:40:12,755 [INFO] ResearchDoctoralDegreesAwarded - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' +2024-02-08 19:40:12,839 [INFO] ResearchDoctoralDegreesAwarded - parsed sheet: 1888 records +2024-02-08 19:40:12,839 [INFO] ResearchDoctoralDegreesAwarded - rename 'Main panel' to 'Main panel code' +2024-02-08 19:40:12,840 [INFO] ResearchDoctoralDegreesAwarded - replace '['/', ':']' with '_' in 'Institution name' +2024-02-08 19:40:12,841 [INFO] ResearchDoctoralDegreesAwarded - add columns for panel names +2024-02-08 19:40:12,842 [INFO] ResearchDoctoralDegreesAwarded - calculate total number of degrees awarded +2024-02-08 19:40:12,842 [INFO] ResearchDoctoralDegreesAwarded - drop columns '['Unit of assessment number', 'Institution UKPRN code', 'Main panel code']' +2024-02-08 19:40:12,843 [INFO] ResearchDoctoralDegreesAwarded - make categorical ['Unit of assessment name', 'Multiple submission name', 'Joint submission', 'Main panel name', 'Multiple submission letter', 'Institution name'] +2024-02-08 19:40:12,856 [INFO] ResearchDoctoralDegreesAwarded - write dataset to 'data/processed/sheets/ResearchDoctoralDegreesAwarded.parquet' diff --git a/logs/ResearchGroups.log b/logs/ResearchGroups.log index 4ab2e6e..09d38ff 100644 --- a/logs/ResearchGroups.log +++ b/logs/ResearchGroups.log @@ -1,9 +1,9 @@ -2024-02-06 17:05:42,734 [INFO] ResearchGroups - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' -2024-02-06 17:05:42,800 [INFO] ResearchGroups - parsed sheet: 2036 records -2024-02-06 17:05:42,800 [INFO] ResearchGroups - rename 'Main panel' to 'Main panel code' -2024-02-06 17:05:42,801 [INFO] ResearchGroups - replace '['/', ':']' with '_' in 'Institution name' -2024-02-06 17:05:42,802 [INFO] ResearchGroups - add columns for panel names -2024-02-06 17:05:42,803 [INFO] ResearchGroups - make group code categorical -2024-02-06 17:05:42,803 [INFO] ResearchGroups - drop columns '['Main panel code', 'Unit of assessment number', 'Institution UKPRN code']' -2024-02-06 17:05:42,803 [INFO] ResearchGroups - make categorical ['Institution name', 'Joint submission', 'Main panel name', 'Unit of assessment name', 'Multiple submission name', 'Multiple submission letter'] -2024-02-06 17:05:42,817 [INFO] ResearchGroups - write dataset to 'data/processed/sheets/ResearchGroups.parquet' +2024-02-08 19:40:12,754 [INFO] ResearchGroups - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' +2024-02-08 19:40:12,808 [INFO] ResearchGroups - parsed sheet: 2036 records +2024-02-08 19:40:12,809 [INFO] ResearchGroups - rename 'Main panel' to 'Main panel code' +2024-02-08 19:40:12,810 [INFO] ResearchGroups - replace '['/', ':']' with '_' in 'Institution name' +2024-02-08 19:40:12,811 [INFO] ResearchGroups - add columns for panel names +2024-02-08 19:40:12,811 [INFO] ResearchGroups - make group code categorical +2024-02-08 19:40:12,812 [INFO] ResearchGroups - drop columns '['Institution UKPRN code', 'Unit of assessment number', 'Main panel code']' +2024-02-08 19:40:12,812 [INFO] ResearchGroups - make categorical ['Unit of assessment name', 'Main panel name', 'Multiple submission name', 'Joint submission', 'Multiple submission letter', 'Institution name'] +2024-02-08 19:40:12,824 [INFO] ResearchGroups - write dataset to 'data/processed/sheets/ResearchGroups.parquet' diff --git a/logs/ResearchIncome.log b/logs/ResearchIncome.log index 79a6eb3..7095e83 100644 --- a/logs/ResearchIncome.log +++ b/logs/ResearchIncome.log @@ -1,9 +1,9 @@ -2024-02-06 17:05:42,714 [INFO] ResearchIncome - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' -2024-02-06 17:05:43,917 [INFO] ResearchIncome - parsed sheet: 28637 records -2024-02-06 17:05:43,918 [INFO] ResearchIncome - rename 'Main panel' to 'Main panel code' -2024-02-06 17:05:43,926 [INFO] ResearchIncome - replace '['/', ':']' with '_' in 'Institution name' -2024-02-06 17:05:43,931 [INFO] ResearchIncome - add columns for panel names -2024-02-06 17:05:43,934 [INFO] ResearchIncome - make income source categorical -2024-02-06 17:05:43,936 [INFO] ResearchIncome - drop columns '['Unit of assessment number', 'Institution UKPRN code', 'Main panel code']' -2024-02-06 17:05:43,936 [INFO] ResearchIncome - make categorical ['Unit of assessment name', 'Multiple submission name', 'Joint submission', 'Multiple submission letter', 'Main panel name', 'Institution name'] -2024-02-06 17:05:43,965 [INFO] ResearchIncome - write dataset to 'data/processed/sheets/ResearchIncome.parquet' +2024-02-08 19:40:12,755 [INFO] ResearchIncome - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' +2024-02-08 19:40:13,935 [INFO] ResearchIncome - parsed sheet: 28637 records +2024-02-08 19:40:13,936 [INFO] ResearchIncome - rename 'Main panel' to 'Main panel code' +2024-02-08 19:40:13,942 [INFO] ResearchIncome - replace '['/', ':']' with '_' in 'Institution name' +2024-02-08 19:40:13,945 [INFO] ResearchIncome - add columns for panel names +2024-02-08 19:40:13,947 [INFO] ResearchIncome - make income source categorical +2024-02-08 19:40:13,948 [INFO] ResearchIncome - drop columns '['Institution UKPRN code', 'Main panel code', 'Unit of assessment number']' +2024-02-08 19:40:13,949 [INFO] ResearchIncome - make categorical ['Unit of assessment name', 'Multiple submission letter', 'Joint submission', 'Multiple submission name', 'Main panel name', 'Institution name'] +2024-02-08 19:40:13,973 [INFO] ResearchIncome - write dataset to 'data/processed/sheets/ResearchIncome.parquet' diff --git a/logs/ResearchIncomeInKind.log b/logs/ResearchIncomeInKind.log index ab25a18..a9dedbb 100644 --- a/logs/ResearchIncomeInKind.log +++ b/logs/ResearchIncomeInKind.log @@ -1,9 +1,9 @@ -2024-02-06 17:05:42,756 [INFO] ResearchIncomeInKind - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' -2024-02-06 17:05:42,964 [INFO] ResearchIncomeInKind - parsed sheet: 4093 records -2024-02-06 17:05:42,965 [INFO] ResearchIncomeInKind - rename 'Main panel' to 'Main panel code' -2024-02-06 17:05:42,967 [INFO] ResearchIncomeInKind - replace '['/', ':']' with '_' in 'Institution name' -2024-02-06 17:05:42,968 [INFO] ResearchIncomeInKind - add columns for panel names -2024-02-06 17:05:42,969 [INFO] ResearchIncomeInKind - make income source categorical -2024-02-06 17:05:42,969 [INFO] ResearchIncomeInKind - drop columns '['Unit of assessment number', 'Main panel code', 'Institution UKPRN code']' -2024-02-06 17:05:42,969 [INFO] ResearchIncomeInKind - make categorical ['Multiple submission name', 'Main panel name', 'Joint submission', 'Unit of assessment name', 'Multiple submission letter', 'Institution name'] -2024-02-06 17:05:42,983 [INFO] ResearchIncomeInKind - write dataset to 'data/processed/sheets/ResearchIncomeInKind.parquet' +2024-02-08 19:40:12,767 [INFO] ResearchIncomeInKind - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' +2024-02-08 19:40:12,974 [INFO] ResearchIncomeInKind - parsed sheet: 4093 records +2024-02-08 19:40:12,974 [INFO] ResearchIncomeInKind - rename 'Main panel' to 'Main panel code' +2024-02-08 19:40:12,975 [INFO] ResearchIncomeInKind - replace '['/', ':']' with '_' in 'Institution name' +2024-02-08 19:40:12,976 [INFO] ResearchIncomeInKind - add columns for panel names +2024-02-08 19:40:12,977 [INFO] ResearchIncomeInKind - make income source categorical +2024-02-08 19:40:12,977 [INFO] ResearchIncomeInKind - drop columns '['Unit of assessment number', 'Institution UKPRN code', 'Main panel code']' +2024-02-08 19:40:12,977 [INFO] ResearchIncomeInKind - make categorical ['Unit of assessment name', 'Multiple submission letter', 'Joint submission', 'Institution name', 'Main panel name', 'Multiple submission name'] +2024-02-08 19:40:12,989 [INFO] ResearchIncomeInKind - write dataset to 'data/processed/sheets/ResearchIncomeInKind.parquet' diff --git a/logs/Results.log b/logs/Results.log index 0508c77..a51ce33 100644 --- a/logs/Results.log +++ b/logs/Results.log @@ -1,43 +1,43 @@ -2024-02-06 17:06:00,957 [INFO] Results - read sheet from 'data/raw/REF-2021-Results-All-2022-05-06.xlsx' -2024-02-06 17:06:01,360 [INFO] Results - parsed sheet: 7552 records -2024-02-06 17:06:01,360 [INFO] Results - rename 'Main panel' to 'Main panel code' -2024-02-06 17:06:01,362 [INFO] Results - replace '['/', ':']' with '_' in 'Institution name' -2024-02-06 17:06:01,364 [INFO] Results - add columns for panel names -2024-02-06 17:06:01,369 [INFO] Results - replace '-' with na in ['4 stars', '3 stars', '2 stars', '1 star', 'Unclassified'] -2024-02-06 17:06:01,371 [INFO] Results - bin percentages for ['4 stars', '3 stars', '2 stars', '1 star', 'Unclassified'] -2024-02-06 17:06:01,372 [INFO] Results - drop columns '['Institution code (UKPRN)', 'Institution sort order']' -2024-02-06 17:06:01,381 [INFO] Results - pivot to make wide format for ratings per profile to enable analyses -2024-02-06 17:06:01,382 [INFO] Results - make categorical ['Overall evaluation - 4 stars (binned)', 'Outputs evaluation - 4 stars (binned)', 'Outputs evaluation - 1 star (binned)', 'Overall evaluation - Unclassified (binned)', 'Overall evaluation - 1 star (binned)', 'Environment evaluation - Unclassified (binned)', 'Overall evaluation - 3 stars (binned)', 'Environment evaluation - 2 stars (binned)', 'Environment evaluation - 3 stars (binned)', 'Impact evaluation - 2 stars (binned)', 'Outputs evaluation - 3 stars (binned)', 'Outputs evaluation - Unclassified (binned)', 'Impact evaluation - 4 stars (binned)', 'Environment evaluation - 1 star (binned)', 'Outputs evaluation - 2 stars (binned)', 'Overall evaluation - 2 stars (binned)', 'Impact evaluation - Unclassified (binned)', 'Impact evaluation - 3 stars (binned)', 'Environment evaluation - 4 stars (binned)', 'Impact evaluation - 1 star (binned)'] -2024-02-06 17:06:01,411 [INFO] Results - read dataset from 'data/processed/sheets/ResearchGroups.parquet' -2024-02-06 17:06:01,415 [INFO] Results - added column 'Research group submissions (added)' -2024-02-06 17:06:01,573 [INFO] Results - read dataset from 'data/processed/sheets/Outputs.parquet' -2024-02-06 17:06:01,580 [INFO] Results - added column 'Output submissions (added)' -2024-02-06 17:06:01,585 [INFO] Results - added column 'Output submissions - Chapter in book (added)' -2024-02-06 17:06:01,593 [INFO] Results - added column 'Output submissions - Journal article (added)' -2024-02-06 17:06:01,598 [INFO] Results - added column 'Output submissions - Authored book (added)' -2024-02-06 17:06:01,603 [INFO] Results - added column 'Output submissions - Edited book (added)' -2024-02-06 17:06:01,607 [INFO] Results - added column 'Output submissions - Exhibition (added)' -2024-02-06 17:06:01,611 [INFO] Results - added column 'Output submissions - Performance (added)' -2024-02-06 17:06:01,615 [INFO] Results - added column 'Output submissions - Digital or visual media (added)' -2024-02-06 17:06:01,619 [INFO] Results - added column 'Output submissions - Conference contribution (added)' -2024-02-06 17:06:01,624 [INFO] Results - added column 'Output submissions - Scholarly edition (added)' -2024-02-06 17:06:01,629 [INFO] Results - added column 'Output submissions - Other (added)' -2024-02-06 17:06:01,634 [INFO] Results - added column 'Output submissions - Working paper (added)' -2024-02-06 17:06:01,638 [INFO] Results - added column 'Output submissions - Patent/ published patent application (added)' -2024-02-06 17:06:01,642 [INFO] Results - added column 'Output submissions - Composition (added)' -2024-02-06 17:06:01,646 [INFO] Results - added column 'Output submissions - Website content (added)' -2024-02-06 17:06:01,649 [INFO] Results - added column 'Output submissions - Design (added)' -2024-02-06 17:06:01,653 [INFO] Results - added column 'Output submissions - Artefact (added)' -2024-02-06 17:06:01,657 [INFO] Results - added column 'Output submissions - Research report for external body (added)' -2024-02-06 17:06:01,661 [INFO] Results - added column 'Output submissions - Research data sets and databases (added)' -2024-02-06 17:06:01,665 [INFO] Results - added column 'Output submissions - Translation (added)' -2024-02-06 17:06:01,669 [INFO] Results - added column 'Output submissions - Software (added)' -2024-02-06 17:06:01,673 [INFO] Results - added column 'Output submissions - Devices and products (added)' -2024-02-06 17:06:01,776 [INFO] Results - read dataset from 'data/processed/sheets/ImpactCaseStudies.parquet' -2024-02-06 17:06:01,782 [INFO] Results - added column 'Impact case study submissions (added)' -2024-02-06 17:06:01,791 [INFO] Results - read dataset from 'data/processed/sheets/ResearchDoctoralDegreesAwarded.parquet' -2024-02-06 17:06:01,795 [INFO] Results - added columns '['Total number of doctoral degrees awarded (added)']' -2024-02-06 17:06:01,952 [INFO] Results - read dataset from '/Users/mihaela/Documents/work/ssi_work/ref-2021-analysis/data/processed/environment_statements/prepared/EnvironmentStatementsUnitLevel.parquet' -2024-02-06 17:06:01,956 [INFO] Results - merged with unit environment statements: 1888 records -2024-02-06 17:06:01,957 [INFO] Results - make categorical ['Multiple submission letter', 'Main panel name', 'Unit of assessment name', 'Institution name', 'Joint submission', 'Multiple submission name'] -2024-02-06 17:06:02,345 [INFO] Results - write dataset to 'data/processed/sheets/Results.parquet' +2024-02-08 19:40:30,887 [INFO] Results - read sheet from 'data/raw/REF-2021-Results-All-2022-05-06.xlsx' +2024-02-08 19:40:31,284 [INFO] Results - parsed sheet: 7552 records +2024-02-08 19:40:31,284 [INFO] Results - rename 'Main panel' to 'Main panel code' +2024-02-08 19:40:31,286 [INFO] Results - replace '['/', ':']' with '_' in 'Institution name' +2024-02-08 19:40:31,288 [INFO] Results - add columns for panel names +2024-02-08 19:40:31,293 [INFO] Results - replace '-' with na in ['4 stars', '3 stars', '2 stars', '1 star', 'Unclassified'] +2024-02-08 19:40:31,296 [INFO] Results - bin percentages for ['4 stars', '3 stars', '2 stars', '1 star', 'Unclassified'] +2024-02-08 19:40:31,297 [INFO] Results - drop columns '['Institution code (UKPRN)', 'Institution sort order']' +2024-02-08 19:40:31,307 [INFO] Results - pivot to make wide format for ratings per profile to enable analyses +2024-02-08 19:40:31,307 [INFO] Results - make categorical ['Environment evaluation - 4 stars (binned)', 'Impact evaluation - 2 stars (binned)', 'Overall evaluation - Unclassified (binned)', 'Outputs evaluation - 2 stars (binned)', 'Environment evaluation - 3 stars (binned)', 'Impact evaluation - Unclassified (binned)', 'Environment evaluation - 2 stars (binned)', 'Impact evaluation - 3 stars (binned)', 'Environment evaluation - Unclassified (binned)', 'Overall evaluation - 4 stars (binned)', 'Outputs evaluation - 3 stars (binned)', 'Outputs evaluation - Unclassified (binned)', 'Overall evaluation - 1 star (binned)', 'Overall evaluation - 3 stars (binned)', 'Environment evaluation - 1 star (binned)', 'Overall evaluation - 2 stars (binned)', 'Outputs evaluation - 1 star (binned)', 'Outputs evaluation - 4 stars (binned)', 'Impact evaluation - 1 star (binned)', 'Impact evaluation - 4 stars (binned)'] +2024-02-08 19:40:31,349 [INFO] Results - read dataset from 'data/processed/sheets/ResearchGroups.parquet' +2024-02-08 19:40:31,355 [INFO] Results - added column 'Research group submissions (added)' +2024-02-08 19:40:31,506 [INFO] Results - read dataset from 'data/processed/sheets/Outputs.parquet' +2024-02-08 19:40:31,514 [INFO] Results - added column 'Output submissions (added)' +2024-02-08 19:40:31,519 [INFO] Results - added column 'Output submissions - Chapter in book (added)' +2024-02-08 19:40:31,527 [INFO] Results - added column 'Output submissions - Journal article (added)' +2024-02-08 19:40:31,532 [INFO] Results - added column 'Output submissions - Authored book (added)' +2024-02-08 19:40:31,536 [INFO] Results - added column 'Output submissions - Edited book (added)' +2024-02-08 19:40:31,539 [INFO] Results - added column 'Output submissions - Exhibition (added)' +2024-02-08 19:40:31,543 [INFO] Results - added column 'Output submissions - Performance (added)' +2024-02-08 19:40:31,547 [INFO] Results - added column 'Output submissions - Digital or visual media (added)' +2024-02-08 19:40:31,551 [INFO] Results - added column 'Output submissions - Conference contribution (added)' +2024-02-08 19:40:31,555 [INFO] Results - added column 'Output submissions - Scholarly edition (added)' +2024-02-08 19:40:31,559 [INFO] Results - added column 'Output submissions - Other (added)' +2024-02-08 19:40:31,562 [INFO] Results - added column 'Output submissions - Working paper (added)' +2024-02-08 19:40:31,566 [INFO] Results - added column 'Output submissions - Patent/ published patent application (added)' +2024-02-08 19:40:31,570 [INFO] Results - added column 'Output submissions - Composition (added)' +2024-02-08 19:40:31,573 [INFO] Results - added column 'Output submissions - Website content (added)' +2024-02-08 19:40:31,577 [INFO] Results - added column 'Output submissions - Design (added)' +2024-02-08 19:40:31,581 [INFO] Results - added column 'Output submissions - Artefact (added)' +2024-02-08 19:40:31,585 [INFO] Results - added column 'Output submissions - Research report for external body (added)' +2024-02-08 19:40:31,588 [INFO] Results - added column 'Output submissions - Research data sets and databases (added)' +2024-02-08 19:40:31,592 [INFO] Results - added column 'Output submissions - Translation (added)' +2024-02-08 19:40:31,596 [INFO] Results - added column 'Output submissions - Software (added)' +2024-02-08 19:40:31,600 [INFO] Results - added column 'Output submissions - Devices and products (added)' +2024-02-08 19:40:31,697 [INFO] Results - read dataset from 'data/processed/sheets/ImpactCaseStudies.parquet' +2024-02-08 19:40:31,701 [INFO] Results - added column 'Impact case study submissions (added)' +2024-02-08 19:40:31,708 [INFO] Results - read dataset from 'data/processed/sheets/ResearchDoctoralDegreesAwarded.parquet' +2024-02-08 19:40:31,711 [INFO] Results - added columns '['Total number of doctoral degrees awarded (added)']' +2024-02-08 19:40:31,860 [INFO] Results - read dataset from '/Users/mihaela/Documents/work/ssi_work/ref-2021-analysis/data/processed/environment_statements/EnvironmentStatementsUnitLevel.parquet' +2024-02-08 19:40:31,866 [INFO] Results - merged with unit environment statements: 1888 records +2024-02-08 19:40:31,866 [INFO] Results - make categorical ['Multiple submission letter', 'Joint submission', 'Main panel name', 'Multiple submission name', 'Unit of assessment name', 'Institution name'] +2024-02-08 19:40:32,238 [INFO] Results - write dataset to 'data/processed/sheets/Results.parquet' diff --git a/src/REF2021_processing/process_envstatements.py b/src/REF2021_processing/process_envstatements.py index 1d6acd0..019ba6d 100644 --- a/src/REF2021_processing/process_envstatements.py +++ b/src/REF2021_processing/process_envstatements.py @@ -10,15 +10,12 @@ # include all that was noticed in the actual statements TO_DELETE_HEADERS = [ - item.replace(" ", "").lower() - for item in [ - "Institutional level environment template (REF5a)", - "Institutional level environment template (REF5b)", - "Unit-level environment template (REF5a)", - "Unit-level environment template (REF5b)", - "REF5a - Institution Environment Statement", - "Institutional-Level Environment Statement (REF5a)", - ] + "Institutional level environment template (REF5a)", + "Institutional level environment template (REF5b)", + "Unit-level environment template (REF5a)", + "Unit-level environment template (REF5b)", + "REF5a - Institution Environment Statement", + "Institutional-Level Environment Statement (REF5a)", ] CHARS_TO_DELETE_FROM_HEADER = [" ", "\t", ".", ",", ":", ";", "-", "and", "&"] @@ -166,7 +163,7 @@ } -def get_and_clean_lines(statement): +def get_and_clean_lines(statement, sname=None): """Splits the statement into lines and cleans them. Args: @@ -178,23 +175,41 @@ def get_and_clean_lines(statement): # split the statement into lines lines = statement.splitlines() + if sname is not None: + logging.info("%s - split statements into lines", sname) # delete empty lines lines = [line for line in lines if line.strip()] + if sname is not None: + logging.info("%s - deleted empty lines", sname) # replace tabs with spaces lines = [line.replace("\t", " ") for line in lines] + if sname is not None: + logging.info("%s - replaced tabs with spaces", sname) # replace multiple spaces with a single space lines = [" ".join(line.split()) for line in lines] + if sname is not None: + logging.info("%s - replaced multiple spaces with a single space", sname) - # delete lines with specified content + # delete lines with page numbers lines = [ line for line in lines if line.replace(" ", "").replace("\t", "").lower() not in TO_DELETE_PAGES ] - lines = [line for line in lines if line.lower() not in TO_DELETE_HEADERS] + if sname is not None: + logging.info("%s - deleted lines with page numbers", sname) + + lines = [ + line + for line in lines + if line.lower() + not in [item.replace(" ", "").lower() for item in TO_DELETE_HEADERS] + ] + if sname is not None: + logging.info("%s - deleted lines equal to any of %s", sname, TO_DELETE_HEADERS) return lines @@ -216,7 +231,7 @@ def clean_header(header): return header -def section_indices(statement, sections): +def section_indices(statement, sections, sname=None): """Finds the indices of the sections in the statement. Args: @@ -229,7 +244,7 @@ def section_indices(statement, sections): indices = [None for section in sections] - lines = get_and_clean_lines(statement) + lines = get_and_clean_lines(statement, sname=sname) for isection, (section, headers) in enumerate(sections.items()): for header in [clean_header(header) for header in headers]: @@ -273,15 +288,20 @@ def prepare_institution_statements(): # initialise the dataset and the counts dset = pd.DataFrame() counts = [0 for section in SECTIONS_INSTITUTION] - for institution_name in fnames: + for institution_index, institution_name in enumerate(fnames): infname = os.path.join( source_config["extracted_path"], f"{source_config['prefix']}{institution_name}{source_config['input_extension']}", ) + sname_for_logging = None + if institution_index == 0: + sname_for_logging = source_config["name"] with open(infname, "r+", encoding="utf-8") as file: statement = file.read() - (indices, lines) = section_indices(statement, SECTIONS_INSTITUTION) + (indices, lines) = section_indices( + statement, SECTIONS_INSTITUTION, sname_for_logging + ) # assign the institution name data = {cb.COL_INST_NAME: [institution_name]} @@ -309,23 +329,27 @@ def prepare_institution_statements(): # add the current extracted data to the dataset dset = pd.concat([dset, pd.DataFrame(data)], ignore_index=True) - logging.info( - "%s - prepared institution statements: %d records, %d columns", - source_config["name"], - dset.shape[0], - dset.shape[1], - ) - - # report mistamatches in the number of prepared statements + # report mismatches in the number of processed statements if dset.shape[0] != len(fnames): logging.warning( - "%s - prepared statements %d/%d statements", + "%s - processed only %d of %d available statements", source_config["name"], dset.shape[0], len(fnames), ) + else: + logging.info( + "%s - processed all %d available statements", + source_config["name"], + len(fnames), + ) + + # make categorical + dset = utils.make_columns_categorical( + dset, cb.COLUMNS_TO_CATEGORY, source_config["name"] + ) - # set the index name and save the prepared data + # set the index name and save the processed data dset.index.name = "Record" rw.export_dataframe( dset, @@ -365,13 +389,16 @@ def prepare_unit_statements(): # initialise dataset dset = pd.DataFrame() - for fname in fnames: + for fname_index, fname in enumerate(fnames): ( institution_name, unit_code, multiple_submission_letter, ) = utils.get_info_from_filename(fname) + sname_for_logging = None + if fname_index == 0: + sname_for_logging = source_config["name"] with open( os.path.join( source_config["extracted_path"], @@ -380,7 +407,9 @@ def prepare_unit_statements(): "r+", encoding="utf-8", ) as file: - (indices, lines) = section_indices(file.read(), SECTIONS_UNIT) + (indices, lines) = section_indices( + file.read(), SECTIONS_UNIT, sname_for_logging + ) data = { cb.COL_INST_NAME: [institution_name], @@ -410,20 +439,27 @@ def prepare_unit_statements(): # add the current extracted data to the dataset dset = pd.concat([dset, pd.DataFrame(data)], ignore_index=True) - logging.info( - "%s - prepared statements: %d records", source_config["name"], dset.shape[0] - ) - - # report mistamatches in the number of prepared statements + # report mismatches in the number of processed statements if dset.shape[0] != len(fnames): logging.warning( - "%s - prepared statements %d/%d statements", + "%s - processed only %d of %d available statements", source_config["name"], dset.shape[0], len(fnames), ) + else: + logging.info( + "%s - processed all %d available statements", + source_config["name"], + len(fnames), + ) + + # make categorical + dset = utils.make_columns_categorical( + dset, cb.COLUMNS_TO_CATEGORY, source_config["name"] + ) - # set the index name and save the prepared data + # set the index name and save the processed data dset.index.name = "Record" rw.export_dataframe( dset, diff --git a/src/REF2021_processing/process_submissions_and_results.py b/src/REF2021_processing/process_submissions_and_results.py index c4e0e40..a1b6ad2 100644 --- a/src/REF2021_processing/process_submissions_and_results.py +++ b/src/REF2021_processing/process_submissions_and_results.py @@ -238,7 +238,7 @@ def preprocess_impacts(dset): return dset -def preprocess_sheet(source, output_path): +def process_sheet(source, output_path): """Preprocess a sheet from the raw data. Args: @@ -267,7 +267,7 @@ def preprocess_sheet(source, output_path): dset = utils.rename_columns(dset, sname) # preprocess institution name - dset = utils.preprocess_inst_name(dset, sname) + dset = utils.process_inst_name(dset, sname) # assign names where we only have codes and make categorical dset[cb.COL_PANEL_NAME] = pd.Categorical( @@ -346,7 +346,7 @@ def preprocess_sheet(source, output_path): # run pre-processing if STATUS: - preprocess_sheet(source_name, OUTPUT_PATH) + process_sheet(source_name, OUTPUT_PATH) else: print(f"{utils.FAILED_ICON} failed: setup logger") diff --git a/src/REF2021_processing/read_write.py b/src/REF2021_processing/read_write.py index c4b4eaa..d404686 100644 --- a/src/REF2021_processing/read_write.py +++ b/src/REF2021_processing/read_write.py @@ -53,7 +53,7 @@ "prefix": "Unit environment statement - ", "input_extension": ".txt", "name": "EnvironmentStatementsUnitLevel", - "output_path": "data/processed/environment_statements/prepared/", + "output_path": "data/processed/environment_statements/", "tests": { "records": 1874, }, @@ -63,7 +63,7 @@ "name": "EnvironmentStatementsInstitutionLevel", "input_extension": ".txt", "prefix": "Institution environment statement - ", - "output_path": "data/processed/environment_statements/prepared/", + "output_path": "data/processed/environment_statements/", "tests": { "records": 143, }, @@ -184,13 +184,13 @@ def rule_all(): config.append( f"{logs_path}{SOURCES['submissions']['sheets'][source]}{logs_extension}" ) - # extracted and prepared environment statements + # extracted and processed environment statements for source, _ in SOURCES["environment_statements"].items(): config.append( f"{logs_path}{SOURCES['environment_statements'][source]['name']}" f"{logs_extension}" ) - # extracted and prepared results + # extracted and processed results log_fname = f"{SOURCES['results']['sheet']}{logs_extension}" config.append(os.path.join(logs_path, log_fname)) @@ -288,12 +288,12 @@ def rule_results(config_name): config = [ os.path.join(SOURCES["results"]["raw_path"], SOURCES["results"]["filename"]) ] - # extracted and prepared sheets + # extracted and processed sheets for source, _ in SOURCES["submissions"]["sheets"].items(): config.append( f"{logs_path}{SOURCES['submissions']['sheets'][source]}{logs_extension}", ) - # extracted and prepared environment statements + # extracted and processed environment statements for source, _ in SOURCES["environment_statements"].items(): config.append( f"{logs_path}{SOURCES['environment_statements'][source]['name']}" diff --git a/src/REF2021_processing/utils.py b/src/REF2021_processing/utils.py index 142e1f7..efc0b5b 100644 --- a/src/REF2021_processing/utils.py +++ b/src/REF2021_processing/utils.py @@ -270,8 +270,8 @@ def rename_columns(dset, sname): return dset -def preprocess_inst_name(dset, sname): - """Preprocess the institution name column +def process_inst_name(dset, sname): + """Process the institution name column to replace characters that are not allowed in filenames. Args: