diff --git a/data/processed/sheets/ImpactCaseStudies.parquet b/data/processed/sheets/ImpactCaseStudies.parquet index d9af615..09a0c1d 100644 Binary files a/data/processed/sheets/ImpactCaseStudies.parquet and b/data/processed/sheets/ImpactCaseStudies.parquet differ diff --git a/data/processed/sheets/Outputs.parquet b/data/processed/sheets/Outputs.parquet index 7625732..863b297 100644 Binary files a/data/processed/sheets/Outputs.parquet and b/data/processed/sheets/Outputs.parquet differ diff --git a/data/processed/sheets/ResearchDoctoralDegreesAwarded.parquet b/data/processed/sheets/ResearchDoctoralDegreesAwarded.parquet index 6170ca8..5d56ca2 100644 Binary files a/data/processed/sheets/ResearchDoctoralDegreesAwarded.parquet and b/data/processed/sheets/ResearchDoctoralDegreesAwarded.parquet differ diff --git a/data/processed/sheets/ResearchGroups.parquet b/data/processed/sheets/ResearchGroups.parquet index 32fbc2e..5e374b6 100644 Binary files a/data/processed/sheets/ResearchGroups.parquet and b/data/processed/sheets/ResearchGroups.parquet differ diff --git a/data/processed/sheets/ResearchIncome.parquet b/data/processed/sheets/ResearchIncome.parquet index 188f977..245732e 100644 Binary files a/data/processed/sheets/ResearchIncome.parquet and b/data/processed/sheets/ResearchIncome.parquet differ diff --git a/data/processed/sheets/ResearchIncomeInKind.parquet b/data/processed/sheets/ResearchIncomeInKind.parquet index 6e77b8a..bad3d1c 100644 Binary files a/data/processed/sheets/ResearchIncomeInKind.parquet and b/data/processed/sheets/ResearchIncomeInKind.parquet differ diff --git a/data/processed/sheets/Results.parquet b/data/processed/sheets/Results.parquet index fd5b7fd..f56bc84 100644 Binary files a/data/processed/sheets/Results.parquet and b/data/processed/sheets/Results.parquet differ diff --git a/logs/EnvironmentStatementsInstitutionLevel.log b/logs/EnvironmentStatementsInstitutionLevel.log index f460394..f9f6dcb 100644 --- a/logs/EnvironmentStatementsInstitutionLevel.log +++ b/logs/EnvironmentStatementsInstitutionLevel.log @@ -1,8 +1,8 @@ -2024-02-12 10:22:32,935 [INFO] EnvironmentStatementsInstitutionLevel - read data from 'data/processed/environment_statements/extracted/institution/' -2024-02-12 10:22:32,935 [INFO] EnvironmentStatementsInstitutionLevel - statements: 143, sections: 4 -2024-02-12 10:22:32,937 [INFO] EnvironmentStatementsInstitutionLevel - split statements into lines -2024-02-12 10:22:32,938 [INFO] EnvironmentStatementsInstitutionLevel - deleted lines with page numbers -2024-02-12 10:22:32,939 [INFO] EnvironmentStatementsInstitutionLevel - deleted lines equal to any of ['Institutional level environment template (REF5a)', 'Institutional level environment template (REF5b)', 'Unit-level environment template (REF5a)', 'Unit-level environment template (REF5b)', 'REF5a - Institution Environment Statement', 'Institutional-Level Environment Statement (REF5a)'] -2024-02-12 10:22:33,409 [INFO] EnvironmentStatementsInstitutionLevel - processed all 143 available statements -2024-02-12 10:22:33,409 [INFO] EnvironmentStatementsInstitutionLevel - make categorical ['Institution name'] -2024-02-12 10:22:33,461 [INFO] EnvironmentStatementsInstitutionLevel - write dataset to 'data/processed/environment_statements/EnvironmentStatementsInstitutionLevel.parquet' +2024-02-22 17:24:56,642 [INFO] EnvironmentStatementsInstitutionLevel - read data from 'data/processed/environment_statements/extracted/institution/' +2024-02-22 17:24:56,642 [INFO] EnvironmentStatementsInstitutionLevel - statements: 143, sections: 4 +2024-02-22 17:24:56,643 [INFO] EnvironmentStatementsInstitutionLevel - split statements into lines +2024-02-22 17:24:56,644 [INFO] EnvironmentStatementsInstitutionLevel - deleted lines with page numbers +2024-02-22 17:24:56,645 [INFO] EnvironmentStatementsInstitutionLevel - deleted lines equal to any of ['Institutional level environment template (REF5a)', 'Institutional level environment template (REF5b)', 'Unit-level environment template (REF5a)', 'Unit-level environment template (REF5b)', 'REF5a - Institution Environment Statement', 'Institutional-Level Environment Statement (REF5a)'] +2024-02-22 17:24:57,170 [INFO] EnvironmentStatementsInstitutionLevel - processed all 143 available statements +2024-02-22 17:24:57,170 [INFO] EnvironmentStatementsInstitutionLevel - make categorical ['Institution name'] +2024-02-22 17:24:57,218 [INFO] EnvironmentStatementsInstitutionLevel - write dataset to 'data/processed/environment_statements/EnvironmentStatementsInstitutionLevel.parquet' diff --git a/logs/EnvironmentStatementsUnitLevel.log b/logs/EnvironmentStatementsUnitLevel.log index c6e6e7b..ec1e10c 100644 --- a/logs/EnvironmentStatementsUnitLevel.log +++ b/logs/EnvironmentStatementsUnitLevel.log @@ -1,8 +1,8 @@ -2024-02-12 10:22:32,936 [INFO] EnvironmentStatementsUnitLevel - read data from 'data/processed/environment_statements/extracted/unit/' -2024-02-12 10:22:32,937 [INFO] EnvironmentStatementsUnitLevel - statements: 1874, sections: 4 -2024-02-12 10:22:32,937 [INFO] EnvironmentStatementsUnitLevel - split statements into lines -2024-02-12 10:22:32,938 [INFO] EnvironmentStatementsUnitLevel - deleted lines with page numbers -2024-02-12 10:22:32,939 [INFO] EnvironmentStatementsUnitLevel - deleted lines equal to any of ['Institutional level environment template (REF5a)', 'Institutional level environment template (REF5b)', 'Unit-level environment template (REF5a)', 'Unit-level environment template (REF5b)', 'REF5a - Institution Environment Statement', 'Institutional-Level Environment Statement (REF5a)'] -2024-02-12 10:22:52,274 [INFO] EnvironmentStatementsUnitLevel - processed all 1874 available statements -2024-02-12 10:22:52,274 [INFO] EnvironmentStatementsUnitLevel - make categorical ['Multiple submission letter', 'Unit of assessment name', 'Institution name'] -2024-02-12 10:22:52,773 [INFO] EnvironmentStatementsUnitLevel - write dataset to 'data/processed/environment_statements/EnvironmentStatementsUnitLevel.parquet' +2024-02-22 17:24:56,659 [INFO] EnvironmentStatementsUnitLevel - read data from 'data/processed/environment_statements/extracted/unit/' +2024-02-22 17:24:56,659 [INFO] EnvironmentStatementsUnitLevel - statements: 1874, sections: 4 +2024-02-22 17:24:56,660 [INFO] EnvironmentStatementsUnitLevel - split statements into lines +2024-02-22 17:24:56,661 [INFO] EnvironmentStatementsUnitLevel - deleted lines with page numbers +2024-02-22 17:24:56,662 [INFO] EnvironmentStatementsUnitLevel - deleted lines equal to any of ['Institutional level environment template (REF5a)', 'Institutional level environment template (REF5b)', 'Unit-level environment template (REF5a)', 'Unit-level environment template (REF5b)', 'REF5a - Institution Environment Statement', 'Institutional-Level Environment Statement (REF5a)'] +2024-02-22 17:25:17,122 [INFO] EnvironmentStatementsUnitLevel - processed all 1874 available statements +2024-02-22 17:25:17,122 [INFO] EnvironmentStatementsUnitLevel - make categorical ['Institution name', 'Multiple submission letter', 'Unit of assessment name'] +2024-02-22 17:25:17,544 [INFO] EnvironmentStatementsUnitLevel - write dataset to 'data/processed/environment_statements/EnvironmentStatementsUnitLevel.parquet' diff --git a/logs/ImpactCaseStudies.log b/logs/ImpactCaseStudies.log index a88eef1..fee2e0c 100644 --- a/logs/ImpactCaseStudies.log +++ b/logs/ImpactCaseStudies.log @@ -1,10 +1,10 @@ -2024-02-08 19:40:12,766 [INFO] ImpactCaseStudies - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' -2024-02-08 19:40:13,319 [INFO] ImpactCaseStudies - parsed sheet: 6361 records -2024-02-08 19:40:13,319 [INFO] ImpactCaseStudies - rename 'Main panel' to 'Main panel code' -2024-02-08 19:40:13,321 [INFO] ImpactCaseStudies - replace '['/', ':']' with '_' in 'Institution name' -2024-02-08 19:40:13,323 [INFO] ImpactCaseStudies - add columns for panel names -2024-02-08 19:40:13,324 [INFO] ImpactCaseStudies - shift columns from title to the left to fix raw data issue -2024-02-08 19:40:16,184 [INFO] ImpactCaseStudies - replace styling characters in ['1. Summary of the impact', '2. Underpinning research', '3. References to the research', '4. Details of the impact', '5. Sources to corroborate the impact'] -2024-02-08 19:40:16,187 [INFO] ImpactCaseStudies - drop columns '['3. References to the research', '5. Sources to corroborate the impact', 'Researcher ORCIDs', 'Institution UKPRN code', 'Is continued from 2014', 'Unit of assessment number', 'Countries', 'Grant funding', 'Main panel code', 'Formal partners', 'Global research identifiers', '2. Underpinning research']' -2024-02-08 19:40:16,215 [INFO] ImpactCaseStudies - make categorical ['Main panel name', 'Institution name', 'Multiple submission letter', 'Unit of assessment name', 'Multiple submission name', 'Joint submission'] -2024-02-08 19:40:16,429 [INFO] ImpactCaseStudies - write dataset to 'data/processed/sheets/ImpactCaseStudies.parquet' +2024-02-22 17:25:00,260 [INFO] ImpactCaseStudies - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' +2024-02-22 17:25:00,831 [INFO] ImpactCaseStudies - parsed sheet: 6361 records +2024-02-22 17:25:00,835 [INFO] ImpactCaseStudies - rename 'Main panel' to 'Main panel code' +2024-02-22 17:25:00,837 [INFO] ImpactCaseStudies - replace '['/', ':']' with '_' in 'Institution name' +2024-02-22 17:25:00,839 [INFO] ImpactCaseStudies - add columns for panel names +2024-02-22 17:25:00,840 [INFO] ImpactCaseStudies - shift columns from title to the left to fix raw data issue +2024-02-22 17:25:03,838 [INFO] ImpactCaseStudies - replace styling characters in ['1. Summary of the impact', '2. Underpinning research', '3. References to the research', '4. Details of the impact', '5. Sources to corroborate the impact'] +2024-02-22 17:25:03,839 [INFO] ImpactCaseStudies - drop columns '['2. Underpinning research', '3. References to the research', 'Formal partners', 'Global research identifiers', 'Institution UKPRN code', 'Unit of assessment number', 'Countries', 'Is continued from 2014', 'Researcher ORCIDs', '5. Sources to corroborate the impact', 'Grant funding', 'Main panel code']' +2024-02-22 17:25:03,862 [INFO] ImpactCaseStudies - make categorical ['Main panel name', 'Institution name', 'Joint submission', 'Multiple submission name', 'Unit of assessment name', 'Multiple submission letter'] +2024-02-22 17:25:04,071 [INFO] ImpactCaseStudies - write dataset to 'data/processed/sheets/ImpactCaseStudies.parquet' diff --git a/logs/Outputs.log b/logs/Outputs.log index 76a4e5c..82ec373 100644 --- a/logs/Outputs.log +++ b/logs/Outputs.log @@ -1,12 +1,12 @@ -2024-02-08 19:40:12,758 [INFO] Outputs - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' -2024-02-08 19:40:29,215 [INFO] Outputs - parsed sheet: 185353 records -2024-02-08 19:40:29,240 [INFO] Outputs - rename 'Main panel' to 'Main panel code' -2024-02-08 19:40:29,273 [INFO] Outputs - rename 'Output type' to 'Output type code' -2024-02-08 19:40:29,319 [INFO] Outputs - replace '['/', ':']' with '_' in 'Institution name' -2024-02-08 19:40:29,363 [INFO] Outputs - add columns for panel names -2024-02-08 19:40:30,092 [INFO] Outputs - replace styling characters in ['Title'] -2024-02-08 19:40:30,128 [INFO] Outputs - add columns for output types names -2024-02-08 19:40:30,132 [INFO] Outputs - make output year categorical -2024-02-08 19:40:30,175 [INFO] Outputs - drop columns '['Institution UKPRN code', 'Unit of assessment number', 'Output type code', 'Main panel code']' -2024-02-08 19:40:30,189 [INFO] Outputs - make categorical ['Unit of assessment name', 'Institution name', 'Multiple submission name', 'Interdisciplinary', 'Cross-referral requested', 'Main panel name', 'Multiple submission letter', 'Forensic science', 'Open access status', 'Citations applicable', 'Propose double weighting', 'Non-English', 'Is reserve output', 'Delayed by COVID19', 'Joint submission', 'Research group', 'Output type', 'Criminology'] -2024-02-08 19:40:30,511 [INFO] Outputs - write dataset to 'data/processed/sheets/Outputs.parquet' +2024-02-22 17:25:00,260 [INFO] Outputs - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' +2024-02-22 17:25:16,736 [INFO] Outputs - parsed sheet: 185353 records +2024-02-22 17:25:16,864 [INFO] Outputs - rename 'Main panel' to 'Main panel code' +2024-02-22 17:25:16,895 [INFO] Outputs - rename 'Output type' to 'Output type code' +2024-02-22 17:25:16,942 [INFO] Outputs - replace '['/', ':']' with '_' in 'Institution name' +2024-02-22 17:25:16,983 [INFO] Outputs - add columns for panel names +2024-02-22 17:25:17,729 [INFO] Outputs - replace styling characters in ['Title'] +2024-02-22 17:25:17,765 [INFO] Outputs - add columns for output types names +2024-02-22 17:25:17,766 [INFO] Outputs - make output year categorical +2024-02-22 17:25:17,809 [INFO] Outputs - drop columns '['Unit of assessment number', 'Institution UKPRN code', 'Main panel code', 'Output type code']' +2024-02-22 17:25:17,822 [INFO] Outputs - make categorical ['Non-English', 'Criminology', 'Main panel name', 'Unit of assessment name', 'Cross-referral requested', 'Institution name', 'Output type', 'Is reserve output', 'Multiple submission name', 'Delayed by COVID19', 'Multiple submission letter', 'Joint submission', 'Interdisciplinary', 'Open access status', 'Forensic science', 'Propose double weighting', 'Research group', 'Citations applicable'] +2024-02-22 17:25:18,126 [INFO] Outputs - write dataset to 'data/processed/sheets/Outputs.parquet' diff --git a/logs/ResearchDoctoralDegreesAwarded.log b/logs/ResearchDoctoralDegreesAwarded.log index 17b78cf..b3f8e5d 100644 --- a/logs/ResearchDoctoralDegreesAwarded.log +++ b/logs/ResearchDoctoralDegreesAwarded.log @@ -1,9 +1,9 @@ -2024-02-08 19:40:12,755 [INFO] ResearchDoctoralDegreesAwarded - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' -2024-02-08 19:40:12,839 [INFO] ResearchDoctoralDegreesAwarded - parsed sheet: 1888 records -2024-02-08 19:40:12,839 [INFO] ResearchDoctoralDegreesAwarded - rename 'Main panel' to 'Main panel code' -2024-02-08 19:40:12,840 [INFO] ResearchDoctoralDegreesAwarded - replace '['/', ':']' with '_' in 'Institution name' -2024-02-08 19:40:12,841 [INFO] ResearchDoctoralDegreesAwarded - add columns for panel names -2024-02-08 19:40:12,842 [INFO] ResearchDoctoralDegreesAwarded - calculate total number of degrees awarded -2024-02-08 19:40:12,842 [INFO] ResearchDoctoralDegreesAwarded - drop columns '['Unit of assessment number', 'Institution UKPRN code', 'Main panel code']' -2024-02-08 19:40:12,843 [INFO] ResearchDoctoralDegreesAwarded - make categorical ['Unit of assessment name', 'Multiple submission name', 'Joint submission', 'Main panel name', 'Multiple submission letter', 'Institution name'] -2024-02-08 19:40:12,856 [INFO] ResearchDoctoralDegreesAwarded - write dataset to 'data/processed/sheets/ResearchDoctoralDegreesAwarded.parquet' +2024-02-22 17:25:00,302 [INFO] ResearchDoctoralDegreesAwarded - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' +2024-02-22 17:25:00,399 [INFO] ResearchDoctoralDegreesAwarded - parsed sheet: 1888 records +2024-02-22 17:25:00,401 [INFO] ResearchDoctoralDegreesAwarded - rename 'Main panel' to 'Main panel code' +2024-02-22 17:25:00,401 [INFO] ResearchDoctoralDegreesAwarded - replace '['/', ':']' with '_' in 'Institution name' +2024-02-22 17:25:00,402 [INFO] ResearchDoctoralDegreesAwarded - add columns for panel names +2024-02-22 17:25:00,403 [INFO] ResearchDoctoralDegreesAwarded - calculate total number of degrees awarded +2024-02-22 17:25:00,403 [INFO] ResearchDoctoralDegreesAwarded - drop columns '['Main panel code', 'Unit of assessment number', 'Institution UKPRN code']' +2024-02-22 17:25:00,403 [INFO] ResearchDoctoralDegreesAwarded - make categorical ['Joint submission', 'Main panel name', 'Multiple submission name', 'Multiple submission letter', 'Unit of assessment name', 'Institution name'] +2024-02-22 17:25:00,416 [INFO] ResearchDoctoralDegreesAwarded - write dataset to 'data/processed/sheets/ResearchDoctoralDegreesAwarded.parquet' diff --git a/logs/ResearchGroups.log b/logs/ResearchGroups.log index 09d38ff..5610488 100644 --- a/logs/ResearchGroups.log +++ b/logs/ResearchGroups.log @@ -1,9 +1,9 @@ -2024-02-08 19:40:12,754 [INFO] ResearchGroups - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' -2024-02-08 19:40:12,808 [INFO] ResearchGroups - parsed sheet: 2036 records -2024-02-08 19:40:12,809 [INFO] ResearchGroups - rename 'Main panel' to 'Main panel code' -2024-02-08 19:40:12,810 [INFO] ResearchGroups - replace '['/', ':']' with '_' in 'Institution name' -2024-02-08 19:40:12,811 [INFO] ResearchGroups - add columns for panel names -2024-02-08 19:40:12,811 [INFO] ResearchGroups - make group code categorical -2024-02-08 19:40:12,812 [INFO] ResearchGroups - drop columns '['Institution UKPRN code', 'Unit of assessment number', 'Main panel code']' -2024-02-08 19:40:12,812 [INFO] ResearchGroups - make categorical ['Unit of assessment name', 'Main panel name', 'Multiple submission name', 'Joint submission', 'Multiple submission letter', 'Institution name'] -2024-02-08 19:40:12,824 [INFO] ResearchGroups - write dataset to 'data/processed/sheets/ResearchGroups.parquet' +2024-02-22 17:25:00,260 [INFO] ResearchGroups - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' +2024-02-22 17:25:00,329 [INFO] ResearchGroups - parsed sheet: 2036 records +2024-02-22 17:25:00,332 [INFO] ResearchGroups - rename 'Main panel' to 'Main panel code' +2024-02-22 17:25:00,332 [INFO] ResearchGroups - replace '['/', ':']' with '_' in 'Institution name' +2024-02-22 17:25:00,333 [INFO] ResearchGroups - add columns for panel names +2024-02-22 17:25:00,333 [INFO] ResearchGroups - make group code categorical +2024-02-22 17:25:00,334 [INFO] ResearchGroups - drop columns '['Main panel code', 'Institution UKPRN code', 'Unit of assessment number']' +2024-02-22 17:25:00,334 [INFO] ResearchGroups - make categorical ['Institution name', 'Main panel name', 'Multiple submission letter', 'Multiple submission name', 'Unit of assessment name', 'Joint submission'] +2024-02-22 17:25:00,353 [INFO] ResearchGroups - write dataset to 'data/processed/sheets/ResearchGroups.parquet' diff --git a/logs/ResearchIncome.log b/logs/ResearchIncome.log index 7095e83..98e1b24 100644 --- a/logs/ResearchIncome.log +++ b/logs/ResearchIncome.log @@ -1,9 +1,9 @@ -2024-02-08 19:40:12,755 [INFO] ResearchIncome - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' -2024-02-08 19:40:13,935 [INFO] ResearchIncome - parsed sheet: 28637 records -2024-02-08 19:40:13,936 [INFO] ResearchIncome - rename 'Main panel' to 'Main panel code' -2024-02-08 19:40:13,942 [INFO] ResearchIncome - replace '['/', ':']' with '_' in 'Institution name' -2024-02-08 19:40:13,945 [INFO] ResearchIncome - add columns for panel names -2024-02-08 19:40:13,947 [INFO] ResearchIncome - make income source categorical -2024-02-08 19:40:13,948 [INFO] ResearchIncome - drop columns '['Institution UKPRN code', 'Main panel code', 'Unit of assessment number']' -2024-02-08 19:40:13,949 [INFO] ResearchIncome - make categorical ['Unit of assessment name', 'Multiple submission letter', 'Joint submission', 'Multiple submission name', 'Main panel name', 'Institution name'] -2024-02-08 19:40:13,973 [INFO] ResearchIncome - write dataset to 'data/processed/sheets/ResearchIncome.parquet' +2024-02-22 17:25:00,318 [INFO] ResearchIncome - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' +2024-02-22 17:25:01,468 [INFO] ResearchIncome - parsed sheet: 28637 records +2024-02-22 17:25:01,477 [INFO] ResearchIncome - rename 'Main panel' to 'Main panel code' +2024-02-22 17:25:01,483 [INFO] ResearchIncome - replace '['/', ':']' with '_' in 'Institution name' +2024-02-22 17:25:01,486 [INFO] ResearchIncome - add columns for panel names +2024-02-22 17:25:01,488 [INFO] ResearchIncome - make income source categorical +2024-02-22 17:25:01,489 [INFO] ResearchIncome - drop columns '['Institution UKPRN code', 'Main panel code', 'Unit of assessment number']' +2024-02-22 17:25:01,489 [INFO] ResearchIncome - make categorical ['Unit of assessment name', 'Institution name', 'Main panel name', 'Joint submission', 'Multiple submission letter', 'Multiple submission name'] +2024-02-22 17:25:01,512 [INFO] ResearchIncome - write dataset to 'data/processed/sheets/ResearchIncome.parquet' diff --git a/logs/ResearchIncomeInKind.log b/logs/ResearchIncomeInKind.log index a9dedbb..6328184 100644 --- a/logs/ResearchIncomeInKind.log +++ b/logs/ResearchIncomeInKind.log @@ -1,9 +1,9 @@ -2024-02-08 19:40:12,767 [INFO] ResearchIncomeInKind - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' -2024-02-08 19:40:12,974 [INFO] ResearchIncomeInKind - parsed sheet: 4093 records -2024-02-08 19:40:12,974 [INFO] ResearchIncomeInKind - rename 'Main panel' to 'Main panel code' -2024-02-08 19:40:12,975 [INFO] ResearchIncomeInKind - replace '['/', ':']' with '_' in 'Institution name' -2024-02-08 19:40:12,976 [INFO] ResearchIncomeInKind - add columns for panel names -2024-02-08 19:40:12,977 [INFO] ResearchIncomeInKind - make income source categorical -2024-02-08 19:40:12,977 [INFO] ResearchIncomeInKind - drop columns '['Unit of assessment number', 'Institution UKPRN code', 'Main panel code']' -2024-02-08 19:40:12,977 [INFO] ResearchIncomeInKind - make categorical ['Unit of assessment name', 'Multiple submission letter', 'Joint submission', 'Institution name', 'Main panel name', 'Multiple submission name'] -2024-02-08 19:40:12,989 [INFO] ResearchIncomeInKind - write dataset to 'data/processed/sheets/ResearchIncomeInKind.parquet' +2024-02-22 17:25:00,252 [INFO] ResearchIncomeInKind - read sheet from 'data/raw/REF-2021-Submissions-All-2022-07-27.xlsx' +2024-02-22 17:25:00,455 [INFO] ResearchIncomeInKind - parsed sheet: 4093 records +2024-02-22 17:25:00,458 [INFO] ResearchIncomeInKind - rename 'Main panel' to 'Main panel code' +2024-02-22 17:25:00,459 [INFO] ResearchIncomeInKind - replace '['/', ':']' with '_' in 'Institution name' +2024-02-22 17:25:00,460 [INFO] ResearchIncomeInKind - add columns for panel names +2024-02-22 17:25:00,461 [INFO] ResearchIncomeInKind - make income source categorical +2024-02-22 17:25:00,461 [INFO] ResearchIncomeInKind - drop columns '['Institution UKPRN code', 'Unit of assessment number', 'Main panel code']' +2024-02-22 17:25:00,461 [INFO] ResearchIncomeInKind - make categorical ['Unit of assessment name', 'Multiple submission letter', 'Main panel name', 'Multiple submission name', 'Joint submission', 'Institution name'] +2024-02-22 17:25:00,474 [INFO] ResearchIncomeInKind - write dataset to 'data/processed/sheets/ResearchIncomeInKind.parquet' diff --git a/logs/Results.log b/logs/Results.log index d8aa0ec..a74e538 100644 --- a/logs/Results.log +++ b/logs/Results.log @@ -1,39 +1,39 @@ -2024-02-22 15:44:03,758 [INFO] Results - read sheet from 'data/raw/REF-2021-Results-All-2022-05-06.xlsx' -2024-02-22 15:44:04,148 [INFO] Results - parsed sheet: 7552 records -2024-02-22 15:44:04,149 [INFO] Results - rename 'Main panel' to 'Main panel code' -2024-02-22 15:44:04,152 [INFO] Results - replace '['/', ':']' with '_' in 'Institution name' -2024-02-22 15:44:04,154 [INFO] Results - add columns for panel names -2024-02-22 15:44:04,159 [INFO] Results - replace '-' with na in ['4 stars', '3 stars', '2 stars', '1 star', 'Unclassified'] -2024-02-22 15:44:04,160 [INFO] Results - drop columns '['Institution sort order', 'Institution code (UKPRN)']' -2024-02-22 15:44:04,166 [INFO] Results - pivot to make wide format for ratings per profile to enable analyses -2024-02-22 15:44:04,653 [INFO] Results - read dataset from 'data/processed/sheets/Outputs.parquet' -2024-02-22 15:44:04,664 [INFO] Results - added column 'Output submissions (added)' -2024-02-22 15:44:04,668 [INFO] Results - added column 'Output submissions - Chapter in book (added)' -2024-02-22 15:44:04,675 [INFO] Results - added column 'Output submissions - Journal article (added)' -2024-02-22 15:44:04,678 [INFO] Results - added column 'Output submissions - Authored book (added)' -2024-02-22 15:44:04,681 [INFO] Results - added column 'Output submissions - Edited book (added)' -2024-02-22 15:44:04,684 [INFO] Results - added column 'Output submissions - Exhibition (added)' -2024-02-22 15:44:04,686 [INFO] Results - added column 'Output submissions - Performance (added)' -2024-02-22 15:44:04,689 [INFO] Results - added column 'Output submissions - Digital or visual media (added)' -2024-02-22 15:44:04,691 [INFO] Results - added column 'Output submissions - Conference contribution (added)' -2024-02-22 15:44:04,694 [INFO] Results - added column 'Output submissions - Scholarly edition (added)' -2024-02-22 15:44:04,696 [INFO] Results - added column 'Output submissions - Other (added)' -2024-02-22 15:44:04,699 [INFO] Results - added column 'Output submissions - Working paper (added)' -2024-02-22 15:44:04,701 [INFO] Results - added column 'Output submissions - Patent/ published patent application (added)' -2024-02-22 15:44:04,704 [INFO] Results - added column 'Output submissions - Composition (added)' -2024-02-22 15:44:04,706 [INFO] Results - added column 'Output submissions - Website content (added)' -2024-02-22 15:44:04,709 [INFO] Results - added column 'Output submissions - Design (added)' -2024-02-22 15:44:04,711 [INFO] Results - added column 'Output submissions - Artefact (added)' -2024-02-22 15:44:04,714 [INFO] Results - added column 'Output submissions - Research report for external body (added)' -2024-02-22 15:44:04,716 [INFO] Results - added column 'Output submissions - Research data sets and databases (added)' -2024-02-22 15:44:04,719 [INFO] Results - added column 'Output submissions - Translation (added)' -2024-02-22 15:44:04,721 [INFO] Results - added column 'Output submissions - Software (added)' -2024-02-22 15:44:04,723 [INFO] Results - added column 'Output submissions - Devices and products (added)' -2024-02-22 15:44:04,843 [INFO] Results - read dataset from 'data/processed/sheets/ImpactCaseStudies.parquet' -2024-02-22 15:44:04,846 [INFO] Results - added column 'Impact case study submissions (added)' -2024-02-22 15:44:04,858 [INFO] Results - read dataset from 'data/processed/sheets/ResearchDoctoralDegreesAwarded.parquet' -2024-02-22 15:44:04,860 [INFO] Results - added columns '['Total number of doctoral degrees awarded (added)']' -2024-02-22 15:44:05,034 [INFO] Results - read dataset from '/Users/mihaela/Documents/work/ssi_work/ref-2021-analysis/data/processed/environment_statements/EnvironmentStatementsUnitLevel.parquet' -2024-02-22 15:44:05,041 [INFO] Results - merged with unit environment statements: 1888 records -2024-02-22 15:44:05,041 [INFO] Results - make categorical ['Main panel name', 'Institution name', 'Multiple submission name', 'Unit of assessment name', 'Joint submission', 'Multiple submission letter'] -2024-02-22 15:44:05,476 [INFO] Results - write dataset to 'data/processed/sheets/Results.parquet' +2024-02-22 17:26:59,653 [INFO] Results - read sheet from 'data/raw/REF-2021-Results-All-2022-05-06.xlsx' +2024-02-22 17:27:00,050 [INFO] Results - parsed sheet: 7552 records +2024-02-22 17:27:00,053 [INFO] Results - rename 'Main panel' to 'Main panel code' +2024-02-22 17:27:00,055 [INFO] Results - replace '['/', ':']' with '_' in 'Institution name' +2024-02-22 17:27:00,056 [INFO] Results - add columns for panel names +2024-02-22 17:27:00,061 [INFO] Results - replace '-' with na in ['4 stars', '3 stars', '2 stars', '1 star', 'Unclassified'] +2024-02-22 17:27:00,062 [INFO] Results - drop columns '['Institution sort order', 'Institution code (UKPRN)']' +2024-02-22 17:27:00,067 [INFO] Results - pivot to make wide format for ratings per profile to enable analyses +2024-02-22 17:27:00,270 [INFO] Results - read dataset from 'data/processed/sheets/Outputs.parquet' +2024-02-22 17:27:00,277 [INFO] Results - added column 'Output submissions (added)' +2024-02-22 17:27:00,281 [INFO] Results - added column 'Output submissions - Chapter in book (added)' +2024-02-22 17:27:00,288 [INFO] Results - added column 'Output submissions - Journal article (added)' +2024-02-22 17:27:00,291 [INFO] Results - added column 'Output submissions - Authored book (added)' +2024-02-22 17:27:00,294 [INFO] Results - added column 'Output submissions - Edited book (added)' +2024-02-22 17:27:00,296 [INFO] Results - added column 'Output submissions - Exhibition (added)' +2024-02-22 17:27:00,299 [INFO] Results - added column 'Output submissions - Performance (added)' +2024-02-22 17:27:00,301 [INFO] Results - added column 'Output submissions - Digital or visual media (added)' +2024-02-22 17:27:00,303 [INFO] Results - added column 'Output submissions - Conference contribution (added)' +2024-02-22 17:27:00,306 [INFO] Results - added column 'Output submissions - Scholarly edition (added)' +2024-02-22 17:27:00,308 [INFO] Results - added column 'Output submissions - Other (added)' +2024-02-22 17:27:00,310 [INFO] Results - added column 'Output submissions - Working paper (added)' +2024-02-22 17:27:00,313 [INFO] Results - added column 'Output submissions - Patent/ published patent application (added)' +2024-02-22 17:27:00,315 [INFO] Results - added column 'Output submissions - Composition (added)' +2024-02-22 17:27:00,317 [INFO] Results - added column 'Output submissions - Website content (added)' +2024-02-22 17:27:00,320 [INFO] Results - added column 'Output submissions - Design (added)' +2024-02-22 17:27:00,322 [INFO] Results - added column 'Output submissions - Artefact (added)' +2024-02-22 17:27:00,324 [INFO] Results - added column 'Output submissions - Research report for external body (added)' +2024-02-22 17:27:00,326 [INFO] Results - added column 'Output submissions - Research data sets and databases (added)' +2024-02-22 17:27:00,329 [INFO] Results - added column 'Output submissions - Translation (added)' +2024-02-22 17:27:00,331 [INFO] Results - added column 'Output submissions - Software (added)' +2024-02-22 17:27:00,333 [INFO] Results - added column 'Output submissions - Devices and products (added)' +2024-02-22 17:27:00,448 [INFO] Results - read dataset from 'data/processed/sheets/ImpactCaseStudies.parquet' +2024-02-22 17:27:00,452 [INFO] Results - added column 'Impact case study submissions (added)' +2024-02-22 17:27:00,463 [INFO] Results - read dataset from 'data/processed/sheets/ResearchDoctoralDegreesAwarded.parquet' +2024-02-22 17:27:00,464 [INFO] Results - added columns '['Total number of doctoral degrees awarded (added)']' +2024-02-22 17:27:00,636 [INFO] Results - read dataset from '/Users/mihaela/Documents/work/ssi_work/ref-2021-analysis/data/processed/environment_statements/EnvironmentStatementsUnitLevel.parquet' +2024-02-22 17:27:00,644 [INFO] Results - merged with unit environment statements: 1888 records +2024-02-22 17:27:00,654 [INFO] Results - make categorical ['Multiple submission name', 'Institution name', 'Joint submission', 'Unit of assessment name', 'Multiple submission letter', 'Main panel name'] +2024-02-22 17:27:00,664 [INFO] Results - write dataset to 'data/processed/sheets/Results.parquet' diff --git a/src/REF2021_processing/process_submissions_and_results.py b/src/REF2021_processing/process_submissions_and_results.py index 450ca01..3d52df7 100644 --- a/src/REF2021_processing/process_submissions_and_results.py +++ b/src/REF2021_processing/process_submissions_and_results.py @@ -36,7 +36,6 @@ def preprocess_results(dset): # do not bin these percentages as it is not actually useful # # bin percentages # # --------------- - # # COL_RESULTS_PERC_STAFF_SUBMITTED, COL_RESULTS_TOTAL_FTE_SUBMITTED_JOINT not binned, not < 100% # for column in cb.COLUMNS_STARS: # dset = utils.bin_percentages( # dset, column, f"{column}{cb.COLUMN_NAME_BINNED_SUFFIX}" diff --git a/src/REF2021_processing/read_write.py b/src/REF2021_processing/read_write.py index 7ab94d7..cafde4f 100644 --- a/src/REF2021_processing/read_write.py +++ b/src/REF2021_processing/read_write.py @@ -3,6 +3,7 @@ import logging import pandas as pd +import REF2021_processing.codebook as cb PROJECT_PATH = os.path.dirname(os.path.abspath(__name__)) @@ -106,6 +107,11 @@ def extract_sheet(fpath, sname, header, index_col=None): dset = dobj.parse(sname, header=header, index_col=index_col) logging.info("%s - parsed sheet: %d records", sname, dset.shape[0]) + dset[cb.COL_UOA_NAME] = ( + dset[cb.COL_UOA_NUMBER].astype(str).str.zfill(2) + ": " + dset[cb.COL_UOA_NAME] + ) + dset[cb.COL_UOA_NAME] = dset[cb.COL_UOA_NAME].astype('category') + return dset diff --git a/src/REF2021_processing/utils.py b/src/REF2021_processing/utils.py index 30de559..1e8d8e0 100644 --- a/src/REF2021_processing/utils.py +++ b/src/REF2021_processing/utils.py @@ -305,7 +305,7 @@ def bin_percentages_labels(): """ bins = PERCENTAGE_BINS - labels = [f"{bins[i]} to {bins[i+1]} %" for i in range(len(bins) - 1)] + labels = [f"{bins[i]} to {bins[i + 1]} %" for i in range(len(bins) - 1)] # labels = [f"({bins[i]}, {bins[i+1]}]" for i in range(len(bins)-1)] # labels[0] = f"[{bins[0]}, {bins[1]}]"