-
Notifications
You must be signed in to change notification settings - Fork 0
/
configuration.ini
161 lines (120 loc) · 8.53 KB
/
configuration.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
[General]
# Specifies the environment in which the script will run
# local: Sequential processing, suitable for a personal computer
# hpc: Parallel processing using multiple processors, suitable for High Performance Computing (HPC) environments
run_mode = local # options: local or hpc. Select your preference between the two here.
# only applicable for HPC. Select how many CPUs you would like to use to perfom the tasks in parallel.
num_processors = 18
[Directories]
# Directory where all historical weather data sheet images are stored in folders per station.
# Here the folder name is the station number, e.g. 203
# Within these folders (stations), the images are labelled with format "STN_YYYYMM_SF" or "STN_YYYYMM_HD" using the data inventory, where:
#STN is the three digit station number,
#YYYY is the year
#MM is the month,
#SF represents Standard Format
#HD represents a hand drawn form / or photocopied form of the standard format
## See example sheets in this GitHub repository: https://github.com/VUB-HYDR/MeteoSaver/tree/101a4c7b37487d542118696103eae69f2e65553a/data
full_datadir = data/00_post1960_DRC_hydroclimate_datasheet_images/
# Directory where pre-QA/QC transcribed data is stored
pre_QA_QC_transcribed_hydroclimate_data_dir = results/01_pre_QA_QC_transcribed_hydroclimate_data/
# Directory where post-QA/QC transcribed data is stored
post_QA_QC_transcribed_hydroclimate_data_dir = results/02_post_QA_QC_transcribed_hydroclimate_data/
# Directory for the final refined daily hydroclimate data (after all quality checks)
final_refined_daily_hydroclimate_data_dir = results/04_final_refined_daily_hydroclimate_data/
# Directory to store transient transcription output during processing
transient_transcription_output_dir = results/05_transient_transcription_output
# Directory for manually transcribed data (used for validation)
manually_transcribed_data_dir = results/06_manually_transcribed_hydroclimate_data/
# Directory for validation results comparing manually transcribed and the MeteoSaver transcribed data
validation_dir = results/03_validation_transcibed_data/
# Directory for all the stations metadata
metadata_file_path = data\01_metadata_INERA_stations\metadata_INERA_stations.xlsx
[TableAndCellDetection]
# Values for further clipping the detected table in the image. (Dimensions are in pixels)
# Here the clip_up, clip_down, clip_left, and clip_right ensure clipping of the HEADERS and ROW LABELS (Date & Pentad no. in our case) from the entire detected table (table detected using ML).
#Adjust this to your case study. Incase you would like to maintain the full detected table, set clip_up, clip_down, clip_left, clip_right = 0
clip_up = 430
clip_down = 270
clip_left = 200
clip_right = 150
# Maximum expected (detected) table width and height dimensions in pixels, for digitized sheets under same protocol. This is to avoid cases where the table is not detected by the software and the entire image is returned. Replace these values with your case study pixels
max_table_width = 3900
max_table_height = 3600
# Minimum and maximum allowed width and height in pixels for each cell (boundary box) in the detected table. This is done to reduce noise in form of small or very large detected bounding boxes
min_cell_width_threshold = 50
max_cell_width_threshold = 200
min_cell_height_threshold = 28
max_cell_height_threshold = 90
# The expected number of rows and columns in the detected tables. Adjust these values to your expected table configurations as in the sheets
# Number of rows excluding headers
no_of_rows = 43
# Number of columns with data (excluding date and pentad no.)
no_of_columns = 24
# Number of rows including headers
no_of_rows_including_headers = 46
# In order to avoid missed cells (boundary boxes) in columns, we input parameters to identify this.
# Minimum height space between the bounding boxes in a column in case of a missing bounding box. Modify as needed for your case
space_height_threshold = 50
# Minimum width space between the bounding boxes in a row in case of a missing bounding box. Modify as needed for your case
space_width_threshold = 120
# Set height for a new box (missing box) to add to the bounding boxes list
max_cell_height_per_box = 50
[Transcription]
# Optical Character Recognition/ Handwritten Text Recognition (OCR/HTR) model to use for transcription.
# Options could be: Tesseract-OCR, EasyOCR, PaddleOCR.
ocr_model = Tesseract-OCR
# For Tesseract-OCR, you need to specify the location of the tesseract.exe file on your personal computer (local) or HPC infrastructure.
# Incase of running this on a personal computer, download and Install Tesseract first as shown here: https://github.com/UB-Mannheim/tesseract/wiki , then specify the location of the tesseract.exe in your program files
tesseract_path = C:/Program Files/Tesseract-OCR/tesseract.exe
# Incase of HPC infrastructure, contact your HPC admin to locate the Tesseract excecution file or to load the module on the HPC. Example HPC path is commented below:
# tesseract_path = /apps/brussel/RL8/skylake-ib/software/tesseract/5.3.4-GCCcore-12.3.0/bin/tesseract
# Additionally for Tesseract-OCR, you have to specify the path to the language data path
# First go to the path to the system tessdata directory (for system-wide language files) e.g. C:/Program Files/Tesseract-OCR/tessdata
# Directory where the trained Optical Character Recognition/Handwritten Text Recognition (OCR/HTR) models are stored
system_tessdata_dir = C:/Program Files/Tesseract-OCR/tessdata
# Then COPY the custom trained language model in our repository (in this folder: /OCR_HTR_models) to be used (in this case, cobecore-V6.traineddata) and PASTE this cobecore-V6.traineddata file into the tessdata directory above for your system (e.g., C:/Program Files/Tesseract-OCR/tessdata). This path below is specific to this GitHub repo.
# Incase of HPC infrastructure, place the line of code below in your job bash script:
# export TESSDATA_PREFIX= "/OCR_HTR_models/"
[QAQC]
#Quality Assessment and Quality Control (QA/QC) checks
# Here we identify which columns (in spreadsheet column labels terms) we would like to perform the QA/QC checks
# Columns to focus on: D = Max Temp, E = Min Temp, F = Average Temp. Check how these are used in the quality assessement and quality control module.
columns_to_check = D,E,F
# Columns to focus on: D = Max Temp, E = Min Temp, F = Average Temp, G = Diurnal Temperature range
columns_to_check_with_extra_variable = D,E,F,G
# Thresholds for temperatures during post-processing. Adjust these depending on your study area and available literature
# Maximum temperature threshold (in °C)
max_temperature_threshold = 40
#Minimum temperature threshold (in °C)
min_temperature_threshold = 5
# Specify the number of decimal places on your sheets (1 for divide by 10, 2 for divide by 100, etc.). In this example, all the data values on the sheet had one decimal point
decimal_places = 1
# Define the uncertainty margin used in temperature calculations
uncertainty_margin = 0.2
# Number of header rows in the sheet
header_rows = 3
# Flag to indicate if multi day totals or averages (e.g., 5-day or 6-day totals) should be handled
# Whether to check and process multi day totals e.g., 5/6-day totals. Change this to 'False' if you dont have multi-day totals per sheet but only have one final total at the end of the sheet
multi_day_totals = True
# Whether to check and process multi day averages e.g., 5/6-day averages. This is if they are recorded in the row immediately below that of the multi-day totals. Change this to 'False' if you dont have multi-day averages
multi_day_averages = True
# Maximum number of days contained in multi-day totals. In this example, we have both 5-day and 6-day totals
# Specify. In case you have only one total input the total number of data entries used in calculation of total
max_days_for_multi_day_total = 6
# Rows to be processed for multi-day totals if multi_day_totals is True
# Example row numbers for multi-day totals
multi_day_totals_rows = 9,16,23,30,37,45
# Rows for the final totals (in case multi_day_totals is False)
# Example row numbers for the final totals
final_totals_rows = 45
# Rows and columns to exclude during some steps during processing
# Headers and multi-day (5/6) day totals
excluded_rows = 1,2,3,9,16,23,30,37,45
# Example of middle columns to exclude: e.g., with columns with Date, Pentad no., Bellani and U (relative humidity)
excluded_columns = 1,2,3,15,20,25,26,27
# multi-day averages
additional_excluded_rows = 10,17,24,31,38,46
[DataFormatting]
# Date is in column B. This column had the date.
date_column = B