-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
144 lines (117 loc) · 4.66 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import random
import pandas as pd
from PIL import Image
from tensorflow.keras.preprocessing.image import load_img
from typing import Any, Dict, Tuple
from .global_variables import CLASS_LABEL_TO_INDEX_MAP, NUM_CLASSES
def _get_reference_class_label(slide_metadata: pd.DataFrame) -> str:
"""
Gets the reference class label of a certain slide.
Parameters
----------
slide_metadata: pd.DataFrame
One-row dataframe containing metadata of the slide of interest
Returns
-------
str
String describing the tissue type ('normal', 'luad', 'lssc')
"""
tissue_type = slide_metadata['tissue_type']
if tissue_type == 'normal':
return tissue_type
else:
return slide_metadata['cancer_subtype']
def create_slides_metadata(bq_results_df: pd.DataFrame) -> Dict[str, Dict[str, Any]]:
"""
Builds a dataframe comprising all slides' metadata.
Parameters
----------
bq_results_df: pd.DataFrame
Dataframe obtained from BigQuery. Contains one DICOM file (one level of a slide) per row.
Returns
-------
pd.DataFrame
Slides metadata table with one row per slide.
"""
slides_metadata = dict()
for index, row in bq_results_df.iterrows():
slide_metadata = row.to_dict()
image_id = slide_metadata['digital_slide_id']
# Move level specific values through "pop()"
level_data = {
'width': slide_metadata.pop('width', None),
'height': slide_metadata.pop('height', None),
'pixel_spacing': slide_metadata.pop('pixel_spacing', None),
'compression': slide_metadata.pop('compression', None),
'crdc_instance_uuid': slide_metadata.pop('crdc_instance_uuid', None),
'gcs_url': slide_metadata.pop('gcs_url', None)
}
if not image_id in slides_metadata:
slides_metadata[image_id] = slide_metadata
slides_metadata[image_id]['reference_class_label'] = _get_reference_class_label(slide_metadata)
slides_metadata[image_id]['levels'] = []
slides_metadata[image_id]['levels'].append(level_data)
for slide_metadata in slides_metadata.values():
slide_metadata['levels'].sort(key=lambda x: x['pixel_spacing'])
if len(slide_metadata['levels']) > 0:
base_level = slide_metadata['levels'][0]
slide_metadata['width'] = base_level['width']
slide_metadata['height'] = base_level['height']
return pd.DataFrame.from_records(list(slides_metadata.values()),
index=list(slides_metadata.keys()))
def get_stratified_subsample(slides_metadata: pd.DataFrame, num_slides: int, random_state: int) -> pd.DataFrame:
"""
Gets a subsample from slides_metadata with the same amount of samples for each class.
Parameters
----------
slides_metadata: pd.DataFrame
Slides metadata table with one row per slide.
num_slides: int
Number of slides which should be returned in total.
random_state: int
Integer used to seed the pd.sample function.
Returns
-------
pd.DataFrame
Subsample of slides metadata table.
"""
assert num_slides % NUM_CLASSES == 0
return pd.concat(\
[slides_metadata.loc[slides_metadata['reference_class_label'] == cl]\
.sample(num_slides // NUM_CLASSES, random_state=random_state)
for cl in CLASS_LABEL_TO_INDEX_MAP.keys()])
def get_tile_path(tiles_dir: str, tile_info: Tuple[str, Tuple[int,int]]) -> Tuple[str,str]:
"""
Function that composes the subfolder and the full path where to store a certain tile.
Parameters
----------
tiles_dir: str
Directory for all tiles.
tile_info: tuple
Tuple containing the image ID and the tile position of the tile of interest.
Returns
-------
tuple
Directory and full path where to store the tile.
"""
image_id, tile_pos = tile_info
tile_dir = os.path.join(tiles_dir, image_id, str(tile_pos[1]))
tile_path = os.path.join(tile_dir, '{y_pos}.{ext}'.format(y_pos=tile_pos[0], ext='png'))
return tile_dir, tile_path
def load_tile(tiles_dir: str, tile_info: Tuple[str, Tuple[int,int]]) -> Image.Image:
"""
Function loading a tile from disk.
Parameters
----------
tiles_dir: str
Directory for all tiles.
tile_info: tuple
Tuple containing the image ID and the tile position of the tile of interest.
Returns
-------
Image.Image
Requested tile.
"""
_, tile_path = get_tile_path(tiles_dir, tile_info)
return load_img(tile_path, color_mode='rgb')