PR tests - macos/ubuntu failing (#13)

* Added test autopipeline and modalities, solved some autopipeline bugs, read_dicom_series and pet now supports series_id * PT/RTDOSE metadata to csv * fixed some bugs in autopipeline.py * now the pipeline saves on exit * deleted data * now checks for existing subject id * uncommented one line pytest * uncommented one line pytest * Added dataset class which can load from nrrds or directly from the dataset and convert to pytorch dataset * bug fixes_1.0 * test and autopipe fixed * bug fixes 2 * fixed pipeline tests * clean tests * added workflow * yml * yml * matplotlib * trying other patient to avoid memoryerror * set roi_names to avoid memoryerror * cave * indents * Update manual-test.yml Co-authored-by: Vishwesh <vishweshramanathan@gmail.com> Former-commit-id: 10a90b4
bhklab · Dec 10, 2021 · 1de1d92 · 1de1d92
1 parent 2e915fb
commit 1de1d92
Show file tree

Hide file tree

Showing 13 changed files with 374 additions and 99 deletions.
diff --git a/.github/workflows/manual-test.yml b/.github/workflows/manual-test.yml
@@ -9,11 +9,15 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: [3.7, 3.8, 3.9]
+        os: 
+          - ubuntu-latest
+          - macos-latest
+          - windows-latest
+        python-version: 
+          - 3.7
+          - 3.8
+          - 3.9
 
-
-
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
@@ -32,3 +36,8 @@ jobs:
     - name: Run pytest
       run: |
         pytest tests
+    - name: Slack Notification
+      if: ${{always() && matrix.os == 'ubuntu-latest'}}
+      uses: rtCamp/action-slack-notify@v2
+      env:
+        SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_URL }}
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 # data
+data
 examples/data/tcia_n*
 
 # macOS

diff --git a/imgtools/autopipeline.py b/imgtools/autopipeline.py
@@ -1,4 +1,10 @@
 import os
+import shutil
+import warnings
+import glob
+import ast
+import datetime
+import json
 
 from argparse import ArgumentParser
 
@@ -7,10 +13,11 @@
 
 import SimpleITK as sitk
 import pandas as pd
-import warnings
+import numpy as np
+
 from joblib import Parallel, delayed
-import glob
-import ast
+
+
 ###############################################################
 # Example usage:
 # python radcure_simple.py ./data/RADCURE/data ./RADCURE_output
@@ -61,6 +68,10 @@ def __init__(self,
         # output ops
         self.output = ImageAutoOutput(self.output_directory, self.output_streams)
 
+        #Make a directory
+        if not os.path.exists(os.path.join(self.output_directory,".temp")):
+            os.mkdir(os.path.join(self.output_directory,".temp"))
+
 
     def process_one_subject(self, subject_id):
         """Define the processing operations for one subject.
@@ -75,7 +86,7 @@ def process_one_subject(self, subject_id):
            The ID of subject to process
         """
         #Check if the subject_id has already been processed
-        if os.path.exists(os.path.join(self.output_directory,f'temp_{subject_id}.txt')):
+        if os.path.exists(os.path.join(self.output_directory,".temp",f'temp_{subject_id}.json')):
             print(f"{subject_id} already processed")
             return 
 
@@ -96,7 +107,9 @@ def process_one_subject(self, subject_id):
             output_stream = ("_").join([item for item in colname.split("_") if item != "1"])
 
             #If there are multiple connections existing, multiple connections means two modalities connected to one modality. They end with _1
-            mult_conn = colname.split("_")[-1] == "1"
+            mult_conn = colname.split("_")[-1].isnumeric()
+            num = colname.split("_")[-1]
+
             print(output_stream)
 
             if read_results[i] is None:
@@ -130,6 +143,8 @@ def process_one_subject(self, subject_id):
                 else:
                     counter[modality] = counter[modality]+1
                     self.output(f"{subject_id}_{counter[modality]}", doses, output_stream)
+                    # self.output(f"{subject_id}_{num}", doses, output_stream)
+
                 metadata[f"size_{output_stream}"] = str(doses.GetSize())
                 metadata[f"metadata_{output_stream}"] = str(read_results[i].get_metadata())
                 print(subject_id, " SAVED DOSE")
@@ -152,7 +167,10 @@ def process_one_subject(self, subject_id):
                 else:
                     counter[modality] = counter[modality] + 1
                     self.output(f"{subject_id}_{counter[modality]}", mask, output_stream)
+                    # self.output(f"{subject_id}_{num}", mask, output_stream)
+
                 metadata[f"roi_names_{output_stream}"] = str(structure_set.roi_names)
+                # metadata[f"metadata_{colname}"] = [structure_set.roi_names]
 
                 print(subject_id, "SAVED MASK ON", conn_to)
             elif modality == "PT":
@@ -175,7 +193,26 @@ def process_one_subject(self, subject_id):
         with open(os.path.join(self.output_directory,f'temp_{subject_id}.txt'),'w') as f:
             f.write(str(metadata))
         return 
+
+    #                 self.output(f"{subject_id}_{num}", pet, output_stream)
+    #             metadata[f"size_{output_stream}"] = str(pet.GetSize())
+    #             metadata[f"metadata_{colname}"] = [read_results[i].get_metadata()]
+    #             print(subject_id, " SAVED PET")
+    #     #Saving all the metadata in multiple text files
+    #     with open(os.path.join(self.output_directory,".temp",f'temp_{subject_id}.json'),'w') as f:
+    #         json.dump(metadata,f)
+    #     return 
 
+    # def save_data(self):
+    #     files = glob.glob(os.path.join(self.output_directory,".temp","*.json"))
+    #     for file in files:
+    #         subject_id = ("_").join(file.replace("/","_").replace(".","_").split("_")[-3:-1])
+    #         with open(file) as f:
+    #             metadata = json.load(f)
+    #         self.output_df.loc[subject_id, list(metadata.keys())] = list(metadata.values())
+    #     self.output_df.to_csv(self.output_df_path)
+    #     shutil.rmtree(os.path.join(self.output_directory,".temp"))
+
     def save_data(self):
         files = glob.glob(os.path.join(self.output_directory,"*.txt"))
         for file in files:

diff --git a/imgtools/io/__init__.py b/imgtools/io/__init__.py
@@ -1,3 +1,4 @@
 from .common import *
 from .loaders import *
 from .writers import *
+from .dataset import *
diff --git a/imgtools/io/common.py b/imgtools/io/common.py
@@ -1,4 +1,5 @@
 import os
+from typing import Dict
 
 from pydicom.misc import is_dicom
 
@@ -34,3 +35,17 @@ def find_dicom_paths(root_path: str, yield_directories: bool = False) -> str:
                 fpath = os.path.join(root, f)
                 if is_dicom(fpath):
                     yield fpath
+
+def file_name_convention() -> Dict:
+    """
+    This function returns the file name taxonomy which is used by ImageAutoOutput and Dataset class
+    """
+    file_name_convention = {"CT": "image",
+                          "RTDOSE_CT": "dose", 
+                          "RTSTRUCT_CT": "mask_ct.seg", 
+                          "RTSTRUCT_PT": "mask_pt.seg", 
+                          "PT_CT": "pet", 
+                          "PT": "pet", 
+                          "RTDOSE": "dose", 
+                          "RTSTRUCT": "mask.seg"}
+    return file_name_convention
diff --git a/imgtools/io/dataset.py b/imgtools/io/dataset.py
@@ -0,0 +1,183 @@
+from genericpath import exists
+import os
+import numpy as np
+from typing import List, Sequence, Optional, Callable, Iterable, Dict,Tuple
+import torchio as tio
+import pandas as pd
+# from . import file_name_convention
+# from ..ops import StructureSetToSegmentation, ImageAutoInput, Resample, BaseOp
+from imgtools.io import file_name_convention
+from imgtools.ops import StructureSetToSegmentation, ImageAutoInput, Resample, BaseOp
+from tqdm import tqdm
+from joblib import Parallel, delayed
+import SimpleITK as sitk
+import warnings
+from imgtools.pipeline import Pipeline
+
+class Dataset(tio.SubjectsDataset):
+    """
+    This class takes in medical dataset in the form of nrrds or directly from the dataset and converts the data into torchio.Subject object, which can be loaded into 
+    torchio.SubjectDataset object.
+    This class inherits from torchio.SubjectDataset object, which can support transforms and torch.Dataloader.
+    Read more about torchio from https://torchio.readthedocs.io/quickstart.html and torchio.SubjectDataset from https://github.com/fepegar/torchio/blob/3e07b78da16d6db4da7193325b3f9cb31fc0911a/torchio/data/dataset.py#L101
+    """
+    def __init__(
+        self,
+        subjects: Sequence[tio.Subject],
+        transform: Optional[Callable] = None,
+        load_getitem: bool = True
+        ) -> tio.SubjectsDataset:
+        super().__init__(subjects,transform,load_getitem)
+
+    @classmethod
+    def load_from_nrrd(
+            cls,
+            path:str,
+            transform: Optional[Callable] = None,
+            load_getitem: bool = True
+            ) -> List[tio.Subject]:
+        """
+        Based on the given path, passess the processed nrrd files present in the directory and the metadata associated with it and creates a list of Subject instances
+        Parameters
+            path: Path to the output directory passed to the autopipeline script. The output directory should have all the user mentioned modalities processed and present in their folder. The directory
+                  should additionally have dataset.csv which stores all the metadata
+        """
+        path_metadata = os.path.join(path,"dataset.csv")
+        if not os.path.exists(path_metadata):
+            raise ValueError("The specified path has no file name {}".format(path_metadata))
+        df_metadata = pd.read_csv(path_metadata,index_col=0)
+        output_streams = [("_").join(cols.split("_")[1:]) for cols in df_metadata.columns if cols.split("_")[0]=="folder"]
+        imp_metadata = [cols for cols in df_metadata.columns if cols.split("_")[0] in ("metadata")]
+        #Based on the file naming taxonomy
+        file_names = file_name_convention()
+        subject_id_list = list(df_metadata.index)
+        subjects = []
+        for subject_id in tqdm(subject_id_list):
+            temp = {}
+            for col in output_streams:
+                extension = file_names[col]
+                mult_conn = col.split("_")[-1].isnumeric()
+                metadata_name = f"metadata_{col}"
+                if mult_conn:
+                    extra = col.split("_")[-1]+"_"
+                else:
+                    extra = ""
+                path_mod = os.path.join(path,extension.split(".")[0],f"{subject_id}_{extra}{extension}.nrrd")
+                #All modalities except RTSTRUCT should be of type torchIO.ScalarImage
+                if col!="RTSTRUCT":
+                    temp[f"mod_{col}"] = tio.ScalarImage(path_mod)
+                else:
+                    temp[f"mod_{col}"] = tio.LabelImage(path_mod)
+                #For including metadata
+                if metadata_name in imp_metadata:
+                    #convert string to proper datatype
+                    temp[metadata_name] = df_metadata.loc[subject_id,metadata_name][0]
+            subjects.append(tio.Subject(temp))
+        return cls(subjects,transform,load_getitem)
+
+    @classmethod
+    def load_directly(
+            cls,
+            path:str,
+            modalities: str,
+            n_jobs: int = -1,
+            spacing: Tuple = (1., 1., 0.),
+            transform: Optional[Callable] = None,
+            load_getitem: bool = True
+            ) -> List[tio.Subject]:
+        """
+        Based on the given path, imgtools crawls through the directory, forms datagraph and picks the user defined modalities. These paths are processed into sitk.Image.
+        This image and the metadata associated with it, creates a list of Subject instances
+        Parameters
+            path: Path to the directory of the dataset
+        """
+        input = ImageAutoInput(path, modalities, n_jobs)
+        df_metadata = input.df_combined
+        output_streams = input.output_streams
+        #Basic operations
+        subject_id_list = list(df_metadata.index)
+        # basic image processing ops
+        resample = Resample(spacing=spacing)
+        make_binary_mask = StructureSetToSegmentation(roi_names=[], continuous=False)
+        subjects =  Parallel(n_jobs=n_jobs)(delayed(cls.process_one_subject)(input,subject_id,output_streams,resample,make_binary_mask) for subject_id in tqdm(subject_id_list))
+        return cls(subjects,transform,load_getitem)
+
+    @staticmethod
+    def process_one_subject(
+            input: Pipeline,
+            subject_id: str,
+            output_streams: List[str],
+            resample: BaseOp,
+            make_binary_mask: BaseOp,  
+            ) -> tio.Subject:
+        """
+        Process all modalities for one subject
+        Parameters:
+            input: ImageAutoInput class which helps in loading the respective DICOMs
+            subject_id: subject id of the data
+            output_streams: the modalities that are being considered, Note that there can be multiple items of same modality based on their relations with different modalities
+            resample: transformation which resamples sitk.Image
+            make_binary_mask: transformation useful in making binary mask for rtstructs
+        Returns tio.Subject instance for a particular subject id
+        """
+        temp = {}
+        read_results = input(subject_id)
+        for i,colname in enumerate(output_streams):
+            modality = colname.split("_")[0]
+            output_stream = ("_").join([item for item in colname.split("_") if item != "1"])
+
+            if read_results[i] is None:
+                temp[f"mod_{colname}"] = None
+            elif modality == "CT":
+                image = read_results[i]
+                if len(image.GetSize()) == 4:
+                    assert image.GetSize()[-1] == 1, f"There is more than one volume in this CT file for {subject_id}."
+                    extractor = sitk.ExtractImageFilter()
+                    extractor.SetSize([*image.GetSize()[:3], 0])
+                    extractor.SetIndex([0, 0, 0, 0])    
+                    image = extractor.Execute(image)
+                image = resample(image)
+                temp[f"mod_{colname}"] = tio.ScalarImage.from_sitk(image)
+            elif modality == "RTDOSE":
+                try: #For cases with no image present
+                    doses = read_results[i].resample_dose(image)
+                except:
+                    Warning("No CT image present. Returning dose image without resampling")
+                    doses = read_results[i]
+                temp[f"mod_{colname}"] = tio.ScalarImage.from_sitk(doses)
+                temp[f"metadata_{colname}"] = read_results[i].get_metadata()
+            elif modality == "RTSTRUCT":
+                #For RTSTRUCT, you need image or PT
+                structure_set = read_results[i]
+                conn_to = output_stream.split("_")[-1]
+                # make_binary_mask relative to ct/pet
+                if conn_to == "CT":
+                    mask = make_binary_mask(structure_set, image)
+                elif conn_to == "PT":
+                    mask = make_binary_mask(structure_set, pet)
+                else:
+                    raise ValueError("You need to pass a reference CT or PT/PET image to map contours to.")
+                temp[f"mod_{colname}"] = tio.LabelMap.from_sitk(mask)
+                temp[f"metadata_{colname}"] = structure_set.roi_names
+            elif modality == "PT":
+                try:
+                    #For cases with no image present
+                    pet = read_results[i].resample_pet(image)
+                except:
+                    Warning("No CT image present. Returning PT/PET image without resampling.")
+                    pet = read_results[i]
+                temp[f"mod_{colname}"] = tio.ScalarImage.from_sitk(pet)
+                temp[f"metadata_{colname}"] = read_results[i].get_metadata()
+        return tio.Subject(temp)
+
+if __name__=="__main__":
+    from torch.utils.data import DataLoader
+    # output_path = "/cluster/projects/radiomics/Temp/vishwesh/HN-CT_RTdose_test2"
+    input_path = "/cluster/home/ramanav/imgtools/examples/data_test"
+    transform = tio.Compose([tio.Resize(256)])
+    # subjects_dataset = Dataset.load_from_nrrd(output_path,transform=transform)
+    subjects_dataset = Dataset.load_directly(input_path,modalities="CT,RTDOSE,PT",n_jobs=4,transform=transform)
+    print(len(subjects_dataset))
+    training_loader = DataLoader(subjects_dataset, batch_size=4)
+    items = next(iter(training_loader))
+    print(items["mod_RTDOSE_CT"])