From 232eec3ae6271b34f94bbdd4553275ca2bc58de8 Mon Sep 17 00:00:00 2001 From: Continuous Integration Date: Wed, 17 Jan 2024 15:05:04 +0000 Subject: [PATCH 01/19] build: Bumped version for apply-flatfield-plugin from 2.0.0-dev9 to 2.0.0 --- transforms/images/apply-flatfield-plugin/VERSION | 1 + transforms/images/apply-flatfield-tool/README.md | 4 ++++ transforms/images/apply-flatfield-tool/plugin.json | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 transforms/images/apply-flatfield-plugin/VERSION diff --git a/transforms/images/apply-flatfield-plugin/VERSION b/transforms/images/apply-flatfield-plugin/VERSION new file mode 100644 index 000000000..227cea215 --- /dev/null +++ b/transforms/images/apply-flatfield-plugin/VERSION @@ -0,0 +1 @@ +2.0.0 diff --git a/transforms/images/apply-flatfield-tool/README.md b/transforms/images/apply-flatfield-tool/README.md index aa08fb55e..4eaf93481 100644 --- a/transforms/images/apply-flatfield-tool/README.md +++ b/transforms/images/apply-flatfield-tool/README.md @@ -1,4 +1,8 @@ +<<<<<<< HEAD:transforms/images/apply-flatfield-tool/README.md # Apply Flatfield Plugin (v2.0.1-dev0) +======= +# Apply Flatfield Plugin (v2.0.0) +>>>>>>> 0172c986 (build: Bumped version for apply-flatfield-plugin from 2.0.0-dev9 to 2.0.0):transforms/images/apply-flatfield-plugin/README.md This WIPP plugin applies a flatfield operation on every image in a collection. The algorithm used to apply the flatfield is as follows: diff --git a/transforms/images/apply-flatfield-tool/plugin.json b/transforms/images/apply-flatfield-tool/plugin.json index e086239e8..09495876b 100644 --- a/transforms/images/apply-flatfield-tool/plugin.json +++ b/transforms/images/apply-flatfield-tool/plugin.json @@ -91,4 +91,4 @@ "description": "Preview the output images' names without actually running computation" } ] -} +} \ No newline at end of file From 1bcb25537a6956d0a385c2b8d771620b46f75581 Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Thu, 25 Jan 2024 12:35:26 -0600 Subject: [PATCH 02/19] synced with remote forked --- transforms/images/apply-flatfield-plugin/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/images/apply-flatfield-plugin/VERSION b/transforms/images/apply-flatfield-plugin/VERSION index 227cea215..b2484da91 100644 --- a/transforms/images/apply-flatfield-plugin/VERSION +++ b/transforms/images/apply-flatfield-plugin/VERSION @@ -1 +1 @@ -2.0.0 +2.0.0-dev9 From 954789db5975594b62b85fe51743177e48d4ed06 Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Wed, 17 Jan 2024 08:29:58 -0600 Subject: [PATCH 03/19] renamed plugin --- .../feature-subsetting-plugin/Dockerfile | 20 ++ .../feature-subsetting-plugin/README.md | 56 ++++ clustering/feature-subsetting-plugin/VERSION | 1 + .../feature-subsetting-plugin/build-docker.sh | 4 + .../feature-subsetting-plugin/bumpversion.cfg | 27 ++ .../package-release.sh | 0 .../feature-subsetting-plugin/plugin.json | 144 +++++++++ .../feature-subsetting-plugin/pyproject.toml | 30 ++ .../feature-subsetting-plugin/run-docker.sh | 0 .../feature-subsetting-plugin/src/main.py | 288 ++++++++++++++++++ 10 files changed, 570 insertions(+) create mode 100644 clustering/feature-subsetting-plugin/Dockerfile create mode 100644 clustering/feature-subsetting-plugin/README.md create mode 100644 clustering/feature-subsetting-plugin/VERSION create mode 100644 clustering/feature-subsetting-plugin/build-docker.sh create mode 100644 clustering/feature-subsetting-plugin/bumpversion.cfg create mode 100644 clustering/feature-subsetting-plugin/package-release.sh create mode 100644 clustering/feature-subsetting-plugin/plugin.json create mode 100644 clustering/feature-subsetting-plugin/pyproject.toml create mode 100644 clustering/feature-subsetting-plugin/run-docker.sh create mode 100644 clustering/feature-subsetting-plugin/src/main.py diff --git a/clustering/feature-subsetting-plugin/Dockerfile b/clustering/feature-subsetting-plugin/Dockerfile new file mode 100644 index 000000000..fa0026dce --- /dev/null +++ b/clustering/feature-subsetting-plugin/Dockerfile @@ -0,0 +1,20 @@ +FROM polusai/bfio:2.3.3 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".arrow" + + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} + +RUN pip3 install ${EXEC_DIR} --no-cache + + +ENTRYPOINT ["python3", "-m", "polus.plugins.clustering.feature_subsetting"] +CMD ["--help"] \ No newline at end of file diff --git a/clustering/feature-subsetting-plugin/README.md b/clustering/feature-subsetting-plugin/README.md new file mode 100644 index 000000000..24ccba663 --- /dev/null +++ b/clustering/feature-subsetting-plugin/README.md @@ -0,0 +1,56 @@ +# Feature Data Subset + +This WIPP plugin subsets data based on a given feature. It works in conjunction with the `polus-feature-extraction-plugin`, where the feature extraction plugin can be used to extract the features such as the mean intensity of every image in the input image collection. + +# Usage +The details and usage of the plugin inputs is provided in the section below. In addition to the subsetted data, the output directory also consists of a `summary.txt` file which has information as to what images were kept and their new filename if they were renamed. + +### Explanation of inputs +Some of the inputs are pretty straighforward and are used commonly across most WIPP plugins. This section is used to provide some details and examples of the inputs that may be a little complicated. The image collection with the following pattern will be used as an example : `r{r+}_t{t+}_p{p+}_z{z+}_c{c+}.ome.tif`, where r,t,p,z,c stand for replicate, timepoint, positon,z-positon, and channel respectively. Consider we have 5 replicates, 3 timepoints, 50 positions, 10 z-planes and 4 channels. + +1. `inpDir` - This contains the path to the input image collection to subset data from. +2. `filePattern` - Filepattern of the input images +3. `groupVar` - This is a mandatory input across which to subset data. This can take either 1 or 2 variables as input and if 2 variables are provided then the second variable will be treated as the minor grouping variable. In our example, if the `z` is provided as input, then within a subcollection, the mean of the feature value will be taken for all images with the same z. Then the z positions will be filtered out based on the input of `percentile` and `removeDirection` variables. Now if `z,c` are provided as input, then 'c' will be treated as the minor grouping variable which means that the mean will be taken for all images with the same z for each channel. Also, the plugin will ensures that the same values of z positions are filtered out across c. +4. `csvDir` - This contains the path to the csv collection containing the feature values for each image. This can be the output of the feature extraction plugin. +5. `feature` - The column name from the csv file that will be used to filter images +6. `percentile` and `removeDirection` - These two variables denote the critieria with which images are filtered. For example, if percentile is `0.1` and removeDirection is set to `Below` then images with feature value below the 10th percentile will be removed. On the other hand, if removeDirection is set to above then all images with feature value greater than the 10th pecentile will be removed. This enables data subsetting from both `brighfield` and `darkfield` microscopy images. + + **Optional Arguments** + +8. `sectionVar` - This is an optional input to segregate the input image collection into sub-collections. The analysis will be done seperately for each sub-collection. In our example, if the user enters `r,t` as the sectionVar, then we will have 15 subcollections (5*3),1 for each combination of timepoint and replicate. If the user enters `r` as sectionVar, then we will have 5 sub collections, 1 for each replicate. If the user wants to consider the whole image collection as a single section, then no input is required. NOTE: As a post processing step, same number of images will be subsetted across different sections. +9. `padding` - This is an optional variable with default value of 0. A delay of 3 means that 3 additional planes will captured on either side of the subsetted data. This can be used as a sanity check to ensure that the subsetted data captures the images we want. For example, in our examples if the following z values were filtered out intitially - 5,6,7 ; then a delay of 3 means that the output dataset will have z positions 2,3,4,5,6,7,8,9,10 if all them exist. +10. `writeOutput` - This is an optional argument with default value `True`. If it is set to true, then both the output image collection and `summary.txt` file will be created. If it is set to false, then the output directory will only consist of summary.txt. This option enables the user to tune the hyperparameters such as percentile, removeDirecton, feature without actually creating the output image collection. + + + +Contact [Gauhar Bains](mailto:gauhar.bains@labshare.org) for more information. + +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes one input argument and one output argument: + +| Name | Description | I/O | Type | +| ------------------- | ----------------------------------------------------- | ------ | ------------- | +| `--csvDir` | CSV collection containing features | Input | csvCollection | +| `--padding` | Number of images to capture outside the cutoff | Input | int | +| `--feature` | Feature to use to subset data | Input | string | +| `--filePattern` | Filename pattern used to separate data | Input | string | +| `--groupVar` | variables to group by in a section | Input | string | +| `--inpDir` | Input image collection to be processed by this plugin | Input | collection | +| `--percentile` | Percentile to remove | Input | int | +| `--removeDirection` | remove direction above or below percentile | Input | string | +| `--sectionVar` | variables to divide larger sections | Input | string | +| `--writeOutput` | write output image collection or not | Input | boolean | +| `--outDir` | Output collection | Output | collection | + diff --git a/clustering/feature-subsetting-plugin/VERSION b/clustering/feature-subsetting-plugin/VERSION new file mode 100644 index 000000000..b4f09dd42 --- /dev/null +++ b/clustering/feature-subsetting-plugin/VERSION @@ -0,0 +1 @@ +0.2.0-dev \ No newline at end of file diff --git a/clustering/feature-subsetting-plugin/build-docker.sh b/clustering/feature-subsetting-plugin/build-docker.sh new file mode 100644 index 000000000..d9ad13705 --- /dev/null +++ b/clustering/feature-subsetting-plugin/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:VERSION] + +[bumpversion:file:src/polus/plugins/clustering/feature_subsetting/__init__.py] \ No newline at end of file diff --git a/clustering/feature-subsetting-plugin/package-release.sh b/clustering/feature-subsetting-plugin/package-release.sh new file mode 100644 index 000000000..e69de29bb diff --git a/clustering/feature-subsetting-plugin/plugin.json b/clustering/feature-subsetting-plugin/plugin.json new file mode 100644 index 000000000..a60a0f7f8 --- /dev/null +++ b/clustering/feature-subsetting-plugin/plugin.json @@ -0,0 +1,144 @@ +{ + "name": "Feature Subsetting", + "version": "0.2.0-dev", + "title": "Feature Subsetting", + "description": "Subset data using a given feature", + "author": "Gauhar Bains (gauhar.bains@labshare.org) and Hamdah Shafqat Abbasi (hamdahshafqat.abbasi@nih.gov)", + "institution": "National Center for Advancing Translational Sciences, National Institutes of Health", + "repository": "https://github.com/PolusAI/polus-plugin", + "website": "https://ncats.nih.gov/preclinical/core/informatics", + "citation": "", + "containerId": "polusai/feature-subsetting-plugin:0.2.0-dev", + "baseCommand": [ + "python3", + "-m", + "polus.plugins.clustering.feature_subsetting" + ], + "inputs": [ + { + "name": "inpDir", + "type": "collection", + "description": "Input image collection to be processed by this plugin", + "required": true + }, + { + "name": "filePattern", + "type": "string", + "description": "Filename pattern used to separate data", + "required": true + }, + { + "name": "groupVar", + "type": "string", + "description": "variables to group by in a section", + "required": true + }, + { + "name": "sectionVar", + "type": "string", + "description": "variables to divide larger sections", + "required": false + }, + { + "name": "csvDir", + "type": "csvCollection", + "description": "CSV collection containing features", + "required": true + }, + { + "name": "feature", + "type": "string", + "description": "Feature to use to subset data", + "required": true + }, + { + "name": "percentile", + "type": "number", + "description": "Percentile to remove", + "required": true + }, + { + "name": "removeDirection", + "type": "enum", + "options": { + "values": [ + "Below", + "Above" + ] + }, + "description": "remove direction above or below percentile", + "required": true + }, + { + "name": "padding", + "type": "string", + "description": "Number of images to capture outside the cutoff", + "required": false + }, + { + "name": "writeOutput", + "type": "boolean", + "description": "write output image collection or not", + "required": false + } + ], + "outputs": [ + { + "name": "outDir", + "type": "collection", + "description": "Output collection" + } + ], + "ui": [ + { + "key": "inputs.inpDir", + "title": "Input collection", + "description": "Input image collection to be processed by this plugin" + }, + { + "key": "inputs.csvDir", + "title": "CSV collection", + "description": "CSV collection containing features" + }, + { + "key": "inputs.feature", + "title": "Feature", + "description": "Feature to use to subset data" + }, + { + "key": "inputs.percentile", + "title": "Percentile", + "description": "Percentile to remove" + }, + { + "key": "inputs.removeDirection", + "title": "Remove Direction", + "description": "remove direction above or below percentile" + }, + { + "key": "inputs.filePattern", + "title": "Filename pattern", + "description": "Filename pattern used to separate data" + }, + { + "key": "inputs.groupVar", + "title": "Grouping Variables", + "description": "variables to group by in a section" + }, + { + "key": "inputs.sectionVar", + "title": "Section Variables", + "description": "variables to divide larger sections" + }, + { + "key": "inputs.padding", + "title": "Padding", + "description": "Number of images to capture outside the cutoff" + }, + { + "key": "inputs.writeOutput", + "title": "Write Output", + "description": "write output image collection or not" + } + ] +} \ No newline at end of file diff --git a/clustering/feature-subsetting-plugin/pyproject.toml b/clustering/feature-subsetting-plugin/pyproject.toml new file mode 100644 index 000000000..22e810895 --- /dev/null +++ b/clustering/feature-subsetting-plugin/pyproject.toml @@ -0,0 +1,30 @@ +[tool.poetry] +name = "polus-plugins-clustering-feature-subsetting" +version = "0.2.0-dev" +description = "Subset data using a given feature." +authors = [ + "Gauhar Bains ", + "Hamdah Shafqat abbasi " + ] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.4" +typer = "^0.7.0" +tqdm = "^4.64.1" +preadator="0.4.0.dev2" +vaex = "^4.17.0" + +[tool.poetry.group.dev.dependencies] +pre-commit = "^3.3.3" +bump2version = "^1.0.1" +pytest = "^7.3.2" +pytest-xdist = "^3.3.1" +pytest-sugar = "^0.9.7" +ipykernel = "^6.28.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" \ No newline at end of file diff --git a/clustering/feature-subsetting-plugin/run-docker.sh b/clustering/feature-subsetting-plugin/run-docker.sh new file mode 100644 index 000000000..e69de29bb diff --git a/clustering/feature-subsetting-plugin/src/main.py b/clustering/feature-subsetting-plugin/src/main.py new file mode 100644 index 000000000..a942d67c2 --- /dev/null +++ b/clustering/feature-subsetting-plugin/src/main.py @@ -0,0 +1,288 @@ +import argparse, logging, subprocess, time, multiprocessing, sys +import os +import filepattern +import pandas as pd +import shutil +from pathlib import Path +import traceback + +def filter_planes(feature_dict, removeDirection, percentile): + """filter planes by the criteria specified by removeDirection + and percentile + + Args: + feature_dict (dictionary): planes and respective feature value + removeDirection (string): remove above or below percentile + percentile (int): cutoff percentile + + Returns: + set: planes that fit the criteria + """ + planes = list(feature_dict.keys()) + feat_value = [feature_dict[i] for i in planes] + thresh = min(feat_value) + percentile * (max(feat_value) - min(feat_value)) + + # filter planes + if removeDirection == 'Below': + keep_planes = [z for z in planes if feature_dict[z] >= thresh] + else: + keep_planes = [z for z in planes if feature_dict[z] <= thresh] + + return set(keep_planes) + +def make_uniform(planes_dict, uniques, padding): + """ Ensure each section has the same number of images + + This function makes the output collection uniform in + the sense that it preserves same number of planes across + sections. It also captures additional planes based + on the value of the padding variable + + Args: + planes_dict (dict): planes to keep in different sections + uniques (list): unique values for the major grouping variable + padding (int): additional images to capture outside cutoff + + Returns: + dictionary: dictionary containing planes to keep + """ + + # max no. of planes + max_len = max([len(i) for i in planes_dict.values()]) + + # max planes that can be added on each side + min_ind = min([min(planes_dict[k]) for k in planes_dict]) + max_ind = max([max(planes_dict[k]) for k in planes_dict]) + max_add_left = uniques.index(min_ind) + max_add_right = len(uniques) - (uniques.index(max_ind)+1) + + # add planes in each section based on padding and max number of planes + for section_id, planes in planes_dict.items(): + len_to_add = max_len - len(planes) + len_add_left = min(int(len_to_add)/2+padding, max_add_left) + len_add_right = min(len_to_add - len_add_left+padding, max_add_right) + left_ind = int(uniques.index(min(planes)) - len_add_left) + right_ind = int(uniques.index(max(planes)) + len_add_right)+1 + planes_dict[section_id] = uniques[left_ind:right_ind] + return planes_dict + +def main(inpDir,csvDir,outDir,filePattern,groupVar,percentile, + removeDirection,sectionVar,feature,padding,writeOutput): + """Function containing the main login to subset data + + Args: + inpDir (string): path to input image collection + csvDir (string): path to csv file containing features + outDir (string): path to output collection + filePattern (string): input image filepattern + groupVar (string): grouping variables + percentile (float): cutoff feature percentile + removeDirection (string): subset above or below percentile + sectionVar (string): sectioning variable + feature (string): feature to subset using + padding (int): capture additional images outside of cutoff + writeOutput (boolean): write output image collection or not + """ + + # Get all file names in csvDir image collection + csvDir_files = [f.name for f in Path(csvDir).iterdir() if f.is_file() and "".join(f.suffixes)=='.csv'] + + # Get all file names in inpDir image collection + inpDir_files = [f.name for f in Path(inpDir).iterdir() if f.is_file() and "".join(f.suffixes)=='.ome.tif'] + + # read and concat all csv files + for ind, file in enumerate(csvDir_files): + if ind == 0: + feature_df = pd.read_csv(os.path.join(csvDir, file), header=0) + else: + feature_df = pd.concat([feature_df, pd.read_csv(os.path.join(csvDir, file), header=0)]) + + # store image name and its feature value + feature_dict = {k:v for k,v in zip(feature_df['Image'], feature_df[feature])} + + # seperate filepattern variables into different categories + _,var = filepattern.get_regex(filePattern) + grouping_variables = groupVar.split(',') + section_variables = sectionVar.split(',') + sub_section_variables = [v for v in var if v not in grouping_variables+section_variables] + + # initialize filepattern object + fp = filepattern.FilePattern(inpDir, pattern=filePattern) + uniques = fp.uniques + + [maj_grouping_var, min_grouping_var] = grouping_variables if len(grouping_variables)>1 else grouping_variables+[None] + keep_planes = {} + + logger.info('Iterating over sections...') + # single iteration of this loop gives all images in one section + for file in fp(group_by=sub_section_variables+grouping_variables): + + section_feat_dict = {} + section_keep_planes = [] + section_id = tuple([file[0][i] for i in section_variables]) if section_variables[0] else 1 + + # iterate over files in one section + for f in file: + if min_grouping_var == None: + f[min_grouping_var] = None + + # stote feature values for images + if f[min_grouping_var] not in section_feat_dict: + section_feat_dict[f[min_grouping_var]] = {} + + if f[maj_grouping_var] not in section_feat_dict[f[min_grouping_var]]: + section_feat_dict[f[min_grouping_var]][f[maj_grouping_var]] = [] + + section_feat_dict[f[min_grouping_var]][f[maj_grouping_var]].append(feature_dict[f['file'].name]) + + # average feature value by grouping variable + for key1 in section_feat_dict: + for key2 in section_feat_dict[key1]: + section_feat_dict[key1][key2] = sum(section_feat_dict[key1][key2])/len(section_feat_dict[key1][key2]) + + # find planes to keep based on specified criteria + section_keep_planes.append(filter_planes(section_feat_dict[key1],removeDirection, percentile)) + + # keep same planes within a section, across the minor grouping variable + section_keep_planes = list(section_keep_planes[0].union(*section_keep_planes)) + section_keep_planes = [i for i in range(min(section_keep_planes), max(section_keep_planes)+1) if i in uniques[maj_grouping_var]] + keep_planes[section_id] = section_keep_planes + + # keep same number of planes across different sections + keep_planes = make_uniform(keep_planes, uniques[maj_grouping_var], padding) + + # start writing summary.txt + summary = open(os.path.join(outDir, 'metadata_files', 'summary.txt'), 'w') + + logger.info('renaming subsetted data') + + # reinitialize filepattern object + fp = filepattern.FilePattern(inpDir, pattern=filePattern) + + # rename subsetted data + for file in fp(group_by=sub_section_variables+grouping_variables): + section_id = tuple([file[0][i] for i in section_variables]) if section_variables[0] else 1 + section_keep_planes = keep_planes[section_id] + rename_map = {k:v for k,v in zip(keep_planes[section_id], uniques[maj_grouping_var])} + + # update summary.txt with section renaming info + summary.write('------------------------------------------------ \n') + if sectionVar.strip(): + summary.write('Section : {} \n'.format({k:file[0][k] for k in section_variables})) + logger.info('Renaming files from section : {} \n'.format({k:file[0][k] for k in section_variables})) + summary.write('\nThe following values of "{}" variable have been renamed: \n'.format(maj_grouping_var)) + for k,v in rename_map.items(): + summary.write('{} ---> {} \n'.format(k,v)) + summary.write('\n Files : \n \n') + + # rename and write output + for f in file: + if f[maj_grouping_var] not in keep_planes[section_id]: + continue + + # old and new file name + old_file_name = f['file'].name + file_name_dict = {k.upper():v for k,v in f.items() if k!='file'} + file_name_dict[maj_grouping_var.upper()] = rename_map[f[maj_grouping_var]] + new_file_name = fp.get_matching(**file_name_dict)[0]['file'].name + + # if write output collection + if writeOutput: + shutil.copy2(os.path.join(inpDir, old_file_name),os.path.join(outDir, 'images', new_file_name)) + + summary.write('{} -----> {} \n'.format(old_file_name, new_file_name)) + summary.close() + +if __name__=="__main__": + # Initialize the logger + logging.basicConfig(format='%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s', + datefmt='%d-%b-%y %H:%M:%S') + logger = logging.getLogger("main") + logger.setLevel(logging.INFO) + + ''' Argument parsing ''' + logger.info("Parsing arguments...") + parser = argparse.ArgumentParser(prog='main', description='Subset data using a given feature') + + # Input arguments + parser.add_argument('--csvDir', dest='csvDir', type=str, + help='CSV collection containing features', required=True) + parser.add_argument('--padding', dest='padding', type=str, + help='Number of images to capture outside the cutoff', required=False) + parser.add_argument('--feature', dest='feature', type=str, + help='Feature to use to subset data', required=True) + parser.add_argument('--filePattern', dest='filePattern', type=str, + help='Filename pattern used to separate data', required=True) + parser.add_argument('--groupVar', dest='groupVar', type=str, + help='variables to group by in a section', required=True) + parser.add_argument('--inpDir', dest='inpDir', type=str, + help='Input image collection to be processed by this plugin', required=True) + parser.add_argument('--percentile', dest='percentile', type=str, + help='Percentile to remove', required=True) + parser.add_argument('--removeDirection', dest='removeDirection', type=str, + help='remove direction above or below percentile', required=True) + parser.add_argument('--sectionVar', dest='sectionVar', type=str, + help='variables to divide larger sections', required=False) + parser.add_argument('--writeOutput', dest='writeOutput', type=str, + help='write output image collection or not', required=False) + # Output arguments + parser.add_argument('--outDir', dest='outDir', type=str, + help='Output collection', required=True) + + # Parse the arguments + args = parser.parse_args() + csvDir = args.csvDir + logger.info('csvDir = {}'.format(csvDir)) + padding = args.padding + padding = 0 if padding==None else int(padding) + logger.info('padding = {}'.format(padding)) + feature = args.feature + logger.info('feature = {}'.format(feature)) + filePattern = args.filePattern + logger.info('filePattern = {}'.format(filePattern)) + groupVar = args.groupVar + logger.info('groupVar = {}'.format(groupVar)) + inpDir = args.inpDir + if (Path.is_dir(Path(args.inpDir).joinpath('images'))): + # switch to images folder if present + fpath = str(Path(args.inpDir).joinpath('images').absolute()) + logger.info('inpDir = {}'.format(inpDir)) + percentile = float(args.percentile) + logger.info('percentile = {}'.format(percentile)) + removeDirection = args.removeDirection + logger.info('removeDirection = {}'.format(removeDirection)) + sectionVar = args.sectionVar + sectionVar = '' if sectionVar is None else sectionVar + logger.info('sectionVar = {}'.format(sectionVar)) + writeOutput = True if args.writeOutput==None else args.writeOutput == 'true' + logger.info('writeOutput = {}'.format(writeOutput)) + outDir = args.outDir + logger.info('outDir = {}'.format(outDir)) + + # create metadata and images folder in outDir + if not os.path.isdir(os.path.join(outDir, 'images')): + os.mkdir(os.path.join(outDir, 'images')) + if not os.path.isdir(os.path.join(outDir, 'metadata_files')): + os.mkdir(os.path.join(outDir, 'metadata_files')) + + # Surround with try/finally for proper error catching + try: + main(inpDir=inpDir, + csvDir=csvDir, + outDir=outDir, + filePattern=filePattern, + groupVar=groupVar, + percentile=percentile, + removeDirection=removeDirection, + sectionVar=sectionVar, + feature=feature, + padding=padding, + writeOutput=writeOutput) + + except Exception: + traceback.print_exc() + + finally: + logger.info('exiting workflow..') + # Exit the program + sys.exit() \ No newline at end of file From 8a5252fb76ded52fcadd0c25af939da4af9294b4 Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Thu, 18 Jan 2024 07:50:32 -0600 Subject: [PATCH 04/19] created plugin structure --- .../package-release.sh | 16 ++ .../feature-subsetting-plugin/run-docker.sh | 23 +++ .../clustering/feature_subsetting/__init__.py | 2 + .../clustering/feature_subsetting/__main__.py | 156 ++++++++++++++++++ .../feature_subsetting/feature_subset.py} | 1 + .../tests/__init__.py | 0 .../tests/conftest.py | 0 .../tests/test_cli.py | 0 .../tests/test_feature_subsetting.py | 0 9 files changed, 198 insertions(+) create mode 100644 clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/__init__.py create mode 100644 clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/__main__.py rename clustering/feature-subsetting-plugin/src/{main.py => polus/plugins/clustering/feature_subsetting/feature_subset.py} (99%) create mode 100644 clustering/feature-subsetting-plugin/tests/__init__.py create mode 100644 clustering/feature-subsetting-plugin/tests/conftest.py create mode 100644 clustering/feature-subsetting-plugin/tests/test_cli.py create mode 100644 clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py diff --git a/clustering/feature-subsetting-plugin/package-release.sh b/clustering/feature-subsetting-plugin/package-release.sh index e69de29bb..8e53414b9 100644 --- a/clustering/feature-subsetting-plugin/package-release.sh +++ b/clustering/feature-subsetting-plugin/package-release.sh @@ -0,0 +1,16 @@ +# This script is designed to help package a new version of a plugin + +# Get the new version +version=$( None: + """Cluster data using HDBSCAN.""" + logger.info(f"--inpDir = {inp_dir}") + logger.info(f"--filePattern = {file_pattern}") + # Regular expression for grouping. + logger.info(f"--groupingPattern = {grouping_pattern}") + # Whether to average data for each group. + logger.info(f"--averageGroups = {average_groups}") + # Name of column to use for grouping. + logger.info(f"--labelCol = {label_col}") + # Minimum cluster size for clustering using HDBSCAN. + logger.info(f"--minClusterSize = {min_cluster_size}") + # Set outlier cluster id as 1. + logger.info(f"--incrementOutlierId = {increment_outlier_id}") + logger.info(f"--outDir = {out_dir}") + + inp_dir = inp_dir.resolve() + out_dir = out_dir.resolve() + + assert inp_dir.exists(), f"{inp_dir} does not exist!! Please check input path again" + assert ( + out_dir.exists() + ), f"{out_dir} does not exist!! Please check output path again" + + num_workers = max([cpu_count(), 2]) + + files = fp.FilePattern(inp_dir, file_pattern) + + if files is None: + msg = f"No tabular files found. Please check {file_pattern} again" + raise ValueError(msg) + + if preview: + with Path.open(Path(out_dir, "preview.json"), "w") as jfile: + out_json: dict[str, Any] = { + "filepattern": file_pattern, + "outDir": [], + } + for file in files(): + out_name = file[1][0].name.replace( + "".join(file[1][0].suffixes), + f"_hdbscan{hd.POLUS_TAB_EXT}", + ) + out_json["outDir"].append(out_name) + json.dump(out_json, jfile, indent=2) + else: + with preadator.ProcessManager( + name="Cluster data using HDBSCAN", + num_processes=num_workers, + threads_per_process=2, + ) as pm: + for file in tqdm( + files(), + total=len(files()), + desc="Clustering data", + mininterval=5, + initial=0, + unit_scale=True, + colour="cyan", + ): + pm.submit_process( + hd.hdbscan_clustering, + file[1][0], + min_cluster_size, + out_dir, + grouping_pattern, + label_col, + average_groups, + increment_outlier_id, + ) + pm.join_processes() + + +if __name__ == "__main__": + app() \ No newline at end of file diff --git a/clustering/feature-subsetting-plugin/src/main.py b/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py similarity index 99% rename from clustering/feature-subsetting-plugin/src/main.py rename to clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py index a942d67c2..ccc179e74 100644 --- a/clustering/feature-subsetting-plugin/src/main.py +++ b/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py @@ -1,3 +1,4 @@ +"""Feature Subsetting Plugin.""" import argparse, logging, subprocess, time, multiprocessing, sys import os import filepattern diff --git a/clustering/feature-subsetting-plugin/tests/__init__.py b/clustering/feature-subsetting-plugin/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/clustering/feature-subsetting-plugin/tests/conftest.py b/clustering/feature-subsetting-plugin/tests/conftest.py new file mode 100644 index 000000000..e69de29bb diff --git a/clustering/feature-subsetting-plugin/tests/test_cli.py b/clustering/feature-subsetting-plugin/tests/test_cli.py new file mode 100644 index 000000000..e69de29bb diff --git a/clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py b/clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py new file mode 100644 index 000000000..e69de29bb From 0f3ddee814d0e6fc5c1e5fffd5a149e403ca22bc Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Thu, 25 Jan 2024 11:21:08 -0600 Subject: [PATCH 05/19] updating filepattern and vaex package --- .../feature-subsetting-plugin/README.md | 52 +- .../example/summary.txt | 14 + .../feature-subsetting-plugin/plugin.json | 255 +++++----- .../feature-subsetting-plugin/run-docker.sh | 31 +- .../clustering/feature_subsetting/__main__.py | 174 ++++--- .../feature_subsetting/feature_subset.py | 468 +++++++++--------- .../tests/__init__.py | 1 + .../tests/conftest.py | 58 +++ .../tests/test_cli.py | 92 ++++ .../tests/test_feature_subsetting.py | 41 ++ 10 files changed, 721 insertions(+), 465 deletions(-) create mode 100644 clustering/feature-subsetting-plugin/example/summary.txt diff --git a/clustering/feature-subsetting-plugin/README.md b/clustering/feature-subsetting-plugin/README.md index 24ccba663..3ba6ec8d8 100644 --- a/clustering/feature-subsetting-plugin/README.md +++ b/clustering/feature-subsetting-plugin/README.md @@ -1,24 +1,25 @@ -# Feature Data Subset +# Feature Data Subset(v0.2.0-dev) -This WIPP plugin subsets data based on a given feature. It works in conjunction with the `polus-feature-extraction-plugin`, where the feature extraction plugin can be used to extract the features such as the mean intensity of every image in the input image collection. +This WIPP plugin subsets data based on a given feature. It works in conjunction with the `polus-feature-extraction-plugin`, where the feature extraction plugin can be used to extract the features such as the mean intensity of every image in the input image collection. # Usage -The details and usage of the plugin inputs is provided in the section below. In addition to the subsetted data, the output directory also consists of a `summary.txt` file which has information as to what images were kept and their new filename if they were renamed. - -### Explanation of inputs -Some of the inputs are pretty straighforward and are used commonly across most WIPP plugins. This section is used to provide some details and examples of the inputs that may be a little complicated. The image collection with the following pattern will be used as an example : `r{r+}_t{t+}_p{p+}_z{z+}_c{c+}.ome.tif`, where r,t,p,z,c stand for replicate, timepoint, positon,z-positon, and channel respectively. Consider we have 5 replicates, 3 timepoints, 50 positions, 10 z-planes and 4 channels. - -1. `inpDir` - This contains the path to the input image collection to subset data from. -2. `filePattern` - Filepattern of the input images -3. `groupVar` - This is a mandatory input across which to subset data. This can take either 1 or 2 variables as input and if 2 variables are provided then the second variable will be treated as the minor grouping variable. In our example, if the `z` is provided as input, then within a subcollection, the mean of the feature value will be taken for all images with the same z. Then the z positions will be filtered out based on the input of `percentile` and `removeDirection` variables. Now if `z,c` are provided as input, then 'c' will be treated as the minor grouping variable which means that the mean will be taken for all images with the same z for each channel. Also, the plugin will ensures that the same values of z positions are filtered out across c. -4. `csvDir` - This contains the path to the csv collection containing the feature values for each image. This can be the output of the feature extraction plugin. -5. `feature` - The column name from the csv file that will be used to filter images -6. `percentile` and `removeDirection` - These two variables denote the critieria with which images are filtered. For example, if percentile is `0.1` and removeDirection is set to `Below` then images with feature value below the 10th percentile will be removed. On the other hand, if removeDirection is set to above then all images with feature value greater than the 10th pecentile will be removed. This enables data subsetting from both `brighfield` and `darkfield` microscopy images. - - **Optional Arguments** - +The details and usage of the plugin inputs is provided in the section below. In addition to the subsetted data, the output directory also consists of a `summary.txt` file which has information as to what images were kept and their new filename if they were renamed. + +### Explanation of inputs +Some of the inputs are pretty straighforward and are used commonly across most WIPP plugins. This section is used to provide some details and examples of the inputs that may be a little complicated. The image collection with the following pattern will be used as an example : `r{r+}_t{t+}_p{p+}_z{z+}_c{c+}.ome.tif`, where r,t,p,z,c stand for replicate, timepoint, positon,z-positon, and channel respectively. Consider we have 5 replicates, 3 timepoints, 50 positions, 10 z-planes and 4 channels. + +1. `inpDir` - This contains the path to the input image collection to subset data from. +2. `tabularDir` This contains the path to the tabular files with file formats (`.csv`, `.arrow`, `.parquet`) containing the feature values for each image. This can be the output of the feature extraction or nyxus plugin +3. `filePattern` - Filepattern of the input images +4. `imageFeature` - Tabular data featuring image filenames +5. `tabularFeature` - Tabular feature that will be used to filter images +6. `groupVar` - This is a mandatory input across which to subset data. This can take either 1 or 2 variables as input and if 2 variables are provided then the second variable will be treated as the minor grouping variable. In our example, if the `z` is provided as input, then within a subcollection, the mean of the feature value will be taken for all images with the same z. Then the z positions will be filtered out based on the input of `percentile` and `removeDirection` variables. Now if `z,c` are provided as input, then 'c' will be treated as the minor grouping variable which means that the mean will be taken for all images with the same z for each channel. Also, the plugin will ensures that the same values of z positions are filtered out across c. +7. `percentile` and `removeDirection` - These two variables denote the critieria with which images are filtered. For example, if percentile is `0.1` and removeDirection is set to `Below` then images with feature value below the 10th percentile will be removed. On the other hand, if removeDirection is set to above then all images with feature value greater than the 10th pecentile will be removed. This enables data subsetting from both `brighfield` and `darkfield` microscopy images. + + **Optional Arguments** + 8. `sectionVar` - This is an optional input to segregate the input image collection into sub-collections. The analysis will be done seperately for each sub-collection. In our example, if the user enters `r,t` as the sectionVar, then we will have 15 subcollections (5*3),1 for each combination of timepoint and replicate. If the user enters `r` as sectionVar, then we will have 5 sub collections, 1 for each replicate. If the user wants to consider the whole image collection as a single section, then no input is required. NOTE: As a post processing step, same number of images will be subsetted across different sections. -9. `padding` - This is an optional variable with default value of 0. A delay of 3 means that 3 additional planes will captured on either side of the subsetted data. This can be used as a sanity check to ensure that the subsetted data captures the images we want. For example, in our examples if the following z values were filtered out intitially - 5,6,7 ; then a delay of 3 means that the output dataset will have z positions 2,3,4,5,6,7,8,9,10 if all them exist. +9. `padding` - This is an optional variable with default value of 0. A delay of 3 means that 3 additional planes will captured on either side of the subsetted data. This can be used as a sanity check to ensure that the subsetted data captures the images we want. For example, in our examples if the following z values were filtered out intitially - 5,6,7 ; then a delay of 3 means that the output dataset will have z positions 2,3,4,5,6,7,8,9,10 if all them exist. 10. `writeOutput` - This is an optional argument with default value `True`. If it is set to true, then both the output image collection and `summary.txt` file will be created. If it is set to false, then the output directory will only consist of summary.txt. This option enables the user to tune the hyperparameters such as percentile, removeDirecton, feature without actually creating the output image collection. @@ -38,19 +39,20 @@ If WIPP is running, navigate to the plugins page and add a new plugin. Paste the ## Options -This plugin takes one input argument and one output argument: +This plugin takes twelve input arguments and one output argument: | Name | Description | I/O | Type | | ------------------- | ----------------------------------------------------- | ------ | ------------- | -| `--csvDir` | CSV collection containing features | Input | csvCollection | -| `--padding` | Number of images to capture outside the cutoff | Input | int | -| `--feature` | Feature to use to subset data | Input | string | +| `--inpDir` | Input image collection to be processed by this plugin | Input | collection | +| `--tabularDir` | Path to tabular data | Input | genericData | | `--filePattern` | Filename pattern used to separate data | Input | string | +| `--imageFeature` | Feature in tabular data with image filenames | Input | string | +| `--tabularFeature` | Tabular feature to filter image files | Input | string | +| `--padding` | Number of images to capture outside the cutoff | Input | integer | | `--groupVar` | variables to group by in a section | Input | string | -| `--inpDir` | Input image collection to be processed by this plugin | Input | collection | -| `--percentile` | Percentile to remove | Input | int | +| `--percentile` | Percentile to remove | Input | float | | `--removeDirection` | remove direction above or below percentile | Input | string | | `--sectionVar` | variables to divide larger sections | Input | string | | `--writeOutput` | write output image collection or not | Input | boolean | -| `--outDir` | Output collection | Output | collection | - +| `--outDir` | Output collection | Output | genericData | +| `--preview` | Generate a JSON file with outputs | Output | JSON | diff --git a/clustering/feature-subsetting-plugin/example/summary.txt b/clustering/feature-subsetting-plugin/example/summary.txt new file mode 100644 index 000000000..7e9662eb2 --- /dev/null +++ b/clustering/feature-subsetting-plugin/example/summary.txt @@ -0,0 +1,14 @@ +------------------------------------------------ + + Files : + +x00_y01_p03_c1.ome.tif -----> x00_y01_p01_c1.ome.tif +x00_y01_p03_c2.ome.tif -----> x00_y01_p01_c2.ome.tif +x00_y01_p03_c3.ome.tif -----> x00_y01_p01_c3.ome.tif +x00_y01_p03_c4.ome.tif -----> x00_y01_p01_c4.ome.tif +x00_y01_p03_c5.ome.tif -----> x00_y01_p01_c5.ome.tif +x00_y01_p04_c1.ome.tif -----> x00_y01_p02_c1.ome.tif +x00_y01_p04_c2.ome.tif -----> x00_y01_p02_c2.ome.tif +x00_y01_p04_c3.ome.tif -----> x00_y01_p02_c3.ome.tif +x00_y01_p04_c4.ome.tif -----> x00_y01_p02_c4.ome.tif +x00_y01_p04_c5.ome.tif -----> x00_y01_p02_c5.ome.tif diff --git a/clustering/feature-subsetting-plugin/plugin.json b/clustering/feature-subsetting-plugin/plugin.json index a60a0f7f8..68e7c463c 100644 --- a/clustering/feature-subsetting-plugin/plugin.json +++ b/clustering/feature-subsetting-plugin/plugin.json @@ -2,10 +2,10 @@ "name": "Feature Subsetting", "version": "0.2.0-dev", "title": "Feature Subsetting", - "description": "Subset data using a given feature", + "description": "Subset data using a given feature.", "author": "Gauhar Bains (gauhar.bains@labshare.org) and Hamdah Shafqat Abbasi (hamdahshafqat.abbasi@nih.gov)", "institution": "National Center for Advancing Translational Sciences, National Institutes of Health", - "repository": "https://github.com/PolusAI/polus-plugin", + "repository": "https://github.com/PolusAI/polus-plugins", "website": "https://ncats.nih.gov/preclinical/core/informatics", "citation": "", "containerId": "polusai/feature-subsetting-plugin:0.2.0-dev", @@ -14,131 +14,160 @@ "-m", "polus.plugins.clustering.feature_subsetting" ], - "inputs": [ - { - "name": "inpDir", + "inputs": { + "inpDir": { "type": "collection", - "description": "Input image collection to be processed by this plugin", - "required": true + "title": "Input image directory", + "description": "Input image directory.", + "required": "True" }, - { - "name": "filePattern", + "tabularDir": { + "type": "genericData", + "title": "Input tabular directory", + "description": "Path to directory containing tabular data.", + "required": "True" + }, + "filePattern": { + "type": "string", + "title": "Filename pattern", + "description": "Filename pattern used to separate data.", + "required": "True" + }, + "imageFeature": { "type": "string", - "description": "Filename pattern used to separate data", - "required": true + "title": "imageFeature", + "description": "Feature in tabular data containing image filenames.", + "required": "True" }, - { - "name": "groupVar", + "tabularFeature": { "type": "string", - "description": "variables to group by in a section", - "required": true + "title": "tabularFeature", + "description": "Feature in tabular data to subset image data.", + "required": "True" }, - { - "name": "sectionVar", + "padding": { + "type": "integer", + "title": "padding", + "description": "Number of images to capture outside the cutoff.", + "required": "False" + }, + "groupVar": { "type": "string", - "description": "variables to divide larger sections", - "required": false + "title": "groupVar", + "description": "variables to group by in a section.", + "required": "True" }, - { - "name": "csvDir", - "type": "csvCollection", - "description": "CSV collection containing features", - "required": true + "percentile": { + "type": "float", + "title": "percentile", + "description": "Percentile to remove.", + "required": "True" }, - { - "name": "feature", + "removeDirection": { "type": "string", - "description": "Feature to use to subset data", - "required": true - }, - { - "name": "percentile", - "type": "number", - "description": "Percentile to remove", - "required": true - }, - { - "name": "removeDirection", - "type": "enum", - "options": { - "values": [ - "Below", - "Above" - ] - }, - "description": "remove direction above or below percentile", - "required": true - }, - { - "name": "padding", + "title": "removeDirection", + "description": "Remove direction above or below percentile.", + "required": "False", + "default": "Below" + }, + "sectionVar": { "type": "string", - "description": "Number of images to capture outside the cutoff", - "required": false + "title": "sectionVar", + "description": "Variables to divide larger sections.", + "required": "False" + }, + "writeOutput": { + "type": "boolean", + "title": "writeOutput", + "description": "Write output image collection or not.", + "required": "False" }, - { - "name": "writeOutput", + "preview": { "type": "boolean", - "description": "write output image collection or not", - "required": false + "title": "Preview", + "description": "Generate an output preview.", + "required": "False" } - ], - "outputs": [ - { - "name": "outDir", - "type": "collection", - "description": "Output collection" + }, + "outputs": { + "outDir": { + "type": "genericData", + "description": "Output collection." } - ], - "ui": [ - { - "key": "inputs.inpDir", - "title": "Input collection", - "description": "Input image collection to be processed by this plugin" - }, - { - "key": "inputs.csvDir", - "title": "CSV collection", - "description": "CSV collection containing features" - }, - { - "key": "inputs.feature", - "title": "Feature", - "description": "Feature to use to subset data" - }, - { - "key": "inputs.percentile", - "title": "Percentile", - "description": "Percentile to remove" - }, - { - "key": "inputs.removeDirection", - "title": "Remove Direction", - "description": "remove direction above or below percentile" - }, - { - "key": "inputs.filePattern", + }, + "ui": { + "inpDir": { + "type": "collection", + "title": "Input image directory", + "description": "Input image directory.", + "required": "True" + }, + "tabularDir": { + "type": "genericData", + "title": "Input tabular directory", + "description": "Path to directory containing tabular data.", + "required": "True" + }, + "filePattern": { + "type": "string", "title": "Filename pattern", - "description": "Filename pattern used to separate data" - }, - { - "key": "inputs.groupVar", - "title": "Grouping Variables", - "description": "variables to group by in a section" - }, - { - "key": "inputs.sectionVar", - "title": "Section Variables", - "description": "variables to divide larger sections" - }, - { - "key": "inputs.padding", - "title": "Padding", - "description": "Number of images to capture outside the cutoff" - }, - { - "key": "inputs.writeOutput", - "title": "Write Output", - "description": "write output image collection or not" + "description": "Filename pattern used to separate data.", + "required": "True" + }, + "imageFeature": { + "type": "string", + "title": "imageFeature", + "description": "Feature in tabular data containing image filenames.", + "required": "True" + }, + "tabularFeature": { + "type": "string", + "title": "tabularFeature", + "description": "Feature in tabular data to subset image data.", + "required": "True" + }, + "padding": { + "type": "integer", + "title": "padding", + "description": "Number of images to capture outside the cutoff.", + "required": "False" + }, + "groupVar": { + "type": "string", + "title": "groupVar", + "description": "variables to group by in a section.", + "required": "True" + }, + "percentile": { + "type": "float", + "title": "percentile", + "description": "Percentile to remove.", + "required": "True" + }, + "removeDirection": { + "type": "string", + "title": "removeDirection", + "description": "Remove direction above or below percentile.", + "required": "False", + "default": "Below" + }, + "sectionVar": { + "type": "string", + "title": "sectionVar", + "description": "Variables to divide larger sections.", + "required": "False" + }, + "writeOutput": { + "type": "boolean", + "title": "writeOutput", + "description": "Write output image collection or not.", + "required": "False" + }, + "preview": { + "type": "boolean", + "title": "Preview", + "description": "Generate an output preview.", + "required": "False" } - ] -} \ No newline at end of file + } +} diff --git a/clustering/feature-subsetting-plugin/run-docker.sh b/clustering/feature-subsetting-plugin/run-docker.sh index ced143837..36cec9de8 100644 --- a/clustering/feature-subsetting-plugin/run-docker.sh +++ b/clustering/feature-subsetting-plugin/run-docker.sh @@ -5,19 +5,28 @@ datapath=$(readlink --canonicalize data) echo ${datapath} # Inputs -inpDir=${datapath}/input -filePattern=".*.csv" -groupingPattern="\w+$" -labelCol="species" -minClusterSize=3 +inpDir=${datapath}/input/images +tabularDir=${datapath}/input/tabular +filePattern="x{x+}_y{y+}_p{p+}_c{c+}.ome.tif" +imageFeature="intensity_image" +padding=0 +groupVar="p,c" +percentile=0.8 +removeDirection="Below" +writeOutput=true outDir=${datapath}/output docker run -v ${datapath}:${datapath} \ - polusai/hdbscan-clustering-plugin:${version} \ + polusai/feature-subsetting-plugin:${version} \ --inpDir ${inpDir} \ + --tabularDir ${tabularDir} \ --filePattern ${filePattern} \ - --groupingPattern ${groupingPattern} \ - --labelCol ${labelCol} \ - --minClusterSize ${minClusterSize} \ - --incrementOutlierId \ - --outDir ${outDir} \ No newline at end of file + --imageFeature${imageFeature} \ + --tabularFeature ${tabularFeature} \ + --padding ${padding} \ + --groupVar ${groupVar} \ + --percentile ${percentile} \ + --groupVar ${groupVar} \ + --removeDirection ${removeDirection} \ + --writeOutput \ + --outDir ${outDir} diff --git a/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/__main__.py b/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/__main__.py index 4402091d6..beab04c5b 100644 --- a/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/__main__.py +++ b/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/__main__.py @@ -1,17 +1,11 @@ -"""Hdbscan Clustering Plugin.""" - -import json +"""Feature Subsetting Plugin.""" import logging -from multiprocessing import cpu_count +import shutil from pathlib import Path -from typing import Any from typing import Optional -import filepattern as fp import polus.plugins.clustering.feature_subsetting.feature_subset as fs -import preadator import typer -from tqdm import tqdm app = typer.Typer() @@ -24,13 +18,29 @@ logger.setLevel(logging.INFO) +def generate_preview( + out_dir: Path, +) -> None: + """Generate preview of the plugin outputs.""" + shutil.copy( + Path(__file__).parents[4].joinpath("example/summary.txt"), + out_dir, + ) + + @app.command() def main( # noqa: PLR0913 inp_dir: Path = typer.Option( ..., "--inpDir", "-i", - help="Path to folder with tabular files", + help="Path to the collection of input images.", + ), + tabular_dir: Path = typer.Option( + ..., + "--tabularDir", + "-t", + help="Path to the collection of tabular files containing features.", ), file_pattern: Optional[str] = typer.Option( ".*", @@ -38,35 +48,53 @@ def main( # noqa: PLR0913 "-f", help="Pattern use to parse filenames", ), - grouping_pattern: Optional[str] = typer.Option( + image_feature: str = typer.Option( None, - "--groupingPattern", - "-g", - help="Regular expression to group rows to capture groups.", + "--imageFeature", + "-if", + help="Image filenames feature in tabular data.", ), - average_groups: Optional[bool] = typer.Option( - False, - "--averageGroups", - "-a", - help="Whether to average data across groups. Requires capture groups.", - ), - label_col: Optional[str] = typer.Option( + tabular_feature: str = typer.Option( None, - "--labelCol", - "-l", - help="Name of column containing labels. Required only for grouping operations.", + "--tabularFeature", + "-tf", + help="Select tabular feature to subset data.", + ), + padding: Optional[int] = typer.Option( + 0, + "--padding", + "-p", + help="Number of images to capture outside the cutoff.", ), - min_cluster_size: int = typer.Option( + group_var: str = typer.Option( ..., - "--minClusterSize", - "-m", - help="Minimum cluster size.", + "--groupVar", + "-g", + help="variables to group by in a section.", + ), + percentile: float = typer.Option( + None, + "--percentile", + "-pc", + help="Percentile to remove.", + ), + remove_direction: Optional[str] = typer.Option( + "Below", + "--removeDirection", + "-r", + help="Remove direction above or below percentile.", + ), + section_var: Optional[str] = typer.Option( + None, + "--sectionVar", + "-s", + help="Variables to divide larger sections.", ), - increment_outlier_id: Optional[bool] = typer.Option( + write_output: Optional[bool] = typer.Option( False, - "--incrementOutlierId", - "-io", - help="Increments outlier ID to 1.", + "--writeOutput", + "-w", + help="Write output image collection or not.", ), out_dir: Path = typer.Option( ..., @@ -80,19 +108,18 @@ def main( # noqa: PLR0913 help="Output a JSON preview of files", ), ) -> None: - """Cluster data using HDBSCAN.""" + """Subset data using a given feature.""" logger.info(f"--inpDir = {inp_dir}") + logger.info(f"--tabularDir = {tabular_dir}") + logger.info(f"--imageFeature = {image_feature}") + logger.info(f"--tabularFeature = {tabular_feature}") logger.info(f"--filePattern = {file_pattern}") - # Regular expression for grouping. - logger.info(f"--groupingPattern = {grouping_pattern}") - # Whether to average data for each group. - logger.info(f"--averageGroups = {average_groups}") - # Name of column to use for grouping. - logger.info(f"--labelCol = {label_col}") - # Minimum cluster size for clustering using HDBSCAN. - logger.info(f"--minClusterSize = {min_cluster_size}") - # Set outlier cluster id as 1. - logger.info(f"--incrementOutlierId = {increment_outlier_id}") + logger.info(f"--padding = {padding}") + logger.info(f"--groupVar = {group_var}") + logger.info(f"--percentile = {percentile}") + logger.info(f"--removeDirection = {remove_direction}") + logger.info(f"--sectionVar = {section_var}") + logger.info(f"--writeOutput = {write_output}") logger.info(f"--outDir = {out_dir}") inp_dir = inp_dir.resolve() @@ -103,54 +130,25 @@ def main( # noqa: PLR0913 out_dir.exists() ), f"{out_dir} does not exist!! Please check output path again" - num_workers = max([cpu_count(), 2]) - - files = fp.FilePattern(inp_dir, file_pattern) - - if files is None: - msg = f"No tabular files found. Please check {file_pattern} again" - raise ValueError(msg) - if preview: - with Path.open(Path(out_dir, "preview.json"), "w") as jfile: - out_json: dict[str, Any] = { - "filepattern": file_pattern, - "outDir": [], - } - for file in files(): - out_name = file[1][0].name.replace( - "".join(file[1][0].suffixes), - f"_hdbscan{hd.POLUS_TAB_EXT}", - ) - out_json["outDir"].append(out_name) - json.dump(out_json, jfile, indent=2) + generate_preview(out_dir) + else: - with preadator.ProcessManager( - name="Cluster data using HDBSCAN", - num_processes=num_workers, - threads_per_process=2, - ) as pm: - for file in tqdm( - files(), - total=len(files()), - desc="Clustering data", - mininterval=5, - initial=0, - unit_scale=True, - colour="cyan", - ): - pm.submit_process( - hd.hdbscan_clustering, - file[1][0], - min_cluster_size, - out_dir, - grouping_pattern, - label_col, - average_groups, - increment_outlier_id, - ) - pm.join_processes() + fs.feature_subset( + inp_dir, + tabular_dir, + out_dir, + file_pattern, + group_var, + percentile, + remove_direction, + section_var, + image_feature, + tabular_feature, + padding, + write_output, + ) if __name__ == "__main__": - app() \ No newline at end of file + app() diff --git a/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py b/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py index ccc179e74..a84455e8e 100644 --- a/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py +++ b/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py @@ -1,20 +1,32 @@ """Feature Subsetting Plugin.""" -import argparse, logging, subprocess, time, multiprocessing, sys +import logging import os -import filepattern -import pandas as pd import shutil from pathlib import Path -import traceback +from typing import Any + +import filepattern +import vaex +from tqdm import tqdm + +CHUNK_SIZE = 10000 + +logger = logging.getLogger(__name__) +logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".csv") + -def filter_planes(feature_dict, removeDirection, percentile): - """filter planes by the criteria specified by removeDirection - and percentile +def filter_planes( + feature_dict: dict, + remove_direction: str, + percentile: float, +) -> set[Any]: + """Filter planes by the criteria specified by remove_direction and percentile. Args: - feature_dict (dictionary): planes and respective feature value - removeDirection (string): remove above or below percentile - percentile (int): cutoff percentile + feature_dict : planes and respective feature value + remove_direction: remove above or below percentile + percentile : cutoff percentile Returns: set: planes that fit the criteria @@ -22,268 +34,268 @@ def filter_planes(feature_dict, removeDirection, percentile): planes = list(feature_dict.keys()) feat_value = [feature_dict[i] for i in planes] thresh = min(feat_value) + percentile * (max(feat_value) - min(feat_value)) - + # filter planes - if removeDirection == 'Below': + if remove_direction == "Below": keep_planes = [z for z in planes if feature_dict[z] >= thresh] else: keep_planes = [z for z in planes if feature_dict[z] <= thresh] - + return set(keep_planes) -def make_uniform(planes_dict, uniques, padding): - """ Ensure each section has the same number of images + +def make_uniform(planes_dict: dict, uniques: list[int], padding: int) -> dict: + """Ensure each section has the same number of images. This function makes the output collection uniform in - the sense that it preserves same number of planes across + the sense that it preserves same number of planes across sections. It also captures additional planes based on the value of the padding variable Args: - planes_dict (dict): planes to keep in different sections - uniques (list): unique values for the major grouping variable - padding (int): additional images to capture outside cutoff + planes_dict: planes to keep in different sections + uniques : unique values for the major grouping variable + padding : additional images to capture outside cutoff Returns: dictionary: dictionary containing planes to keep """ - - # max no. of planes + # max no. of planes max_len = max([len(i) for i in planes_dict.values()]) # max planes that can be added on each side min_ind = min([min(planes_dict[k]) for k in planes_dict]) max_ind = max([max(planes_dict[k]) for k in planes_dict]) max_add_left = uniques.index(min_ind) - max_add_right = len(uniques) - (uniques.index(max_ind)+1) - + max_add_right = len(uniques) - (uniques.index(max_ind) + 1) + # add planes in each section based on padding and max number of planes for section_id, planes in planes_dict.items(): len_to_add = max_len - len(planes) - len_add_left = min(int(len_to_add)/2+padding, max_add_left) - len_add_right = min(len_to_add - len_add_left+padding, max_add_right) - left_ind = int(uniques.index(min(planes)) - len_add_left) - right_ind = int(uniques.index(max(planes)) + len_add_right)+1 + len_add_left = min(int(len_to_add) / 2 + padding, max_add_left) + len_add_right = min(len_to_add - len_add_left + padding, max_add_right) + left_ind = int(uniques.index(min(planes)) - len_add_left) + right_ind = int(uniques.index(max(planes)) + len_add_right) + 1 planes_dict[section_id] = uniques[left_ind:right_ind] return planes_dict -def main(inpDir,csvDir,outDir,filePattern,groupVar,percentile, - removeDirection,sectionVar,feature,padding,writeOutput): - """Function containing the main login to subset data + +def feature_subset( # noqa : C901 + inp_dir: Path, + tabular_dir: Path, + out_dir: Path, + file_pattern: str, + group_var: str, + percentile: float, + remove_direction: str, + section_var: str, + image_feature: str, + tabular_feature: str, + padding: int, + write_output: bool, +) -> None: + """Subsetting images based on feature values. Args: - inpDir (string): path to input image collection - csvDir (string): path to csv file containing features - outDir (string): path to output collection - filePattern (string): input image filepattern - groupVar (string): grouping variables - percentile (float): cutoff feature percentile - removeDirection (string): subset above or below percentile - sectionVar (string): sectioning variable - feature (string): feature to subset using - padding (int): capture additional images outside of cutoff - writeOutput (boolean): write output image collection or not + inp_dir: Path to the collection of input images + tabular_dir : Path to the tabular data directory + out_dir : Path to output directory + file_pattern : Pattern to parse image file names + group_var : variables to group by in a section + percentile : Percentile to remove + remove_direction : Remove direction above or below percentile + section_var : Variables to divide larger sections + image_feature: Image filenames feature in tabular data + tabular_feature : Select tabular feature to subset data + padding : additional images to capture outside cutoff + write_output : Write output image collection or not. """ + tabular_dir_files = [ + f + for f in Path(tabular_dir).iterdir() + if f.is_file() + and "".join(f.suffixes) in [".csv", ".arrow", ".parquet", ".fits"] + ] - # Get all file names in csvDir image collection - csvDir_files = [f.name for f in Path(csvDir).iterdir() if f.is_file() and "".join(f.suffixes)=='.csv'] - - # Get all file names in inpDir image collection - inpDir_files = [f.name for f in Path(inpDir).iterdir() if f.is_file() and "".join(f.suffixes)=='.ome.tif'] + if len(tabular_dir_files) == 0: + msg = f"No tabular files detected Please check {tabular_dir} again" + raise ValueError(msg) - # read and concat all csv files - for ind, file in enumerate(csvDir_files): - if ind == 0: - feature_df = pd.read_csv(os.path.join(csvDir, file), header=0) + # Get the column headers + headers = [] + for in_file in tabular_dir_files: + df = vaex.open(in_file) + headers.append(list(df.columns)) + headers = list(set(headers[0]).intersection(*headers)) + logger.info("Merging the data along rows...") + + featuredf = [] + for in_file in tqdm( + tabular_dir_files, + total=len(tabular_dir_files), + desc="Vaex loading of file", + ): + if in_file.suffix == ".csv": + df = vaex.from_csv(in_file, chunk_size=100_000, convert=True) else: - feature_df = pd.concat([feature_df, pd.read_csv(os.path.join(csvDir, file), header=0)]) - + df = vaex.open(in_file) + df = df[list(headers)] + featuredf.append(df) + + feature_df = vaex.concat(featuredf) + + if feature_df.shape[0] == 0: + msg = f"tabular files are empty Please check {tabular_dir} again" + raise ValueError(msg) + # store image name and its feature value - feature_dict = {k:v for k,v in zip(feature_df['Image'], feature_df[feature])} + feature_dict = dict( + zip( + list(feature_df[image_feature].to_numpy()), + list(feature_df[tabular_feature].to_numpy()), + ), + ) # seperate filepattern variables into different categories - _,var = filepattern.get_regex(filePattern) - grouping_variables = groupVar.split(',') - section_variables = sectionVar.split(',') - sub_section_variables = [v for v in var if v not in grouping_variables+section_variables] + fps = filepattern.FilePattern(inp_dir, file_pattern) + if not len(fps) > 0: + msg = "No image files are detected. Please check filepattern again!" + raise ValueError(msg) - # initialize filepattern object - fp = filepattern.FilePattern(inpDir, pattern=filePattern) - uniques = fp.uniques + uniques = fps.get_unique_values() + var = fps.get_variables() + grouping_variables = group_var.split(",") + if len(grouping_variables) > 1: + min_grouping_var, maj_grouping_var = ( + grouping_variables[1], + grouping_variables[0], + ) + gp_by = [min_grouping_var, maj_grouping_var] + else: + gp_by = [group_var] - [maj_grouping_var, min_grouping_var] = grouping_variables if len(grouping_variables)>1 else grouping_variables+[None] - keep_planes = {} + if section_var is not None: + section_variables = section_var.split(",") + sub_section_variables = [ + v for v in var if v not in grouping_variables + section_variables + ] + else: + sub_section_variables = [v for v in var if v not in grouping_variables] - logger.info('Iterating over sections...') + logger.info("Iterating over sections...") # single iteration of this loop gives all images in one section - for file in fp(group_by=sub_section_variables+grouping_variables): - - section_feat_dict = {} - section_keep_planes = [] - section_id = tuple([file[0][i] for i in section_variables]) if section_variables[0] else 1 - + + section_feat = [] + section_keep_planes = [] + keep_planes = {} + + for file in fps(group_by=gp_by): + section_feat_dict: dict[Any, Any] = {} + if section_var is not None: + section_id = tuple([file[0][i] for i in section_var.split(",")]) + else: + section_id = 1 + # iterate over files in one section - for f in file: - if min_grouping_var == None: - f[min_grouping_var] = None - - # stote feature values for images - if f[min_grouping_var] not in section_feat_dict: - section_feat_dict[f[min_grouping_var]] = {} - - if f[maj_grouping_var] not in section_feat_dict[f[min_grouping_var]]: - section_feat_dict[f[min_grouping_var]][f[maj_grouping_var]] = [] - - section_feat_dict[f[min_grouping_var]][f[maj_grouping_var]].append(feature_dict[f['file'].name]) - - # average feature value by grouping variable - for key1 in section_feat_dict: - for key2 in section_feat_dict[key1]: - section_feat_dict[key1][key2] = sum(section_feat_dict[key1][key2])/len(section_feat_dict[key1][key2]) - - # find planes to keep based on specified criteria - section_keep_planes.append(filter_planes(section_feat_dict[key1],removeDirection, percentile)) - - # keep same planes within a section, across the minor grouping variable - section_keep_planes = list(section_keep_planes[0].union(*section_keep_planes)) - section_keep_planes = [i for i in range(min(section_keep_planes), max(section_keep_planes)+1) if i in uniques[maj_grouping_var]] - keep_planes[section_id] = section_keep_planes - - # keep same number of planes across different sections - keep_planes = make_uniform(keep_planes, uniques[maj_grouping_var], padding) - + + fm = file[1][0][0] + fname = file[1][0][1][0].name + + if min_grouping_var is None: + fm[min_grouping_var] = None + + if fm[min_grouping_var] not in section_feat_dict: + section_feat_dict[fm[min_grouping_var]] = {} + + if fm[maj_grouping_var] not in section_feat_dict[fm[min_grouping_var]]: + section_feat_dict[fm[min_grouping_var]][fm[maj_grouping_var]] = [] + + section_feat_dict[fm[min_grouping_var]][fm[maj_grouping_var]].append( + feature_dict[fname], + ) + + section_feat.append(section_feat_dict) + + sectionfeat: dict[Any, Any] = {} + for f in section_feat: + for k, v in f.items(): + if k not in sectionfeat: + sectionfeat[k] = {} + sectionfeat[k].update(v) + + # average feature value by grouping variable + + for key1 in sectionfeat: + for key2 in sectionfeat[key1]: + sectionfeat[key1][key2] = sum(sectionfeat[key1][key2]) / len( + sectionfeat[key1][key2], + ) + + # find planes to keep based on specified criteria + section_keep_planes.append( + filter_planes(sectionfeat[key1], remove_direction, percentile), + ) + + # keep same planes within a section, across the minor grouping variable + section_keep_planes = list(section_keep_planes[0].union(*section_keep_planes)) + section_keep_planes = [ + i + for i in range( # type: ignore + min(section_keep_planes), + max(section_keep_planes) + 1, # type: ignore + ) + if i in uniques[maj_grouping_var] + ] + keep_planes[section_id] = section_keep_planes + + # # keep same number of planes across different sections + keep_planes = make_uniform(keep_planes, list(uniques[maj_grouping_var]), padding) + # start writing summary.txt - summary = open(os.path.join(outDir, 'metadata_files', 'summary.txt'), 'w') + summary = Path.open(Path(out_dir, "summary.txt"), "w") + + summary.write("\n Files : \n \n") + # update summary.txt with section renaming info - logger.info('renaming subsetted data') + logger.info("renaming subsetted data") - # reinitialize filepattern object - fp = filepattern.FilePattern(inpDir, pattern=filePattern) + for file in fps(group_by=sub_section_variables + grouping_variables): + if section_var is not None: + section_id = tuple([file[0][i] for i in section_var.split(",")]) + else: + section_id = 1 - # rename subsetted data - for file in fp(group_by=sub_section_variables+grouping_variables): - section_id = tuple([file[0][i] for i in section_variables]) if section_variables[0] else 1 section_keep_planes = keep_planes[section_id] - rename_map = {k:v for k,v in zip(keep_planes[section_id], uniques[maj_grouping_var])} - - # update summary.txt with section renaming info - summary.write('------------------------------------------------ \n') - if sectionVar.strip(): - summary.write('Section : {} \n'.format({k:file[0][k] for k in section_variables})) - logger.info('Renaming files from section : {} \n'.format({k:file[0][k] for k in section_variables})) - summary.write('\nThe following values of "{}" variable have been renamed: \n'.format(maj_grouping_var)) - for k,v in rename_map.items(): - summary.write('{} ---> {} \n'.format(k,v)) - summary.write('\n Files : \n \n') - - # rename and write output - for f in file: - if f[maj_grouping_var] not in keep_planes[section_id]: - continue - - # old and new file name - old_file_name = f['file'].name - file_name_dict = {k.upper():v for k,v in f.items() if k!='file'} - file_name_dict[maj_grouping_var.upper()] = rename_map[f[maj_grouping_var]] - new_file_name = fp.get_matching(**file_name_dict)[0]['file'].name - - # if write output collection - if writeOutput: - shutil.copy2(os.path.join(inpDir, old_file_name),os.path.join(outDir, 'images', new_file_name)) - - summary.write('{} -----> {} \n'.format(old_file_name, new_file_name)) - summary.close() - -if __name__=="__main__": - # Initialize the logger - logging.basicConfig(format='%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s', - datefmt='%d-%b-%y %H:%M:%S') - logger = logging.getLogger("main") - logger.setLevel(logging.INFO) - - ''' Argument parsing ''' - logger.info("Parsing arguments...") - parser = argparse.ArgumentParser(prog='main', description='Subset data using a given feature') - - # Input arguments - parser.add_argument('--csvDir', dest='csvDir', type=str, - help='CSV collection containing features', required=True) - parser.add_argument('--padding', dest='padding', type=str, - help='Number of images to capture outside the cutoff', required=False) - parser.add_argument('--feature', dest='feature', type=str, - help='Feature to use to subset data', required=True) - parser.add_argument('--filePattern', dest='filePattern', type=str, - help='Filename pattern used to separate data', required=True) - parser.add_argument('--groupVar', dest='groupVar', type=str, - help='variables to group by in a section', required=True) - parser.add_argument('--inpDir', dest='inpDir', type=str, - help='Input image collection to be processed by this plugin', required=True) - parser.add_argument('--percentile', dest='percentile', type=str, - help='Percentile to remove', required=True) - parser.add_argument('--removeDirection', dest='removeDirection', type=str, - help='remove direction above or below percentile', required=True) - parser.add_argument('--sectionVar', dest='sectionVar', type=str, - help='variables to divide larger sections', required=False) - parser.add_argument('--writeOutput', dest='writeOutput', type=str, - help='write output image collection or not', required=False) - # Output arguments - parser.add_argument('--outDir', dest='outDir', type=str, - help='Output collection', required=True) - - # Parse the arguments - args = parser.parse_args() - csvDir = args.csvDir - logger.info('csvDir = {}'.format(csvDir)) - padding = args.padding - padding = 0 if padding==None else int(padding) - logger.info('padding = {}'.format(padding)) - feature = args.feature - logger.info('feature = {}'.format(feature)) - filePattern = args.filePattern - logger.info('filePattern = {}'.format(filePattern)) - groupVar = args.groupVar - logger.info('groupVar = {}'.format(groupVar)) - inpDir = args.inpDir - if (Path.is_dir(Path(args.inpDir).joinpath('images'))): - # switch to images folder if present - fpath = str(Path(args.inpDir).joinpath('images').absolute()) - logger.info('inpDir = {}'.format(inpDir)) - percentile = float(args.percentile) - logger.info('percentile = {}'.format(percentile)) - removeDirection = args.removeDirection - logger.info('removeDirection = {}'.format(removeDirection)) - sectionVar = args.sectionVar - sectionVar = '' if sectionVar is None else sectionVar - logger.info('sectionVar = {}'.format(sectionVar)) - writeOutput = True if args.writeOutput==None else args.writeOutput == 'true' - logger.info('writeOutput = {}'.format(writeOutput)) - outDir = args.outDir - logger.info('outDir = {}'.format(outDir)) - - # create metadata and images folder in outDir - if not os.path.isdir(os.path.join(outDir, 'images')): - os.mkdir(os.path.join(outDir, 'images')) - if not os.path.isdir(os.path.join(outDir, 'metadata_files')): - os.mkdir(os.path.join(outDir, 'metadata_files')) - - # Surround with try/finally for proper error catching - try: - main(inpDir=inpDir, - csvDir=csvDir, - outDir=outDir, - filePattern=filePattern, - groupVar=groupVar, - percentile=percentile, - removeDirection=removeDirection, - sectionVar=sectionVar, - feature=feature, - padding=padding, - writeOutput=writeOutput) - - except Exception: - traceback.print_exc() - - finally: - logger.info('exiting workflow..') - # Exit the program - sys.exit() \ No newline at end of file + rename_map = dict(zip(keep_planes[section_id], uniques[maj_grouping_var])) + + if section_var is not None and section_var.strip(): + summary.write( + f"Section : {({k: file[0][k] for k in section_variables})} \n", + ) + logger.info( + "Renaming files from section : {} \n".format( + {k: file[0][k] for k in section_variables}, + ), + ) + + # for k,v in rename_map.items(): + fm = file[1][0][0] + fname = file[1][0][1][0].name + + if fm[maj_grouping_var] not in keep_planes[section_id]: + continue + + # old and new file name + old_file_name = fname + + file_name_dict = dict(fm.items()) + file_name_dict[maj_grouping_var] = rename_map[fm[maj_grouping_var]] + + new_file_name = fps.get_matching(**file_name_dict)[0][1][0].name + + # if write output collection + if write_output: + shutil.copy2(Path(inp_dir, old_file_name), Path(out_dir, new_file_name)) + + summary.write(f"{old_file_name} -----> {new_file_name} \n") + summary.close() diff --git a/clustering/feature-subsetting-plugin/tests/__init__.py b/clustering/feature-subsetting-plugin/tests/__init__.py index e69de29bb..9ec7171d9 100644 --- a/clustering/feature-subsetting-plugin/tests/__init__.py +++ b/clustering/feature-subsetting-plugin/tests/__init__.py @@ -0,0 +1 @@ +"""Feature Subsetting Plugin.""" diff --git a/clustering/feature-subsetting-plugin/tests/conftest.py b/clustering/feature-subsetting-plugin/tests/conftest.py index e69de29bb..6aee03ccb 100644 --- a/clustering/feature-subsetting-plugin/tests/conftest.py +++ b/clustering/feature-subsetting-plugin/tests/conftest.py @@ -0,0 +1,58 @@ +"""Test fixtures. + +Set up all data used in tests. +""" +import tempfile +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + + +@pytest.fixture( + params=[ + (500, ".csv"), + ], +) +def get_params(request: pytest.FixtureRequest) -> tuple[int, str]: + """To get the parameter of the fixture.""" + return request.param + + +@pytest.fixture() +def generate_synthetic_data( + get_params: tuple[int, str], +) -> tuple[Path, Path, Path, str]: + """Generate tabular data.""" + nrows, file_extension = get_params + input_directory = Path(tempfile.mkdtemp(prefix="inpDir_", dir=Path.cwd())) + tabular_directory = Path(tempfile.mkdtemp(prefix="tabularDir_", dir=Path.cwd())) + output_directory = Path(tempfile.mkdtemp(prefix="out_", dir=Path.cwd())) + rng = np.random.default_rng() + channels = 5 + zpos = 4 + nrows = 3 + for c in range(channels): + for z in range(zpos): + file_name = Path(input_directory, f"x00_y01_p0{z}_c{c}.ome.tif") + Path.open(Path(file_name), "a").close() + + tabular_data = { + "intensity_image": [file_name.name] * nrows, + "MEAN": rng.random(nrows).tolist(), + "MEAN_ABSOLUTE_DEVIATION": rng.random(nrows).tolist(), + "MEDIAN": rng.random(nrows).tolist(), + "MODE": rng.random(nrows).tolist(), + } + outname = file_name.stem.split(".")[0] + + df = pd.DataFrame(tabular_data) + if file_extension == ".csv": + outpath = Path(tabular_directory, f"{outname}.csv") + df.to_csv(outpath, index=False) + if file_extension == ".arrow": + outpath = Path(tabular_directory, f"{outname}.arrow") + df.to_feather(outpath) + + return input_directory, tabular_directory, output_directory, file_extension diff --git a/clustering/feature-subsetting-plugin/tests/test_cli.py b/clustering/feature-subsetting-plugin/tests/test_cli.py index e69de29bb..4ea128cd0 100644 --- a/clustering/feature-subsetting-plugin/tests/test_cli.py +++ b/clustering/feature-subsetting-plugin/tests/test_cli.py @@ -0,0 +1,92 @@ +# """Test Command line Tool.""" +# from typer.testing import CliRunner +# from polus.plugins.clustering.feature_subsetting.__main__ import app +# import shutil +# from pathlib import Path + + +# def test_cli(generate_synthetic_data: tuple[Path, Path, Path, str]) -> None: +# """Test the command line.""" +# inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data +# file_pattern='x{x+}_y{y+}_p{p+}_c{c+}.ome.tif' +# image_feature="intensity_image" +# tabular_feature = "MEAN" +# padding=0 +# group_var="p,c" + + +# runner = CliRunner() +# result = runner.invoke( +# app, +# [ +# "--inpDir", +# inp_dir, +# "--tabularDir", +# tabular_dir, +# "--filePattern", +# file_pattern, +# "--imageFeature", +# image_feature, +# "--tabularFeature", +# tabular_feature, +# "--padding", +# padding, +# "--groupVar", +# group_var, +# "--percentile", +# 0.8, +# "--removeDirection", +# "Below", +# "--writeOutput", +# "--outDir", +# out_dir, +# ], +# ) + +# assert result.exit_code == 0 +# shutil.rmtree(inp_dir) +# shutil.rmtree(out_dir) +# shutil.rmtree(tabular_dir) + + +# def test_short_cli(generate_synthetic_data: tuple[Path, Path, Path, str]) -> None: +# """Test short cli command line.""" +# inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data +# file_pattern='x{x+}_y{y+}_p{p+}_c{c+}.ome.tif' +# image_feature="intensity_image" +# tabular_feature = "MEAN" +# padding=0 +# group_var="p,c" + +# runner = CliRunner() +# result = runner.invoke( +# app, +# [ +# "-i", +# inp_dir, +# "-t", +# tabular_dir, +# "-f", +# file_pattern, +# "-if", +# image_feature, +# "-tf", +# tabular_feature, +# "-p", +# padding, +# "-g", +# group_var, +# "-pc", +# 0.8, +# "-r", +# "Below", +# "-w", +# "-o", +# out_dir, +# ], +# ) + +# assert result.exit_code == 0 +# shutil.rmtree(inp_dir) +# shutil.rmtree(out_dir) +# shutil.rmtree(tabular_dir) diff --git a/clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py b/clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py index e69de29bb..7c9828cc4 100644 --- a/clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py +++ b/clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py @@ -0,0 +1,41 @@ +"""Test Feature Subsetting Plugin.""" +import shutil +from pathlib import Path + +import polus.plugins.clustering.feature_subsetting.feature_subset as fs + + +def test_feature_subset( + generate_synthetic_data: tuple[Path, Path, Path, str], +) -> None: + """Test images subsetting based on feature values.""" + inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data + file_pattern = "x{x+}_y{y+}_p{p+}_c{c+}.ome.tif" + image_feature = "intensity_image" + tabular_feature = "MEAN" + padding = 0 + percentile = 0.8 + remove_direction = "Below" + group_var = "p,c" + write_output = True + + fs.feature_subset( + inp_dir=inp_dir, + tabular_dir=tabular_dir, + out_dir=out_dir, + file_pattern=file_pattern, + group_var=group_var, + percentile=percentile, + remove_direction=remove_direction, + section_var=None, + image_feature=image_feature, + tabular_feature=tabular_feature, + padding=padding, + write_output=write_output, + ) + + out_ext = [Path(f.name).suffix for f in out_dir.iterdir()] + assert len(out_ext) != 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + shutil.rmtree(tabular_dir) From 0768b5a79f412fae9fd89d235d88db4a01bf8cb0 Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Thu, 25 Jan 2024 11:22:24 -0600 Subject: [PATCH 06/19] updating filepattern and vaex package --- .../tests/test_cli.py | 167 +++++++++--------- 1 file changed, 83 insertions(+), 84 deletions(-) diff --git a/clustering/feature-subsetting-plugin/tests/test_cli.py b/clustering/feature-subsetting-plugin/tests/test_cli.py index 4ea128cd0..b41d795c8 100644 --- a/clustering/feature-subsetting-plugin/tests/test_cli.py +++ b/clustering/feature-subsetting-plugin/tests/test_cli.py @@ -1,92 +1,91 @@ -# """Test Command line Tool.""" -# from typer.testing import CliRunner -# from polus.plugins.clustering.feature_subsetting.__main__ import app -# import shutil -# from pathlib import Path +"""Test Command line Tool.""" +from typer.testing import CliRunner +from polus.plugins.clustering.feature_subsetting.__main__ import app +import shutil +from pathlib import Path -# def test_cli(generate_synthetic_data: tuple[Path, Path, Path, str]) -> None: -# """Test the command line.""" -# inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data -# file_pattern='x{x+}_y{y+}_p{p+}_c{c+}.ome.tif' -# image_feature="intensity_image" -# tabular_feature = "MEAN" -# padding=0 -# group_var="p,c" +def test_cli(generate_synthetic_data: tuple[Path, Path, Path, str]) -> None: + """Test the command line.""" + inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data + file_pattern = "x{x+}_y{y+}_p{p+}_c{c+}.ome.tif" + image_feature = "intensity_image" + tabular_feature = "MEAN" + padding = 0 + group_var = "p,c" + runner = CliRunner() + result = runner.invoke( + app, + [ + "--inpDir", + inp_dir, + "--tabularDir", + tabular_dir, + "--filePattern", + file_pattern, + "--imageFeature", + image_feature, + "--tabularFeature", + tabular_feature, + "--padding", + padding, + "--groupVar", + group_var, + "--percentile", + 0.8, + "--removeDirection", + "Below", + "--writeOutput", + "--outDir", + out_dir, + ], + ) -# runner = CliRunner() -# result = runner.invoke( -# app, -# [ -# "--inpDir", -# inp_dir, -# "--tabularDir", -# tabular_dir, -# "--filePattern", -# file_pattern, -# "--imageFeature", -# image_feature, -# "--tabularFeature", -# tabular_feature, -# "--padding", -# padding, -# "--groupVar", -# group_var, -# "--percentile", -# 0.8, -# "--removeDirection", -# "Below", -# "--writeOutput", -# "--outDir", -# out_dir, -# ], -# ) + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + shutil.rmtree(tabular_dir) -# assert result.exit_code == 0 -# shutil.rmtree(inp_dir) -# shutil.rmtree(out_dir) -# shutil.rmtree(tabular_dir) +def test_short_cli(generate_synthetic_data: tuple[Path, Path, Path, str]) -> None: + """Test short cli command line.""" + inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data + file_pattern = "x{x+}_y{y+}_p{p+}_c{c+}.ome.tif" + image_feature = "intensity_image" + tabular_feature = "MEAN" + padding = 0 + group_var = "p,c" -# def test_short_cli(generate_synthetic_data: tuple[Path, Path, Path, str]) -> None: -# """Test short cli command line.""" -# inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data -# file_pattern='x{x+}_y{y+}_p{p+}_c{c+}.ome.tif' -# image_feature="intensity_image" -# tabular_feature = "MEAN" -# padding=0 -# group_var="p,c" + runner = CliRunner() + result = runner.invoke( + app, + [ + "-i", + inp_dir, + "-t", + tabular_dir, + "-f", + file_pattern, + "-if", + image_feature, + "-tf", + tabular_feature, + "-p", + padding, + "-g", + group_var, + "-pc", + 0.8, + "-r", + "Below", + "-w", + "-o", + out_dir, + ], + ) -# runner = CliRunner() -# result = runner.invoke( -# app, -# [ -# "-i", -# inp_dir, -# "-t", -# tabular_dir, -# "-f", -# file_pattern, -# "-if", -# image_feature, -# "-tf", -# tabular_feature, -# "-p", -# padding, -# "-g", -# group_var, -# "-pc", -# 0.8, -# "-r", -# "Below", -# "-w", -# "-o", -# out_dir, -# ], -# ) - -# assert result.exit_code == 0 -# shutil.rmtree(inp_dir) -# shutil.rmtree(out_dir) -# shutil.rmtree(tabular_dir) + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + shutil.rmtree(tabular_dir) From b139dd3ece44527fbec9931efcb69aa4c7e9859a Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Thu, 25 Jan 2024 11:56:25 -0600 Subject: [PATCH 07/19] Added two more tests --- .../feature_subsetting/feature_subset.py | 2 -- .../tests/test_feature_subsetting.py | 30 +++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py b/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py index a84455e8e..06578672d 100644 --- a/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py +++ b/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py @@ -277,8 +277,6 @@ def feature_subset( # noqa : C901 {k: file[0][k] for k in section_variables}, ), ) - - # for k,v in rename_map.items(): fm = file[1][0][0] fname = file[1][0][1][0].name diff --git a/clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py b/clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py index 7c9828cc4..b3691f10f 100644 --- a/clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py +++ b/clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py @@ -39,3 +39,33 @@ def test_feature_subset( shutil.rmtree(inp_dir) shutil.rmtree(out_dir) shutil.rmtree(tabular_dir) + + +def test_filter_planes() -> None: + """Test filter planes.""" + feature_dict = { + 1: 1236.597914951989, + 2: 1153.754875685871, + 3: 1537.3429175240055, + 4: 1626.0415809327849, + } + + percentile = 0.1 + remove_direction = "Below" + fn = fs.filter_planes( + feature_dict=feature_dict, + remove_direction=remove_direction, + percentile=percentile, + ) + + assert type(fn) == set + + +def test_make_uniform() -> None: + """Test each section contain same number of images.""" + planes_dict = {1: [3, 4]} + uniques = [1, 2, 3, 4] + padding = 0 + fn = fs.make_uniform(planes_dict=planes_dict, uniques=uniques, padding=padding) + + assert len(fn) != 0 From db103f03286b82c427fcbfd156b850d76010df24 Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Thu, 25 Jan 2024 12:45:45 -0600 Subject: [PATCH 08/19] deleted old repo --- .../Dockerfile | 24 -- .../polus-feature-subsetting-plugin/README.md | 56 ---- .../polus-feature-subsetting-plugin/VERSION | 1 - .../build-docker.sh | 4 - .../plugin.json | 139 --------- .../src/main.py | 288 ------------------ .../src/requirements.txt | 2 - 7 files changed, 514 deletions(-) delete mode 100644 clustering/polus-feature-subsetting-plugin/Dockerfile delete mode 100644 clustering/polus-feature-subsetting-plugin/README.md delete mode 100644 clustering/polus-feature-subsetting-plugin/VERSION delete mode 100644 clustering/polus-feature-subsetting-plugin/build-docker.sh delete mode 100644 clustering/polus-feature-subsetting-plugin/plugin.json delete mode 100644 clustering/polus-feature-subsetting-plugin/src/main.py delete mode 100644 clustering/polus-feature-subsetting-plugin/src/requirements.txt diff --git a/clustering/polus-feature-subsetting-plugin/Dockerfile b/clustering/polus-feature-subsetting-plugin/Dockerfile deleted file mode 100644 index babcd2385..000000000 --- a/clustering/polus-feature-subsetting-plugin/Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ - -FROM polusai/bfio:2.1.9 - -# from bfio container -# ENV POLUS_EXT=".ome.tif" -# ENV POLUS_LOG="INFO" -# ENV EXEC_DIR="/opt/executables" -# ENV DATA_DIR="/data" - -COPY VERSION / - -ARG EXEC_DIR="/opt/executables" -ARG DATA_DIR="/data" - -RUN mkdir -p ${EXEC_DIR} \ - && mkdir -p ${DATA_DIR}/inputs \ - && mkdir ${DATA_DIR}/outputs - -COPY src ${EXEC_DIR}/ -WORKDIR ${EXEC_DIR} - -RUN pip3 install -r ${EXEC_DIR}/requirements.txt --no-cache-dir - -ENTRYPOINT ["python3", "/opt/executables/main.py"] \ No newline at end of file diff --git a/clustering/polus-feature-subsetting-plugin/README.md b/clustering/polus-feature-subsetting-plugin/README.md deleted file mode 100644 index 24ccba663..000000000 --- a/clustering/polus-feature-subsetting-plugin/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# Feature Data Subset - -This WIPP plugin subsets data based on a given feature. It works in conjunction with the `polus-feature-extraction-plugin`, where the feature extraction plugin can be used to extract the features such as the mean intensity of every image in the input image collection. - -# Usage -The details and usage of the plugin inputs is provided in the section below. In addition to the subsetted data, the output directory also consists of a `summary.txt` file which has information as to what images were kept and their new filename if they were renamed. - -### Explanation of inputs -Some of the inputs are pretty straighforward and are used commonly across most WIPP plugins. This section is used to provide some details and examples of the inputs that may be a little complicated. The image collection with the following pattern will be used as an example : `r{r+}_t{t+}_p{p+}_z{z+}_c{c+}.ome.tif`, where r,t,p,z,c stand for replicate, timepoint, positon,z-positon, and channel respectively. Consider we have 5 replicates, 3 timepoints, 50 positions, 10 z-planes and 4 channels. - -1. `inpDir` - This contains the path to the input image collection to subset data from. -2. `filePattern` - Filepattern of the input images -3. `groupVar` - This is a mandatory input across which to subset data. This can take either 1 or 2 variables as input and if 2 variables are provided then the second variable will be treated as the minor grouping variable. In our example, if the `z` is provided as input, then within a subcollection, the mean of the feature value will be taken for all images with the same z. Then the z positions will be filtered out based on the input of `percentile` and `removeDirection` variables. Now if `z,c` are provided as input, then 'c' will be treated as the minor grouping variable which means that the mean will be taken for all images with the same z for each channel. Also, the plugin will ensures that the same values of z positions are filtered out across c. -4. `csvDir` - This contains the path to the csv collection containing the feature values for each image. This can be the output of the feature extraction plugin. -5. `feature` - The column name from the csv file that will be used to filter images -6. `percentile` and `removeDirection` - These two variables denote the critieria with which images are filtered. For example, if percentile is `0.1` and removeDirection is set to `Below` then images with feature value below the 10th percentile will be removed. On the other hand, if removeDirection is set to above then all images with feature value greater than the 10th pecentile will be removed. This enables data subsetting from both `brighfield` and `darkfield` microscopy images. - - **Optional Arguments** - -8. `sectionVar` - This is an optional input to segregate the input image collection into sub-collections. The analysis will be done seperately for each sub-collection. In our example, if the user enters `r,t` as the sectionVar, then we will have 15 subcollections (5*3),1 for each combination of timepoint and replicate. If the user enters `r` as sectionVar, then we will have 5 sub collections, 1 for each replicate. If the user wants to consider the whole image collection as a single section, then no input is required. NOTE: As a post processing step, same number of images will be subsetted across different sections. -9. `padding` - This is an optional variable with default value of 0. A delay of 3 means that 3 additional planes will captured on either side of the subsetted data. This can be used as a sanity check to ensure that the subsetted data captures the images we want. For example, in our examples if the following z values were filtered out intitially - 5,6,7 ; then a delay of 3 means that the output dataset will have z positions 2,3,4,5,6,7,8,9,10 if all them exist. -10. `writeOutput` - This is an optional argument with default value `True`. If it is set to true, then both the output image collection and `summary.txt` file will be created. If it is set to false, then the output directory will only consist of summary.txt. This option enables the user to tune the hyperparameters such as percentile, removeDirecton, feature without actually creating the output image collection. - - - -Contact [Gauhar Bains](mailto:gauhar.bains@labshare.org) for more information. - -For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). - -## Building - -To build the Docker image for the conversion plugin, run -`./build-docker.sh`. - -## Install WIPP Plugin - -If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. - -## Options - -This plugin takes one input argument and one output argument: - -| Name | Description | I/O | Type | -| ------------------- | ----------------------------------------------------- | ------ | ------------- | -| `--csvDir` | CSV collection containing features | Input | csvCollection | -| `--padding` | Number of images to capture outside the cutoff | Input | int | -| `--feature` | Feature to use to subset data | Input | string | -| `--filePattern` | Filename pattern used to separate data | Input | string | -| `--groupVar` | variables to group by in a section | Input | string | -| `--inpDir` | Input image collection to be processed by this plugin | Input | collection | -| `--percentile` | Percentile to remove | Input | int | -| `--removeDirection` | remove direction above or below percentile | Input | string | -| `--sectionVar` | variables to divide larger sections | Input | string | -| `--writeOutput` | write output image collection or not | Input | boolean | -| `--outDir` | Output collection | Output | collection | - diff --git a/clustering/polus-feature-subsetting-plugin/VERSION b/clustering/polus-feature-subsetting-plugin/VERSION deleted file mode 100644 index a34eaa5d0..000000000 --- a/clustering/polus-feature-subsetting-plugin/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.1.11 \ No newline at end of file diff --git a/clustering/polus-feature-subsetting-plugin/build-docker.sh b/clustering/polus-feature-subsetting-plugin/build-docker.sh deleted file mode 100644 index d9ad13705..000000000 --- a/clustering/polus-feature-subsetting-plugin/build-docker.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -version=$(= thresh] - else: - keep_planes = [z for z in planes if feature_dict[z] <= thresh] - - return set(keep_planes) - -def make_uniform(planes_dict, uniques, padding): - """ Ensure each section has the same number of images - - This function makes the output collection uniform in - the sense that it preserves same number of planes across - sections. It also captures additional planes based - on the value of the padding variable - - Args: - planes_dict (dict): planes to keep in different sections - uniques (list): unique values for the major grouping variable - padding (int): additional images to capture outside cutoff - - Returns: - dictionary: dictionary containing planes to keep - """ - - # max no. of planes - max_len = max([len(i) for i in planes_dict.values()]) - - # max planes that can be added on each side - min_ind = min([min(planes_dict[k]) for k in planes_dict]) - max_ind = max([max(planes_dict[k]) for k in planes_dict]) - max_add_left = uniques.index(min_ind) - max_add_right = len(uniques) - (uniques.index(max_ind)+1) - - # add planes in each section based on padding and max number of planes - for section_id, planes in planes_dict.items(): - len_to_add = max_len - len(planes) - len_add_left = min(int(len_to_add)/2+padding, max_add_left) - len_add_right = min(len_to_add - len_add_left+padding, max_add_right) - left_ind = int(uniques.index(min(planes)) - len_add_left) - right_ind = int(uniques.index(max(planes)) + len_add_right)+1 - planes_dict[section_id] = uniques[left_ind:right_ind] - return planes_dict - -def main(inpDir,csvDir,outDir,filePattern,groupVar,percentile, - removeDirection,sectionVar,feature,padding,writeOutput): - """Function containing the main login to subset data - - Args: - inpDir (string): path to input image collection - csvDir (string): path to csv file containing features - outDir (string): path to output collection - filePattern (string): input image filepattern - groupVar (string): grouping variables - percentile (float): cutoff feature percentile - removeDirection (string): subset above or below percentile - sectionVar (string): sectioning variable - feature (string): feature to subset using - padding (int): capture additional images outside of cutoff - writeOutput (boolean): write output image collection or not - """ - - # Get all file names in csvDir image collection - csvDir_files = [f.name for f in Path(csvDir).iterdir() if f.is_file() and "".join(f.suffixes)=='.csv'] - - # Get all file names in inpDir image collection - inpDir_files = [f.name for f in Path(inpDir).iterdir() if f.is_file() and "".join(f.suffixes)=='.ome.tif'] - - # read and concat all csv files - for ind, file in enumerate(csvDir_files): - if ind == 0: - feature_df = pd.read_csv(os.path.join(csvDir, file), header=0) - else: - feature_df = pd.concat([feature_df, pd.read_csv(os.path.join(csvDir, file), header=0)]) - - # store image name and its feature value - feature_dict = {k:v for k,v in zip(feature_df['Image'], feature_df[feature])} - - # seperate filepattern variables into different categories - _,var = filepattern.get_regex(filePattern) - grouping_variables = groupVar.split(',') - section_variables = sectionVar.split(',') - sub_section_variables = [v for v in var if v not in grouping_variables+section_variables] - - # initialize filepattern object - fp = filepattern.FilePattern(inpDir, pattern=filePattern) - uniques = fp.uniques - - [maj_grouping_var, min_grouping_var] = grouping_variables if len(grouping_variables)>1 else grouping_variables+[None] - keep_planes = {} - - logger.info('Iterating over sections...') - # single iteration of this loop gives all images in one section - for file in fp(group_by=sub_section_variables+grouping_variables): - - section_feat_dict = {} - section_keep_planes = [] - section_id = tuple([file[0][i] for i in section_variables]) if section_variables[0] else 1 - - # iterate over files in one section - for f in file: - if min_grouping_var == None: - f[min_grouping_var] = None - - # stote feature values for images - if f[min_grouping_var] not in section_feat_dict: - section_feat_dict[f[min_grouping_var]] = {} - - if f[maj_grouping_var] not in section_feat_dict[f[min_grouping_var]]: - section_feat_dict[f[min_grouping_var]][f[maj_grouping_var]] = [] - - section_feat_dict[f[min_grouping_var]][f[maj_grouping_var]].append(feature_dict[f['file'].name]) - - # average feature value by grouping variable - for key1 in section_feat_dict: - for key2 in section_feat_dict[key1]: - section_feat_dict[key1][key2] = sum(section_feat_dict[key1][key2])/len(section_feat_dict[key1][key2]) - - # find planes to keep based on specified criteria - section_keep_planes.append(filter_planes(section_feat_dict[key1],removeDirection, percentile)) - - # keep same planes within a section, across the minor grouping variable - section_keep_planes = list(section_keep_planes[0].union(*section_keep_planes)) - section_keep_planes = [i for i in range(min(section_keep_planes), max(section_keep_planes)+1) if i in uniques[maj_grouping_var]] - keep_planes[section_id] = section_keep_planes - - # keep same number of planes across different sections - keep_planes = make_uniform(keep_planes, uniques[maj_grouping_var], padding) - - # start writing summary.txt - summary = open(os.path.join(outDir, 'metadata_files', 'summary.txt'), 'w') - - logger.info('renaming subsetted data') - - # reinitialize filepattern object - fp = filepattern.FilePattern(inpDir, pattern=filePattern) - - # rename subsetted data - for file in fp(group_by=sub_section_variables+grouping_variables): - section_id = tuple([file[0][i] for i in section_variables]) if section_variables[0] else 1 - section_keep_planes = keep_planes[section_id] - rename_map = {k:v for k,v in zip(keep_planes[section_id], uniques[maj_grouping_var])} - - # update summary.txt with section renaming info - summary.write('------------------------------------------------ \n') - if sectionVar.strip(): - summary.write('Section : {} \n'.format({k:file[0][k] for k in section_variables})) - logger.info('Renaming files from section : {} \n'.format({k:file[0][k] for k in section_variables})) - summary.write('\nThe following values of "{}" variable have been renamed: \n'.format(maj_grouping_var)) - for k,v in rename_map.items(): - summary.write('{} ---> {} \n'.format(k,v)) - summary.write('\n Files : \n \n') - - # rename and write output - for f in file: - if f[maj_grouping_var] not in keep_planes[section_id]: - continue - - # old and new file name - old_file_name = f['file'].name - file_name_dict = {k.upper():v for k,v in f.items() if k!='file'} - file_name_dict[maj_grouping_var.upper()] = rename_map[f[maj_grouping_var]] - new_file_name = fp.get_matching(**file_name_dict)[0]['file'].name - - # if write output collection - if writeOutput: - shutil.copy2(os.path.join(inpDir, old_file_name),os.path.join(outDir, 'images', new_file_name)) - - summary.write('{} -----> {} \n'.format(old_file_name, new_file_name)) - summary.close() - -if __name__=="__main__": - # Initialize the logger - logging.basicConfig(format='%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s', - datefmt='%d-%b-%y %H:%M:%S') - logger = logging.getLogger("main") - logger.setLevel(logging.INFO) - - ''' Argument parsing ''' - logger.info("Parsing arguments...") - parser = argparse.ArgumentParser(prog='main', description='Subset data using a given feature') - - # Input arguments - parser.add_argument('--csvDir', dest='csvDir', type=str, - help='CSV collection containing features', required=True) - parser.add_argument('--padding', dest='padding', type=str, - help='Number of images to capture outside the cutoff', required=False) - parser.add_argument('--feature', dest='feature', type=str, - help='Feature to use to subset data', required=True) - parser.add_argument('--filePattern', dest='filePattern', type=str, - help='Filename pattern used to separate data', required=True) - parser.add_argument('--groupVar', dest='groupVar', type=str, - help='variables to group by in a section', required=True) - parser.add_argument('--inpDir', dest='inpDir', type=str, - help='Input image collection to be processed by this plugin', required=True) - parser.add_argument('--percentile', dest='percentile', type=str, - help='Percentile to remove', required=True) - parser.add_argument('--removeDirection', dest='removeDirection', type=str, - help='remove direction above or below percentile', required=True) - parser.add_argument('--sectionVar', dest='sectionVar', type=str, - help='variables to divide larger sections', required=False) - parser.add_argument('--writeOutput', dest='writeOutput', type=str, - help='write output image collection or not', required=False) - # Output arguments - parser.add_argument('--outDir', dest='outDir', type=str, - help='Output collection', required=True) - - # Parse the arguments - args = parser.parse_args() - csvDir = args.csvDir - logger.info('csvDir = {}'.format(csvDir)) - padding = args.padding - padding = 0 if padding==None else int(padding) - logger.info('padding = {}'.format(padding)) - feature = args.feature - logger.info('feature = {}'.format(feature)) - filePattern = args.filePattern - logger.info('filePattern = {}'.format(filePattern)) - groupVar = args.groupVar - logger.info('groupVar = {}'.format(groupVar)) - inpDir = args.inpDir - if (Path.is_dir(Path(args.inpDir).joinpath('images'))): - # switch to images folder if present - fpath = str(Path(args.inpDir).joinpath('images').absolute()) - logger.info('inpDir = {}'.format(inpDir)) - percentile = float(args.percentile) - logger.info('percentile = {}'.format(percentile)) - removeDirection = args.removeDirection - logger.info('removeDirection = {}'.format(removeDirection)) - sectionVar = args.sectionVar - sectionVar = '' if sectionVar is None else sectionVar - logger.info('sectionVar = {}'.format(sectionVar)) - writeOutput = True if args.writeOutput==None else args.writeOutput == 'true' - logger.info('writeOutput = {}'.format(writeOutput)) - outDir = args.outDir - logger.info('outDir = {}'.format(outDir)) - - # create metadata and images folder in outDir - if not os.path.isdir(os.path.join(outDir, 'images')): - os.mkdir(os.path.join(outDir, 'images')) - if not os.path.isdir(os.path.join(outDir, 'metadata_files')): - os.mkdir(os.path.join(outDir, 'metadata_files')) - - # Surround with try/finally for proper error catching - try: - main(inpDir=inpDir, - csvDir=csvDir, - outDir=outDir, - filePattern=filePattern, - groupVar=groupVar, - percentile=percentile, - removeDirection=removeDirection, - sectionVar=sectionVar, - feature=feature, - padding=padding, - writeOutput=writeOutput) - - except Exception: - traceback.print_exc() - - finally: - logger.info('exiting workflow..') - # Exit the program - sys.exit() \ No newline at end of file diff --git a/clustering/polus-feature-subsetting-plugin/src/requirements.txt b/clustering/polus-feature-subsetting-plugin/src/requirements.txt deleted file mode 100644 index b7e965ece..000000000 --- a/clustering/polus-feature-subsetting-plugin/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -filepattern>=1.4.5 -pandas>=1.1.3 \ No newline at end of file From 91045c8e4bbd1f951f16a526392d1685da02af0a Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Thu, 25 Jan 2024 13:09:17 -0600 Subject: [PATCH 09/19] remove unused dependency --- clustering/feature-subsetting-plugin/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clustering/feature-subsetting-plugin/pyproject.toml b/clustering/feature-subsetting-plugin/pyproject.toml index 22e810895..f6d0df8b8 100644 --- a/clustering/feature-subsetting-plugin/pyproject.toml +++ b/clustering/feature-subsetting-plugin/pyproject.toml @@ -14,9 +14,9 @@ python = ">=3.9,<3.12" filepattern = "^2.0.4" typer = "^0.7.0" tqdm = "^4.64.1" -preadator="0.4.0.dev2" vaex = "^4.17.0" + [tool.poetry.group.dev.dependencies] pre-commit = "^3.3.3" bump2version = "^1.0.1" @@ -27,4 +27,4 @@ ipykernel = "^6.28.0" [build-system] requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" \ No newline at end of file +build-backend = "poetry.core.masonry.api" From 573901f60e2eae163885140fe449a62d0207fa20 Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Tue, 20 Feb 2024 08:22:44 -0600 Subject: [PATCH 10/19] fix typo error --- clustering/feature-subsetting-plugin/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clustering/feature-subsetting-plugin/README.md b/clustering/feature-subsetting-plugin/README.md index 3ba6ec8d8..bd58f484b 100644 --- a/clustering/feature-subsetting-plugin/README.md +++ b/clustering/feature-subsetting-plugin/README.md @@ -14,7 +14,7 @@ Some of the inputs are pretty straighforward and are used commonly across most W 4. `imageFeature` - Tabular data featuring image filenames 5. `tabularFeature` - Tabular feature that will be used to filter images 6. `groupVar` - This is a mandatory input across which to subset data. This can take either 1 or 2 variables as input and if 2 variables are provided then the second variable will be treated as the minor grouping variable. In our example, if the `z` is provided as input, then within a subcollection, the mean of the feature value will be taken for all images with the same z. Then the z positions will be filtered out based on the input of `percentile` and `removeDirection` variables. Now if `z,c` are provided as input, then 'c' will be treated as the minor grouping variable which means that the mean will be taken for all images with the same z for each channel. Also, the plugin will ensures that the same values of z positions are filtered out across c. -7. `percentile` and `removeDirection` - These two variables denote the critieria with which images are filtered. For example, if percentile is `0.1` and removeDirection is set to `Below` then images with feature value below the 10th percentile will be removed. On the other hand, if removeDirection is set to above then all images with feature value greater than the 10th pecentile will be removed. This enables data subsetting from both `brighfield` and `darkfield` microscopy images. +7. `percentile` and `removeDirection` - These two variables denote the critieria with which images are filtered. For example, if percentile is `0.1` and removeDirection is set to `Below` then images with feature value below the 10th percentile will be removed. On the other hand, if removeDirection is set to above then all images with feature value greater than the 10th pecentile will be removed. This enables data subsetting from both `brightfield` and `darkfield` microscopy images. **Optional Arguments** From 0e82f4bce54339c4b62d21adc59452dd630dbaad Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Tue, 20 Feb 2024 08:26:04 -0600 Subject: [PATCH 11/19] fixed no of inputs --- clustering/feature-subsetting-plugin/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clustering/feature-subsetting-plugin/README.md b/clustering/feature-subsetting-plugin/README.md index bd58f484b..a171eb848 100644 --- a/clustering/feature-subsetting-plugin/README.md +++ b/clustering/feature-subsetting-plugin/README.md @@ -39,7 +39,7 @@ If WIPP is running, navigate to the plugins page and add a new plugin. Paste the ## Options -This plugin takes twelve input arguments and one output argument: +This plugin takes eleven input arguments and one output argument: | Name | Description | I/O | Type | | ------------------- | ----------------------------------------------------- | ------ | ------------- | From 3ca976fe89a946f077539dd4acae596f35e1a4b8 Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Thu, 4 Apr 2024 10:37:46 -0500 Subject: [PATCH 12/19] renamed plugin name and updated container image --- clustering/feature-subsetting-plugin/VERSION | 1 - clustering/feature-subsetting-plugin/build-docker.sh | 4 ---- .../plugins/clustering/feature_subsetting/__init__.py | 2 -- clustering/feature-subsetting-plugin/tests/__init__.py | 1 - .../Dockerfile | 6 +++--- .../README.md | 2 +- clustering/feature-subsetting-tool/VERSION | 1 + clustering/feature-subsetting-tool/build-docker.sh | 4 ++++ .../bumpversion.cfg | 4 ++-- .../example/summary.txt | 0 .../package-release.sh | 4 ++-- .../plugin.json | 8 ++++---- .../pyproject.toml | 4 ++-- .../run-docker.sh | 2 +- .../images/clustering/feature_subsetting/__init__.py | 3 +++ .../images}/clustering/feature_subsetting/__main__.py | 5 +++-- .../clustering/feature_subsetting/feature_subset.py | 3 ++- clustering/feature-subsetting-tool/tests/__init__.py | 1 + .../tests/conftest.py | 0 .../tests/test_cli.py | 3 ++- .../tests/test_feature_subsetting.py | 3 ++- 21 files changed, 33 insertions(+), 28 deletions(-) delete mode 100644 clustering/feature-subsetting-plugin/VERSION delete mode 100644 clustering/feature-subsetting-plugin/build-docker.sh delete mode 100644 clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/__init__.py delete mode 100644 clustering/feature-subsetting-plugin/tests/__init__.py rename clustering/{feature-subsetting-plugin => feature-subsetting-tool}/Dockerfile (74%) rename clustering/{feature-subsetting-plugin => feature-subsetting-tool}/README.md (99%) create mode 100644 clustering/feature-subsetting-tool/VERSION create mode 100644 clustering/feature-subsetting-tool/build-docker.sh rename clustering/{feature-subsetting-plugin => feature-subsetting-tool}/bumpversion.cfg (82%) rename clustering/{feature-subsetting-plugin => feature-subsetting-tool}/example/summary.txt (100%) rename clustering/{feature-subsetting-plugin => feature-subsetting-tool}/package-release.sh (78%) rename clustering/{feature-subsetting-plugin => feature-subsetting-tool}/plugin.json (95%) rename clustering/{feature-subsetting-plugin => feature-subsetting-tool}/pyproject.toml (89%) rename clustering/{feature-subsetting-plugin => feature-subsetting-tool}/run-docker.sh (93%) create mode 100644 clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/__init__.py rename clustering/{feature-subsetting-plugin/src/polus/plugins => feature-subsetting-tool/src/polus/images}/clustering/feature_subsetting/__main__.py (97%) rename clustering/{feature-subsetting-plugin/src/polus/plugins => feature-subsetting-tool/src/polus/images}/clustering/feature_subsetting/feature_subset.py (99%) create mode 100644 clustering/feature-subsetting-tool/tests/__init__.py rename clustering/{feature-subsetting-plugin => feature-subsetting-tool}/tests/conftest.py (100%) rename clustering/{feature-subsetting-plugin => feature-subsetting-tool}/tests/test_cli.py (96%) rename clustering/{feature-subsetting-plugin => feature-subsetting-tool}/tests/test_feature_subsetting.py (96%) diff --git a/clustering/feature-subsetting-plugin/VERSION b/clustering/feature-subsetting-plugin/VERSION deleted file mode 100644 index b4f09dd42..000000000 --- a/clustering/feature-subsetting-plugin/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.2.0-dev \ No newline at end of file diff --git a/clustering/feature-subsetting-plugin/build-docker.sh b/clustering/feature-subsetting-plugin/build-docker.sh deleted file mode 100644 index d9ad13705..000000000 --- a/clustering/feature-subsetting-plugin/build-docker.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -version=$(\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? @@ -24,4 +24,4 @@ replace = version = "{new_version}" [bumpversion:file:VERSION] -[bumpversion:file:src/polus/plugins/clustering/feature_subsetting/__init__.py] \ No newline at end of file +[bumpversion:file:src/polus/images/clustering/feature_subsetting/__init__.py] diff --git a/clustering/feature-subsetting-plugin/example/summary.txt b/clustering/feature-subsetting-tool/example/summary.txt similarity index 100% rename from clustering/feature-subsetting-plugin/example/summary.txt rename to clustering/feature-subsetting-tool/example/summary.txt diff --git a/clustering/feature-subsetting-plugin/package-release.sh b/clustering/feature-subsetting-tool/package-release.sh similarity index 78% rename from clustering/feature-subsetting-plugin/package-release.sh rename to clustering/feature-subsetting-tool/package-release.sh index 8e53414b9..1efde1b01 100644 --- a/clustering/feature-subsetting-plugin/package-release.sh +++ b/clustering/feature-subsetting-tool/package-release.sh @@ -10,7 +10,7 @@ bump2version --config-file bumpversion.cfg --new-version ${version} --allow-dirt ./build-docker.sh # Push to dockerhub -docker push polusai/feature-subsetting-plugin:${version} +docker push polusai/feature-subsetting-tool:${version} # Run pytests -python -m pytest -s tests \ No newline at end of file +python -m pytest -s tests diff --git a/clustering/feature-subsetting-plugin/plugin.json b/clustering/feature-subsetting-tool/plugin.json similarity index 95% rename from clustering/feature-subsetting-plugin/plugin.json rename to clustering/feature-subsetting-tool/plugin.json index 68e7c463c..bdaebffc1 100644 --- a/clustering/feature-subsetting-plugin/plugin.json +++ b/clustering/feature-subsetting-tool/plugin.json @@ -1,18 +1,18 @@ { "name": "Feature Subsetting", - "version": "0.2.0-dev", + "version": "0.2.1-dev", "title": "Feature Subsetting", "description": "Subset data using a given feature.", "author": "Gauhar Bains (gauhar.bains@labshare.org) and Hamdah Shafqat Abbasi (hamdahshafqat.abbasi@nih.gov)", "institution": "National Center for Advancing Translational Sciences, National Institutes of Health", - "repository": "https://github.com/PolusAI/polus-plugins", + "repository": "https://github.com/PolusAI/image-tools", "website": "https://ncats.nih.gov/preclinical/core/informatics", "citation": "", - "containerId": "polusai/feature-subsetting-plugin:0.2.0-dev", + "containerId": "polusai/feature-subsetting-plugin:0.2.1-dev", "baseCommand": [ "python3", "-m", - "polus.plugins.clustering.feature_subsetting" + "polus.images.clustering.feature_subsetting" ], "inputs": { "inpDir": { diff --git a/clustering/feature-subsetting-plugin/pyproject.toml b/clustering/feature-subsetting-tool/pyproject.toml similarity index 89% rename from clustering/feature-subsetting-plugin/pyproject.toml rename to clustering/feature-subsetting-tool/pyproject.toml index f6d0df8b8..eb99a1ab5 100644 --- a/clustering/feature-subsetting-plugin/pyproject.toml +++ b/clustering/feature-subsetting-tool/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] -name = "polus-plugins-clustering-feature-subsetting" -version = "0.2.0-dev" +name = "polus-images-clustering-feature-subsetting" +version = "0.2.1-dev" description = "Subset data using a given feature." authors = [ "Gauhar Bains ", diff --git a/clustering/feature-subsetting-plugin/run-docker.sh b/clustering/feature-subsetting-tool/run-docker.sh similarity index 93% rename from clustering/feature-subsetting-plugin/run-docker.sh rename to clustering/feature-subsetting-tool/run-docker.sh index 36cec9de8..0810b5c1e 100644 --- a/clustering/feature-subsetting-plugin/run-docker.sh +++ b/clustering/feature-subsetting-tool/run-docker.sh @@ -17,7 +17,7 @@ writeOutput=true outDir=${datapath}/output docker run -v ${datapath}:${datapath} \ - polusai/feature-subsetting-plugin:${version} \ + polusai/feature-subsetting-tool:${version} \ --inpDir ${inpDir} \ --tabularDir ${tabularDir} \ --filePattern ${filePattern} \ diff --git a/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/__init__.py b/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/__init__.py new file mode 100644 index 000000000..0d63ded24 --- /dev/null +++ b/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/__init__.py @@ -0,0 +1,3 @@ +"""Feature Subsetting Tool.""" + +__version__ = "0.2.1-dev" diff --git a/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/__main__.py b/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/__main__.py similarity index 97% rename from clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/__main__.py rename to clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/__main__.py index beab04c5b..d99dee32b 100644 --- a/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/__main__.py +++ b/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/__main__.py @@ -1,10 +1,11 @@ -"""Feature Subsetting Plugin.""" +"""Feature Subsetting Tool.""" + import logging import shutil from pathlib import Path from typing import Optional -import polus.plugins.clustering.feature_subsetting.feature_subset as fs +import polus.images.clustering.feature_subsetting.feature_subset as fs import typer app = typer.Typer() diff --git a/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py b/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/feature_subset.py similarity index 99% rename from clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py rename to clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/feature_subset.py index 06578672d..15e4b74bc 100644 --- a/clustering/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/feature_subset.py +++ b/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/feature_subset.py @@ -1,4 +1,5 @@ -"""Feature Subsetting Plugin.""" +"""Feature Subsetting Tool.""" + import logging import os import shutil diff --git a/clustering/feature-subsetting-tool/tests/__init__.py b/clustering/feature-subsetting-tool/tests/__init__.py new file mode 100644 index 000000000..00b38f20e --- /dev/null +++ b/clustering/feature-subsetting-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Feature Subsetting Tool.""" diff --git a/clustering/feature-subsetting-plugin/tests/conftest.py b/clustering/feature-subsetting-tool/tests/conftest.py similarity index 100% rename from clustering/feature-subsetting-plugin/tests/conftest.py rename to clustering/feature-subsetting-tool/tests/conftest.py diff --git a/clustering/feature-subsetting-plugin/tests/test_cli.py b/clustering/feature-subsetting-tool/tests/test_cli.py similarity index 96% rename from clustering/feature-subsetting-plugin/tests/test_cli.py rename to clustering/feature-subsetting-tool/tests/test_cli.py index b41d795c8..3cbe68154 100644 --- a/clustering/feature-subsetting-plugin/tests/test_cli.py +++ b/clustering/feature-subsetting-tool/tests/test_cli.py @@ -1,6 +1,7 @@ """Test Command line Tool.""" + from typer.testing import CliRunner -from polus.plugins.clustering.feature_subsetting.__main__ import app +from polus.images.clustering.feature_subsetting.__main__ import app import shutil from pathlib import Path diff --git a/clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py b/clustering/feature-subsetting-tool/tests/test_feature_subsetting.py similarity index 96% rename from clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py rename to clustering/feature-subsetting-tool/tests/test_feature_subsetting.py index b3691f10f..91d6163a6 100644 --- a/clustering/feature-subsetting-plugin/tests/test_feature_subsetting.py +++ b/clustering/feature-subsetting-tool/tests/test_feature_subsetting.py @@ -1,8 +1,9 @@ """Test Feature Subsetting Plugin.""" + import shutil from pathlib import Path -import polus.plugins.clustering.feature_subsetting.feature_subset as fs +import polus.images.clustering.feature_subsetting.feature_subset as fs def test_feature_subset( From cb1f60904b58ac500175622b31074953e05983ef Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Thu, 4 Apr 2024 10:40:41 -0500 Subject: [PATCH 13/19] renamed bumpversion file --- .../feature-subsetting-tool/{bumpversion.cfg => .bumpversion.cfg} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename clustering/feature-subsetting-tool/{bumpversion.cfg => .bumpversion.cfg} (100%) diff --git a/clustering/feature-subsetting-tool/bumpversion.cfg b/clustering/feature-subsetting-tool/.bumpversion.cfg similarity index 100% rename from clustering/feature-subsetting-tool/bumpversion.cfg rename to clustering/feature-subsetting-tool/.bumpversion.cfg From c63defaf8460d4ff932596539e4a6b0ea8391189 Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Thu, 4 Apr 2024 10:58:48 -0500 Subject: [PATCH 14/19] updated base container image --- clustering/feature-subsetting-tool/Dockerfile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/clustering/feature-subsetting-tool/Dockerfile b/clustering/feature-subsetting-tool/Dockerfile index f4800eaf4..286992b4d 100644 --- a/clustering/feature-subsetting-tool/Dockerfile +++ b/clustering/feature-subsetting-tool/Dockerfile @@ -1,10 +1,10 @@ -FROM polusai/bfio:2.3.6 +FROM polusai/bfio:2.3.3 # environment variables defined in polusai/bfio ENV EXEC_DIR="/opt/executables" ENV POLUS_IMG_EXT=".ome.tif" -ENV POLUS_TAB_EXT=".arrow" - +ENV POLUS_TAB_EXT=".csv" +ENV POLUS_LOG="INFO" # Work directory defined in the base container WORKDIR ${EXEC_DIR} @@ -12,8 +12,9 @@ WORKDIR ${EXEC_DIR} COPY pyproject.toml ${EXEC_DIR} COPY VERSION ${EXEC_DIR} COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src -RUN pip3 install ${EXEC_DIR} --no-cache +RUN pip3 install ${EXEC_DIR} --no-cache-dir ENTRYPOINT ["python3", "-m", "polus.images.clustering.feature_subsetting"] From 71ef6a180625f76af1dfd4a9b4d87d1534c94712 Mon Sep 17 00:00:00 2001 From: hamshkhawar Date: Thu, 4 Apr 2024 11:08:00 -0500 Subject: [PATCH 15/19] updated container image --- clustering/feature-subsetting-tool/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clustering/feature-subsetting-tool/Dockerfile b/clustering/feature-subsetting-tool/Dockerfile index 286992b4d..4a89a995b 100644 --- a/clustering/feature-subsetting-tool/Dockerfile +++ b/clustering/feature-subsetting-tool/Dockerfile @@ -1,4 +1,4 @@ -FROM polusai/bfio:2.3.3 +FROM polusai/bfio:2.3.6 # environment variables defined in polusai/bfio ENV EXEC_DIR="/opt/executables" From 53f15bc23ad92aa4cdd470d443af5d4cb6f7c792 Mon Sep 17 00:00:00 2001 From: Hamdah Shafqat Abbasi <74803092+hamshkhawar@users.noreply.github.com> Date: Tue, 6 Aug 2024 07:13:45 -0400 Subject: [PATCH 16/19] Updating hdbscan-clustering plugin (#498) * fix merge conflicts * fix apply manifest * fix apply manifest * remove file * updated hdbscan-clustering-plugin * fix bug in tests * fixed random generation of floats * fixed docker file and shell script for running docker * fixed docker files * renamed plugin and fixed merged conflicts * fixed docker files --- .../hdbscan-clustering-tool/.bumpversion.cfg | 27 +++ clustering/hdbscan-clustering-tool/.gitignore | 23 +++ clustering/hdbscan-clustering-tool/Dockerfile | 21 +++ clustering/hdbscan-clustering-tool/README.md | 52 ++++++ clustering/hdbscan-clustering-tool/VERSION | 1 + .../hdbscan-clustering-tool/build-docker.sh | 4 + .../package-release.sh | 16 ++ .../hdbscan-clustering-tool/plugin.json | 123 ++++++++++++++ .../hdbscan-clustering-tool/pyproject.toml | 32 ++++ .../hdbscan-clustering-tool/run-docker.sh | 23 +++ .../clustering/hdbscan_clustering/__init__.py | 4 + .../clustering/hdbscan_clustering/__main__.py | 156 ++++++++++++++++++ .../hdbscan_clustering/hdbscan_clustering.py | 150 +++++++++++++++++ .../hdbscan-clustering-tool/tests/__init__.py | 1 + .../hdbscan-clustering-tool/tests/conftest.py | 48 ++++++ .../hdbscan-clustering-tool/tests/test_cli.py | 74 +++++++++ .../tests/test_hdbscan_clustering.py | 49 ++++++ 17 files changed, 804 insertions(+) create mode 100644 clustering/hdbscan-clustering-tool/.bumpversion.cfg create mode 100644 clustering/hdbscan-clustering-tool/.gitignore create mode 100644 clustering/hdbscan-clustering-tool/Dockerfile create mode 100644 clustering/hdbscan-clustering-tool/README.md create mode 100644 clustering/hdbscan-clustering-tool/VERSION create mode 100755 clustering/hdbscan-clustering-tool/build-docker.sh create mode 100644 clustering/hdbscan-clustering-tool/package-release.sh create mode 100644 clustering/hdbscan-clustering-tool/plugin.json create mode 100644 clustering/hdbscan-clustering-tool/pyproject.toml create mode 100755 clustering/hdbscan-clustering-tool/run-docker.sh create mode 100644 clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/__init__.py create mode 100644 clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/__main__.py create mode 100644 clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py create mode 100644 clustering/hdbscan-clustering-tool/tests/__init__.py create mode 100644 clustering/hdbscan-clustering-tool/tests/conftest.py create mode 100644 clustering/hdbscan-clustering-tool/tests/test_cli.py create mode 100644 clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py diff --git a/clustering/hdbscan-clustering-tool/.bumpversion.cfg b/clustering/hdbscan-clustering-tool/.bumpversion.cfg new file mode 100644 index 000000000..230e6c5f9 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/.bumpversion.cfg @@ -0,0 +1,27 @@ +[bumpversion] +current_version = 0.4.8-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:VERSION] + +[bumpversion:file:src/polus/images/clustering/hdbscan_clustering/__init__.py] diff --git a/clustering/hdbscan-clustering-tool/.gitignore b/clustering/hdbscan-clustering-tool/.gitignore new file mode 100644 index 000000000..9ed1c3775 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/.gitignore @@ -0,0 +1,23 @@ +# Jupyter Notebook +.ipynb_checkpoints +poetry.lock +../../poetry.lock +# Environments +.env +.myenv +.venv +env/ +venv/ +# test data directory +data +# yaml file +.pre-commit-config.yaml +# hidden files +.DS_Store +.ds_store +# flake8 +.flake8 +../../.flake8 +__pycache__ +.mypy_cache +requirements.txt diff --git a/clustering/hdbscan-clustering-tool/Dockerfile b/clustering/hdbscan-clustering-tool/Dockerfile new file mode 100644 index 000000000..fd4b86f93 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/Dockerfile @@ -0,0 +1,21 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_LOG="INFO" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + + +ENTRYPOINT ["python3", "-m", "polus.images.clustering.hdbscan_clustering"] +CMD ["--help"] diff --git a/clustering/hdbscan-clustering-tool/README.md b/clustering/hdbscan-clustering-tool/README.md new file mode 100644 index 000000000..80c37a501 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/README.md @@ -0,0 +1,52 @@ +# Hierarchical Density-Based Spatial Clustering of Applications with Noise(HDBSCAN) Clustering (v0.4.8-dev0) + +The HDBSCAN Clustering plugin clusters the data using [HDBSCAN clustering](https://pypi.org/project/hdbscan/) library. The input and output for this plugin is a CSV file. Each observation (row) in the input CSV file is assigned to one of the clusters. The output CSV file contains the column `cluster` that identifies the cluster to which each observation belongs. A user can supply a regular expression with capture groups if they wish to cluster each group independently, or if they wish to average the numerical features across each group and treat them as a single observation. + +## Inputs: + +### Input directory: +This plugin supports the all [vaex](https://vaex.readthedocs.io/en/latest/guides/io.html) supported file formats. + +### Filename pattern: +This plugin uses [filepattern](https://filepattern2.readthedocs.io/en/latest/Home.html) python library to parse file names of tabular files to be processed by this plugin. + +### Grouping pattern: +The input for this parameter is a regular expression with capture group. This input splits the data into groups based on the matched pattern. A new column `group` is created in the output file that has the group based on the given pattern. Unless `averageGroups` is set to `true`, providing a grouping pattern will cluster each group independently. + +### Average groups: +`groupingPattern` to average the numerical features and produce a single row per group which is then clustered. The resulting cluster is assigned to all observations belonging in that group. + +### Label column: +This is the name of the column containing the labels to be used with `groupingPattern`. + +### Minimum cluster size: +This parameter defines the smallest number of points that should be considered as cluster. This is a required parameter. The input should be an integer and the value should be greater than 1. + +### Increment outlier ID: +This parameter sets the ID of the outlier cluster to `1`, otherwise it will be 0. This is useful for visualization purposes if the resulting cluster IDs are turned into image annotations. + +## Output: +The output is a tabular file containing the clustered data. + +## Building +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Options + +This plugin takes four input arguments and one output argument: + +| Name | Description | I/O | Type | +| ---------------------- | ---------------------------------------------------------------------------------------------- | ------ | ------------- | +| `--inpDir` | Input tabular data files. | Input | genericData | +| `--groupingPattern` | Regular expression to group rows. Clustering will be applied across capture groups by default. | Input | string | +| `--averageGroups` | Average data across groups. Requires capture groups | Input | boolean | +| `--labelCol` | Name of the column containing labels for grouping pattern. | Input | string | +| `--minClusterSize` | Minimum cluster size. | Input | number | +| `--incrementOutlierId` | Increments outlier ID to 1. | Input | boolean | +| `--outDir` | Output collection | Output | genericData | +| `--preview` | Generate a JSON file with outputs | Output | JSON | diff --git a/clustering/hdbscan-clustering-tool/VERSION b/clustering/hdbscan-clustering-tool/VERSION new file mode 100644 index 000000000..316ad8d55 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/VERSION @@ -0,0 +1 @@ +0.4.8-dev0 diff --git a/clustering/hdbscan-clustering-tool/build-docker.sh b/clustering/hdbscan-clustering-tool/build-docker.sh new file mode 100755 index 000000000..2e7dd1861 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", + "Hythem Sidky ", + "Hamdah Shafqat abbasi " + ] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.4" +typer = "^0.7.0" +tqdm = "^4.64.1" +preadator="0.4.0.dev2" +vaex = "^4.17.0" +hdbscan = "^0.8.34rc1" + + +[tool.poetry.group.dev.dependencies] +pre-commit = "^3.3.3" +bump2version = "^1.0.1" +pytest = "^7.3.2" +pytest-xdist = "^3.3.1" +pytest-sugar = "^0.9.7" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/clustering/hdbscan-clustering-tool/run-docker.sh b/clustering/hdbscan-clustering-tool/run-docker.sh new file mode 100755 index 000000000..931115198 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/run-docker.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +version=$( None: + """Cluster data using HDBSCAN.""" + logger.info(f"--inpDir = {inp_dir}") + logger.info(f"--filePattern = {file_pattern}") + # Regular expression for grouping. + logger.info(f"--groupingPattern = {grouping_pattern}") + # Whether to average data for each group. + logger.info(f"--averageGroups = {average_groups}") + # Name of column to use for grouping. + logger.info(f"--labelCol = {label_col}") + # Minimum cluster size for clustering using HDBSCAN. + logger.info(f"--minClusterSize = {min_cluster_size}") + # Set outlier cluster id as 1. + logger.info(f"--incrementOutlierId = {increment_outlier_id}") + logger.info(f"--outDir = {out_dir}") + + inp_dir = inp_dir.resolve() + out_dir = out_dir.resolve() + + assert inp_dir.exists(), f"{inp_dir} does not exist!! Please check input path again" + assert ( + out_dir.exists() + ), f"{out_dir} does not exist!! Please check output path again" + + num_workers = max([cpu_count(), 2]) + + files = fp.FilePattern(inp_dir, file_pattern) + + if files is None: + msg = f"No tabular files found. Please check {file_pattern} again" + raise ValueError(msg) + + if preview: + with Path.open(Path(out_dir, "preview.json"), "w") as jfile: + out_json: dict[str, Any] = { + "filepattern": file_pattern, + "outDir": [], + } + for file in files(): + out_name = file[1][0].name.replace( + "".join(file[1][0].suffixes), + f"_hdbscan{hd.POLUS_TAB_EXT}", + ) + out_json["outDir"].append(out_name) + json.dump(out_json, jfile, indent=2) + else: + with preadator.ProcessManager( + name="Cluster data using HDBSCAN", + num_processes=num_workers, + threads_per_process=2, + ) as pm: + for file in tqdm( + files(), + total=len(files()), + desc="Clustering data", + mininterval=5, + initial=0, + unit_scale=True, + colour="cyan", + ): + pm.submit_process( + hd.hdbscan_clustering, + file[1][0], + min_cluster_size, + out_dir, + grouping_pattern, + label_col, + average_groups, + increment_outlier_id, + ) + pm.join_processes() + + +if __name__ == "__main__": + app() diff --git a/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py b/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py new file mode 100644 index 000000000..3940c2861 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py @@ -0,0 +1,150 @@ +"""Hdbscan Clustering Plugin.""" +import logging +import os +import re +from itertools import chain +from pathlib import Path + +import hdbscan +import numpy as np +import vaex + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".csv") +CHUNK_SIZE = 10000 + + +def hdbscan_model( + data: np.ndarray, + min_cluster_size: int, + increment_outlier_id: bool, +) -> np.ndarray: + """Cluster data using HDBSCAN. + + Args: + data: Data that need to be clustered. + min_cluster_size: Minimum cluster size. + increment_outlier_id : Increment outlier ID to unity. + + Returns: + Cluster labels for each row of data. + """ + clusters = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size).fit(data) + labels = clusters.labels_.flatten().astype(np.uint16) + 1 + return labels + 1 if increment_outlier_id else labels + + +def hdbscan_clustering( # noqa: PLR0913 + file: Path, + min_cluster_size: int, + out_dir: Path, + grouping_pattern: str, + label_col: str, + average_groups: bool, + increment_outlier_id: bool, +) -> None: + """Cluster data using HDBSCAN. + + Args: + file: Path of a tabular file. + min_cluster_size: Smallest size grouping that should be considered as a cluster. + out_dir: Path to output directory. + grouping_pattern: Regular expression to caputure groups in a label_col. + label_col: Name of column containing labels. + average_groups:To average data across groups. + increment_outlier_id: Increment outlier ID to unity. + """ + if Path(file.name).suffix == ".csv": + df = vaex.from_csv(file, convert=True, chunk_size=CHUNK_SIZE) + else: + df = vaex.open(file) + # If user provided a regular expression. + if grouping_pattern: + if label_col == "None": + msg = f"Please define label column to capture groups {label_col}" + raise ValueError(msg) + + # Create a column group with matching string + group = np.array( + [ + re.search(grouping_pattern, x).group(0) # type: ignore + for x in df[label_col].tolist() + if len(re.search(grouping_pattern, x).group(0)) != 0 # type: ignore + ], + ) + if len(group) == 0: + msg = f"Could not find group with pattern {grouping_pattern}" + raise ValueError(msg) + + # Create a column group with matching string + df["group"] = group + int_columns = [ + feature + for feature in df.get_column_names() + if df.data_type(feature) == int or df.data_type(feature) == float + ] + + # If we want to average features for each group. + if average_groups: + df_grouped = df.groupby( + "group", + agg=[vaex.agg.mean(x) for x in int_columns], + ) + # Cluster data using HDBSCAN clustering. + logger.info("Clustering the data") + cluster_ids = hdbscan_model( + df_grouped.values, + min_cluster_size, + increment_outlier_id, + ) + df_grouped["cluster"] = cluster_ids + df = df.join( + df_grouped["group", "cluster"], + left_on="group", + right_on="group", + ) + + else: + dfs = [] + for group, df_ss in df.groupby("group"): + # Cluster data using HDBSCAN clustering. + logger.info(f"Clustering data in group {group}") + + cluster_ids = hdbscan_model( + df_ss.values, + min_cluster_size, + increment_outlier_id, + ) + + dfs.append(cluster_ids) + cluster_ids = np.array(list(chain.from_iterable(dfs))) + df["cluster"] = cluster_ids + + # No grouping. Vanilla clustering. + else: + int_columns = [ + feature + for feature in df.get_column_names() + if df.data_type(feature) == int or df.data_type(feature) == float + ] + + # Cluster data using HDBSCAN clustering + logger.info("Clustering the data") + cluster_ids = hdbscan_model( + df[int_columns].values, + min_cluster_size, + increment_outlier_id, + ) + df["cluster"] = cluster_ids + + outname = Path(out_dir, f"{Path(file.name).stem}_hdbscan{POLUS_TAB_EXT}") + + if POLUS_TAB_EXT == ".arrow": + df.export_feather(outname) + logger.info(f"Saving outputs: {outname}") + else: + df.export_csv(path=outname, chunk_size=CHUNK_SIZE) + + logger.info("Finished all processes!") diff --git a/clustering/hdbscan-clustering-tool/tests/__init__.py b/clustering/hdbscan-clustering-tool/tests/__init__.py new file mode 100644 index 000000000..2f89ec82b --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Hdbscan Clustering Plugin.""" diff --git a/clustering/hdbscan-clustering-tool/tests/conftest.py b/clustering/hdbscan-clustering-tool/tests/conftest.py new file mode 100644 index 000000000..a609d5b80 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/conftest.py @@ -0,0 +1,48 @@ +"""Test fixtures. + +Set up all data used in tests. +""" +import tempfile +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + + +@pytest.fixture( + params=[(50000, ".csv"), (100000, ".arrow")], +) +def get_params(request: pytest.FixtureRequest) -> tuple[int, str]: + """To get the parameter of the fixture.""" + return request.param + + +@pytest.fixture() +def generate_synthetic_data(get_params: tuple[int, str]) -> tuple[Path, Path, str]: + """Generate tabular data.""" + nrows, file_extension = get_params + + input_directory = Path(tempfile.mkdtemp(prefix="inputs_")) + output_directory = Path(tempfile.mkdtemp(prefix="out_")) + rng = np.random.default_rng() + tabular_data = { + "sepal_length": rng.random(nrows).tolist(), + "sepal_width": rng.random(nrows).tolist(), + "petal_length": rng.random(nrows).tolist(), + "petal_width": rng.random(nrows).tolist(), + "species": rng.choice( + ["Iris-setosa", "Iris-versicolor", "Iris-virginica"], + nrows, + ).tolist(), + } + + df = pd.DataFrame(tabular_data) + if file_extension == ".csv": + outpath = Path(input_directory, "data.csv") + df.to_csv(outpath, index=False) + if file_extension == ".arrow": + outpath = Path(input_directory, "data.arrow") + df.to_feather(outpath) + + return input_directory, output_directory, file_extension diff --git a/clustering/hdbscan-clustering-tool/tests/test_cli.py b/clustering/hdbscan-clustering-tool/tests/test_cli.py new file mode 100644 index 000000000..b087215e8 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/test_cli.py @@ -0,0 +1,74 @@ +"""Test Command line Tool.""" + +from typer.testing import CliRunner +from polus.images.clustering.hdbscan_clustering.__main__ import app +import shutil +from pathlib import Path + + +def test_cli(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test the command line.""" + inp_dir, out_dir, file_extension = generate_synthetic_data + pattern = r"\w+$" + file_pattern = f".*{file_extension}" + label = "species" + clustersize = 3 + + runner = CliRunner() + result = runner.invoke( + app, + [ + "--inpDir", + inp_dir, + "--filePattern", + file_pattern, + "--groupingPattern", + pattern, + "--averageGroups", + "--labelCol", + label, + "--minClusterSize", + clustersize, + "--incrementOutlierId", + "--outDir", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + + +def test_short_cli(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test short command line.""" + inp_dir, out_dir, file_extension = generate_synthetic_data + pattern = r"\w+$" + file_pattern = f".*{file_extension}" + label = "species" + clustersize = 3 + + runner = CliRunner() + result = runner.invoke( + app, + [ + "-i", + inp_dir, + "-f", + file_pattern, + "-g", + pattern, + "-a", + "-l", + label, + "-m", + clustersize, + "-io", + "-o", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) diff --git a/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py b/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py new file mode 100644 index 000000000..83debf273 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py @@ -0,0 +1,49 @@ +"""Test Hdbscan Clustering Plugin.""" + +import shutil +from pathlib import Path + +import filepattern as fp +import polus.images.clustering.hdbscan_clustering.hdbscan_clustering as hd +import vaex + + +def test_hdbscan_clustering(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test hdbscan clustering of tabular data.""" + inp_dir, out_dir, file_extension = generate_synthetic_data + pattern = r"\w+$" + file_pattern = f".*{file_extension}" + files = fp.FilePattern(inp_dir, file_pattern) + for file in files(): + hd.hdbscan_clustering( + file=file[1][0], + min_cluster_size=3, + grouping_pattern=pattern, + label_col="species", + average_groups=True, + increment_outlier_id=True, + out_dir=out_dir, + ) + + out_ext = [Path(f.name).suffix for f in out_dir.iterdir()] + assert all(out_ext) is True + for f in out_dir.iterdir(): + df = vaex.open(f) + assert "cluster" in df.column_names + assert df["cluster"].values != 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + + +def test_hdbscan_model(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test hdbscan model.""" + inp_dir, _, file_extension = generate_synthetic_data + file_pattern = f".*{file_extension}" + files = fp.FilePattern(inp_dir, file_pattern) + for file in files(): + df = vaex.open(file[1][0]) + data = df[df.column_names[:-1]].values + min_cluster_size = 3 + label = hd.hdbscan_model(data, min_cluster_size, True) + assert len(label) != 0 + shutil.rmtree(inp_dir) From a4b6e90a73cad5e793092a066ff8636d4266566d Mon Sep 17 00:00:00 2001 From: Continuous Integration Date: Wed, 17 Jan 2024 15:05:04 +0000 Subject: [PATCH 17/19] build: Bumped version for apply-flatfield-plugin from 2.0.0-dev9 to 2.0.0 --- transforms/images/apply-flatfield-plugin/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/images/apply-flatfield-plugin/VERSION b/transforms/images/apply-flatfield-plugin/VERSION index b2484da91..38f77a65b 100644 --- a/transforms/images/apply-flatfield-plugin/VERSION +++ b/transforms/images/apply-flatfield-plugin/VERSION @@ -1 +1 @@ -2.0.0-dev9 +2.0.1 From 17304ae486957850bcd75cb2cfd50fb97d21cf4b Mon Sep 17 00:00:00 2001 From: Hamdah Shafqat Abbasi Date: Fri, 16 Aug 2024 17:18:56 -0400 Subject: [PATCH 18/19] =?UTF-8?q?Bump=20version:=200.2.1-dev1=20=E2=86=92?= =?UTF-8?q?=200.2.1-dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../feature-subsetting-tool/.bumpversion.cfg | 8 +- clustering/feature-subsetting-tool/README.md | 2 +- clustering/feature-subsetting-tool/VERSION | 2 +- .../feature-subsetting-tool/plugin.json | 203 +++++++++--------- .../feature-subsetting-tool/pyproject.toml | 2 +- .../clustering/feature_subsetting/__init__.py | 2 +- 6 files changed, 107 insertions(+), 112 deletions(-) diff --git a/clustering/feature-subsetting-tool/.bumpversion.cfg b/clustering/feature-subsetting-tool/.bumpversion.cfg index 13187cad0..f70906c30 100644 --- a/clustering/feature-subsetting-tool/.bumpversion.cfg +++ b/clustering/feature-subsetting-tool/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.1-dev +current_version = 0.2.1-dev0 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? @@ -22,6 +22,12 @@ replace = version = "{new_version}" [bumpversion:file:plugin.json] +[bumpversion:file:README.md] + +[bumpversion:file:ict.yaml] + +[bumpversion:file:FeatureSubsetting.cwl] + [bumpversion:file:VERSION] [bumpversion:file:src/polus/images/clustering/feature_subsetting/__init__.py] diff --git a/clustering/feature-subsetting-tool/README.md b/clustering/feature-subsetting-tool/README.md index ee57810da..84e2a96ba 100644 --- a/clustering/feature-subsetting-tool/README.md +++ b/clustering/feature-subsetting-tool/README.md @@ -1,4 +1,4 @@ -# Feature Data Subset(v0.2.1-dev) +# Feature Data Subset(0.2.1-dev0) This WIPP plugin subsets data based on a given feature. It works in conjunction with the `polus-feature-extraction-plugin`, where the feature extraction plugin can be used to extract the features such as the mean intensity of every image in the input image collection. diff --git a/clustering/feature-subsetting-tool/VERSION b/clustering/feature-subsetting-tool/VERSION index a5fea600d..6c0f6f401 100644 --- a/clustering/feature-subsetting-tool/VERSION +++ b/clustering/feature-subsetting-tool/VERSION @@ -1 +1 @@ -0.2.1-dev +0.2.1-dev0 diff --git a/clustering/feature-subsetting-tool/plugin.json b/clustering/feature-subsetting-tool/plugin.json index bdaebffc1..99968a764 100644 --- a/clustering/feature-subsetting-tool/plugin.json +++ b/clustering/feature-subsetting-tool/plugin.json @@ -1,6 +1,6 @@ { "name": "Feature Subsetting", - "version": "0.2.1-dev", + "version": "0.2.1-dev0", "title": "Feature Subsetting", "description": "Subset data using a given feature.", "author": "Gauhar Bains (gauhar.bains@labshare.org) and Hamdah Shafqat Abbasi (hamdahshafqat.abbasi@nih.gov)", @@ -8,166 +8,155 @@ "repository": "https://github.com/PolusAI/image-tools", "website": "https://ncats.nih.gov/preclinical/core/informatics", "citation": "", - "containerId": "polusai/feature-subsetting-plugin:0.2.1-dev", + "containerId": "polusai/feature-subsetting-tool:0.2.1-dev0", "baseCommand": [ "python3", "-m", "polus.images.clustering.feature_subsetting" ], - "inputs": { - "inpDir": { + "inputs": [ + { + "name": "inpDir", "type": "collection", - "title": "Input image directory", - "description": "Input image directory.", - "required": "True" + "description": "Input image directory", + "required": true }, - "tabularDir": { + { + "name": "tabularDir", "type": "genericData", - "title": "Input tabular directory", - "description": "Path to directory containing tabular data.", - "required": "True" + "description": "Path to directory containing tabular data", + "required": true }, - "filePattern": { + { + "name": "filePattern", "type": "string", - "title": "Filename pattern", "description": "Filename pattern used to separate data.", - "required": "True" + "required": true }, - "imageFeature": { + { + "name": "imageFeature", "type": "string", - "title": "imageFeature", "description": "Feature in tabular data containing image filenames.", - "required": "True" + "required": true }, - "tabularFeature": { + { + "name": "tabularFeature", "type": "string", - "title": "tabularFeature", - "description": "Feature in tabular data to subset image data.", - "required": "True" + "description": "Feature in tabular data to subset image data", + "required": true }, - "padding": { + { + "name": "padding", "type": "integer", - "title": "padding", "description": "Number of images to capture outside the cutoff.", - "required": "False" + "required": false }, - "groupVar": { + { + "name": "groupVar", "type": "string", - "title": "groupVar", "description": "variables to group by in a section.", - "required": "True" + "required": true }, - "percentile": { - "type": "float", - "title": "percentile", + { + "name": "percentile", + "type": "number", "description": "Percentile to remove.", - "required": "True" + "required": true }, - "removeDirection": { + { + "name": "removeDirection", "type": "string", - "title": "removeDirection", - "description": "Remove direction above or below percentile.", - "required": "False", + "description": "Remove direction above or below percentile", + "required": false, "default": "Below" }, - "sectionVar": { + { + "name": "sectionVar", "type": "string", - "title": "sectionVar", "description": "Variables to divide larger sections.", - "required": "False" + "required": false }, - "writeOutput": { + { + "name": "writeOutput", "type": "boolean", - "title": "writeOutput", "description": "Write output image collection or not.", - "required": "False" + "required": false }, - "preview": { + { + "name": "preview", "type": "boolean", - "title": "Preview", - "description": "Generate an output preview.", - "required": "False" + "description": "Generate an output preview", + "required": false } - }, - "outputs": { - "outDir": { + ], + "outputs": [ + { + "name": "outDir", "type": "genericData", - "description": "Output collection." + "description": "Output collection" } - }, - "ui": { - "inpDir": { - "type": "collection", - "title": "Input image directory", - "description": "Input image directory.", - "required": "True" - }, - "tabularDir": { - "type": "genericData", - "title": "Input tabular directory", - "description": "Path to directory containing tabular data.", - "required": "True" - }, - "filePattern": { - "type": "string", - "title": "Filename pattern", - "description": "Filename pattern used to separate data.", - "required": "True" - }, - "imageFeature": { - "type": "string", + ], + "ui": [ + { + "key": "inputs.inpDir", + "title": "inpDir", + "description": "Path to Input image directory" + }, + { + "key": "inputs.tabularDir", + "title": "tabularDir", + "description": "Input tabular directory" + }, + { + "key": "inputs.filePattern", + "title": "filepattern", + "description": "A filepattern, used to select data for conversion" + }, + { + "key": "inputs.imageFeature", "title": "imageFeature", - "description": "Feature in tabular data containing image filenames.", - "required": "True" + "description": "Feature in tabular data containing image filenames" }, - "tabularFeature": { - "type": "string", + { + "key": "inputs.tabularFeature", "title": "tabularFeature", - "description": "Feature in tabular data to subset image data.", - "required": "True" + "description": "Feature in tabular data to subset image data." }, - "padding": { - "type": "integer", + { + "key": "inputs.padding", "title": "padding", - "description": "Number of images to capture outside the cutoff.", - "required": "False" + "description": "Number of images to capture outside the cutoff." }, - "groupVar": { - "type": "string", + { + "key": "inputs.groupVar", "title": "groupVar", - "description": "variables to group by in a section.", - "required": "True" + "description": "Variables to group by in a section." }, - "percentile": { - "type": "float", + { + "key": "inputs.percentile", "title": "percentile", - "description": "Percentile to remove.", - "required": "True" + "description": "Percentile to remove." }, - "removeDirection": { - "type": "string", + { + "key": "inputs.removeDirection", "title": "removeDirection", - "description": "Remove direction above or below percentile.", - "required": "False", - "default": "Below" + "description": "Remove direction above or below percentile." }, - "sectionVar": { - "type": "string", + { + "key": "inputs.sectionVar", "title": "sectionVar", - "description": "Variables to divide larger sections.", - "required": "False" + "description": "Variables to divide larger sections." }, - "writeOutput": { - "type": "boolean", + { + "key": "inputs.writeOutput", "title": "writeOutput", - "description": "Write output image collection or not.", - "required": "False" + "description": "Write output image collection or not." }, - "preview": { - "type": "boolean", - "title": "Preview", + { + "key": "inputs.preview", + "title": "preview", "description": "Generate an output preview.", - "required": "False" + "default": false } - } + ] } diff --git a/clustering/feature-subsetting-tool/pyproject.toml b/clustering/feature-subsetting-tool/pyproject.toml index eb99a1ab5..7f3767e5b 100644 --- a/clustering/feature-subsetting-tool/pyproject.toml +++ b/clustering/feature-subsetting-tool/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "polus-images-clustering-feature-subsetting" -version = "0.2.1-dev" +version = "0.2.1-dev0" description = "Subset data using a given feature." authors = [ "Gauhar Bains ", diff --git a/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/__init__.py b/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/__init__.py index 0d63ded24..5138eeedd 100644 --- a/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/__init__.py +++ b/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/__init__.py @@ -1,3 +1,3 @@ """Feature Subsetting Tool.""" -__version__ = "0.2.1-dev" +__version__ = "0.2.1-dev0" From fa9d41903a6f88f520f7105eee7a684a374054a6 Mon Sep 17 00:00:00 2001 From: Hamdah Shafqat Abbasi Date: Fri, 16 Aug 2024 17:20:33 -0400 Subject: [PATCH 19/19] added clt and ict file --- .../FeatureSubsetting.cwl | 68 +++++++++ clustering/feature-subsetting-tool/ict.yaml | 141 ++++++++++++++++++ 2 files changed, 209 insertions(+) create mode 100644 clustering/feature-subsetting-tool/FeatureSubsetting.cwl create mode 100644 clustering/feature-subsetting-tool/ict.yaml diff --git a/clustering/feature-subsetting-tool/FeatureSubsetting.cwl b/clustering/feature-subsetting-tool/FeatureSubsetting.cwl new file mode 100644 index 000000000..681cf88bc --- /dev/null +++ b/clustering/feature-subsetting-tool/FeatureSubsetting.cwl @@ -0,0 +1,68 @@ +class: CommandLineTool +cwlVersion: v1.2 +inputs: + filePattern: + inputBinding: + prefix: --filePattern + type: string + groupVa: + inputBinding: + prefix: --groupVa + type: string + imageFeature: + inputBinding: + prefix: --imageFeature + type: string + inpDir: + inputBinding: + prefix: --inpDir + type: Directory + outDir: + inputBinding: + prefix: --outDir + type: Directory + padding: + inputBinding: + prefix: --padding + type: string? + percentile: + inputBinding: + prefix: --percentile + type: double + preview: + inputBinding: + prefix: --preview + type: boolean? + removeDirection: + inputBinding: + prefix: --removeDirection + type: string? + sectionVar: + inputBinding: + prefix: --sectionVar + type: string? + tabularDir: + inputBinding: + prefix: --tabularDir + type: Directory + tabularFeature: + inputBinding: + prefix: --tabularFeature + type: string + writeOutput: + inputBinding: + prefix: --writeOutput + type: boolean? +outputs: + outDir: + outputBinding: + glob: $(inputs.outDir.basename) + type: Directory +requirements: + DockerRequirement: + dockerPull: polusai/feature-subsetting-tool:0.2.1-dev0 + InitialWorkDirRequirement: + listing: + - entry: $(inputs.outDir) + writable: true + InlineJavascriptRequirement: {} diff --git a/clustering/feature-subsetting-tool/ict.yaml b/clustering/feature-subsetting-tool/ict.yaml new file mode 100644 index 000000000..d91c85c97 --- /dev/null +++ b/clustering/feature-subsetting-tool/ict.yaml @@ -0,0 +1,141 @@ +author: + - Gauhar Bains +contact: gauhar.bains@labshare.org +container: polusai/feature-subsetting-tool:0.2.1-dev0 +description: Subset data using a given feature. +entrypoint: python3 -m polus.images.clustering.feature_subsetting +inputs: + - description: Input image directory + format: + - collection + name: inpDir + required: true + type: path + - description: Path to directory containing tabular data + format: + - genericData + name: tabularDir + required: true + type: path + - description: Filename pattern used to separate data. + format: + - string + name: filePattern + required: true + type: string + - description: Feature in tabular data containing image filenames. + format: + - string + name: imageFeature + required: true + type: string + - description: Feature in tabular data to subset image data + format: + - string + name: tabularFeature + required: true + type: string + - description: Number of images to capture outside the cutoff. + format: + - integer + name: padding + required: false + type: number + - description: variables to group by in a section. + format: + - string + name: groupVar + required: true + type: string + - description: Percentile to remove. + format: + - number + name: percentile + required: true + type: number + - description: Remove direction above or below percentile + format: + - string + name: removeDirection + required: false + type: string + - description: Variables to divide larger sections. + format: + - string + name: sectionVar + required: false + type: string + - description: Write output image collection or not. + format: + - boolean + name: writeOutput + required: false + type: boolean + - description: Generate an output preview + format: + - boolean + name: preview + required: false + type: boolean +name: polusai/FeatureSubsetting +outputs: + - description: Output collection + format: + - genericData + name: outDir + required: true + type: path +repository: https://github.com/PolusAI/image-tools +specVersion: 1.0.0 +title: Feature Subsetting +ui: + - description: Path to Input image directory + key: inputs.inpDir + title: inpDir + type: path + - description: Input tabular directory + key: inputs.tabularDir + title: tabularDir + type: path + - description: A filepattern, used to select data for conversion + key: inputs.filePattern + title: filepattern + type: text + - description: Feature in tabular data containing image filenames + key: inputs.imageFeature + title: imageFeature + type: text + - description: Feature in tabular data to subset image data. + key: inputs.tabularFeature + title: tabularFeature + type: text + - description: Number of images to capture outside the cutoff. + key: inputs.padding + title: padding + type: number + - description: Variables to group by in a section. + key: inputs.groupVar + title: groupVar + type: text + - description: Percentile to remove. + key: inputs.percentile + title: percentile + type: number + - description: Remove direction above or below percentile. + key: inputs.removeDirection + title: removeDirection + type: text + - description: Variables to divide larger sections. + key: inputs.sectionVar + title: sectionVar + type: text + - description: Write output image collection or not. + key: inputs.writeOutput + title: writeOutput + type: checkbox + - default: false + description: Generate an output preview. + key: inputs.preview + title: preview + type: checkbox +version: 0.2.1-dev0