diff --git a/clustering/feature-subsetting-tool/.bumpversion.cfg b/clustering/feature-subsetting-tool/.bumpversion.cfg new file mode 100644 index 000000000..f70906c30 --- /dev/null +++ b/clustering/feature-subsetting-tool/.bumpversion.cfg @@ -0,0 +1,33 @@ +[bumpversion] +current_version = 0.2.1-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:README.md] + +[bumpversion:file:ict.yaml] + +[bumpversion:file:FeatureSubsetting.cwl] + +[bumpversion:file:VERSION] + +[bumpversion:file:src/polus/images/clustering/feature_subsetting/__init__.py] diff --git a/clustering/feature-subsetting-tool/Dockerfile b/clustering/feature-subsetting-tool/Dockerfile new file mode 100644 index 000000000..4a89a995b --- /dev/null +++ b/clustering/feature-subsetting-tool/Dockerfile @@ -0,0 +1,21 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + + +ENTRYPOINT ["python3", "-m", "polus.images.clustering.feature_subsetting"] +CMD ["--help"] diff --git a/clustering/feature-subsetting-tool/FeatureSubsetting.cwl b/clustering/feature-subsetting-tool/FeatureSubsetting.cwl new file mode 100644 index 000000000..681cf88bc --- /dev/null +++ b/clustering/feature-subsetting-tool/FeatureSubsetting.cwl @@ -0,0 +1,68 @@ +class: CommandLineTool +cwlVersion: v1.2 +inputs: + filePattern: + inputBinding: + prefix: --filePattern + type: string + groupVa: + inputBinding: + prefix: --groupVa + type: string + imageFeature: + inputBinding: + prefix: --imageFeature + type: string + inpDir: + inputBinding: + prefix: --inpDir + type: Directory + outDir: + inputBinding: + prefix: --outDir + type: Directory + padding: + inputBinding: + prefix: --padding + type: string? + percentile: + inputBinding: + prefix: --percentile + type: double + preview: + inputBinding: + prefix: --preview + type: boolean? + removeDirection: + inputBinding: + prefix: --removeDirection + type: string? + sectionVar: + inputBinding: + prefix: --sectionVar + type: string? + tabularDir: + inputBinding: + prefix: --tabularDir + type: Directory + tabularFeature: + inputBinding: + prefix: --tabularFeature + type: string + writeOutput: + inputBinding: + prefix: --writeOutput + type: boolean? +outputs: + outDir: + outputBinding: + glob: $(inputs.outDir.basename) + type: Directory +requirements: + DockerRequirement: + dockerPull: polusai/feature-subsetting-tool:0.2.1-dev0 + InitialWorkDirRequirement: + listing: + - entry: $(inputs.outDir) + writable: true + InlineJavascriptRequirement: {} diff --git a/clustering/feature-subsetting-tool/README.md b/clustering/feature-subsetting-tool/README.md new file mode 100644 index 000000000..84e2a96ba --- /dev/null +++ b/clustering/feature-subsetting-tool/README.md @@ -0,0 +1,58 @@ +# Feature Data Subset(0.2.1-dev0) + +This WIPP plugin subsets data based on a given feature. It works in conjunction with the `polus-feature-extraction-plugin`, where the feature extraction plugin can be used to extract the features such as the mean intensity of every image in the input image collection. + +# Usage +The details and usage of the plugin inputs is provided in the section below. In addition to the subsetted data, the output directory also consists of a `summary.txt` file which has information as to what images were kept and their new filename if they were renamed. + +### Explanation of inputs +Some of the inputs are pretty straighforward and are used commonly across most WIPP plugins. This section is used to provide some details and examples of the inputs that may be a little complicated. The image collection with the following pattern will be used as an example : `r{r+}_t{t+}_p{p+}_z{z+}_c{c+}.ome.tif`, where r,t,p,z,c stand for replicate, timepoint, positon,z-positon, and channel respectively. Consider we have 5 replicates, 3 timepoints, 50 positions, 10 z-planes and 4 channels. + +1. `inpDir` - This contains the path to the input image collection to subset data from. +2. `tabularDir` This contains the path to the tabular files with file formats (`.csv`, `.arrow`, `.parquet`) containing the feature values for each image. This can be the output of the feature extraction or nyxus plugin +3. `filePattern` - Filepattern of the input images +4. `imageFeature` - Tabular data featuring image filenames +5. `tabularFeature` - Tabular feature that will be used to filter images +6. `groupVar` - This is a mandatory input across which to subset data. This can take either 1 or 2 variables as input and if 2 variables are provided then the second variable will be treated as the minor grouping variable. In our example, if the `z` is provided as input, then within a subcollection, the mean of the feature value will be taken for all images with the same z. Then the z positions will be filtered out based on the input of `percentile` and `removeDirection` variables. Now if `z,c` are provided as input, then 'c' will be treated as the minor grouping variable which means that the mean will be taken for all images with the same z for each channel. Also, the plugin will ensures that the same values of z positions are filtered out across c. +7. `percentile` and `removeDirection` - These two variables denote the critieria with which images are filtered. For example, if percentile is `0.1` and removeDirection is set to `Below` then images with feature value below the 10th percentile will be removed. On the other hand, if removeDirection is set to above then all images with feature value greater than the 10th pecentile will be removed. This enables data subsetting from both `brightfield` and `darkfield` microscopy images. + + **Optional Arguments** + +8. `sectionVar` - This is an optional input to segregate the input image collection into sub-collections. The analysis will be done seperately for each sub-collection. In our example, if the user enters `r,t` as the sectionVar, then we will have 15 subcollections (5*3),1 for each combination of timepoint and replicate. If the user enters `r` as sectionVar, then we will have 5 sub collections, 1 for each replicate. If the user wants to consider the whole image collection as a single section, then no input is required. NOTE: As a post processing step, same number of images will be subsetted across different sections. +9. `padding` - This is an optional variable with default value of 0. A delay of 3 means that 3 additional planes will captured on either side of the subsetted data. This can be used as a sanity check to ensure that the subsetted data captures the images we want. For example, in our examples if the following z values were filtered out intitially - 5,6,7 ; then a delay of 3 means that the output dataset will have z positions 2,3,4,5,6,7,8,9,10 if all them exist. +10. `writeOutput` - This is an optional argument with default value `True`. If it is set to true, then both the output image collection and `summary.txt` file will be created. If it is set to false, then the output directory will only consist of summary.txt. This option enables the user to tune the hyperparameters such as percentile, removeDirecton, feature without actually creating the output image collection. + + + +Contact [Gauhar Bains](mailto:gauhar.bains@labshare.org) for more information. + +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes eleven input arguments and one output argument: + +| Name | Description | I/O | Type | +| ------------------- | ----------------------------------------------------- | ------ | ------------- | +| `--inpDir` | Input image collection to be processed by this plugin | Input | collection | +| `--tabularDir` | Path to tabular data | Input | genericData | +| `--filePattern` | Filename pattern used to separate data | Input | string | +| `--imageFeature` | Feature in tabular data with image filenames | Input | string | +| `--tabularFeature` | Tabular feature to filter image files | Input | string | +| `--padding` | Number of images to capture outside the cutoff | Input | integer | +| `--groupVar` | variables to group by in a section | Input | string | +| `--percentile` | Percentile to remove | Input | float | +| `--removeDirection` | remove direction above or below percentile | Input | string | +| `--sectionVar` | variables to divide larger sections | Input | string | +| `--writeOutput` | write output image collection or not | Input | boolean | +| `--outDir` | Output collection | Output | genericData | +| `--preview` | Generate a JSON file with outputs | Output | JSON | diff --git a/clustering/feature-subsetting-tool/VERSION b/clustering/feature-subsetting-tool/VERSION new file mode 100644 index 000000000..6c0f6f401 --- /dev/null +++ b/clustering/feature-subsetting-tool/VERSION @@ -0,0 +1 @@ +0.2.1-dev0 diff --git a/clustering/feature-subsetting-tool/build-docker.sh b/clustering/feature-subsetting-tool/build-docker.sh new file mode 100644 index 000000000..d82557ec8 --- /dev/null +++ b/clustering/feature-subsetting-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$( x00_y01_p01_c1.ome.tif +x00_y01_p03_c2.ome.tif -----> x00_y01_p01_c2.ome.tif +x00_y01_p03_c3.ome.tif -----> x00_y01_p01_c3.ome.tif +x00_y01_p03_c4.ome.tif -----> x00_y01_p01_c4.ome.tif +x00_y01_p03_c5.ome.tif -----> x00_y01_p01_c5.ome.tif +x00_y01_p04_c1.ome.tif -----> x00_y01_p02_c1.ome.tif +x00_y01_p04_c2.ome.tif -----> x00_y01_p02_c2.ome.tif +x00_y01_p04_c3.ome.tif -----> x00_y01_p02_c3.ome.tif +x00_y01_p04_c4.ome.tif -----> x00_y01_p02_c4.ome.tif +x00_y01_p04_c5.ome.tif -----> x00_y01_p02_c5.ome.tif diff --git a/clustering/feature-subsetting-tool/ict.yaml b/clustering/feature-subsetting-tool/ict.yaml new file mode 100644 index 000000000..d91c85c97 --- /dev/null +++ b/clustering/feature-subsetting-tool/ict.yaml @@ -0,0 +1,141 @@ +author: + - Gauhar Bains +contact: gauhar.bains@labshare.org +container: polusai/feature-subsetting-tool:0.2.1-dev0 +description: Subset data using a given feature. +entrypoint: python3 -m polus.images.clustering.feature_subsetting +inputs: + - description: Input image directory + format: + - collection + name: inpDir + required: true + type: path + - description: Path to directory containing tabular data + format: + - genericData + name: tabularDir + required: true + type: path + - description: Filename pattern used to separate data. + format: + - string + name: filePattern + required: true + type: string + - description: Feature in tabular data containing image filenames. + format: + - string + name: imageFeature + required: true + type: string + - description: Feature in tabular data to subset image data + format: + - string + name: tabularFeature + required: true + type: string + - description: Number of images to capture outside the cutoff. + format: + - integer + name: padding + required: false + type: number + - description: variables to group by in a section. + format: + - string + name: groupVar + required: true + type: string + - description: Percentile to remove. + format: + - number + name: percentile + required: true + type: number + - description: Remove direction above or below percentile + format: + - string + name: removeDirection + required: false + type: string + - description: Variables to divide larger sections. + format: + - string + name: sectionVar + required: false + type: string + - description: Write output image collection or not. + format: + - boolean + name: writeOutput + required: false + type: boolean + - description: Generate an output preview + format: + - boolean + name: preview + required: false + type: boolean +name: polusai/FeatureSubsetting +outputs: + - description: Output collection + format: + - genericData + name: outDir + required: true + type: path +repository: https://github.com/PolusAI/image-tools +specVersion: 1.0.0 +title: Feature Subsetting +ui: + - description: Path to Input image directory + key: inputs.inpDir + title: inpDir + type: path + - description: Input tabular directory + key: inputs.tabularDir + title: tabularDir + type: path + - description: A filepattern, used to select data for conversion + key: inputs.filePattern + title: filepattern + type: text + - description: Feature in tabular data containing image filenames + key: inputs.imageFeature + title: imageFeature + type: text + - description: Feature in tabular data to subset image data. + key: inputs.tabularFeature + title: tabularFeature + type: text + - description: Number of images to capture outside the cutoff. + key: inputs.padding + title: padding + type: number + - description: Variables to group by in a section. + key: inputs.groupVar + title: groupVar + type: text + - description: Percentile to remove. + key: inputs.percentile + title: percentile + type: number + - description: Remove direction above or below percentile. + key: inputs.removeDirection + title: removeDirection + type: text + - description: Variables to divide larger sections. + key: inputs.sectionVar + title: sectionVar + type: text + - description: Write output image collection or not. + key: inputs.writeOutput + title: writeOutput + type: checkbox + - default: false + description: Generate an output preview. + key: inputs.preview + title: preview + type: checkbox +version: 0.2.1-dev0 diff --git a/clustering/feature-subsetting-tool/package-release.sh b/clustering/feature-subsetting-tool/package-release.sh new file mode 100644 index 000000000..1efde1b01 --- /dev/null +++ b/clustering/feature-subsetting-tool/package-release.sh @@ -0,0 +1,16 @@ +# This script is designed to help package a new version of a plugin + +# Get the new version +version=$(", + "Hamdah Shafqat abbasi " + ] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.4" +typer = "^0.7.0" +tqdm = "^4.64.1" +vaex = "^4.17.0" + + +[tool.poetry.group.dev.dependencies] +pre-commit = "^3.3.3" +bump2version = "^1.0.1" +pytest = "^7.3.2" +pytest-xdist = "^3.3.1" +pytest-sugar = "^0.9.7" +ipykernel = "^6.28.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/clustering/feature-subsetting-tool/run-docker.sh b/clustering/feature-subsetting-tool/run-docker.sh new file mode 100644 index 000000000..0810b5c1e --- /dev/null +++ b/clustering/feature-subsetting-tool/run-docker.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +version=$( None: + """Generate preview of the plugin outputs.""" + shutil.copy( + Path(__file__).parents[4].joinpath("example/summary.txt"), + out_dir, + ) + + +@app.command() +def main( # noqa: PLR0913 + inp_dir: Path = typer.Option( + ..., + "--inpDir", + "-i", + help="Path to the collection of input images.", + ), + tabular_dir: Path = typer.Option( + ..., + "--tabularDir", + "-t", + help="Path to the collection of tabular files containing features.", + ), + file_pattern: Optional[str] = typer.Option( + ".*", + "--filePattern", + "-f", + help="Pattern use to parse filenames", + ), + image_feature: str = typer.Option( + None, + "--imageFeature", + "-if", + help="Image filenames feature in tabular data.", + ), + tabular_feature: str = typer.Option( + None, + "--tabularFeature", + "-tf", + help="Select tabular feature to subset data.", + ), + padding: Optional[int] = typer.Option( + 0, + "--padding", + "-p", + help="Number of images to capture outside the cutoff.", + ), + group_var: str = typer.Option( + ..., + "--groupVar", + "-g", + help="variables to group by in a section.", + ), + percentile: float = typer.Option( + None, + "--percentile", + "-pc", + help="Percentile to remove.", + ), + remove_direction: Optional[str] = typer.Option( + "Below", + "--removeDirection", + "-r", + help="Remove direction above or below percentile.", + ), + section_var: Optional[str] = typer.Option( + None, + "--sectionVar", + "-s", + help="Variables to divide larger sections.", + ), + write_output: Optional[bool] = typer.Option( + False, + "--writeOutput", + "-w", + help="Write output image collection or not.", + ), + out_dir: Path = typer.Option( + ..., + "--outDir", + "-o", + help="Output directory", + ), + preview: Optional[bool] = typer.Option( + False, + "--preview", + help="Output a JSON preview of files", + ), +) -> None: + """Subset data using a given feature.""" + logger.info(f"--inpDir = {inp_dir}") + logger.info(f"--tabularDir = {tabular_dir}") + logger.info(f"--imageFeature = {image_feature}") + logger.info(f"--tabularFeature = {tabular_feature}") + logger.info(f"--filePattern = {file_pattern}") + logger.info(f"--padding = {padding}") + logger.info(f"--groupVar = {group_var}") + logger.info(f"--percentile = {percentile}") + logger.info(f"--removeDirection = {remove_direction}") + logger.info(f"--sectionVar = {section_var}") + logger.info(f"--writeOutput = {write_output}") + logger.info(f"--outDir = {out_dir}") + + inp_dir = inp_dir.resolve() + out_dir = out_dir.resolve() + + assert inp_dir.exists(), f"{inp_dir} does not exist!! Please check input path again" + assert ( + out_dir.exists() + ), f"{out_dir} does not exist!! Please check output path again" + + if preview: + generate_preview(out_dir) + + else: + fs.feature_subset( + inp_dir, + tabular_dir, + out_dir, + file_pattern, + group_var, + percentile, + remove_direction, + section_var, + image_feature, + tabular_feature, + padding, + write_output, + ) + + +if __name__ == "__main__": + app() diff --git a/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/feature_subset.py b/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/feature_subset.py new file mode 100644 index 000000000..15e4b74bc --- /dev/null +++ b/clustering/feature-subsetting-tool/src/polus/images/clustering/feature_subsetting/feature_subset.py @@ -0,0 +1,300 @@ +"""Feature Subsetting Tool.""" + +import logging +import os +import shutil +from pathlib import Path +from typing import Any + +import filepattern +import vaex +from tqdm import tqdm + +CHUNK_SIZE = 10000 + +logger = logging.getLogger(__name__) +logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".csv") + + +def filter_planes( + feature_dict: dict, + remove_direction: str, + percentile: float, +) -> set[Any]: + """Filter planes by the criteria specified by remove_direction and percentile. + + Args: + feature_dict : planes and respective feature value + remove_direction: remove above or below percentile + percentile : cutoff percentile + + Returns: + set: planes that fit the criteria + """ + planes = list(feature_dict.keys()) + feat_value = [feature_dict[i] for i in planes] + thresh = min(feat_value) + percentile * (max(feat_value) - min(feat_value)) + + # filter planes + if remove_direction == "Below": + keep_planes = [z for z in planes if feature_dict[z] >= thresh] + else: + keep_planes = [z for z in planes if feature_dict[z] <= thresh] + + return set(keep_planes) + + +def make_uniform(planes_dict: dict, uniques: list[int], padding: int) -> dict: + """Ensure each section has the same number of images. + + This function makes the output collection uniform in + the sense that it preserves same number of planes across + sections. It also captures additional planes based + on the value of the padding variable + + Args: + planes_dict: planes to keep in different sections + uniques : unique values for the major grouping variable + padding : additional images to capture outside cutoff + + Returns: + dictionary: dictionary containing planes to keep + """ + # max no. of planes + max_len = max([len(i) for i in planes_dict.values()]) + + # max planes that can be added on each side + min_ind = min([min(planes_dict[k]) for k in planes_dict]) + max_ind = max([max(planes_dict[k]) for k in planes_dict]) + max_add_left = uniques.index(min_ind) + max_add_right = len(uniques) - (uniques.index(max_ind) + 1) + + # add planes in each section based on padding and max number of planes + for section_id, planes in planes_dict.items(): + len_to_add = max_len - len(planes) + len_add_left = min(int(len_to_add) / 2 + padding, max_add_left) + len_add_right = min(len_to_add - len_add_left + padding, max_add_right) + left_ind = int(uniques.index(min(planes)) - len_add_left) + right_ind = int(uniques.index(max(planes)) + len_add_right) + 1 + planes_dict[section_id] = uniques[left_ind:right_ind] + return planes_dict + + +def feature_subset( # noqa : C901 + inp_dir: Path, + tabular_dir: Path, + out_dir: Path, + file_pattern: str, + group_var: str, + percentile: float, + remove_direction: str, + section_var: str, + image_feature: str, + tabular_feature: str, + padding: int, + write_output: bool, +) -> None: + """Subsetting images based on feature values. + + Args: + inp_dir: Path to the collection of input images + tabular_dir : Path to the tabular data directory + out_dir : Path to output directory + file_pattern : Pattern to parse image file names + group_var : variables to group by in a section + percentile : Percentile to remove + remove_direction : Remove direction above or below percentile + section_var : Variables to divide larger sections + image_feature: Image filenames feature in tabular data + tabular_feature : Select tabular feature to subset data + padding : additional images to capture outside cutoff + write_output : Write output image collection or not. + """ + tabular_dir_files = [ + f + for f in Path(tabular_dir).iterdir() + if f.is_file() + and "".join(f.suffixes) in [".csv", ".arrow", ".parquet", ".fits"] + ] + + if len(tabular_dir_files) == 0: + msg = f"No tabular files detected Please check {tabular_dir} again" + raise ValueError(msg) + + # Get the column headers + headers = [] + for in_file in tabular_dir_files: + df = vaex.open(in_file) + headers.append(list(df.columns)) + headers = list(set(headers[0]).intersection(*headers)) + logger.info("Merging the data along rows...") + + featuredf = [] + for in_file in tqdm( + tabular_dir_files, + total=len(tabular_dir_files), + desc="Vaex loading of file", + ): + if in_file.suffix == ".csv": + df = vaex.from_csv(in_file, chunk_size=100_000, convert=True) + else: + df = vaex.open(in_file) + df = df[list(headers)] + featuredf.append(df) + + feature_df = vaex.concat(featuredf) + + if feature_df.shape[0] == 0: + msg = f"tabular files are empty Please check {tabular_dir} again" + raise ValueError(msg) + + # store image name and its feature value + feature_dict = dict( + zip( + list(feature_df[image_feature].to_numpy()), + list(feature_df[tabular_feature].to_numpy()), + ), + ) + + # seperate filepattern variables into different categories + fps = filepattern.FilePattern(inp_dir, file_pattern) + if not len(fps) > 0: + msg = "No image files are detected. Please check filepattern again!" + raise ValueError(msg) + + uniques = fps.get_unique_values() + var = fps.get_variables() + grouping_variables = group_var.split(",") + if len(grouping_variables) > 1: + min_grouping_var, maj_grouping_var = ( + grouping_variables[1], + grouping_variables[0], + ) + gp_by = [min_grouping_var, maj_grouping_var] + else: + gp_by = [group_var] + + if section_var is not None: + section_variables = section_var.split(",") + sub_section_variables = [ + v for v in var if v not in grouping_variables + section_variables + ] + else: + sub_section_variables = [v for v in var if v not in grouping_variables] + + logger.info("Iterating over sections...") + # single iteration of this loop gives all images in one section + + section_feat = [] + section_keep_planes = [] + keep_planes = {} + + for file in fps(group_by=gp_by): + section_feat_dict: dict[Any, Any] = {} + if section_var is not None: + section_id = tuple([file[0][i] for i in section_var.split(",")]) + else: + section_id = 1 + + # iterate over files in one section + + fm = file[1][0][0] + fname = file[1][0][1][0].name + + if min_grouping_var is None: + fm[min_grouping_var] = None + + if fm[min_grouping_var] not in section_feat_dict: + section_feat_dict[fm[min_grouping_var]] = {} + + if fm[maj_grouping_var] not in section_feat_dict[fm[min_grouping_var]]: + section_feat_dict[fm[min_grouping_var]][fm[maj_grouping_var]] = [] + + section_feat_dict[fm[min_grouping_var]][fm[maj_grouping_var]].append( + feature_dict[fname], + ) + + section_feat.append(section_feat_dict) + + sectionfeat: dict[Any, Any] = {} + for f in section_feat: + for k, v in f.items(): + if k not in sectionfeat: + sectionfeat[k] = {} + sectionfeat[k].update(v) + + # average feature value by grouping variable + + for key1 in sectionfeat: + for key2 in sectionfeat[key1]: + sectionfeat[key1][key2] = sum(sectionfeat[key1][key2]) / len( + sectionfeat[key1][key2], + ) + + # find planes to keep based on specified criteria + section_keep_planes.append( + filter_planes(sectionfeat[key1], remove_direction, percentile), + ) + + # keep same planes within a section, across the minor grouping variable + section_keep_planes = list(section_keep_planes[0].union(*section_keep_planes)) + section_keep_planes = [ + i + for i in range( # type: ignore + min(section_keep_planes), + max(section_keep_planes) + 1, # type: ignore + ) + if i in uniques[maj_grouping_var] + ] + keep_planes[section_id] = section_keep_planes + + # # keep same number of planes across different sections + keep_planes = make_uniform(keep_planes, list(uniques[maj_grouping_var]), padding) + + # start writing summary.txt + summary = Path.open(Path(out_dir, "summary.txt"), "w") + + summary.write("\n Files : \n \n") + # update summary.txt with section renaming info + + logger.info("renaming subsetted data") + + for file in fps(group_by=sub_section_variables + grouping_variables): + if section_var is not None: + section_id = tuple([file[0][i] for i in section_var.split(",")]) + else: + section_id = 1 + + section_keep_planes = keep_planes[section_id] + rename_map = dict(zip(keep_planes[section_id], uniques[maj_grouping_var])) + + if section_var is not None and section_var.strip(): + summary.write( + f"Section : {({k: file[0][k] for k in section_variables})} \n", + ) + logger.info( + "Renaming files from section : {} \n".format( + {k: file[0][k] for k in section_variables}, + ), + ) + fm = file[1][0][0] + fname = file[1][0][1][0].name + + if fm[maj_grouping_var] not in keep_planes[section_id]: + continue + + # old and new file name + old_file_name = fname + + file_name_dict = dict(fm.items()) + file_name_dict[maj_grouping_var] = rename_map[fm[maj_grouping_var]] + + new_file_name = fps.get_matching(**file_name_dict)[0][1][0].name + + # if write output collection + if write_output: + shutil.copy2(Path(inp_dir, old_file_name), Path(out_dir, new_file_name)) + + summary.write(f"{old_file_name} -----> {new_file_name} \n") + summary.close() diff --git a/clustering/feature-subsetting-tool/tests/__init__.py b/clustering/feature-subsetting-tool/tests/__init__.py new file mode 100644 index 000000000..00b38f20e --- /dev/null +++ b/clustering/feature-subsetting-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Feature Subsetting Tool.""" diff --git a/clustering/feature-subsetting-tool/tests/conftest.py b/clustering/feature-subsetting-tool/tests/conftest.py new file mode 100644 index 000000000..6aee03ccb --- /dev/null +++ b/clustering/feature-subsetting-tool/tests/conftest.py @@ -0,0 +1,58 @@ +"""Test fixtures. + +Set up all data used in tests. +""" +import tempfile +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + + +@pytest.fixture( + params=[ + (500, ".csv"), + ], +) +def get_params(request: pytest.FixtureRequest) -> tuple[int, str]: + """To get the parameter of the fixture.""" + return request.param + + +@pytest.fixture() +def generate_synthetic_data( + get_params: tuple[int, str], +) -> tuple[Path, Path, Path, str]: + """Generate tabular data.""" + nrows, file_extension = get_params + input_directory = Path(tempfile.mkdtemp(prefix="inpDir_", dir=Path.cwd())) + tabular_directory = Path(tempfile.mkdtemp(prefix="tabularDir_", dir=Path.cwd())) + output_directory = Path(tempfile.mkdtemp(prefix="out_", dir=Path.cwd())) + rng = np.random.default_rng() + channels = 5 + zpos = 4 + nrows = 3 + for c in range(channels): + for z in range(zpos): + file_name = Path(input_directory, f"x00_y01_p0{z}_c{c}.ome.tif") + Path.open(Path(file_name), "a").close() + + tabular_data = { + "intensity_image": [file_name.name] * nrows, + "MEAN": rng.random(nrows).tolist(), + "MEAN_ABSOLUTE_DEVIATION": rng.random(nrows).tolist(), + "MEDIAN": rng.random(nrows).tolist(), + "MODE": rng.random(nrows).tolist(), + } + outname = file_name.stem.split(".")[0] + + df = pd.DataFrame(tabular_data) + if file_extension == ".csv": + outpath = Path(tabular_directory, f"{outname}.csv") + df.to_csv(outpath, index=False) + if file_extension == ".arrow": + outpath = Path(tabular_directory, f"{outname}.arrow") + df.to_feather(outpath) + + return input_directory, tabular_directory, output_directory, file_extension diff --git a/clustering/feature-subsetting-tool/tests/test_cli.py b/clustering/feature-subsetting-tool/tests/test_cli.py new file mode 100644 index 000000000..3cbe68154 --- /dev/null +++ b/clustering/feature-subsetting-tool/tests/test_cli.py @@ -0,0 +1,92 @@ +"""Test Command line Tool.""" + +from typer.testing import CliRunner +from polus.images.clustering.feature_subsetting.__main__ import app +import shutil +from pathlib import Path + + +def test_cli(generate_synthetic_data: tuple[Path, Path, Path, str]) -> None: + """Test the command line.""" + inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data + file_pattern = "x{x+}_y{y+}_p{p+}_c{c+}.ome.tif" + image_feature = "intensity_image" + tabular_feature = "MEAN" + padding = 0 + group_var = "p,c" + + runner = CliRunner() + result = runner.invoke( + app, + [ + "--inpDir", + inp_dir, + "--tabularDir", + tabular_dir, + "--filePattern", + file_pattern, + "--imageFeature", + image_feature, + "--tabularFeature", + tabular_feature, + "--padding", + padding, + "--groupVar", + group_var, + "--percentile", + 0.8, + "--removeDirection", + "Below", + "--writeOutput", + "--outDir", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + shutil.rmtree(tabular_dir) + + +def test_short_cli(generate_synthetic_data: tuple[Path, Path, Path, str]) -> None: + """Test short cli command line.""" + inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data + file_pattern = "x{x+}_y{y+}_p{p+}_c{c+}.ome.tif" + image_feature = "intensity_image" + tabular_feature = "MEAN" + padding = 0 + group_var = "p,c" + + runner = CliRunner() + result = runner.invoke( + app, + [ + "-i", + inp_dir, + "-t", + tabular_dir, + "-f", + file_pattern, + "-if", + image_feature, + "-tf", + tabular_feature, + "-p", + padding, + "-g", + group_var, + "-pc", + 0.8, + "-r", + "Below", + "-w", + "-o", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + shutil.rmtree(tabular_dir) diff --git a/clustering/feature-subsetting-tool/tests/test_feature_subsetting.py b/clustering/feature-subsetting-tool/tests/test_feature_subsetting.py new file mode 100644 index 000000000..91d6163a6 --- /dev/null +++ b/clustering/feature-subsetting-tool/tests/test_feature_subsetting.py @@ -0,0 +1,72 @@ +"""Test Feature Subsetting Plugin.""" + +import shutil +from pathlib import Path + +import polus.images.clustering.feature_subsetting.feature_subset as fs + + +def test_feature_subset( + generate_synthetic_data: tuple[Path, Path, Path, str], +) -> None: + """Test images subsetting based on feature values.""" + inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data + file_pattern = "x{x+}_y{y+}_p{p+}_c{c+}.ome.tif" + image_feature = "intensity_image" + tabular_feature = "MEAN" + padding = 0 + percentile = 0.8 + remove_direction = "Below" + group_var = "p,c" + write_output = True + + fs.feature_subset( + inp_dir=inp_dir, + tabular_dir=tabular_dir, + out_dir=out_dir, + file_pattern=file_pattern, + group_var=group_var, + percentile=percentile, + remove_direction=remove_direction, + section_var=None, + image_feature=image_feature, + tabular_feature=tabular_feature, + padding=padding, + write_output=write_output, + ) + + out_ext = [Path(f.name).suffix for f in out_dir.iterdir()] + assert len(out_ext) != 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + shutil.rmtree(tabular_dir) + + +def test_filter_planes() -> None: + """Test filter planes.""" + feature_dict = { + 1: 1236.597914951989, + 2: 1153.754875685871, + 3: 1537.3429175240055, + 4: 1626.0415809327849, + } + + percentile = 0.1 + remove_direction = "Below" + fn = fs.filter_planes( + feature_dict=feature_dict, + remove_direction=remove_direction, + percentile=percentile, + ) + + assert type(fn) == set + + +def test_make_uniform() -> None: + """Test each section contain same number of images.""" + planes_dict = {1: [3, 4]} + uniques = [1, 2, 3, 4] + padding = 0 + fn = fs.make_uniform(planes_dict=planes_dict, uniques=uniques, padding=padding) + + assert len(fn) != 0 diff --git a/clustering/hdbscan-clustering-tool/.bumpversion.cfg b/clustering/hdbscan-clustering-tool/.bumpversion.cfg new file mode 100644 index 000000000..230e6c5f9 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/.bumpversion.cfg @@ -0,0 +1,27 @@ +[bumpversion] +current_version = 0.4.8-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:VERSION] + +[bumpversion:file:src/polus/images/clustering/hdbscan_clustering/__init__.py] diff --git a/clustering/hdbscan-clustering-tool/.gitignore b/clustering/hdbscan-clustering-tool/.gitignore new file mode 100644 index 000000000..9ed1c3775 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/.gitignore @@ -0,0 +1,23 @@ +# Jupyter Notebook +.ipynb_checkpoints +poetry.lock +../../poetry.lock +# Environments +.env +.myenv +.venv +env/ +venv/ +# test data directory +data +# yaml file +.pre-commit-config.yaml +# hidden files +.DS_Store +.ds_store +# flake8 +.flake8 +../../.flake8 +__pycache__ +.mypy_cache +requirements.txt diff --git a/clustering/hdbscan-clustering-tool/Dockerfile b/clustering/hdbscan-clustering-tool/Dockerfile new file mode 100644 index 000000000..fd4b86f93 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/Dockerfile @@ -0,0 +1,21 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_LOG="INFO" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + + +ENTRYPOINT ["python3", "-m", "polus.images.clustering.hdbscan_clustering"] +CMD ["--help"] diff --git a/clustering/hdbscan-clustering-tool/README.md b/clustering/hdbscan-clustering-tool/README.md new file mode 100644 index 000000000..80c37a501 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/README.md @@ -0,0 +1,52 @@ +# Hierarchical Density-Based Spatial Clustering of Applications with Noise(HDBSCAN) Clustering (v0.4.8-dev0) + +The HDBSCAN Clustering plugin clusters the data using [HDBSCAN clustering](https://pypi.org/project/hdbscan/) library. The input and output for this plugin is a CSV file. Each observation (row) in the input CSV file is assigned to one of the clusters. The output CSV file contains the column `cluster` that identifies the cluster to which each observation belongs. A user can supply a regular expression with capture groups if they wish to cluster each group independently, or if they wish to average the numerical features across each group and treat them as a single observation. + +## Inputs: + +### Input directory: +This plugin supports the all [vaex](https://vaex.readthedocs.io/en/latest/guides/io.html) supported file formats. + +### Filename pattern: +This plugin uses [filepattern](https://filepattern2.readthedocs.io/en/latest/Home.html) python library to parse file names of tabular files to be processed by this plugin. + +### Grouping pattern: +The input for this parameter is a regular expression with capture group. This input splits the data into groups based on the matched pattern. A new column `group` is created in the output file that has the group based on the given pattern. Unless `averageGroups` is set to `true`, providing a grouping pattern will cluster each group independently. + +### Average groups: +`groupingPattern` to average the numerical features and produce a single row per group which is then clustered. The resulting cluster is assigned to all observations belonging in that group. + +### Label column: +This is the name of the column containing the labels to be used with `groupingPattern`. + +### Minimum cluster size: +This parameter defines the smallest number of points that should be considered as cluster. This is a required parameter. The input should be an integer and the value should be greater than 1. + +### Increment outlier ID: +This parameter sets the ID of the outlier cluster to `1`, otherwise it will be 0. This is useful for visualization purposes if the resulting cluster IDs are turned into image annotations. + +## Output: +The output is a tabular file containing the clustered data. + +## Building +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Options + +This plugin takes four input arguments and one output argument: + +| Name | Description | I/O | Type | +| ---------------------- | ---------------------------------------------------------------------------------------------- | ------ | ------------- | +| `--inpDir` | Input tabular data files. | Input | genericData | +| `--groupingPattern` | Regular expression to group rows. Clustering will be applied across capture groups by default. | Input | string | +| `--averageGroups` | Average data across groups. Requires capture groups | Input | boolean | +| `--labelCol` | Name of the column containing labels for grouping pattern. | Input | string | +| `--minClusterSize` | Minimum cluster size. | Input | number | +| `--incrementOutlierId` | Increments outlier ID to 1. | Input | boolean | +| `--outDir` | Output collection | Output | genericData | +| `--preview` | Generate a JSON file with outputs | Output | JSON | diff --git a/clustering/hdbscan-clustering-tool/VERSION b/clustering/hdbscan-clustering-tool/VERSION new file mode 100644 index 000000000..316ad8d55 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/VERSION @@ -0,0 +1 @@ +0.4.8-dev0 diff --git a/clustering/hdbscan-clustering-tool/build-docker.sh b/clustering/hdbscan-clustering-tool/build-docker.sh new file mode 100755 index 000000000..2e7dd1861 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", + "Hythem Sidky ", + "Hamdah Shafqat abbasi " + ] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.4" +typer = "^0.7.0" +tqdm = "^4.64.1" +preadator="0.4.0.dev2" +vaex = "^4.17.0" +hdbscan = "^0.8.34rc1" + + +[tool.poetry.group.dev.dependencies] +pre-commit = "^3.3.3" +bump2version = "^1.0.1" +pytest = "^7.3.2" +pytest-xdist = "^3.3.1" +pytest-sugar = "^0.9.7" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/clustering/hdbscan-clustering-tool/run-docker.sh b/clustering/hdbscan-clustering-tool/run-docker.sh new file mode 100755 index 000000000..931115198 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/run-docker.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +version=$( None: + """Cluster data using HDBSCAN.""" + logger.info(f"--inpDir = {inp_dir}") + logger.info(f"--filePattern = {file_pattern}") + # Regular expression for grouping. + logger.info(f"--groupingPattern = {grouping_pattern}") + # Whether to average data for each group. + logger.info(f"--averageGroups = {average_groups}") + # Name of column to use for grouping. + logger.info(f"--labelCol = {label_col}") + # Minimum cluster size for clustering using HDBSCAN. + logger.info(f"--minClusterSize = {min_cluster_size}") + # Set outlier cluster id as 1. + logger.info(f"--incrementOutlierId = {increment_outlier_id}") + logger.info(f"--outDir = {out_dir}") + + inp_dir = inp_dir.resolve() + out_dir = out_dir.resolve() + + assert inp_dir.exists(), f"{inp_dir} does not exist!! Please check input path again" + assert ( + out_dir.exists() + ), f"{out_dir} does not exist!! Please check output path again" + + num_workers = max([cpu_count(), 2]) + + files = fp.FilePattern(inp_dir, file_pattern) + + if files is None: + msg = f"No tabular files found. Please check {file_pattern} again" + raise ValueError(msg) + + if preview: + with Path.open(Path(out_dir, "preview.json"), "w") as jfile: + out_json: dict[str, Any] = { + "filepattern": file_pattern, + "outDir": [], + } + for file in files(): + out_name = file[1][0].name.replace( + "".join(file[1][0].suffixes), + f"_hdbscan{hd.POLUS_TAB_EXT}", + ) + out_json["outDir"].append(out_name) + json.dump(out_json, jfile, indent=2) + else: + with preadator.ProcessManager( + name="Cluster data using HDBSCAN", + num_processes=num_workers, + threads_per_process=2, + ) as pm: + for file in tqdm( + files(), + total=len(files()), + desc="Clustering data", + mininterval=5, + initial=0, + unit_scale=True, + colour="cyan", + ): + pm.submit_process( + hd.hdbscan_clustering, + file[1][0], + min_cluster_size, + out_dir, + grouping_pattern, + label_col, + average_groups, + increment_outlier_id, + ) + pm.join_processes() + + +if __name__ == "__main__": + app() diff --git a/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py b/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py new file mode 100644 index 000000000..3940c2861 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py @@ -0,0 +1,150 @@ +"""Hdbscan Clustering Plugin.""" +import logging +import os +import re +from itertools import chain +from pathlib import Path + +import hdbscan +import numpy as np +import vaex + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".csv") +CHUNK_SIZE = 10000 + + +def hdbscan_model( + data: np.ndarray, + min_cluster_size: int, + increment_outlier_id: bool, +) -> np.ndarray: + """Cluster data using HDBSCAN. + + Args: + data: Data that need to be clustered. + min_cluster_size: Minimum cluster size. + increment_outlier_id : Increment outlier ID to unity. + + Returns: + Cluster labels for each row of data. + """ + clusters = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size).fit(data) + labels = clusters.labels_.flatten().astype(np.uint16) + 1 + return labels + 1 if increment_outlier_id else labels + + +def hdbscan_clustering( # noqa: PLR0913 + file: Path, + min_cluster_size: int, + out_dir: Path, + grouping_pattern: str, + label_col: str, + average_groups: bool, + increment_outlier_id: bool, +) -> None: + """Cluster data using HDBSCAN. + + Args: + file: Path of a tabular file. + min_cluster_size: Smallest size grouping that should be considered as a cluster. + out_dir: Path to output directory. + grouping_pattern: Regular expression to caputure groups in a label_col. + label_col: Name of column containing labels. + average_groups:To average data across groups. + increment_outlier_id: Increment outlier ID to unity. + """ + if Path(file.name).suffix == ".csv": + df = vaex.from_csv(file, convert=True, chunk_size=CHUNK_SIZE) + else: + df = vaex.open(file) + # If user provided a regular expression. + if grouping_pattern: + if label_col == "None": + msg = f"Please define label column to capture groups {label_col}" + raise ValueError(msg) + + # Create a column group with matching string + group = np.array( + [ + re.search(grouping_pattern, x).group(0) # type: ignore + for x in df[label_col].tolist() + if len(re.search(grouping_pattern, x).group(0)) != 0 # type: ignore + ], + ) + if len(group) == 0: + msg = f"Could not find group with pattern {grouping_pattern}" + raise ValueError(msg) + + # Create a column group with matching string + df["group"] = group + int_columns = [ + feature + for feature in df.get_column_names() + if df.data_type(feature) == int or df.data_type(feature) == float + ] + + # If we want to average features for each group. + if average_groups: + df_grouped = df.groupby( + "group", + agg=[vaex.agg.mean(x) for x in int_columns], + ) + # Cluster data using HDBSCAN clustering. + logger.info("Clustering the data") + cluster_ids = hdbscan_model( + df_grouped.values, + min_cluster_size, + increment_outlier_id, + ) + df_grouped["cluster"] = cluster_ids + df = df.join( + df_grouped["group", "cluster"], + left_on="group", + right_on="group", + ) + + else: + dfs = [] + for group, df_ss in df.groupby("group"): + # Cluster data using HDBSCAN clustering. + logger.info(f"Clustering data in group {group}") + + cluster_ids = hdbscan_model( + df_ss.values, + min_cluster_size, + increment_outlier_id, + ) + + dfs.append(cluster_ids) + cluster_ids = np.array(list(chain.from_iterable(dfs))) + df["cluster"] = cluster_ids + + # No grouping. Vanilla clustering. + else: + int_columns = [ + feature + for feature in df.get_column_names() + if df.data_type(feature) == int or df.data_type(feature) == float + ] + + # Cluster data using HDBSCAN clustering + logger.info("Clustering the data") + cluster_ids = hdbscan_model( + df[int_columns].values, + min_cluster_size, + increment_outlier_id, + ) + df["cluster"] = cluster_ids + + outname = Path(out_dir, f"{Path(file.name).stem}_hdbscan{POLUS_TAB_EXT}") + + if POLUS_TAB_EXT == ".arrow": + df.export_feather(outname) + logger.info(f"Saving outputs: {outname}") + else: + df.export_csv(path=outname, chunk_size=CHUNK_SIZE) + + logger.info("Finished all processes!") diff --git a/clustering/hdbscan-clustering-tool/tests/__init__.py b/clustering/hdbscan-clustering-tool/tests/__init__.py new file mode 100644 index 000000000..2f89ec82b --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Hdbscan Clustering Plugin.""" diff --git a/clustering/hdbscan-clustering-tool/tests/conftest.py b/clustering/hdbscan-clustering-tool/tests/conftest.py new file mode 100644 index 000000000..a609d5b80 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/conftest.py @@ -0,0 +1,48 @@ +"""Test fixtures. + +Set up all data used in tests. +""" +import tempfile +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + + +@pytest.fixture( + params=[(50000, ".csv"), (100000, ".arrow")], +) +def get_params(request: pytest.FixtureRequest) -> tuple[int, str]: + """To get the parameter of the fixture.""" + return request.param + + +@pytest.fixture() +def generate_synthetic_data(get_params: tuple[int, str]) -> tuple[Path, Path, str]: + """Generate tabular data.""" + nrows, file_extension = get_params + + input_directory = Path(tempfile.mkdtemp(prefix="inputs_")) + output_directory = Path(tempfile.mkdtemp(prefix="out_")) + rng = np.random.default_rng() + tabular_data = { + "sepal_length": rng.random(nrows).tolist(), + "sepal_width": rng.random(nrows).tolist(), + "petal_length": rng.random(nrows).tolist(), + "petal_width": rng.random(nrows).tolist(), + "species": rng.choice( + ["Iris-setosa", "Iris-versicolor", "Iris-virginica"], + nrows, + ).tolist(), + } + + df = pd.DataFrame(tabular_data) + if file_extension == ".csv": + outpath = Path(input_directory, "data.csv") + df.to_csv(outpath, index=False) + if file_extension == ".arrow": + outpath = Path(input_directory, "data.arrow") + df.to_feather(outpath) + + return input_directory, output_directory, file_extension diff --git a/clustering/hdbscan-clustering-tool/tests/test_cli.py b/clustering/hdbscan-clustering-tool/tests/test_cli.py new file mode 100644 index 000000000..b087215e8 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/test_cli.py @@ -0,0 +1,74 @@ +"""Test Command line Tool.""" + +from typer.testing import CliRunner +from polus.images.clustering.hdbscan_clustering.__main__ import app +import shutil +from pathlib import Path + + +def test_cli(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test the command line.""" + inp_dir, out_dir, file_extension = generate_synthetic_data + pattern = r"\w+$" + file_pattern = f".*{file_extension}" + label = "species" + clustersize = 3 + + runner = CliRunner() + result = runner.invoke( + app, + [ + "--inpDir", + inp_dir, + "--filePattern", + file_pattern, + "--groupingPattern", + pattern, + "--averageGroups", + "--labelCol", + label, + "--minClusterSize", + clustersize, + "--incrementOutlierId", + "--outDir", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + + +def test_short_cli(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test short command line.""" + inp_dir, out_dir, file_extension = generate_synthetic_data + pattern = r"\w+$" + file_pattern = f".*{file_extension}" + label = "species" + clustersize = 3 + + runner = CliRunner() + result = runner.invoke( + app, + [ + "-i", + inp_dir, + "-f", + file_pattern, + "-g", + pattern, + "-a", + "-l", + label, + "-m", + clustersize, + "-io", + "-o", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) diff --git a/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py b/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py new file mode 100644 index 000000000..83debf273 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py @@ -0,0 +1,49 @@ +"""Test Hdbscan Clustering Plugin.""" + +import shutil +from pathlib import Path + +import filepattern as fp +import polus.images.clustering.hdbscan_clustering.hdbscan_clustering as hd +import vaex + + +def test_hdbscan_clustering(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test hdbscan clustering of tabular data.""" + inp_dir, out_dir, file_extension = generate_synthetic_data + pattern = r"\w+$" + file_pattern = f".*{file_extension}" + files = fp.FilePattern(inp_dir, file_pattern) + for file in files(): + hd.hdbscan_clustering( + file=file[1][0], + min_cluster_size=3, + grouping_pattern=pattern, + label_col="species", + average_groups=True, + increment_outlier_id=True, + out_dir=out_dir, + ) + + out_ext = [Path(f.name).suffix for f in out_dir.iterdir()] + assert all(out_ext) is True + for f in out_dir.iterdir(): + df = vaex.open(f) + assert "cluster" in df.column_names + assert df["cluster"].values != 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + + +def test_hdbscan_model(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test hdbscan model.""" + inp_dir, _, file_extension = generate_synthetic_data + file_pattern = f".*{file_extension}" + files = fp.FilePattern(inp_dir, file_pattern) + for file in files(): + df = vaex.open(file[1][0]) + data = df[df.column_names[:-1]].values + min_cluster_size = 3 + label = hd.hdbscan_model(data, min_cluster_size, True) + assert len(label) != 0 + shutil.rmtree(inp_dir) diff --git a/transforms/images/apply-flatfield-plugin/VERSION b/transforms/images/apply-flatfield-plugin/VERSION new file mode 100644 index 000000000..38f77a65b --- /dev/null +++ b/transforms/images/apply-flatfield-plugin/VERSION @@ -0,0 +1 @@ +2.0.1 diff --git a/transforms/images/apply-flatfield-tool/README.md b/transforms/images/apply-flatfield-tool/README.md index c2ce5d8b9..aa08fb55e 100644 --- a/transforms/images/apply-flatfield-tool/README.md +++ b/transforms/images/apply-flatfield-tool/README.md @@ -1,4 +1,4 @@ -# Apply Flatfield Plugin (v2.0.1) +# Apply Flatfield Plugin (v2.0.1-dev0) This WIPP plugin applies a flatfield operation on every image in a collection. The algorithm used to apply the flatfield is as follows: diff --git a/transforms/images/apply-flatfield-tool/plugin.json b/transforms/images/apply-flatfield-tool/plugin.json index 60a1bafda..d6430c7be 100644 --- a/transforms/images/apply-flatfield-tool/plugin.json +++ b/transforms/images/apply-flatfield-tool/plugin.json @@ -91,4 +91,4 @@ "description": "Preview the output images' names without actually running computation" } ] -} +} \ No newline at end of file