Updates

qiita-spots · Nov 4, 2024 · 09a1cc2 · 09a1cc2
1 parent 933e78f
commit 09a1cc2
Show file tree

Hide file tree

Showing 12 changed files with 1,921 additions and 1,500 deletions.
diff --git a/qp_klp/Instruments.py → qp_klp/SequencingTech.py b/qp_klp/Instruments.py → qp_klp/SequencingTech.py
@@ -3,25 +3,26 @@
 from sequence_processing_pipeline.TRNormCountsJob import TRNormCountsJob
 from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob
 from sequence_processing_pipeline.PipelineError import PipelineError
+from os.path import join
 
 
-INSTRUMENT_NAME_NONE = "Instrument"
-INSTRUMENT_NAME_ILLUMINA = "Illumina"
-INSTRUMENT_NAME_TELLSEQ = "TellSeq"
+SEQTECH_NAME_NONE = "None"
+SEQTECH_NAME_ILLUMINA = "Illumina"
+SEQTECH_NAME_TELLSEQ = "TellSeq"
 
 
-class Instrument():
+class SequencingTech():
     """
-    Instruments encapsulate Job()s and other functionality that vary on the
-     nature of the Instrument used to create the raw data. All Instruments are
-     mixins for Workflow() classes and shouldn't define their own
+    SequencingTechs encapsulate Job()s and other functionality that vary on
+     the nature of the Instrument used to create the raw data. All Instruments
+     are mixins for Workflow() classes and shouldn't define their own
      initialization.
     """
-    instrument_type = INSTRUMENT_NAME_NONE
+    seqtech_type = SEQTECH_NAME_NONE
 
 
-class Illumina(Instrument):
-    instrument_type = INSTRUMENT_NAME_ILLUMINA
+class Illumina(SequencingTech):
+    seqtech_type = SEQTECH_NAME_ILLUMINA
 
     def _get_configuration(self, command):
         # NB: This helper method is to change the behavior of Pipeline.
@@ -83,26 +84,45 @@ def convert_raw_to_fastq(self):
         return failed_samples
 
 
-class TellSeq(Instrument):
-    instrument_type = INSTRUMENT_NAME_TELLSEQ
+class TellSeq(SequencingTech):
+    seqtech_type = SEQTECH_NAME_TELLSEQ
 
     def convert_raw_to_fastq(self):
         config = self.pipeline.get_software_configuration('tell-read')
 
+        print("RUN DIR: %s" % self.pipeline.run_dir)
+        print("OUTPUT PATH: %s" % self.pipeline.output_path)
+        print("INPUT FILE PATH: %s" % self.pipeline.input_file_path)
+
         tr_job = TellReadJob(self.pipeline.run_dir,
                              self.pipeline.output_path,
                              self.pipeline.input_file_path,
                              config['queue'],
                              config['nodes'],
-                             config['nprocs'],
                              config['wallclock_time_in_minutes'],
                              config['per_process_memory_limit'],
-                             config['executable_path'],
                              config['modules_to_load'],
-                             self.master_qiita_job_id)
+                             self.master_qiita_job_id,
+                             config['label'],
+                             config['reference_base'],
+                             config['reference_map'],
+                             config['tmp1_path'],
+                             config['sing_script_path'],
+                             config['cores_per_task'])
 
         tr_job.run(callback=self.status_update_callback)
 
+        '''
+        when run is run, we're going to create a job script which we can test to see if it checks out.
+        then we're going to submit the job, and we need to fake the sbatch command.
+        then we need to fake the results directory.
+        all we're testing is that the job script is what we expect.
+
+        '''
+
+        """
+
+
         # TODO: determine these appropriately
         max_array_length = "foo"
         label = "bar"
@@ -145,9 +165,10 @@ def convert_raw_to_fastq(self):
 
         i_job.run(callback=self.status_update_callback)
 
-        # TODO: after i_job is completed, there are two optional jobs that
+        # NB: after i_job is completed, there are two optional jobs that
         # can be performed in parallel using the new functionality in Job()
-        # class.
+        # class. However we are not using the output from this step right now
+        # so we will leave it unimplemented temporarily.
 
         # TODO: take post-processing code from Version 1 impl and tack on
         # the cleanup line from the original cleanup script in order to
@@ -164,3 +185,112 @@ def convert_raw_to_fastq(self):
 
         # Potentially needed for tell-seq jobs but perhaps not.
         # return job.audit(self.pipeline.get_sample_ids())
+
+
+        # post-process working directory to make it appear like results
+        # generated by ConvertJob
+
+        integrated_files_path = join(self.pipeline.output_path, 'output', "integrated")
+
+        if not exists(integrated_files_path):
+            raise ValueError(f"{integrated_files_path} does not exist")
+
+        # move integrated directory to TRConvertJob directory, co-level with
+        # output directory. This makes it easier to delete the rest of the
+        # output that we don't need.
+
+        # move err and out logs into logs subdirectory.
+        for root, dirs, files in walk(self.output_path):
+            for _file in files:
+                _path = join(root, _file)
+                if _path.endswith('.err'):
+                    move(_path, join(self.output_path, 'logs'))
+                elif _path.endswith('.out'):
+                    move(_path, join(self.output_path, 'logs'))
+            # don't go below one level.
+            break
+
+        # save two logs and move them into standard Job logs directory.
+        move(join(self.output_path, 'output', 'log'),
+             join(self.output_path, 'logs'))
+        move(join(self.output_path, 'output', 'output.log'),
+             join(self.output_path, 'logs'))
+
+        # rename the files and move them into project directories.
+        for root, dirs, files in walk(integrated_files_path):
+            for _file in files:
+                fastq_file = join(root, _file)
+                self._post_process_file(fastq_file, self.mapping)
+
+        # move project folders from integrated directory to working_dir.
+        contents = listdir(integrated_files_path)
+        for name in contents:
+            move(join(integrated_files_path, name),
+                 self.output_path)
+
+        # delete the original output directory.
+        rmtree(join(self.output_path, 'output'))
+
+
+
+
+\
+
+        def _post_process_file(self, fastq_file, mapping):
+        # generate names of the form generated by bcl-convert/bcl2fastq:
+        # <Sample_ID>_S#_L00#_<R# or I#>_001.fastq.gz
+        # see:
+        # https://help.basespace.illumina.com/files-used-by-basespace/
+        # fastq-files
+        _dir, _file = split(fastq_file)
+
+        # ex: integrated/C544.R2.fastq.gz
+        m = match(r"(C5\d\d)\.([R,I]\d)\.fastq.gz", _file)
+
+        if m is None:
+            raise ValueError(f"The filename '{_file}' is not of a "
+                             "recognizable form")
+
+        adapter_id = m[1]
+        read_type = m[2]
+
+        if adapter_id not in mapping:
+            raise ValueError(f"{adapter_id} is not present in mapping")
+
+        sample_name, sample_index, project_name = mapping[adapter_id]
+
+        # generate the new filename for the fastq file, and reorganize the
+        # files by project.
+        new_name = "%s_S%d_%s_%s_001.fastq.gz" % (sample_name,
+                                                  sample_index,
+                                                  self.lane,
+                                                  read_type)
+
+        # ensure that the project directory exists before we rename and move
+        # the file to that location.
+        makedirs(join(_dir, project_name), exist_ok=True)
+
+        # if there's an error renaming and moving the file, let it pass up to
+        # the user.
+        final_path = join(_dir, project_name, new_name)
+        rename(fastq_file, final_path)
+        return final_path
+
+    def _generate_sample_mapping(self):
+        # this generates a sample mapping for the C501-C596 adapters used by
+        # the vendor to a sample-name and project. In production use this
+        # mapping would need to be created from the future sample-sheet.
+        project_names = ['Project1', 'Project2', 'Project3']
+        sample_mapping = {}
+
+        for sample_index in range(1, 97):
+            adapter_id = "C%s" % str(sample_index + 500)
+            sample_name = "MySample%d" % sample_index
+            project_name = project_names[sample_index % 3]
+            sample_mapping[adapter_id] = (sample_name, sample_index,
+                                          project_name)
+
+        return sample_mapping
+        """
+
+
diff --git a/qp_klp/Workflows.py b/qp_klp/Workflows.py
@@ -1,4 +1,4 @@
-from .Instruments import Illumina, TellSeq
+from .SequencingTech import Illumina, TellSeq
 from os.path import join, abspath, exists, split
 from os import walk, makedirs, listdir
 import pandas as pd
@@ -12,7 +12,7 @@
 from .Assays import Amplicon, Metagenomic, Metatranscriptomic
 from .Assays import (METAOMIC_ASSAY_NAMES, ASSAY_NAME_AMPLICON,
                      ASSAY_NAME_METAGENOMIC, ASSAY_NAME_METATRANSCRIPTOMIC)
-from .Instruments import INSTRUMENT_NAME_ILLUMINA, INSTRUMENT_NAME_TELLSEQ
+from .SequencingTech import SEQTECH_NAME_ILLUMINA, SEQTECH_NAME_TELLSEQ
 from .FailedSamplesRecord import FailedSamplesRecord
 
 
@@ -69,7 +69,7 @@ def what_am_i(self):
         Returns text description of Workflow's Instrument & Assay mixins.
         :return:
         """
-        return (f"Instrument: {self.instrument_type}" + "\t" +
+        return (f"Instrument: {self.seqtech_type}" + "\t" +
                 f"Assay: {self.assay_type}")
 
     def generate_special_map(self):
@@ -369,7 +369,6 @@ def _project_metadata_check(self):
 
         for qiita_id in qiita_ids:
             url = f"/api/v1/study/{qiita_id}/samples/info"
-            # CHARLIE
             logging.debug(f"URL: {url}")
             categories = self.qclient.get(url)["categories"]
 
@@ -1247,11 +1246,11 @@ class WorkflowFactory():
                  StandardAmpliconWorkflow,
                  TellSeqMetagenomicWorkflow]
 
-    ST_TO_IN_MAP = {INSTRUMENT_NAME_ILLUMINA: ['standard_metag',
+    ST_TO_IN_MAP = {SEQTECH_NAME_ILLUMINA: ['standard_metag',
                                                'standard_metat',
                                                'absquant_metag',
                                                'absquant_metat'],
-                    INSTRUMENT_NAME_TELLSEQ: ['tellseq_metag',
+                    SEQTECH_NAME_TELLSEQ: ['tellseq_metag',
                                               'tellseq_absquant']}
 
     @classmethod
@@ -1279,7 +1278,9 @@ def generate_workflow(cls, **kwargs):
             # SheetVersion will raise a ValueError() here, w/the message
             # "'{sheet}' doesn't appear to be a valid sample-sheet."
 
+
             sheet = load_sample_sheet(kwargs['uif_path'])
+
             # if we do not validate the sample-sheet now, it will be validated
             # downstream when we attempt to instantiate a Workflow(), which in
             # turn will attempt to instantiate a Pipeline(), which will load
@@ -1314,7 +1315,7 @@ def generate_workflow(cls, **kwargs):
 
         for workflow in WorkflowFactory.WORKFLOWS:
             if workflow.assay_type == assay_type:
-                if workflow.instrument_type == instrument_type:
+                if workflow.seqtech_type == instrument_type:
                     # return instantiated workflow object
                     return workflow(**kwargs)
 

diff --git a/qp_klp/tests/data/configuration_profiles/miseq_metagenomic.json b/qp_klp/tests/data/configuration_profiles/miseq_metagenomic.json
@@ -3,6 +3,22 @@
     "instrument_type": "MiSeq",
     "assay_type": "Metagenomic",
     "configuration": {
+      "tell-read": {
+        "label": "my_label",
+        "reference_base": "/my/reference/base/path",
+        "reference_map": "/my/reference/map/path",
+        "tmp1_path": "/my/tmp1/path",
+        "sing_script_path": "/my/sing/script/path",
+        "cores_per_task": "999",
+        "nodes": 1,
+        "lane": 1,
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 216,
+        "modules_to_load": [
+          "foo_module"
+        ],
+        "per_process_memory_limit": "10"
+      },
       "bcl-convert": {
         "nodes": 1,
         "nprocs": 16,

diff --git a/qp_klp/tests/data/configuration_profiles/miseq_metatranscriptomic.json b/qp_klp/tests/data/configuration_profiles/miseq_metatranscriptomic.json
@@ -47,6 +47,7 @@
         "cores_per_task": 2,
         "movi_executable_path": "/home/user/user_dir/Movi/build/movi-default",
         "gres_value": 4,
+        "additional_fastq_tags": ["BX"],
         "pmls_path": "/home/user/user_dir/human_host_filtration/scripts/qiita_filter_pmls.py"
       },
       "seqpro": {