Skip to content

Commit

Permalink
Updates
Browse files Browse the repository at this point in the history
  • Loading branch information
charles-cowart committed Nov 4, 2024
1 parent 933e78f commit 09a1cc2
Show file tree
Hide file tree
Showing 12 changed files with 1,921 additions and 1,500 deletions.
164 changes: 147 additions & 17 deletions qp_klp/Instruments.py → qp_klp/SequencingTech.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,26 @@
from sequence_processing_pipeline.TRNormCountsJob import TRNormCountsJob
from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob
from sequence_processing_pipeline.PipelineError import PipelineError
from os.path import join


INSTRUMENT_NAME_NONE = "Instrument"
INSTRUMENT_NAME_ILLUMINA = "Illumina"
INSTRUMENT_NAME_TELLSEQ = "TellSeq"
SEQTECH_NAME_NONE = "None"
SEQTECH_NAME_ILLUMINA = "Illumina"
SEQTECH_NAME_TELLSEQ = "TellSeq"


class Instrument():
class SequencingTech():
"""
Instruments encapsulate Job()s and other functionality that vary on the
nature of the Instrument used to create the raw data. All Instruments are
mixins for Workflow() classes and shouldn't define their own
SequencingTechs encapsulate Job()s and other functionality that vary on
the nature of the Instrument used to create the raw data. All Instruments
are mixins for Workflow() classes and shouldn't define their own
initialization.
"""
instrument_type = INSTRUMENT_NAME_NONE
seqtech_type = SEQTECH_NAME_NONE


class Illumina(Instrument):
instrument_type = INSTRUMENT_NAME_ILLUMINA
class Illumina(SequencingTech):
seqtech_type = SEQTECH_NAME_ILLUMINA

def _get_configuration(self, command):
# NB: This helper method is to change the behavior of Pipeline.
Expand Down Expand Up @@ -83,26 +84,45 @@ def convert_raw_to_fastq(self):
return failed_samples


class TellSeq(Instrument):
instrument_type = INSTRUMENT_NAME_TELLSEQ
class TellSeq(SequencingTech):
seqtech_type = SEQTECH_NAME_TELLSEQ

def convert_raw_to_fastq(self):
config = self.pipeline.get_software_configuration('tell-read')

print("RUN DIR: %s" % self.pipeline.run_dir)
print("OUTPUT PATH: %s" % self.pipeline.output_path)
print("INPUT FILE PATH: %s" % self.pipeline.input_file_path)

tr_job = TellReadJob(self.pipeline.run_dir,
self.pipeline.output_path,
self.pipeline.input_file_path,
config['queue'],
config['nodes'],
config['nprocs'],
config['wallclock_time_in_minutes'],
config['per_process_memory_limit'],
config['executable_path'],
config['modules_to_load'],
self.master_qiita_job_id)
self.master_qiita_job_id,
config['label'],
config['reference_base'],
config['reference_map'],
config['tmp1_path'],
config['sing_script_path'],
config['cores_per_task'])

tr_job.run(callback=self.status_update_callback)

'''
when run is run, we're going to create a job script which we can test to see if it checks out.
then we're going to submit the job, and we need to fake the sbatch command.
then we need to fake the results directory.
all we're testing is that the job script is what we expect.
'''

"""
# TODO: determine these appropriately
max_array_length = "foo"
label = "bar"
Expand Down Expand Up @@ -145,9 +165,10 @@ def convert_raw_to_fastq(self):
i_job.run(callback=self.status_update_callback)
# TODO: after i_job is completed, there are two optional jobs that
# NB: after i_job is completed, there are two optional jobs that
# can be performed in parallel using the new functionality in Job()
# class.
# class. However we are not using the output from this step right now
# so we will leave it unimplemented temporarily.
# TODO: take post-processing code from Version 1 impl and tack on
# the cleanup line from the original cleanup script in order to
Expand All @@ -164,3 +185,112 @@ def convert_raw_to_fastq(self):
# Potentially needed for tell-seq jobs but perhaps not.
# return job.audit(self.pipeline.get_sample_ids())
# post-process working directory to make it appear like results
# generated by ConvertJob
integrated_files_path = join(self.pipeline.output_path, 'output', "integrated")
if not exists(integrated_files_path):
raise ValueError(f"{integrated_files_path} does not exist")
# move integrated directory to TRConvertJob directory, co-level with
# output directory. This makes it easier to delete the rest of the
# output that we don't need.
# move err and out logs into logs subdirectory.
for root, dirs, files in walk(self.output_path):
for _file in files:
_path = join(root, _file)
if _path.endswith('.err'):
move(_path, join(self.output_path, 'logs'))
elif _path.endswith('.out'):
move(_path, join(self.output_path, 'logs'))
# don't go below one level.
break
# save two logs and move them into standard Job logs directory.
move(join(self.output_path, 'output', 'log'),
join(self.output_path, 'logs'))
move(join(self.output_path, 'output', 'output.log'),
join(self.output_path, 'logs'))
# rename the files and move them into project directories.
for root, dirs, files in walk(integrated_files_path):
for _file in files:
fastq_file = join(root, _file)
self._post_process_file(fastq_file, self.mapping)
# move project folders from integrated directory to working_dir.
contents = listdir(integrated_files_path)
for name in contents:
move(join(integrated_files_path, name),
self.output_path)
# delete the original output directory.
rmtree(join(self.output_path, 'output'))
\
def _post_process_file(self, fastq_file, mapping):
# generate names of the form generated by bcl-convert/bcl2fastq:
# <Sample_ID>_S#_L00#_<R# or I#>_001.fastq.gz
# see:
# https://help.basespace.illumina.com/files-used-by-basespace/
# fastq-files
_dir, _file = split(fastq_file)
# ex: integrated/C544.R2.fastq.gz
m = match(r"(C5\d\d)\.([R,I]\d)\.fastq.gz", _file)
if m is None:
raise ValueError(f"The filename '{_file}' is not of a "
"recognizable form")
adapter_id = m[1]
read_type = m[2]
if adapter_id not in mapping:
raise ValueError(f"{adapter_id} is not present in mapping")
sample_name, sample_index, project_name = mapping[adapter_id]
# generate the new filename for the fastq file, and reorganize the
# files by project.
new_name = "%s_S%d_%s_%s_001.fastq.gz" % (sample_name,
sample_index,
self.lane,
read_type)
# ensure that the project directory exists before we rename and move
# the file to that location.
makedirs(join(_dir, project_name), exist_ok=True)
# if there's an error renaming and moving the file, let it pass up to
# the user.
final_path = join(_dir, project_name, new_name)
rename(fastq_file, final_path)
return final_path
def _generate_sample_mapping(self):
# this generates a sample mapping for the C501-C596 adapters used by
# the vendor to a sample-name and project. In production use this
# mapping would need to be created from the future sample-sheet.
project_names = ['Project1', 'Project2', 'Project3']
sample_mapping = {}
for sample_index in range(1, 97):
adapter_id = "C%s" % str(sample_index + 500)
sample_name = "MySample%d" % sample_index
project_name = project_names[sample_index % 3]
sample_mapping[adapter_id] = (sample_name, sample_index,
project_name)
return sample_mapping
"""


15 changes: 8 additions & 7 deletions qp_klp/Workflows.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .Instruments import Illumina, TellSeq
from .SequencingTech import Illumina, TellSeq
from os.path import join, abspath, exists, split
from os import walk, makedirs, listdir
import pandas as pd
Expand All @@ -12,7 +12,7 @@
from .Assays import Amplicon, Metagenomic, Metatranscriptomic
from .Assays import (METAOMIC_ASSAY_NAMES, ASSAY_NAME_AMPLICON,
ASSAY_NAME_METAGENOMIC, ASSAY_NAME_METATRANSCRIPTOMIC)
from .Instruments import INSTRUMENT_NAME_ILLUMINA, INSTRUMENT_NAME_TELLSEQ
from .SequencingTech import SEQTECH_NAME_ILLUMINA, SEQTECH_NAME_TELLSEQ
from .FailedSamplesRecord import FailedSamplesRecord


Expand Down Expand Up @@ -69,7 +69,7 @@ def what_am_i(self):
Returns text description of Workflow's Instrument & Assay mixins.
:return:
"""
return (f"Instrument: {self.instrument_type}" + "\t" +
return (f"Instrument: {self.seqtech_type}" + "\t" +
f"Assay: {self.assay_type}")

def generate_special_map(self):
Expand Down Expand Up @@ -369,7 +369,6 @@ def _project_metadata_check(self):

for qiita_id in qiita_ids:
url = f"/api/v1/study/{qiita_id}/samples/info"
# CHARLIE
logging.debug(f"URL: {url}")
categories = self.qclient.get(url)["categories"]

Expand Down Expand Up @@ -1247,11 +1246,11 @@ class WorkflowFactory():
StandardAmpliconWorkflow,
TellSeqMetagenomicWorkflow]

ST_TO_IN_MAP = {INSTRUMENT_NAME_ILLUMINA: ['standard_metag',
ST_TO_IN_MAP = {SEQTECH_NAME_ILLUMINA: ['standard_metag',
'standard_metat',
'absquant_metag',
'absquant_metat'],
INSTRUMENT_NAME_TELLSEQ: ['tellseq_metag',
SEQTECH_NAME_TELLSEQ: ['tellseq_metag',
'tellseq_absquant']}

@classmethod
Expand Down Expand Up @@ -1279,7 +1278,9 @@ def generate_workflow(cls, **kwargs):
# SheetVersion will raise a ValueError() here, w/the message
# "'{sheet}' doesn't appear to be a valid sample-sheet."


sheet = load_sample_sheet(kwargs['uif_path'])

# if we do not validate the sample-sheet now, it will be validated
# downstream when we attempt to instantiate a Workflow(), which in
# turn will attempt to instantiate a Pipeline(), which will load
Expand Down Expand Up @@ -1314,7 +1315,7 @@ def generate_workflow(cls, **kwargs):

for workflow in WorkflowFactory.WORKFLOWS:
if workflow.assay_type == assay_type:
if workflow.instrument_type == instrument_type:
if workflow.seqtech_type == instrument_type:
# return instantiated workflow object
return workflow(**kwargs)

Expand Down
16 changes: 16 additions & 0 deletions qp_klp/tests/data/configuration_profiles/miseq_metagenomic.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,22 @@
"instrument_type": "MiSeq",
"assay_type": "Metagenomic",
"configuration": {
"tell-read": {
"label": "my_label",
"reference_base": "/my/reference/base/path",
"reference_map": "/my/reference/map/path",
"tmp1_path": "/my/tmp1/path",
"sing_script_path": "/my/sing/script/path",
"cores_per_task": "999",
"nodes": 1,
"lane": 1,
"queue": "qiita",
"wallclock_time_in_minutes": 216,
"modules_to_load": [
"foo_module"
],
"per_process_memory_limit": "10"
},
"bcl-convert": {
"nodes": 1,
"nprocs": 16,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
"cores_per_task": 2,
"movi_executable_path": "/home/user/user_dir/Movi/build/movi-default",
"gres_value": 4,
"additional_fastq_tags": ["BX"],
"pmls_path": "/home/user/user_dir/human_host_filtration/scripts/qiita_filter_pmls.py"
},
"seqpro": {
Expand Down
Loading

0 comments on commit 09a1cc2

Please sign in to comment.