Skip to content

Commit

Permalink
Merge branch 'smalton/DOR-989-demux-sample-sheets' into 'master'
Browse files Browse the repository at this point in the history
[DOR-989] Demux sample sheets

Closes DOR-989

See merge request machine-learning/dorado!1294
  • Loading branch information
malton-ont committed Dec 6, 2024
2 parents 105f2e1 + 98e6942 commit cbcdf38
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 9 deletions.
3 changes: 2 additions & 1 deletion dorado/cli/demux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,12 +257,13 @@ int demuxer(int argc, char* argv[]) {
auto client_info = std::make_shared<DefaultClientInfo>();
reader.set_client_info(client_info);

auto barcoding_info = get_barcoding_info(parser, sample_sheet.get());

PipelineDescriptor pipeline_desc;
auto demux_writer = pipeline_desc.add_node<BarcodeDemuxerNode>(
{}, output_dir, demux_writer_threads, parser.visible.get<bool>("--emit-fastq"),
std::move(sample_sheet), sort_bam);

auto barcoding_info = get_barcoding_info(parser, sample_sheet.get());
if (barcoding_info) {
std::optional<std::string> custom_seqs =
parser.visible.present<std::string>("--barcode-sequences");
Expand Down
35 changes: 27 additions & 8 deletions tests/test_simple_basecaller_execution.sh
Original file line number Diff line number Diff line change
Expand Up @@ -348,13 +348,23 @@ test_barcoding_read_groups() (
done
sample_sheet=$1
output_name=read_group_test${sample_sheet:+_sample_sheet}
$dorado_bin basecaller -b ${batch} --kit-name SQK-RBK114-96 ${sample_sheet:+--sample-sheet ${sample_sheet}} ${model_5k} $data_dir/barcode_demux/read_group_test > $output_dir/${output_name}.bam
$dorado_bin basecaller -b ${batch} --kit-name SQK-RBK114-96 ${sample_sheet:+--sample-sheet ${sample_sheet}} ${model_5k} $data_dir/barcode_demux/read_group_test --no-trim > $output_dir/${output_name}.bam

samtools quickcheck -u $output_dir/${output_name}.bam
split_dir=$output_dir/${output_name}
mkdir $split_dir
samtools split -u $split_dir/unknown.bam -f "$split_dir/rg_%!.bam" $output_dir/${output_name}.bam

for bam in $split_dir/rg_*.bam; do
# There shouldn't be any unknown groups.
num_read_groups=$(samtools view -c $split_dir/unknown.bam)
if [[ $num_read_groups -ne "0" ]]; then
echo "Reads with unknown read groups found."
exit 1
fi

check_barcodes() (
bam=$1
echo "Checking file: $bam"
if [[ $bam =~ "_SQK-RBK114-96_" ]]; then
# Arrangement is |<kit>_<barcode>|, so trim the kit from the prefix and the .bam from the suffix.
barcode=${bam#*_SQK-RBK114-96_}
Expand All @@ -363,6 +373,10 @@ test_barcoding_read_groups() (
# Arrangement is |<barcode_alias>|, so trim the model from the prefix and the .bam from the suffix.
barcode=${bam#*_${model_name_5k}_}
barcode=${barcode%.bam*}
elif [[ $bam =~ "/9bf5b3eb10d3b031970acc022aecad4ecc918865_" ]]; then
# Demuxed file, so trim the run_id from the prefix and the .bam from the suffix.
barcode=${bam#*9bf5b3eb10d3b031970acc022aecad4ecc918865_}
barcode=${barcode%.bam*}
else
barcode="unclassified"
fi
Expand All @@ -374,13 +388,18 @@ test_barcoding_read_groups() (
echo "Barcoding read group has incorrect number of reads. '${bam}': ${num_read_groups} != ${expected}"
exit 1
fi
exit 0
)
for bam in $split_dir/rg_*.bam; do
check_barcodes $bam
done

$dorado_bin basecaller -b ${batch} ${model_5k} $data_dir/barcode_demux/read_group_test --no-trim > $output_dir/${output_name}-demux.bam
$dorado_bin demux --no-trim --kit-name SQK-RBK114-96 ${sample_sheet:+--sample-sheet ${sample_sheet}} --output-dir $output_dir/${output_name}-demux $output_dir/${output_name}-demux.bam

for bam in $output_dir/${output_name}-demux/*.bam; do
check_barcodes $bam
done
# There shouldn't be any unknown groups.
num_read_groups=$(samtools view -c $split_dir/unknown.bam)
if [[ $num_read_groups -ne "0" ]]; then
echo "Reads with unknown read groups found."
exit 1
fi
)

# There should be 4 reads with BC01, 3 with BC04, and 2 unclassified groups.
Expand Down

0 comments on commit cbcdf38

Please sign in to comment.