Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ksw #24

Merged
merged 18 commits into from
Mar 10, 2022
Merged

Ksw #24

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 87 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ project(leviosam CXX C)
set(PROJECT_URL "https://github.com/alshai/leviosam")
set(PROJECT_DESCRIPTION "leviosam: lifting over alignments between genomes")
set(CMAKE_CXX_STANDARD 14)
# set(CMAKE_CXX_STANDARD 11)

find_library(HTS_LIB hts)
if(NOT HTS_LIB)
Expand Down Expand Up @@ -70,12 +69,97 @@ endif()

include_directories(${CMAKE_INCLUDE_PATH})

# Check if SSE instructions are available on the machine where
# the project is compiled.

# From https://gist.github.com/hideo55/5642892
MACRO (FindSSE)

IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO)

STRING(FIND ${CPUINFO} "sse2" SSE2_TRUE)
IF (SSE2_TRUE)
set(SSE2_FOUND ON)
ELSE (SSE2_TRUE)
set(SSE2_FOUND OFF)
ENDIF (SSE2_TRUE)

STRING(FIND ${CPUINFO} "sse4_1" SSE4_1_TRUE)
IF (SSE4_1_TRUE)
set(SSE4_1_FOUND ON)
ELSE (SSE4_1_TRUE)
set(SSE4_1_FOUND OFF)
ENDIF (SSE4_1_TRUE)

ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE
CPUINFO)

STRING(FIND ${CPUINFO} "SSE2" SSE2_TRUE)
IF (SSE2_TRUE)
set(SSE2_FOUND ON)
ELSE (SSE2_TRUE)
set(SSE2_FOUND OFF)
ENDIF (SSE2_TRUE)

STRING(FIND ${CPUINFO} "SSE4.1" SSE4_1_TRUE)
IF (SSE4_1_TRUE)
set(SSE4_1_FOUND ON)
ELSE (SSE4_1_TRUE)
set(SSE4_1_FOUND OFF)
ENDIF (SSE4_1_TRUE)

# ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Windows")
# # TODO
# set(SSE2_FOUND true CACHE BOOL "SSE2 available on host")
# set(SSE3_FOUND false CACHE BOOL "SSE3 available on host")
# set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host")
# set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
# set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host")
# ELSE(CMAKE_SYSTEM_NAME MATCHES "Linux")
# set(SSE2_FOUND true CACHE BOOL "SSE2 available on host")
# set(SSE3_FOUND false CACHE BOOL "SSE3 available on host")
# set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host")
# set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
# set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host")
ENDIF(CMAKE_SYSTEM_NAME MATCHES "Linux")

# IF(CMAKE_COMPILER_IS_GNUCXX)
# EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
# IF(GCC_VERSION VERSION_LESS 4.2)
# set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host" FORCE)
# set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host" FORCE)
# ENDIF()
# ENDIF(CMAKE_COMPILER_IS_GNUCXX)

if(SSE4_1_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
# message("CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
elseif(SSE2_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
# message("CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
endif(SSE4_1_FOUND)

mark_as_advanced(SSE2_FOUND SSE3_FOUND SSSE3_FOUND SSE4_1_FOUND SSE4_2_FOUND)

ENDMACRO(FindSSE)

FindSSE()


file(GLOB LIB_SOURCES src/reconcile.cpp src/lift_bed.cpp src/collate.cpp src/chain.cpp
src/leviosam_utils.cpp src/bed.cpp src/lift_bed.cpp
src/leviosam_utils.cpp src/bed.cpp src/lift_bed.cpp src/aln.cpp
src/yaml.cpp
src/ksw2_extd2_sse.c src/ksw2_extz2_sse.c src/ksw2_extz.c
src/bam_md.c src/bam_aux.c src/gzstream.C)
file(GLOB LIB_HEADERS src/leviosam.hpp
src/reconcile.hpp src/lift_bed.hpp src/collate.hpp src/chain.hpp
src/leviosam_utils.hpp src/bed.hpp src/lift_bed.hpp
src/leviosam_utils.hpp src/bed.hpp src/lift_bed.hpp src/aln.hpp
src/yaml.hpp src/rapidyaml.hpp
src/ksw.h
src/bam.h src/IITree.h src/gzstream.h)
add_library(lvsam ${LIB_SOURCES} ${LIB_HEADERS})

Expand Down
41 changes: 0 additions & 41 deletions Makefile

This file was deleted.

11 changes: 11 additions & 0 deletions configs/bowtie2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
alignment:
engine: "ksw_extz2_sse"
nm_threshold: 5
flag: 0x08 # KSW_EZ_APPROX_MAX
a: 0
b: 2
q: 5
e: 3
w: 601 # w = bw * 1.5 + 1
zdrop: 400
end_bonus: -1
13 changes: 13 additions & 0 deletions configs/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
alignment:
engine: "ksw_extz2_sse"
nm_threshold: 15
flag: 0x08 # KSW_EZ_APPROX_MAX
a: 2
b: 4
q: 4
e: 2
q2: 24
e2: 1
w: 601 # w = bw * 1.5 + 1
zdrop: 400
end_bonus: -1
13 changes: 13 additions & 0 deletions configs/pacbio.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
alignment:
engine: "ksw_extd2_sse"
nm_threshold: 200
flag: 0x08 # KSW_EZ_APPROX_MAX
a: 0
b: 4
q: 6
q2: 26
e: 2
e2: 1
w: 601 # w = bw * 1.5 + 1
zdrop: 400
end_bonus: -1
2 changes: 1 addition & 1 deletion scripts/vcf_to_bed.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# From https://www.biostars.org/p/106249/
# Usage sh vcf_to_bed.sh intput.vcf > output.bed
bcftools norm -m- $1 | grep -v '^#' | awk -v OFS='\t' '{if(length($4) > length($5)) print $1,$2,$2+length($4)-1; else print $1,$2-1,$2+length($5)-1}' | bedtools merge -i -
bcftools norm -m- $1 | grep -v '^#' | awk -v OFS='\t' '{if(length($4) > length($5)) print $1,$2,$2+length($4)-1; else print $1,$2-1,$2+length($5)-1}' | bedtools sort -i - | bedtools merge -i -
30 changes: 12 additions & 18 deletions selective_liftover/leviosam_longreads.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,41 +14,33 @@ TIME=time # GNU time
MEASURE_TIME=1 # Set to a >0 value to measure time for each step
DEFER_DEST_BED=""
COMMIT_SOURCE_BED=""
ALLOWED_GAPS=10
ALLOWED_GAPS=1000
HDIST="-S hdist:100"
ALN=minimap2
REALN_CONFIG=""


while getopts a:C:D:g:F:i:L:M:o:r:R:t:T: flag
while getopts a:C:D:g:f:H:i:L:M:o:r:R:t:T:x: flag
do
case "${flag}" in
a) ALN=${OPTARG};;
C) CLFT=${OPTARG};;
D) DEFER_DEST_BED=" -D ${OPTARG}";;
R) COMMIT_SOURCE_BED=" -r ${OPTARG}";;
F) REF=${OPTARG};;
f) REF=${OPTARG};;
H) HDIST=" -S hdist:${OPTARG}";;
g) ALLOWED_GAPS=${OPTARG};;
i) INPUT=${OPTARG};;
L) LEVIOSAM=${OPTARG};;
M) MEASURE_TIME=${OPTARG};;
o) PFX=${OPTARG};;
r) ALN_RG=${OPTARG};;
R) COMMIT_SOURCE_BED=" -r ${OPTARG}";;
t) THR=${OPTARG};;
T) TIME=${OPTARG};;
x) REALN_CONFIG=" -x ${OPTARG}";;
esac
done

echo "Input BAM: ${INPUT}";
echo "Output prefix: ${PFX}";
echo "Aligner: ${ALN}";
echo "Reference: ${REF}";
echo "Aligner read group: ${ALN_RG}";
echo "LevioSAM software: ${LEVIOSAM}";
echo "LevioSAM index: ${CLFT}";
echo "Allowed gaps: ${ALLOWED_GAPS}";
echo "BED where reads get deferred: ${DEFER_DEST_BED}";
echo "BED where reads get discarded: ${COMMIT_SOURCE_BED}";
echo "Num. threads: ${THR}";

if [[ ! ${ALN} =~ ^(minimap2|winnowmap2)$ ]]; then
echo "Invalid ${ALN}. Accepted input: minimap2, winnowmap2"
exit
Expand All @@ -62,7 +54,9 @@ fi
# Lifting over using leviosam
if [ ! -s ${PFX}-committed.bam ]; then
${TT} ${LEVIOSAM} lift -C ${CLFT} -a ${INPUT} -t ${THR} -p ${PFX} -O bam \
-S lifted -G ${ALLOWED_GAPS} \
-S lifted ${HDIST} -G ${ALLOWED_GAPS} \
${REALN_CONFIG} \
-m -f ${REF} \
${DEFER_DEST_BED} ${COMMIT_SOURCE_BED}
fi

Expand All @@ -74,7 +68,7 @@ fi

# Re-align deferred reads
if [ ! -s ${PFX}-realigned.bam ]; then
${TT} ${ALN} -ax map-hifi -t ${THR} ${REF} ${PFX}-deferred.fq.gz | \
${TT} ${ALN} -ax map-hifi --MD -t ${THR} -R ${ALN_RG} ${REF} ${PFX}-deferred.fq.gz | \
${TT} samtools view -hbo ${PFX}-realigned.bam
fi

Expand Down
46 changes: 27 additions & 19 deletions selective_liftover/leviosam_shortreads_pe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ THR=$(nproc)
LEVIOSAM=leviosam
TIME=time # GNU time
MEASURE_TIME=1 # Set to a >0 value to measure time for each step
ALLOWED_GAPS=0
FRAC_CLIPPED=0.95
ISIZE=1000
HDIST=5
DEFER_DEST_BED=""
COMMIT_SOURCE_BED=""
REALN_CONFIG=""

# Default parameters for Bowtie 2 (local)
# ALN=bowtie2
Expand All @@ -29,7 +31,7 @@ MAPQ=30
ALN_SCORE=100


while getopts a:A:b:C:D:f:H:i:L:M:o:q:r:R:t:T: flag
while getopts a:A:b:C:D:f:H:g:G:i:L:M:o:q:r:R:t:T:x: flag
do
case "${flag}" in
a) ALN=${OPTARG};;
Expand All @@ -39,6 +41,8 @@ do
D) DEFER_DEST_BED=" -D ${OPTARG}";;
f) REF=${OPTARG};;
H) HDIST=${OPTARG};;
g) ALLOWED_GAPS=${OPTARG};;
G) ALLOWED_GAPS=${OPTARG};; # to be deprecated
i) INPUT=${OPTARG};;
L) LEVIOSAM=${OPTARG};;
M) MEASURE_TIME=${OPTARG};;
Expand All @@ -48,22 +52,25 @@ do
R) COMMIT_SOURCE_BED=" -r ${OPTARG}";;
t) THR=${OPTARG};;
T) TIME=${OPTARG};;
x) REALN_CONFIG=" -x ${OPTARG}";;
esac
done

echo "Input BAM: ${INPUT}";
echo "Output prefix: ${PFX}";
echo "Aligner: ${ALN}";
echo "Aligner indexes prefix: ${ALN_IDX}";
echo "Aligner read group: ${ALN_RG}";
echo "Targer reference: ${REF}";
echo "LevioSAM software: ${LEVIOSAM}";
echo "LevioSAM index: ${CLFT}";
echo "LevioSAM min MAPQ: ${MAPQ}";
echo "LevioSAM min AS: ${ALN_SCORE}";
echo "BED where reads get deferred: ${DEFER_DEST_BED}";
echo "BED where reads get discarded: ${COMMIT_SOURCE_BED}";
echo "Num. threads: ${THR}";
# echo "Input BAM: ${INPUT}";
# echo "Output prefix: ${PFX}";
# echo "Aligner: ${ALN}";
# echo "Aligner indexes prefix: ${ALN_IDX}";
# echo "Aligner read group: ${ALN_RG}";
# echo "Targer reference: ${REF}";
# echo "LevioSAM software: ${LEVIOSAM}";
# echo "LevioSAM index: ${CLFT}";
# echo "Allowed gaps: ${ALLOWED_GAPS}";
# echo "Re-alignment config: ${REALN_CONFIG}";
# echo "LevioSAM min MAPQ: ${MAPQ}";
# echo "LevioSAM min AS: ${ALN_SCORE}";
# echo "BED where reads get deferred: ${DEFER_DEST_BED}";
# echo "BED where reads get discarded: ${COMMIT_SOURCE_BED}";
# echo "Num. threads: ${THR}";

if [[ ${INPUT} == "" ]]; then
echo "Input is not set properly"
Expand Down Expand Up @@ -92,7 +99,8 @@ fi
if [ ! -s ${PFX}-committed.bam ]; then
${TT} ${LEVIOSAM} lift -C ${CLFT} -a ${INPUT} -t ${THR} -p ${PFX} -O bam \
-S mapq:${MAPQ} -S isize:${ISIZE} -S aln_score:${ALN_SCORE} \
-S clipped_frac:${FRAC_CLIPPED} -S hdist:${HDIST} -G 0 \
-S clipped_frac:${FRAC_CLIPPED} -S hdist:${HDIST} -G ${ALLOWED_GAPS} \
${REALN_CONFIG} \
-m -f ${REF} ${DEFER_DEST_BED} ${COMMIT_SOURCE_BED}
fi

Expand Down Expand Up @@ -130,8 +138,8 @@ fi
if [ ! -s ${PFX}-final.bam ]; then
${TT} samtools cat ${PFX}-paired-committed.bam ${PFX}-paired-deferred-reconciled.bam | \
${TT} samtools sort -@ ${THR} -o ${PFX}-final.bam
rm ${PFX}-paired-deferred.bam ${PFX}-paired-deferred-sorted_n.bam
rm ${PFX}-paired-realigned.bam ${PFX}-paired-realigned-sorted_n.bam
rm ${PFX}-paired-deferred-reconciled.bam
rm ${PFX}-paired-committed.bam ${PFX}-deferred.bam
# rm ${PFX}-paired-deferred.bam ${PFX}-paired-deferred-sorted_n.bam
# rm ${PFX}-paired-realigned.bam ${PFX}-paired-realigned-sorted_n.bam
# rm ${PFX}-paired-deferred-reconciled.bam
# rm ${PFX}-paired-committed.bam ${PFX}-deferred.bam
fi
Loading