From 1d0bfe30d42c1d970f003c09cbf2b43ce9d3871e Mon Sep 17 00:00:00 2001 From: Timo Lassmann Date: Fri, 16 Apr 2021 09:35:54 +0800 Subject: [PATCH] Kalign version 3.3.1 --- ChangeLog | 22 ++++++ README | 136 ++++++++++++++++++++++++++----------- configure.ac | 2 +- dev/run_io_test.sh | 1 - src/alignment_parameters.c | 2 +- src/alignment_parameters.h | 2 +- src/aln_run.c | 1 + src/run_kalign.c | 8 +-- src/rwalign.c | 130 ++++++++++++++--------------------- src/weave_alignment.c | 26 ------- src/weave_alignment.h | 2 +- 11 files changed, 180 insertions(+), 152 deletions(-) diff --git a/ChangeLog b/ChangeLog index d0458b5..0880cbe 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +2021-04-16 Timo Lassmann + + * version 3.3.1 - Bug Fix + The previous version kalign checked the top 50 sequences in inputs to determine + whether the sequences are aligned or not. If the first 50 sequences are not aligned, + but following sequences contain gaps (or other characters!) kalign can crash. In this + version (3.3.1) kalign checks all sequences, thereby avoiding this issue. + To alert users to the situation described above and to warn users about the presence of + odd characters, kalign now produces a warning message like this: + + [Date Time] : LOG : Start io tests. + [Date Time] : LOG : reading: dev/data/a2m.good.1 + [Date Time] : LOG : Detected protein sequences. + [Date Time] : WARNING : -------------------------------------------- (rwalign.c line 505) + [Date Time] : WARNING : The input sequences contain gap characters: (rwalign.c line 506) + [Date Time] : WARNING : "-" : 36 found (rwalign.c line 510) + [Date Time] : WARNING : BUT the sequences do not seem to be aligned! (rwalign.c line 514) + [Date Time] : WARNING : (rwalign.c line 515) + [Date Time] : WARNING : Kalign will remove the gap characters and (rwalign.c line 516) + [Date Time] : WARNING : align the sequences. (rwalign.c line 517) + [Date Time] : WARNING : -------------------------------------------- (rwalign.c line 518) + 2020-11-06 Timo Lassmann * version 3.3 - Threading and more diff --git a/README b/README index 740af36..976ccee 100644 --- a/README +++ b/README @@ -1,48 +1,106 @@ ----------------------------------------------------------------------- - Kalign version 2.03, Copyright (C) 2006 Timo Lassmann - - http://msa.cgb.ki.se/ - timolassmann@gmail.com - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - A copy of this license is in the COPYING file. + Kalign - a multiple sequence alignment program + + Copyright 2006, 2019, 2020, 2021 Timo Lassmann + + This file is part of kalign. + + Kalign is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + ----------------------------------------------------------------------- -Installation: -% ./configure -% make +Kalign is a fast multiple sequence alignment program for biological sequences. + +1) Installation + +1.1) Release Tarball + +Download tarball from [releases](https://github.com/TimoLassmann/kalign/releases). Then: + +tar -zxvf kalign-.tar.gz +cd kalign- +./autogen.sh +./configure +make +make check +make install + + +1.2) Homebrew + +brew install brewsci/bio/kalign + +1.3) Developer version + +git clone https://github.com/TimoLassmann/kalign.git +cd kalign +./autogen.sh +./configure +make +make check +make install + + +1.4) on macOS, install [brew](https://brew.sh/) then: + +brew install libtool +brew install automake +git clone https://github.com/TimoLassmann/kalign.git +cd kalign +./autogen.sh +./configure +make +make check +make install + + +2) Usage + +Usage: kalign -i -o + +Options: + + --format : Output format. [Fasta] + --reformat : Reformat existing alignment. [NA] + --version : Print version and exit + +Kalign expects the input to be a set of unaligned sequences in fasta format or aligned sequences in aligned fasta, MSF or clustal format. Kalign automatically detects whether the input sequences are protein, RNA or DNA. + +Since version 3.2.0 kalign supports passing sequence in via stdin and support alignment of sequences from multiple files. + +3) Examples + +Passing sequences via stdin: + + cat input.fa | kalign -f fasta > out.afa + +Combining multiple input files: + + kalign seqsA.fa seqsB.fa seqsC.fa -f fasta > combined.afa + +Align sequences and output the alignment in MSF format: + + kalign -i BB11001.tfa -f msf -o out.msf -and as root: +Align sequences and output the alignment in clustal format: -% make install + kalign -i BB11001.tfa -f clu -o out.clu +Re-align sequences in an existing alignment: -Usage: + kalign -i BB11001.msf -o out.afa - kalign [Options] infile.fasta outfile.fasta - - or: - - kalign [Options] -i infile.fasta -o outfile.fasta - - or: - - kalign [Options] < infile.fasta > outfile.fasta +Reformat existing alignment: - Options: - - type: kalign -h - \ No newline at end of file + kalign -i BB11001.msf -r afa -o out.afa diff --git a/configure.ac b/configure.ac index 5e78570..38f19d8 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT(kalign, 3.3) +AC_INIT(kalign, 3.3.1) #AC_CONFIG_AUX_DIR([.]) diff --git a/dev/run_io_test.sh b/dev/run_io_test.sh index 5ebc944..8d4c91b 100755 --- a/dev/run_io_test.sh +++ b/dev/run_io_test.sh @@ -18,5 +18,4 @@ do printf "with ERROR $status and Message:\n\n$error\n\n"; exit 1; fi - done diff --git a/src/alignment_parameters.c b/src/alignment_parameters.c index 0385db9..875ebe0 100644 --- a/src/alignment_parameters.c +++ b/src/alignment_parameters.c @@ -30,7 +30,7 @@ int set_param_number(struct aln_param* ap,int L, int sel); int new_aln_matrices(struct aln_param* ap); -int init_ap(struct aln_param** aln_param, struct parameters* param, int numseq,int L) +int init_ap(struct aln_param** aln_param, struct parameters* param,int L) { struct aln_param* ap = NULL; int i,j; diff --git a/src/alignment_parameters.h b/src/alignment_parameters.h index 7896c0d..0a5fc97 100644 --- a/src/alignment_parameters.h +++ b/src/alignment_parameters.h @@ -55,6 +55,6 @@ struct aln_param{ }; -extern int init_ap(struct aln_param** aln_param, struct parameters* param, int numseq,int L); +extern int init_ap(struct aln_param** aln_param, struct parameters* param,int L); extern void free_ap(struct aln_param* ap); #endif diff --git a/src/aln_run.c b/src/aln_run.c index 91fdab1..715f432 100644 --- a/src/aln_run.c +++ b/src/aln_run.c @@ -868,6 +868,7 @@ int do_align_serial(struct msa* msa,struct aln_tasks* t,struct aln_mem* m, int t MFREE(t->profile[b]); t->profile[c] = tmp; + RUN(make_seq(msa,a,b,m->path)); msa->plen[c] = m->path[0]; diff --git a/src/run_kalign.c b/src/run_kalign.c index eb0c363..85e850a 100644 --- a/src/run_kalign.c +++ b/src/run_kalign.c @@ -1,7 +1,7 @@ /* Kalign - a multiple sequence alignment program - Copyright 2006, 2019, 2020 Timo Lassmann + Copyright 2006, 2019, 2020, 2021 Timo Lassmann This file is part of kalign. @@ -132,7 +132,7 @@ int print_kalign_header(void) fprintf(stdout,"\n"); fprintf(stdout,"Kalign (%s)\n", PACKAGE_VERSION); fprintf(stdout,"\n"); - fprintf(stdout,"Copyright (C) 2006,2019,2020 Timo Lassmann\n"); + fprintf(stdout,"Copyright (C) 2006,2019,2020,2021 Timo Lassmann\n"); fprintf(stdout,"\n"); fprintf(stdout,"This program comes with ABSOLUTELY NO WARRANTY; for details type:\n"); fprintf(stdout,"`kalign -showw'.\n"); @@ -520,7 +520,7 @@ int run_kalign(struct parameters* param) } /* allocate aln parameters */ - RUN(init_ap(&ap,param,msa->numseq,msa->L )); + RUN(init_ap(&ap,param,msa->L )); if(param->dump_internal){ double* s; @@ -555,7 +555,7 @@ int run_kalign(struct parameters* param) RUN(convert_msa_to_internal(msa, ALPHA_ambigiousPROTEIN)); } /* allocate aln parameters */ - RUN(init_ap(&ap,param,msa->numseq,msa->L )); + RUN(init_ap(&ap,param,msa->L )); /* Start alignment stuff */ DECLARE_TIMER(t1); diff --git a/src/rwalign.c b/src/rwalign.c index 44ecfd5..bd6ec40 100644 --- a/src/rwalign.c +++ b/src/rwalign.c @@ -65,29 +65,29 @@ struct out_line{ +static int aln_unknown_warning_message(struct msa* msa); +static int read_fasta(struct in_buffer* b, struct msa** msa); +static int read_msf(struct in_buffer* b, struct msa** msa); +static int read_clu(struct in_buffer* b, struct msa** msa); -int read_fasta(struct in_buffer* b, struct msa** msa); -int read_msf(struct in_buffer* b, struct msa** msa); -int read_clu(struct in_buffer* b, struct msa** msa); - -int write_msa_fasta(struct msa* msa,char* outfile); -int write_msa_clustal(struct msa* msa,char* outfile); -int write_msa_msf(struct msa* msa,char* outfile); +static int write_msa_fasta(struct msa* msa,char* outfile); +static int write_msa_clustal(struct msa* msa,char* outfile); +static int write_msa_msf(struct msa* msa,char* outfile); /* memory functions */ -struct msa* alloc_msa(void); -int resize_msa(struct msa* msa); +static struct msa* alloc_msa(void); +static int resize_msa(struct msa* msa); -struct msa_seq* alloc_msa_seq(void); -int resize_msa_seq(struct msa_seq* seq); -void free_msa_seq(struct msa_seq* seq); +static struct msa_seq* alloc_msa_seq(void); +static int resize_msa_seq(struct msa_seq* seq); +static void free_msa_seq(struct msa_seq* seq); -struct line_buffer* alloc_line_buffer(int max_line_len); -int resize_line_buffer(struct line_buffer* lb); -void free_line_buffer(struct line_buffer* lb); +static struct line_buffer* alloc_line_buffer(int max_line_len); +static int resize_line_buffer(struct line_buffer* lb); +static void free_line_buffer(struct line_buffer* lb); static int read_file_stdin(struct in_buffer** buffer,char* infile); static int alloc_in_buffer(struct in_buffer** buffer, int n); @@ -106,8 +106,6 @@ static int GCGMultchecksum(struct msa* msa); /* Taken from squid library by Sean Eddy */ static int GCGchecksum(char *seq, int len); - - static int sort_by_name(const void *a, const void *b); static int sort_by_chksum(const void *a, const void *b); @@ -248,7 +246,6 @@ int read_input(char* infile,struct msa** msa) STOP_TIMER(timer); GET_TIMING(timer); DESTROY_TIMER(timer); - //LOG_MSG("Done reading input sequences in %f seconds.", GET_TIMING(timer)); *msa = m; return OK; ERROR: @@ -465,7 +462,8 @@ int detect_aligned(struct msa* msa) min_len = INT32_MAX; max_len = 0; gaps = 0; - n = MACRO_MIN(50, msa->numseq); + /* n = MACRO_MIN(50, msa->numseq); */ + n = msa->numseq; for(i = 0; i < n;i++){ l = 0; for (j = 0; j <= msa->sequences[i]->len;j++){ @@ -480,12 +478,17 @@ int detect_aligned(struct msa* msa) if(min_len == max_len){ /* sequences have gaps and total length is identical - clearly aligned */ msa->aligned = ALN_STATUS_ALIGNED; }else{ /* odd there are gaps but total length differs - unknown status */ + aln_unknown_warning_message(msa); + msa->aligned = ALN_STATUS_UNKNOWN; } }else{ if(min_len == max_len){ /* no gaps and sequences have same length. Can' tell if they are aligned */ + aln_unknown_warning_message(msa); msa->aligned = ALN_STATUS_UNKNOWN; }else{ /* No gaps and sequences have different lengths - unaligned */ + + msa->aligned = ALN_STATUS_UNALIGNED; } } @@ -493,15 +496,36 @@ int detect_aligned(struct msa* msa) return OK; } +static int aln_unknown_warning_message(struct msa* msa) +{ + int i; + WARNING_MSG("--------------------------------------------"); + WARNING_MSG("The input sequences contain gap characters: "); + + for(i = 0; i < 128;i++){ + if(msa->letter_freq[i] && ispunct(i)){ + WARNING_MSG("\"%c\" : %4d found ", (char)i,msa->letter_freq[i] ); + } + } + + WARNING_MSG("BUT the sequences do not seem to be aligned!"); + WARNING_MSG(" "); + WARNING_MSG("Kalign will remove the gap characters and "); + WARNING_MSG("align the sequences. "); + WARNING_MSG("--------------------------------------------"); + return OK; +} + + /* Checks if sequence names are duplicated */ /* Checks if sequences are duplicated */ int run_extra_checks_on_msa(struct msa* msa) { char* tmp_name = NULL; - char* tmp_ptr; + /* char* tmp_ptr; */ struct sort_struct_name_chksum** a = NULL; int i; - int j; + /* int j; */ int c; int l; @@ -1174,50 +1198,30 @@ int read_clu(struct in_buffer* b , struct msa** m) { struct msa* msa = NULL; struct msa_seq* seq_ptr = NULL; - //FILE* f_ptr = NULL; + char* line = NULL; - //size_t b_len = 0; - //ssize_t nread; int i,j; char* p; int active_seq = 0; int line_len; int nl,ni; - /* sanity checks */ - //if(!my_file_exists(infile)){ - //ERROR_MSG("File: %s does not exist.",infile); - //} + if(msa == NULL){ msa = alloc_msa(); } - //RUNP(f_ptr = fopen(infile, "r")); - //LOG_MSG("GAGA"); - /* scan through first line header */ - //while(fgets(line, BUFFER_LEN, f_ptr)){ - //while ((nread = getline(&line, &b_len, f_ptr)) != -1){ - //fprintf(stdout,"LINE: %s", line); - //line_len = strnlen(line, BUFFER_LEN); ni =0; for(nl = 0; nl < b->n_lines;nl++){ line = b->l[nl]->line; line_len = b->l[nl]->len; ni++; - //line_len = nread; - //line[line_len-1] = 0; - /* line_len--; */ break; } active_seq =0; for(nl = ni; nl < b->n_lines;nl++){ line = b->l[nl]->line; line_len = b->l[nl]->len; - //while ((nread = getline(&line, &b_len, f_ptr)) != -1){ - //while(fgets(line, BUFFER_LEN, f_ptr)){ - //line_len = strnlen(line, BUFFER_LEN); - //line_len = nread; - //line[line_len-1] = 0; - /* line_len--; /\* last character is newline *\/ */ + if(!line_len){ active_seq = 0; }else{ @@ -1226,9 +1230,6 @@ int read_clu(struct in_buffer* b , struct msa** m) RUN(resize_msa(msa)); } seq_ptr = msa->sequences[active_seq]; - //p = strstr(line,seq_ptr->name); - //if(p){ - //LOG_MSG("Found bitsof seq %s", seq_ptr->name); p = line; j = 0; @@ -1254,11 +1255,8 @@ int read_clu(struct in_buffer* b , struct msa** m) } active_seq++; msa->numseq = MACRO_MAX(msa->numseq, active_seq); - } - } - //fprintf(stdout,"%d \"%s\"\n",line_len,line); } RUN(null_terminate_sequences(msa)); @@ -1372,35 +1370,19 @@ int read_fasta( struct in_buffer* b,struct msa** m) { struct msa* msa = NULL; struct msa_seq* seq_ptr = NULL; - //FILE* f_ptr = NULL; - char* line = NULL; - //size_t b_len = 0; - //ssize_t nread; - //char line[BUFFER_LEN]; + char* line = NULL; int line_len; int i; int nl; - /* sanity checks */ - //if(!my_file_exists(infile)){ - //ERROR_MSG("File: %s does not exist.",infile); - //} if(msa == NULL){ msa = alloc_msa(); } - for(nl = 0; nl < b->n_lines;nl++){ line = b->l[nl]->line; line_len = b->l[nl]->len; - //RUNP(f_ptr = fopen(infile, "r")); - - //while ((nread = getline(&line, &b_len, f_ptr)) != -1){ - //while(fgets(line, BUFFER_LEN, f_ptr)){ - //line_len = nread; - - //fprintf(stdout,"%d %s\n",line_len,line); if(line[0] == '>'){ /* alloc seq if buffer is full */ if(msa->alloc_numseq == msa->numseq){ @@ -1424,12 +1406,11 @@ int read_fasta( struct in_buffer* b,struct msa** m) if(!seq_ptr){ ERROR_MSG("Encountered a sequence before encountering it's name"); } + seq_ptr->seq[seq_ptr->len] = line[i]; + seq_ptr->len++; if(seq_ptr->alloc_len == seq_ptr->len){ resize_msa_seq(seq_ptr); } - - seq_ptr->seq[seq_ptr->len] = line[i]; - seq_ptr->len++; }else if(ispunct((int)line[i])){ seq_ptr->gaps[seq_ptr->len]++; } @@ -1440,17 +1421,10 @@ int read_fasta( struct in_buffer* b,struct msa** m) *m = msa; - //fclose(f_ptr); - //MFREE(line); + return OK; ERROR: free_msa(msa); - //if(line){ - //MFREE(line); - //} - //if(f_ptr){ - //fclose(f_ptr); - //} return FAIL; } diff --git a/src/weave_alignment.c b/src/weave_alignment.c index dafb016..6e8f67f 100644 --- a/src/weave_alignment.c +++ b/src/weave_alignment.c @@ -28,32 +28,6 @@ //int update_gaps(int old_len,int*gis,int new_len,int *newgaps); int update_gaps(int old_len,int*gis,int *newgaps); -int weave(struct msa* msa,struct aln_tasks*t) -{ - int i; - int a,b,c; - - //RUN(clean_aln(aln) - - for(i = 0; i < t->n_tasks;i++){ - a = t->list[i]->a; - b = t->list[i]->b; - c = t->list[i]->c; - /* fprintf(stdout,"%3d %3d -> %3d (p: %d)\n", t->list[i]->a, t->list[i]->b, t->list[i]->c, t->list[i]->p); */ - /* RUN(make_seq(msa,a,b,t->map[c])); */ - } - - /*for (i = 0; i < (msa->numseq-1)*3;i +=3){ - a = tree[i]; - b = tree[i+1]; - RUN(make_seq(msa,a,b,map[tree[i+2]])); - }*/ - - return OK; -ERROR: - return FAIL; -} - int clean_aln(struct msa* msa) { int i,j; diff --git a/src/weave_alignment.h b/src/weave_alignment.h index 3e6302c..48e9be4 100644 --- a/src/weave_alignment.h +++ b/src/weave_alignment.h @@ -29,7 +29,7 @@ //extern int weave(struct msa* msa, int** map, int* tree); -extern int weave(struct msa* msa,struct aln_tasks*t); +/* extern int weave(struct aln_tasks* t); */ extern int make_seq(struct msa* msa,int a,int b,int* path); extern int clean_aln(struct msa* msa);