Merge pull request #6 from CRAGENOMICA/dev

Version 1.1.0
CRAGENOMICA · Dec 4, 2024 · c97ad9e · c97ad9e
2 parents ed2b889 + ac8c7e9
commit c97ad9e
Show file tree

Hide file tree

Showing 15 changed files with 1,078 additions and 74 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -121,7 +121,9 @@
             // Resolved by CMake Tools:
             //"program": "${command:cmake.launchTargetPath}",
             "program": "${command:cmake.buildDirectory}/tfa_index",
-             "args": ["../Examples/100Kchr10.tfa.gz" , "-f" ],
+            // "args": ["../Examples/100Kchr10.tfa.gz" , "-f" ],
+            "args": ["/home/data/git/spiga/fastaconvtr/Examples/100Kallchr.tfa", "-f", "-o","/home/data/git/spiga/fastaconvtr/Examples/"  ],
+
             // "args": ["100Kchr10_t.tfa"],
             // 
             // "args": ["100Kchr10_t.tfa" , "-o" , "xxx"],
@@ -168,6 +170,27 @@
             ]
         },
 
+        {
+            "name": "tfa_merge",
+            "type": "cppdbg",
+            "request": "launch",
+            // Resolved by CMake Tools:
+            //"program": "${command:cmake.launchTargetPath}",
+            "program": "${command:cmake.buildDirectory}/tfa_merge",
+            // ./build/tfa_merge -i ./Examples/tfa_merge/example_1.tfa.gz -i ./Examples/tfa_merge/pileup_1.tfa.bgz -o ./Examples/tfa_merge/merged.tfa.gz
+            "args": ["-i", "example_1.tfa.gz", "-i", "pileup_1.tfa.bgz", "-o", "merged.tfa.gz"],
+            "stopAtEntry": false,
+            "cwd": "${workspaceFolder}/Examples/tfa_merge",
+            "externalConsole": false,
+            "MIMode": "gdb",
+            "setupCommands": [
+                {
+                    "description": "Enable pretty-printing for gdb",
+                    "text": "-enable-pretty-printing",
+                    "ignoreFailures": true
+                }
+            ]
+        },
         {
             "name": "tfa_tabix",
             "type": "cppdbg",

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,19 @@
+# Change log Version 1.1.0
+
+## New Features
+- tfa_merge: merge two tfasta files into one.
+### tfa_merge command line usage
+```bash
+tfa_merge -i file1.tfa.gz -i file2.tfa.gz -o merged.tfa.gz
+###
+### Options:
+- `-i, --input FILE`: Input file (specify twice for two files)
+- `-o, --output FILE`: Output file name
+- `-f, --force`: Force overwrite of output file
+- `-h, --help`: Show help message
+```
+
+
 # Change log Version 1.0.0
 
 mstatspop now accept tfasta file in version 2 compressed with htslib (bgzip) and indexed format (.tbi).

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,10 @@
 cmake_minimum_required(VERSION 3.5.0)
-project(mstatspop VERSION 1.0.1 LANGUAGES C CXX)
+project(mstatspop VERSION 1.1.0 LANGUAGES C CXX)
+
+# Set default build type if not specified
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build (Debug or Release)" FORCE)
+endif()
 
 # include(CTest)
 # enable_testing()
@@ -46,6 +51,7 @@ ENDIF(CMAKE_BUILD_TYPE MATCHES Debug)
 file (GLOB_RECURSE SOURCE_FILE CONFIGURE_DEPENDS  "mstatspop/*.c")
 ## explude mstatspop/tfa_index.c and mstatspop/tfa_convert.c
 list(REMOVE_ITEM SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/mstatspop/tfa_index.c")
+list(REMOVE_ITEM SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/mstatspop/tfa_merge.c")
 list(REMOVE_ITEM SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/mstatspop/tfa_convert.c")
 
 # log library
@@ -131,6 +137,11 @@ target_include_directories(tfa_index PRIVATE "lib")
 target_include_directories(tfa_index PRIVATE ${HTSLIB_INCLUDE_DIR})
 target_link_libraries(tfa_index PRIVATE ${HTSLIB_LIBRARY} logging)
 
+## tfa_merge
+add_executable(tfa_merge mstatspop/tfa_merge.c mstatspop/tfasta.c mstatspop/files_util.c)
+target_include_directories(tfa_merge PRIVATE "lib")
+target_include_directories(tfa_merge PRIVATE ${HTSLIB_INCLUDE_DIR})
+target_link_libraries(tfa_merge PRIVATE ${HTSLIB_LIBRARY} logging)
 
 ## ms
 add_executable(ms sources_msHudson/ms.c  sources_msHudson/streec.c sources_msHudson/rand2.c)
@@ -198,5 +209,6 @@ set(CPACK_PROJECT_VERSION ${PROJECT_VERSION})
 install(TARGETS mstatspop DESTINATION bin)
 install(TARGETS ms DESTINATION bin)
 install(TARGETS tfa_index DESTINATION bin)
+install(TARGETS tfa_merge DESTINATION bin)
 install(FILES README.md DESTINATION .)
 include(CPack)
diff --git a/Examples/tfa_merge/example_1.tfa.gz b/Examples/tfa_merge/example_1.tfa.gz
diff --git a/Examples/tfa_merge/example_1.tfa.gz.tbi b/Examples/tfa_merge/example_1.tfa.gz.tbi
diff --git a/Examples/tfa_merge/pileup_1.tfa.bgz b/Examples/tfa_merge/pileup_1.tfa.bgz
diff --git a/Examples/tfa_merge/pileup_1.tfa.bgz.tbi b/Examples/tfa_merge/pileup_1.tfa.bgz.tbi
diff --git a/docs/tfa_merge.md b/docs/tfa_merge.md
@@ -0,0 +1,130 @@
+# TFA File Merge Algorithm
+
+This document describes the algorithm for merging two TFA (Tabix-indexed TFASTA) files.
+
+## Overview
+
+The TFA merge algorithm combines two TFA files by concatenating their DNA sequences while maintaining sample information and sequence structure. The algorithm ensures that both files have compatible sequences and record counts before merging.
+
+
+## Command Line Usage
+```bash
+tfa_merge -i file1.tfa.gz -i file2.tfa.gz -o merged.tfa.gz
+```
+
+```
+Options:
+- `-i, --input FILE`: Input file (specify twice for two files)
+- `-o, --output FILE`: Output file name
+- `-f, --force`: Force overwrite of output file
+- `-h, --help`: Show help message
+```
+
+## Algorithm Steps
+```
+ALGORITHM MergeTFAFiles(file1, file2, outputFile):
+
+1. VALIDATION PHASE:
+   IF file1.numberOfSequences != file2.numberOfSequences THEN
+       RETURN error "Different number of sequences"
+   
+   // Create sequence mapping
+   sequenceMap = new Map()
+   FOR EACH seq1 IN file1.sequences:
+       found = false
+       FOR EACH seq2 IN file2.sequences:
+           IF seq1.name == seq2.name THEN
+               IF seq1.recordCount != seq2.recordCount THEN
+                   RETURN error "Record count mismatch"
+               sequenceMap[seq1] = seq2
+               found = true
+               BREAK
+       IF NOT found THEN
+           RETURN error "Sequence not found in second file"
+
+2. HEADER WRITING PHASE:
+   Write "##fileformat=TFAv2.0" to outputFile
+   Write command line info to outputFile
+   
+   // Merge and write sample names
+   Write "#NAMES: "
+   FOR EACH sample IN file1.samples:
+       Write sample.name + " "
+   FOR EACH sample IN file2.samples:
+       Write sample.name + " "
+   Write newline
+
+3. SEQUENCE MERGING PHASE:
+   FOR EACH sequence IN file1.sequences:
+       
+       
+       iterator1 = createIterator(file1, sequence)
+       iterator2 = createIterator(file2, sequence)
+       
+       WHILE hasNext(iterator1) AND hasNext(iterator2):
+           record1 = next(iterator1)
+           record2 = next(iterator2)
+           
+           // Verify positions match
+           IF record1.position != record2.position THEN
+               RETURN error "Position mismatch"
+           
+           // Merge DNA sequences
+           mergedSequence = record1.sequence + record2.sequence
+           
+           // Write merged record
+           Write sequence.name + "\t" + record1.position + "\t" + mergedSequence + "\n"
+
+4. CLEANUP PHASE:
+   Close all files
+   Free memory
+   Create index for output file
+```
+
+
+## File Format
+
+### Input Files
+- Must be in TFAv2.0 format
+- Must be tabix-indexed
+- Must contain compatible sequences and positions
+
+### Output Format
+Each line in the output file follows this format:
+sequence_name position merged_DNA_sequences
+
+Where:
+- `sequence_name`: Name of the chromosome or scaffold
+- `position`: Position in the sequence (1-based)
+- `merged_DNA_sequences`: Concatenated DNA sequences from both input files
+
+## Error Conditions
+
+The algorithm will fail if:
+1. Input files have different number of sequences
+2. A sequence exists in one file but not in the other
+3. Matching sequences have different record counts
+4. Position values don't match during merging
+5. Input files are not in TFAv2.0 format
+6. Input files are not properly indexed
+
+## Implementation Notes
+
+1. Sequence Order
+   - Files may have sequences in different orders
+   - The sequence mapping handles this difference
+   - Output follows the sequence order of the first file
+
+2. Sample Names
+   - Sample names from both files are preserved
+   - Order is maintained: file1 samples followed by file2 samples
+
+3. Memory Usage
+   - Files are processed in streaming fashion
+   - Only one record from each file is held in memory at a time
+
+4. Performance
+   - Uses tabix indexing for efficient random access
+   - Parallel processing of both input files
+
+
diff --git a/mstatspop/files_util.c b/mstatspop/files_util.c
@@ -1,5 +1,3 @@
-
-
 #include "files_util.h"
 
 
@@ -74,4 +72,10 @@ bool ends_with(const char *str, const char *suffix) {
         return false;
     }
     return strncmp(str + str_len - suffix_len, suffix, suffix_len) == 0;
+}
+
+
+int file_exists(const char *filename) {
+    struct stat buffer;
+    return (stat(filename, &buffer) == 0);
 }
diff --git a/mstatspop/files_util.h b/mstatspop/files_util.h
@@ -6,7 +6,7 @@
 #include <stdio.h>
 #include <stdbool.h>
 #include <string.h>
-
+#include <sys/stat.h>
 #ifdef __cplusplus
 extern "C"
 {
@@ -25,6 +25,13 @@ extern "C"
     char *get_extension(const char *filepath);
     bool ends_with(const char *str, const char *suffix);
 
+    /**
+     * @brief Check if a file exists
+     * @param filename The path to the file to check
+     * @return 1 if the file exists, 0 otherwise
+     */
+    int file_exists(const char *filename);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/mstatspop/get_tfadata.c b/mstatspop/get_tfadata.c
@@ -1215,7 +1215,7 @@ int read_weights_positions_file(
 	long int end_site = init_site + *window_size - 1;
 
   sprintf(region, "%s:%ld-%ld", chr_name, init_site, end_site);
-	 hts_itr_t *iter = tbx_itr_querys(wtfasta->tbx, region);
+	hts_itr_t *iter = tbx_itr_querys(wtfasta->tbx, region);
   if (iter == NULL)
   {
     // it is possible that the region is not found in the index