Skip to content

Commit

Permalink
Merge pull request #6 from CRAGENOMICA/dev
Browse files Browse the repository at this point in the history
Version 1.1.0
  • Loading branch information
ahmedihafez authored Dec 4, 2024
2 parents ed2b889 + ac8c7e9 commit c97ad9e
Show file tree
Hide file tree
Showing 15 changed files with 1,078 additions and 74 deletions.
25 changes: 24 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,9 @@
// Resolved by CMake Tools:
//"program": "${command:cmake.launchTargetPath}",
"program": "${command:cmake.buildDirectory}/tfa_index",
"args": ["../Examples/100Kchr10.tfa.gz" , "-f" ],
// "args": ["../Examples/100Kchr10.tfa.gz" , "-f" ],
"args": ["/home/data/git/spiga/fastaconvtr/Examples/100Kallchr.tfa", "-f", "-o","/home/data/git/spiga/fastaconvtr/Examples/" ],

// "args": ["100Kchr10_t.tfa"],
//
// "args": ["100Kchr10_t.tfa" , "-o" , "xxx"],
Expand Down Expand Up @@ -168,6 +170,27 @@
]
},

{
"name": "tfa_merge",
"type": "cppdbg",
"request": "launch",
// Resolved by CMake Tools:
//"program": "${command:cmake.launchTargetPath}",
"program": "${command:cmake.buildDirectory}/tfa_merge",
// ./build/tfa_merge -i ./Examples/tfa_merge/example_1.tfa.gz -i ./Examples/tfa_merge/pileup_1.tfa.bgz -o ./Examples/tfa_merge/merged.tfa.gz
"args": ["-i", "example_1.tfa.gz", "-i", "pileup_1.tfa.bgz", "-o", "merged.tfa.gz"],
"stopAtEntry": false,
"cwd": "${workspaceFolder}/Examples/tfa_merge",
"externalConsole": false,
"MIMode": "gdb",
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",
"text": "-enable-pretty-printing",
"ignoreFailures": true
}
]
},
{
"name": "tfa_tabix",
"type": "cppdbg",
Expand Down
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# Change log Version 1.1.0

## New Features
- tfa_merge: merge two tfasta files into one.
### tfa_merge command line usage
```bash
tfa_merge -i file1.tfa.gz -i file2.tfa.gz -o merged.tfa.gz
###
### Options:
- `-i, --input FILE`: Input file (specify twice for two files)
- `-o, --output FILE`: Output file name
- `-f, --force`: Force overwrite of output file
- `-h, --help`: Show help message
```


# Change log Version 1.0.0

mstatspop now accept tfasta file in version 2 compressed with htslib (bgzip) and indexed format (.tbi).
Expand Down
14 changes: 13 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
cmake_minimum_required(VERSION 3.5.0)
project(mstatspop VERSION 1.0.1 LANGUAGES C CXX)
project(mstatspop VERSION 1.1.0 LANGUAGES C CXX)

# Set default build type if not specified
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build (Debug or Release)" FORCE)
endif()

# include(CTest)
# enable_testing()
Expand Down Expand Up @@ -46,6 +51,7 @@ ENDIF(CMAKE_BUILD_TYPE MATCHES Debug)
file (GLOB_RECURSE SOURCE_FILE CONFIGURE_DEPENDS "mstatspop/*.c")
## explude mstatspop/tfa_index.c and mstatspop/tfa_convert.c
list(REMOVE_ITEM SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/mstatspop/tfa_index.c")
list(REMOVE_ITEM SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/mstatspop/tfa_merge.c")
list(REMOVE_ITEM SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/mstatspop/tfa_convert.c")

# log library
Expand Down Expand Up @@ -131,6 +137,11 @@ target_include_directories(tfa_index PRIVATE "lib")
target_include_directories(tfa_index PRIVATE ${HTSLIB_INCLUDE_DIR})
target_link_libraries(tfa_index PRIVATE ${HTSLIB_LIBRARY} logging)

## tfa_merge
add_executable(tfa_merge mstatspop/tfa_merge.c mstatspop/tfasta.c mstatspop/files_util.c)
target_include_directories(tfa_merge PRIVATE "lib")
target_include_directories(tfa_merge PRIVATE ${HTSLIB_INCLUDE_DIR})
target_link_libraries(tfa_merge PRIVATE ${HTSLIB_LIBRARY} logging)

## ms
add_executable(ms sources_msHudson/ms.c sources_msHudson/streec.c sources_msHudson/rand2.c)
Expand Down Expand Up @@ -198,5 +209,6 @@ set(CPACK_PROJECT_VERSION ${PROJECT_VERSION})
install(TARGETS mstatspop DESTINATION bin)
install(TARGETS ms DESTINATION bin)
install(TARGETS tfa_index DESTINATION bin)
install(TARGETS tfa_merge DESTINATION bin)
install(FILES README.md DESTINATION .)
include(CPack)
Binary file added Examples/tfa_merge/example_1.tfa.gz
Binary file not shown.
Binary file added Examples/tfa_merge/example_1.tfa.gz.tbi
Binary file not shown.
Binary file added Examples/tfa_merge/pileup_1.tfa.bgz
Binary file not shown.
Binary file added Examples/tfa_merge/pileup_1.tfa.bgz.tbi
Binary file not shown.
130 changes: 130 additions & 0 deletions docs/tfa_merge.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# TFA File Merge Algorithm

This document describes the algorithm for merging two TFA (Tabix-indexed TFASTA) files.

## Overview

The TFA merge algorithm combines two TFA files by concatenating their DNA sequences while maintaining sample information and sequence structure. The algorithm ensures that both files have compatible sequences and record counts before merging.


## Command Line Usage
```bash
tfa_merge -i file1.tfa.gz -i file2.tfa.gz -o merged.tfa.gz
```

```
Options:
- `-i, --input FILE`: Input file (specify twice for two files)
- `-o, --output FILE`: Output file name
- `-f, --force`: Force overwrite of output file
- `-h, --help`: Show help message
```

## Algorithm Steps
```
ALGORITHM MergeTFAFiles(file1, file2, outputFile):
1. VALIDATION PHASE:
IF file1.numberOfSequences != file2.numberOfSequences THEN
RETURN error "Different number of sequences"
// Create sequence mapping
sequenceMap = new Map()
FOR EACH seq1 IN file1.sequences:
found = false
FOR EACH seq2 IN file2.sequences:
IF seq1.name == seq2.name THEN
IF seq1.recordCount != seq2.recordCount THEN
RETURN error "Record count mismatch"
sequenceMap[seq1] = seq2
found = true
BREAK
IF NOT found THEN
RETURN error "Sequence not found in second file"
2. HEADER WRITING PHASE:
Write "##fileformat=TFAv2.0" to outputFile
Write command line info to outputFile
// Merge and write sample names
Write "#NAMES: "
FOR EACH sample IN file1.samples:
Write sample.name + " "
FOR EACH sample IN file2.samples:
Write sample.name + " "
Write newline
3. SEQUENCE MERGING PHASE:
FOR EACH sequence IN file1.sequences:
iterator1 = createIterator(file1, sequence)
iterator2 = createIterator(file2, sequence)
WHILE hasNext(iterator1) AND hasNext(iterator2):
record1 = next(iterator1)
record2 = next(iterator2)
// Verify positions match
IF record1.position != record2.position THEN
RETURN error "Position mismatch"
// Merge DNA sequences
mergedSequence = record1.sequence + record2.sequence
// Write merged record
Write sequence.name + "\t" + record1.position + "\t" + mergedSequence + "\n"
4. CLEANUP PHASE:
Close all files
Free memory
Create index for output file
```


## File Format

### Input Files
- Must be in TFAv2.0 format
- Must be tabix-indexed
- Must contain compatible sequences and positions

### Output Format
Each line in the output file follows this format:
sequence_name position merged_DNA_sequences

Where:
- `sequence_name`: Name of the chromosome or scaffold
- `position`: Position in the sequence (1-based)
- `merged_DNA_sequences`: Concatenated DNA sequences from both input files

## Error Conditions

The algorithm will fail if:
1. Input files have different number of sequences
2. A sequence exists in one file but not in the other
3. Matching sequences have different record counts
4. Position values don't match during merging
5. Input files are not in TFAv2.0 format
6. Input files are not properly indexed

## Implementation Notes

1. Sequence Order
- Files may have sequences in different orders
- The sequence mapping handles this difference
- Output follows the sequence order of the first file

2. Sample Names
- Sample names from both files are preserved
- Order is maintained: file1 samples followed by file2 samples

3. Memory Usage
- Files are processed in streaming fashion
- Only one record from each file is held in memory at a time

4. Performance
- Uses tabix indexing for efficient random access
- Parallel processing of both input files


8 changes: 6 additions & 2 deletions mstatspop/files_util.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@


#include "files_util.h"


Expand Down Expand Up @@ -74,4 +72,10 @@ bool ends_with(const char *str, const char *suffix) {
return false;
}
return strncmp(str + str_len - suffix_len, suffix, suffix_len) == 0;
}


int file_exists(const char *filename) {
struct stat buffer;
return (stat(filename, &buffer) == 0);
}
9 changes: 8 additions & 1 deletion mstatspop/files_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include <stdio.h>
#include <stdbool.h>
#include <string.h>

#include <sys/stat.h>
#ifdef __cplusplus
extern "C"
{
Expand All @@ -25,6 +25,13 @@ extern "C"
char *get_extension(const char *filepath);
bool ends_with(const char *str, const char *suffix);

/**
* @brief Check if a file exists
* @param filename The path to the file to check
* @return 1 if the file exists, 0 otherwise
*/
int file_exists(const char *filename);

#ifdef __cplusplus
}
#endif
Expand Down
2 changes: 1 addition & 1 deletion mstatspop/get_tfadata.c
Original file line number Diff line number Diff line change
Expand Up @@ -1215,7 +1215,7 @@ int read_weights_positions_file(
long int end_site = init_site + *window_size - 1;

sprintf(region, "%s:%ld-%ld", chr_name, init_site, end_site);
hts_itr_t *iter = tbx_itr_querys(wtfasta->tbx, region);
hts_itr_t *iter = tbx_itr_querys(wtfasta->tbx, region);
if (iter == NULL)
{
// it is possible that the region is not found in the index
Expand Down
Loading

0 comments on commit c97ad9e

Please sign in to comment.