From d901750fa567b72fdce7656a11aca29cc6079d9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karatu=C4=9F=20Ozan=20Bircan?= Date: Thu, 25 Jul 2024 17:16:56 +0100 Subject: [PATCH] feat: Split .tsv.gz files into two --- misc/split_files.sh | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 misc/split_files.sh diff --git a/misc/split_files.sh b/misc/split_files.sh new file mode 100644 index 0000000..cd7579c --- /dev/null +++ b/misc/split_files.sh @@ -0,0 +1,33 @@ +#!/bin/bash +#SBATCH --job-name=split +#SBATCH --time=01:00:00 +#SBATCH --mem=4G +#SBATCH --cpus-per-task=1 +#SBATCH --partition=datamover +#SBATCH --mail-type=ALL +#SBATCH --mail-user=karatugo@ebi.ac.uk + +# TODO: Define the full file paths +INPUT_FILE="" +OUTPUT_FILE_1="" +OUTPUT_FILE_2="" + +# Extract the header +HEADER=$(zcat "$INPUT_FILE" | head -n 1) + +# Count the total number of lines in the file +TOTAL_LINES=$(zcat "$INPUT_FILE" | wc -l) + +# Calculate the midpoint +MIDPOINT=$(( (TOTAL_LINES - 1) / 2 )) + +# Split the file into two parts +zcat "$INPUT_FILE" | head -n $((MIDPOINT + 1)) | gzip > "$OUTPUT_FILE_1" +{ + echo "$HEADER" + zcat "$INPUT_FILE" | tail -n +$((MIDPOINT + 2)) +} | gzip > "$OUTPUT_FILE_2" + +echo "File has been split into two parts:" +echo "First part: $OUTPUT_FILE_1" +echo "Second part: $OUTPUT_FILE_2"