-
Notifications
You must be signed in to change notification settings - Fork 108
/
Copy pathbuild.sh
executable file
·80 lines (56 loc) · 2.25 KB
/
build.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/bin/bash
# Sane error handling settings
set -euf -o pipefail
# Specify output directory, and name input data
outdir="output"
metadata="data/sfv.csv"
inseqs="data/sfv.fasta"
# Basic metadata stats
# --------------------
# Compute number of sequences per species
seqs_per_species="$outdir/seqs_per_species.csv"
csvuniq -zc species $metadata > $seqs_per_species
# Compute number of sequences per specimen
seqs_per_specimen="$outdir/seqs_per_specimen.csv"
csvuniq -zc specimen,species,location $metadata > $seqs_per_specimen
# Use those results to compute number of specimens per species
specs_per_species="$outdir/specs_per_species.csv"
csvuniq -zc species $seqs_per_specimen > $specs_per_species
# Also use them to compute number of specimens per species and location
specs_per_species_location="$outdir/specs_per_species_location.csv"
csvuniq -zc species,location $seqs_per_specimen > $specs_per_species_location
# Sequence analysis
# -----------------
# Alignment
alignment="$outdir/alignment.fasta"
#muscle -maxiters 2 -in $inseqs -out $alignment
# Phylognetic tree
tree="$outdir/tree.nw"
#FastTree -seed 1234 -nt $alignment > $tree
# Location comparison
# -------------------
# We'd like to compare the viruses from each of these locations, so we're going to split the data for each one
# and run analyses separately, then recombine later.
# Get array of locations
locations=($(csvuniq -c location $metadata | tail -n +2))
# For each location...
for location in ${locations[*]}
do
# Create a location outdir, if it doesn't already exist
loc_outdir="$outdir/$location"
mkdir -p $loc_outdir
# Create a subset of the metadata for just that location
loc_metadata="$loc_outdir/metadata.csv"
csvgrep -c location -m $location $metadata > $loc_metadata
# Create a list of sequences sampled from that location
loc_sequences="$loc_outdir/sequences"
csvcut -c sequence $loc_metadata > $loc_sequences
# Subset our alignment to just that location
loc_alignment="$loc_outdir/alignment.fasta"
seqmagick convert --include-from-file $loc_sequences $alignment $loc_alignment
# Build a location tree
loc_tree="$loc_outdir/tree.nw"
FastTree -seed 1234 -nt $loc_alignment > $loc_tree
done
# Do something interesting with the things done for each location
# ...