-
Notifications
You must be signed in to change notification settings - Fork 2
/
cleanLists.R
106 lines (93 loc) · 4.23 KB
/
cleanLists.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
library(seqinr)
library(stringr)
library(dplyr)
EXCLUSION.PATTERN = "env";
SEQUENCES.DIR <- "/fast/bakeoff_merged_analysis_sequences_unfiltered/results/";
#SEQUENCES.DIR <- "/fast/bakeoff_merged_analysis_sequences_results/results/";
#SEQUENCES.DIR <- "/fast/bakeoff_merged_analysis_sequences_results_2019/results/";
RESULTS.DIRNAME <- "raw_fixed";
THE.SEQUENCES.DIR <- SEQUENCES.DIR; # to avoid "promise already under evaluation" errors
## NOTE if use.processed.lists is TRUE but there are NO PROCESSED LISTS, this will act on the unprocessed lists, and create symlinks to them with corresponding "processed_" prefixes.
cleanLists <- function (
exclusion.pattern = EXCLUSION.PATTERN,
SEQUENCES.DIR = THE.SEQUENCES.DIR,
results.dirname = RESULTS.DIRNAME,
use.processed.lists = TRUE,
regions = c( "nflg", "v3", "rv217_v3" ),
times = c( "1m", "6m", "1m6m" )
)
{
## For each results dir defined by region and time, call cleanLists.in.dir.
for( the.region in regions ) {
for( the.time in times ) {
cleanLists.in.dir(
the.region,
the.time,
exclusion.pattern = exclusion.pattern,
SEQUENCES.DIR = SEQUENCES.DIR,
results.dirname = results.dirname,
use.processed.lists = use.processed.lists
);
} # End foreach the.time
} # End foreach the.region
} # cleanLists (..)
cleanLists.in.dir <- function (
the.region,
the.time,
exclusion.pattern = EXCLUSION.PATTERN,
SEQUENCES.DIR = THE.SEQUENCES.DIR,
results.dirname = RESULTS.DIRNAME,
use.processed.lists = TRUE
)
{
the.dir <-
paste( SEQUENCES.DIR, results.dirname, "/", the.region, "/", the.time, "/", sep = "" );
if( !file.exists( the.dir) ) {
return;
}
print("-------------------------")
print( the.region )
print( the.time )
# List files, one per participant, each contains the source fasta files for that participant, one per line.
if( use.processed.lists ) {
list.files <- dir( the.dir, pattern = "^processed_[0-9]+\\.list$" );
if( length( list.files ) == 0 ) {
warning( paste( "There are no processed lists in directory", the.dir, "so we are cleaning the unprocessed lists and creating symlinks to them as processed lists.", sep = " " ) );
list.files <- dir( the.dir, pattern = "^[0-9]+\\.list$" );
for( .file in list.files ) {
print( paste( "Creating symbolic link for", paste( the.dir, "processed_", .file, sep = "" ) ) );
file.symlink( paste( the.dir, .file, sep = "" ), paste( the.dir, "processed_", .file, sep = "" ) );
}
list.files <- dir( the.dir, pattern = "^processed_[0-9]+\\.list$" );
}
} else {
list.files <- dir( the.dir, pattern = "^[0-9]+\\.list$" );
}
stopifnot( length( list.files ) > 0 );
names( list.files ) <- gsub( "^(processed_)?([0-9]+)\\.list$", "\\2", list.files );
for( .ppt in names( list.files ) ) {
print(.ppt)
source.files.for.ppt <-
read.delim( paste( the.dir, list.files[ .ppt ], sep = "" ), header = FALSE, sep = "\t", stringsAsFactors = FALSE )[, 1];
filtered.source.files <- c();
for( source.file in source.files.for.ppt ) {
#print(source.file)
if( length( grep( exclusion.pattern, source.file ) ) > 0 ) {
cat( paste( "Excluding", source.file, "because it matches the exclusion pattern." ), fill = TRUE );
next;
}
if( !file.exists( source.file ) ) {
cat( paste( "Excluding", source.file, "because FILE NOT FOUND." ), fill = TRUE );
next;
}
filtered.source.files <- c( filtered.source.files, source.file );
} # End foreach source.file
if( length( filtered.source.files ) < length( source.files.for.ppt ) ) {
stopifnot( length( filtered.source.files ) > 0 );
names( filtered.source.files ) <- NULL;
write.table( filtered.source.files, paste( the.dir, list.files[ .ppt ], sep = "" ), quote = FALSE, col.names = FALSE, row.names = FALSE );
}
} # End foreach .ppt
} # cleanLists.in.dir (..)
## Here is where the action is.
cleanLists()