-
Notifications
You must be signed in to change notification settings - Fork 0
/
MM_Preprocessing.R
30 lines (23 loc) · 1.36 KB
/
MM_Preprocessing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
load_marked_sequences_mm <- function(input_dataset)
{
#Output needs to be a dataframe with a $sample and a $SequenceID columnn
#Sample has two columns: sample and ID. It contains all symbols including "Start" and "End"
print(paste("Loading Mairegger's dataset:", input_dataset))
loaded_dataset<- read.csv(input_dataset, sep=";", header=T)
print("Removing irrelevant symbols ")
#pre-processing symbols: remove symbols that are not relevant. Output: sequences (with ID and cleaned from irrelevant symbols),symbols, theta, HMMTrained
SymbolsToRemove = c("Navigate Back to HubPage", "Navigate to HubPage", "Start", "End")
filtered_marked_dataset<-loaded_dataset
for(x in SymbolsToRemove)
{
temp <- filtered_marked_dataset[-which(filtered_marked_dataset$sample==x),]
filtered_marked_dataset <- temp
}
#filtered_marked_dataset just has $sample and $SequenceID as columns
#just just need to create a list with 1 entry, which contains the dataset
#Now convert the data frame we had into a list
#list_sequences_marked = matrix(list(sample=filtered_marked_dataset$sample, SequenceID=filtered_marked_dataset$SequenceID), nrow=2, ncol=1)
matrix_sequences_marked = matrix(filtered_marked_dataset , nrow=2, ncol=1)
rownames(matrix_sequences_marked) = c("sample", "SequenceID")
return(list(filtered_marked_dataset, matrix_sequences_marked))
}