-
Notifications
You must be signed in to change notification settings - Fork 1
/
Markov_Chain_TEXT.R
89 lines (65 loc) · 2.86 KB
/
Markov_Chain_TEXT.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#-----------------------------------#
# -Markov Chain Text generator- #
# created by: Ethan Edens #
# Date: 8/4/2017 #
#-----------------------------------#
# removes all global variables
rm(list = ls())
# Library
library(readtext)
require(stringi)
require(random)
# The next block of code formats the text so that there is no punctuation or double spaces as well as replacing all the "\n" with spaces
TEXT <- readtext("C:\\Users\\Ethan Edens\\Documents\\bookofmormon.txt")
TEXT <- stri_replace_all(TEXT, " ", fixed = "\n")
TEXT <- gsub('[[:punct:]]','',TEXT)
TEXT <- stri_replace_all(TEXT, " ", fixed = " ")
# This creats a vector with all the words in order that they appear
WORDS <- unlist(strsplit(TEXT, split = " "))
# this removes all Numeric values
x <- (1:10000000)
IDX <- which(WORDS %in% x)
WORDS = WORDS[-IDX]
# This creates a dummy list of all unique words. It is used later on in the MARKOV function also in creating a vector
# containing all words and the words that follow them.
WORDS_VECTORS_DUMMY_VARIABLE <- WORDS[match(WORDS,WORDS) == seq_along(WORDS)]
# List of all unique words in the text
i <- 1
WORDS_VECTORS <- list()
while (i <= length(WORDS_VECTORS_DUMMY_VARIABLE)){
WORDS_VECTORS[[i]] = WORDS_VECTORS_DUMMY_VARIABLE[i]
i = i + 1
}
# Creates list that corespond with each unique word that shows all the words that come after it.
j <- 1
while (j <= length(WORDS_VECTORS)){
x = WORDS[c(which(WORDS == WORDS_VECTORS[j]) + 1)]
WORDS_VECTORS[[j]] = append(WORDS_VECTORS[[j]],x)
j = j + 1
}
#---------------------------#
# The Markov Chain Function #
#---------------------------#
MARKOV <- function(){
# Creates a random number that will dictate the length of the sentence.
LENGTH_OF_SENTENCE = sample(15:20, 1,replace = TRUE)
# Creates a starting number.
seed = sample(1:length(WORDS_VECTORS), 1,replace = TRUE)
# Creates the sentence vector as well as the first word.
SENTENCE_DATA = WORDS_VECTORS[seed]
SENTENCE_DATA = SENTENCE_DATA[[1]][1]
CURRENT_WORD = WORDS_VECTORS[seed]
CURRENT_WORD = CURRENT_WORD[[1]][1]
# Builds the sentence by taking the words that would most likely follow the current word then makes that word the current word and runs again.
# It runs till the lenght of the sentnece vector is equal to the length of the random sentnece length number.
while (length(SENTENCE_DATA) < LENGTH_OF_SENTENCE){
CURRENT_INDEX = match(CURRENT_WORD,WORDS_VECTORS_DUMMY_VARIABLE)
NEW_WORD = WORDS_VECTORS[CURRENT_INDEX]
NEW_WORD = NEW_WORD[[1]][sample(2:length(NEW_WORD[[1]]), 1,replace = TRUE)]
SENTENCE_DATA = append(SENTENCE_DATA, NEW_WORD)
CURRENT_WORD = NEW_WORD
}
SENTENCE_DATA = paste(unlist(SENTENCE_DATA), collapse = ' ')
return(SENTENCE_DATA)
}
MARKOV()