-
Notifications
You must be signed in to change notification settings - Fork 0
/
strings_stringr_regex.R
221 lines (165 loc) Β· 6.38 KB
/
strings_stringr_regex.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
################################### Strings ####################################
library(tidyverse)
astring <- "This is a string"
length(astring) #gives length of vector
str_length(astring) #gives length of string
"The president's approval rating is low."
# for the above, you need to use "" on the outside because of the apostrophe
# Escaping - tell R to use the literal character using \
"this is a quote mark (\")"
# combine strings
paste('Hello', 'my name is', 'Annie') # puts in white space for you
paste0('Hello', 'my name is', 'Annie') # does not put in white space for you
########################## stringr Package Functions ###########################
# every stringr function starts with str_, and string is always the first arg
my_string = "welcome to stringr!"
str_length(my_string)
str_to_upper(my_string)
str_to_lower(my_string)
str_to_title(my_string)
# stringr functions work on vectors
my_string_vector = c('hello world', 'my name is Annie')
str_to_title(my_string_vector)
# extract something from the text
welc = str_extract(my_string, "welc")
welc
some_string = 'hello, my, name, is, Annie'
str_extract(some_string, ',') # returns a vector of just one item
str_extract_all(some_string, ',') # returns a list of all items - can always use unlist()
# remove elements from the string
str_remove(some_string, ',')
str_remove_all(some_string, ',')
# these function aren't really changing the original object
# so, you need to assign to a new variable
# detect a pattern in a string
some_string = 'hello, my, name, is, Annie'
str_detect(some_string, "Annie")
str_detect(some_string, "annie")
str_detect(some_string, "Robert")
str_detect(my_string_vector, "world") # gives a true/false value for each string
# detect and subset strings with a pattern
str_subset(my_string_vector, 'world') # gives back the strings that are TRUE
# count the number of occurences in a string
some_string = 'hello, my, name, is, Annie'
hello_world = 'Hello, world!'
str_count(c(some_string, hello_world), ',')
# split a string by a pattern (and optionally unlist)
some_string = 'hello, my, name, is, Annie'
str_split(some_string, ',')
str_split(some_string, 'm')
unlist(str_split(some_string, ','))
# stripping white space
x = " my name is annie "
str_trim(x, 'left')
str_trim(x, 'right')
str_trim(x, 'both')
# replace something in a string
str_replace(x, 'annie', "Annie" )
str_replace(some_string, ',', "-----" )
str_replace_all(some_string, ',', "-----" )
############################## Refactoring and Pipes ###########################
# see video 6.4 for details on this section
shopping_list <- paste0(
' steak, steak sauce, salt, pepepr, water, mashed potatoes, string beans,',
'icecream, lettuce, tomatoes '
)
# we can use pipes to manipulate strings more effectively
new_shopping_list <- str_trim(shopping_list, 'both') %>%
str_to_upper(.) # you can add the data as a . or not
# you can chain together pipes
# remember , pipes always pass the innermost function out
new_shopping_list <- str_trim(shopping_list, 'both') %>%
str_to_upper(.) %>%
str_split(., ',') %>%
unlist(.) %>%
str_trim(., 'left') %>%
str_to_title(.)
########################### Regular Expressions (RegEx) ########################
# a RegEx is a pattern that exists in a string that you might want to locate
contacts = "Annie - phone: 410-818-7152
Kyle - phone: 847-899-2690
Mom - phone: 410-818-9939
Dad - phone: 443-677-9614
Trevor - phone: 410-818-99345"
cat(contacts)
# extracting the phone numbers - they follow the same pattern
# ddd-ddd-dddd
pattern = '\\d\\d\\d-\\d\\d\\d-\\d\\d\\d\\d'
str_extract_all(contacts, pattern) %>% unlist(.)
# but the above picks up Trevor's phone number which is wrong and we don't want
# find something that isn't a number
pattern = '\\d\\d\\d-\\d\\d\\d-\\d\\d\\d\\d\\D' # uppercase D requests not a number
str_extract_all(contacts, pattern) %>% unlist(.)
# find multiple numbers using {}
pattern = '\\d{3}-\\d{3}-\\d{4}'
str_extract_all(contacts, pattern) %>% unlist(.)
# find between a range of numbers
pattern = '\\d{3}-\\d{3}-\\d{4,5}' #no spaces
str_extract_all(contacts, pattern) %>% unlist(.)
# find word characters (letters, numbers, _)
pattern = '\\w{3,7}'
str_extract(contacts, pattern)
# not a word character
pattern = '\\W'
str_extract(contacts, pattern)
# white space
pattern = '\\s'
str_extract(contacts, pattern)
# not white space
string = ' Hello'
pattern = '\\S'
str_extract(string, pattern)
# lowercase letters
string = 'Hello'
pattern = '[[:lower:]]{1,}'
str_extract(string, pattern)
# uppercase letters
string = 'heLlo'
pattern = '[[:upper:]]{1,}'
str_extract(string, pattern)
# any letter in the alphabet
string = "Annie Britton - phone: 410-818-7152"
pattern = '[[:alpha:]]{1,}\\s[[:alpha:]]{1,}'
str_extract(string, pattern)
# any punctuation
string = "Annie!! - #410-818-7152"
pattern = '[[:punct:]]{1,}'
str_extract(string, pattern)
############### RegEx quantifiers and other special symbols ####################
# the anything character (.) and the '0 or more times' character (*)
string = "today's date is 10/11/2022 and it is 4pm"
pattern = '\\d.*22'
str_extract(string, pattern)
# the one or more times symbol (+)
string = 'James Bond is 007'
pattern = '\\d+'
str_extract(string, pattern)
# the optional character (?)
string = "the president's approval rating is low"
string_2 = "the presidents approval rating is low"
pattern = "president'?s" # saying that the (') is optional
str_extract(string, pattern)
str_extract(string_2, pattern)
# the start of a string (^)
string = "Annie is my name"
string_2 = "my name is Annie"
pattern = "^Annie"
str_extract(string_2, pattern)
# end of a string ($)
string = "Annie is my name"
string_2 = "my name is Annie"
pattern = "Annie$"
str_extract(string, pattern)
str_extract(string_2, pattern)
# using brackets
string = "My SSN is 123457890"
string_2 = "My SSN is 123-45-7890"
pattern = "[0-9-]+"
str_extract(string, pattern)
str_extract(string_2, pattern)
# escaping special characters
string = "use the carat symbol (^) for exponents in R"
pattern = '\\S^\\S' #this won't work because it is taking the special meaning of ^
pattern_2 = '\\S\\^\\S' #add in \\ before
str_extract(string, pattern)
str_extract(string, pattern_2)