-
Notifications
You must be signed in to change notification settings - Fork 14
/
spellcheck_readme.txt
225 lines (212 loc) · 7.21 KB
/
spellcheck_readme.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# to check spellings, please git clone both IdleLands Maps & Custom-Assets
# if you've cloned IdleLands/IdleLands, and run npm install in the root, it will auto load both repos
# into an assets subdir, under the IdleLands root
#
# before you start, put ok_words & ok_fragments into the assets dir (they're in /Maps/test/)
#
# CHECKING THE SPELLING OF CONTENT
#
# 1. ok_fragments and ok_words must be in the assets dir (or at least, the dir above Maps or
# Custom-Assets repo root). If you move them elsewhere, it's up to you to adjust the path in
# the commands below.
# 2. These commands should be run on Linux ONLY (sorry, spell won't work on mac. dunno about win)
# 3. You must have the 'spell' app installed (apt-get install spell, on debian)
# 4. Run these from the repo root (ie assets/maps and assets/content)
# 5. These commands will only work for Custom-Assets/content (which are txt).
# There is a separate set of commands for Maps (content, which are .json files)
# 1. to find mis-spellings
find . -type f -name '*.txt' -exec cat {} \; | tr ",.{}:\"[]$%=" " " | tr " \t" "\n" | grep -v -e "^$" | grep -vEf ../ok_fragments | grep -viwf ../ok_words | spell | sort | uniq -ic | sort -n
# 2. to dump them all out
find . -type f -name '*.txt' -exec cat {} \; | tr ",.{}:\"[]$%=" " " | tr " \t" "\n" | grep -v -e "^$" | grep -vEf ../ok_fragments | grep -viwf ../ok_words | spell | sort | uniq -i > check_words
# 3. to display specifics
xargs -a check_words -d "\n" -I{} grep --color=always -irwn {} * | grep -v check_words | grep '.txt'
# MISC OTHER NOT EXPLAINED (just derivatives of above)
# how many mis-spellings
find . -type f -name '*.txt' -exec cat {} \; | tr ",.{}:\"[]$%=" " " | tr " \t" "\n" | grep -v -e "^$" | grep -vEf ../ok_fragments | grep -viwf ../ok_words | spell | sort | uniq -i | wc -l
# check a specific word
grep -ir foobar * | tr -d ",.{}:\"[]$%=" | tr " \t" "\n" | grep -i foobar
# EXPLANATIONS (Long. If you know Linux CLI, you can ignore the below)
#
#==========================================================
# 1. DISPLAY ALL MIS-SPELLINGS
#
#
# find . -type f -name '*.txt' -exec cat {} \; | tr ",.{}:\"[]$%=" " " | tr " \t" "\n" | grep -v -e "^$" |
# grep -vEf ../ok_fragments | grep -viwf ../ok_words | spell | sort | uniq -ic | sort -n
#
#
# a) find . -type f -name '*.txt' -exec cat {} \;
#
# find
# finds all files in the tree
# .
# starting in the current dir
# -type f
# files only (not directories)
# -name '*.txt'
# named *.txt
# -exec
# for each file, execute the following command
# cat {} \;
# print the file out
# {} is replaced with each file path
# \; just means end of the find command
#
#
# b) tr ",.{}:\"[]$%=" " "
#
# tr
# translates the the first string into the second (basically, search and replace)
# ",.{}:\"[]$%="
# replace those chars (note the escaped " char)...
# " "
# ... with a space
#
# NOTES: this works on everything passed through from the previous command. ie the contents of all
# the non package.json files under the current dir, excluding the data: lines. Thus, it strips out all
# those punctuation marks and replaces them with spaces. In short: this is what breaks up words for
# us, so we can spell check them
#
#
# c) tr " \t" "\n"
#
# translates all the tabs & spaces into newlines. So now we have one word on each line
#
#
# d) grep -v -e "^$"
#
# grep -v
# pass through everything that -doesn't- match this
#
# -e
# use regex (rather than a regular string)
#
# "^$"
# ^ means start of string. $ means end of string. ie, all empty lines
#
# NOTES: this grep just strips out any blank lines (no point in spell checking them, right?)
#
#
# d) grep -vEf ../ok_fragments
#
# grep -v
# pass through everything that -doesn't- match this
#
# -E
# use extended regex (frankly, I suspect we could just use -e here. The diff is minor)
# -f
# do the grep based on the contents of the file passed in (ok_fragments)
#
# ok_fragments
# this includes regex matches that we don't need to spellcheck
#
# NOTES: because we're using regex matches, we can do things like ^gid - which means any word
# starting with gid we'll ignore. Thus gid1, gid2, gid3 etc. We can also do things like Percent$, which
# means we'll ignore dexPercent, agiPercent etc. You get the idea
#
#
# e) grep -viwf ../ok_words
#
# grep -v
# pass through everything that -doesn't- match this
# -i
# case insensitive match
#
# -w
# match whole words only. So Frank, but not the incorrect Frankenstien
#
# -f
# do the grep based on the contents of the file passed in (ok_words)
#
# ok_words
# the list of all the checked & acceptable non-English-but-ok-in-idle-lands words. eg yarrr
#
#
# f) spell
#
# NOTES: spell checks each line (one word per line). This app only outputs mis-spellings
#
# g) sort
#
# NOTES: sorts the output alphabetically. We do this so we can get only unique instances (using uniq), which doesn't work unless they're ordered
#
# h) uniq -ic
#
# -i
# case insensitive
# -c
# output a count (this will be first on the line. eg 27 hellp)
#
# i) sort -n
#
# NOTES: sorts the output numerically. Since the first thing on the line is the count, this means we
# will output the mis-spellings from least to most commonly occurring (ending with the most common)
#
#
#==========================================================
# 2. DUMP OUT MIS-SPELLINGS TO FILE check_words
#
#
# find . -type f -name '*.txt' -exec cat {} \; | tr ",.{}:\"[]$%=" " " | tr " \t" "\n" | grep -v -e "^$" |
# grep -vEf ../ok_fragments | grep -viwf ../ok_words | spell | sort | uniq -i > check_words
#
#
# same as before, up to g)
# h) uniq -i
#
# -i
# case insensitive
#
# i) > check_words
# redirects all the previous output (ie, all mis-spelled words) into a file called check_words
# creates the file if it doesn't exist. Overwrites it if it does
#
#
#==========================================================
# 3. DISPLAY SPECIFIC INSTANCES OF MIS-SPELLED WORDS
#
#
# xargs -a check_words -d "\n" -I{} grep --color=always -irwn {} * | grep -v check_words | grep '.txt'
#
#
# Once we've made our list of ok words, such that we're only
# to display specifics
#
# a) xargs
# Simply, xargs constructs lists of arguments & executes apps on those arguments
#
# b) -a check_words
# passes the check_words file into xargs
#
# c) -d "\n"
# use newline as a separator (one word per line, remember)
# without this, xargs will puke on any names with apostrophes in them
#
# d) -I{}
# normally, xargs will put arguments at the end, but we want to put them in the middle
# so, later on, where you see {}, that's where all the words from check_words have been put
#
# e) grep
# grep is the app that we want to fire all the check_words into
#
# f) --color=always
# highlight the word that we've found (so we can see the context of the mis-spelled word)
#
# g) -i
# case insensitive
#
# h) -r
# recurse into all subdirectories below the current one
#
# i) -w
# match whole words only. So Frank, but not the incorrect Frankenstien
#
# j) -n
# show the line number (so where know WHERE to fix the problem)
#
# k) grep -v check_words
# since we're showing the filename (grep does that) exclude the file check_words
# we could maybe have used some combo of find in here, but this works well enough
#
# l) grep '.txt'
# make sure we're only looking in the .txt files (since this is the content section)