-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathargs.py
executable file
·400 lines (350 loc) · 14.3 KB
/
args.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
#!/usr/bin/python3
import os
import sys
import platform
from letters import eprint
from sd.easy_args import ArgMaster
LANGCODES = {'ar': 'Arabic', 'bg': 'Bulgarian', 'bn': 'Bengali', 'br': 'Breton', 'ca': 'Catalan', 'cs': 'Czech', 'da': 'Danish', 'de': 'German', 'el': 'Greek', 'en': 'English', 'eo': 'Esperanto', 'es': 'Spanish', 'et': 'Estonian', 'eu': 'Basque', 'fa': 'Persian', 'fi': 'Finnish', 'fr': 'French', 'gl': 'Galician', 'he': 'Hebrew', 'hi': 'Hindi', 'hu': 'Hungarian', 'hy': 'Armenian', 'id': 'Indonesian', 'it': 'Italian', 'ja': 'Japanese', 'ka': 'Georgian', 'kk': 'Kazakh', 'ko': 'Korean', 'lv': 'Latvian', 'mk': 'Macedonian', 'ml': 'Malayalam', 'nl': 'Dutch', 'no': 'Norwegian', 'pl': 'Polish', 'pt': 'Portuguese', 'ro': 'Romanian', 'ru': 'Russian', 'sh': 'Serbo-Croatian', 'si': 'Sinhalese', 'sk': 'Slovak', 'sl': 'Slovenian', 'sq': 'Albanian', 'sv': 'Swedish', 'ta': 'Tamil', 'te': 'Telugu', 'th': 'Thai', 'tl': 'Tagalog', 'tr': 'Turkish', 'uk': 'Ukrainian', 'ur': 'Urdu', 'vi': 'Vietnamese', 'zh': 'Chinese'} # pylint: disable=line-too-long
def parse_args():
"Parse arguments"
positionals = [\
["filename", '', str],
'''.txt, .csv or kindle My Clippings.txt file to read list of words from.
Put a # before any lines you want to ignore.
Right now only single words are supported.
If multiple words are given on a line only the first word will be processed unless the file is My Clippings.txt'''
]
language = [\
['lang', '', list, ('en', 'English')],
'''Language code or name.
For example: --lang es or --lang Spanish will show you Spanish words.
For more control, you can type the full set of language code + name like this:
--lang <2 digit code> <language name>
If you are typing the full set, then the language code and name must EXACTLY match the words used in Wiktionary or it won't detect the entries. For example, --lang sk Slovak will give you the Slovak language, but --lang sk Slovakian won't find anything. Go to https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes to look up your language code and name.
''',
]
frequencies = [\
['book', '', str],
'''Location of a book file to use for additional frequency information.
Must be in .txt format, NOT an e-reader format like .mobi - You can convert books to .txt format using the open-source Calibre program.
The more a word appears in the boot, the more it will receive a boost in the adjusted frequency rankings.''',
['freq', '', str, ''],
'''Change the location of the default frequency list. (Must match language with --lang)
This is useful for specifying the Taiwanese or Brazilian version of the language.
''',
['wikifreq', '', bool, False],
'''Use frequency lists gathered from Wikipedia instead of subtitles. This is useful if you want to focus more on the written word instead of the spoken word. However, Wikipedia covers academic topics more than you may see in real life depending on what books you read. For example: yacimiento (Mineral Deposit) comes in at 0.87 fpm, but in Wikipedia it's at a staggering 34 fpm.''',
]
input_list = [\
['rankbook', '', str],
"Rank all the words in a book by number of occurences and analyze them. Must be in .txt format.",
['ignore', '', str],
"Ignore words on this list.\nEach word on the ignore list cancels out one word on the word list.",
"Debugging mode.",
['wikiroots', '', bool, False],
"Why supply your own word list when you can rank every single root word in the dictionary?",
['wikiwords', '', bool, False],
"Show every word in the dictionary.\n(This will take awhile unless you combine it with --csv)",
['skiplines', '', int, 0],
"Skip lines at start of words list. (Helpful if you want to resume a session later)",
['multiline', '', str, 'autodetect'],
'''
True = Read one word per line. (A vertical list)
False = Read multiple words per line (A book)
'''
]
display = [\
['csv', '', str],
"Output data as csv filename instead of displaying. Example: --csv output.csv",
['min', '', float, 0],
"Minimum total fpm to show a word candidate (not adjusted)",
['max', '', float, 0],
"Maximum total fpm to show a word. (useful for filtering out super common words like: the)",
['length', '', int, 2],
"Skip words under given length.",
['stars', '', bool, False],
"Include abnormally high star words in the total when sorting or displaying",
['starval', '', int, 5],
"Ratio of a conjugated word's fpm to the baseline to mark it with a star.",
['noentry', '', bool, False],
'''Don't display the Wiktionary entry for each word (radically increases processing speed)''',
['threshold', '', float, 0.05],
"Lowest fpm to still display a conjugation.\n--threshold 0.05 = (1 in 20 million words)",
# At 250 words per minute for 4 hours a day, it would take around 333 days to come across a word at threshold
# For a 1 fpm word, it would take 66 hours or 16+ days on average
# For a .2 fpm word, it would take 83+ days
# Words I look up in my native language are usually at around 0 to 0.3 fpm
]
sorting = [\
['reverse', '', bool],
"Reverse the word list",
['sort', '', bool],
'''Loosely sorts the words from highest fpm to lowest
applying a formula based on the original word's fpm and it's total derived fpm.
''',
['sortfactor', '', float, 0.8],
'''
A higher number biases the algorithm to sorting towards a word's total fpm.
A lower number biases toward the original (input) word's fpm.
Range: 0-1''',
]
anki = [\
['anki', '', str, 'USER_DID_NOT_INPUT'],
'''Location of your colection.anki2 or .apkg file to check to see if the search word already has a note.
Only searches the first few words of the front card to avoid any example sentences.
If you get a 'Database is Locked' error, then you need to close Anki first or simply export the cards you wish to search to a .apkg file (recommended)
''',
['skipanki', '', bool],
"Skip any cards that are already in anki database instead of just noting them.",
['decks', '', list],
'''Which anki decks to pull cards from.
For example if you have subdecks named words and examples under the deck A
then type --decks A::words A::examples
''',
['ankilimit', '', int, 9],
"Number of words to search in the anki question field.",
]
# debug level 3 will rebuild caches
hidden = [\
['debug', '', int, 0],
]
am = ArgMaster(\
usage='<txt/csv word list>, --options...',
description="Scan Wiktionary to determine a word's root and frequency of conjugations in fpm (Frequency per million words)")
am.update(language, title="\nLanguage:")
am.update(display, title="\nWhich words to display:")
am.update(input_list, title="\nInput word list:")
am.update(sorting, title="\nSorting the list:")
am.update(anki, title="\nConnect with Anki")
am.update(frequencies, title="\nFrequency lists:")
am.update(positionals, title="Positional Arguments", positionals=True, hidden=True)
am.update(hidden, "Used for testing purposes:", hidden=True)
args = am.parse()
# Language codes
args.lang = [arg.lower().strip() for arg in args.lang]
if not args.lang:
print("Please enter a language. For example: --lang English")
sys.exit(1)
if args.lang[0] in ('bs', 'sr', 'hr'):
print('''
At last check, Wiktionary uses the code sh to refer to Serbo-Croatian languages.
If you think this is a mistake, they have discussion pages here:
https://en.wiktionary.org/wiki/Wiktionary:About_Serbo-Croatian
https://en.wikipedia.org/wiki/Talk:Serbo-Croatian
In the meantime, you can try using the following language code:
--lang sh Serbo-Croatian --freq freq/sr.xz
''')
input()
if len(args.lang) == 1:
args.lang = guess_lang(args.lang[0])
if not args.lang:
sys.exit(1)
print("Set language to:", ' '.join(args.lang))
# args.multiline
choice = args.multiline.lower().strip()[:1]
if choice == 'a':
args.multiline = -1
elif choice in 't1':
args.multiline = 1
elif choice in 'f0':
args.multiline = 0
else:
print("Unknown args.multiline parameter")
sys.exit(1)
# Guess anki location
args.anki = guess_anki(args.anki)
# Checking
status = False
if args.filename and not os.path.exists(args.filename):
print("Can't find file named:", args.filename)
elif args.freq and not os.path.exists(args.freq):
print("Can't find file:", args.freq)
elif len(args.lang) != 2:
print("Please enter the language in the format:", '--lang <code> <Name>')
print("For example: --lang es spanish")
elif len(args.lang[0]) != 2:
print("Language codes must be 2 digits long")
elif not 0 <= args.sortfactor <= 1:
print("Sort factor must be between 0 and 1")
elif args.csv and not args.csv.endswith('.csv'):
print("csv output file must end with .csv to prevent accidentally overwriting the wrong file.")
else:
status = True
if not status:
sys.exit(1)
if not args.freq:
# Determine frequency file if not given
if not args.wikifreq:
lang = args.lang[0]
if lang == 'sh':
lang = 'sr'
print("Defaulting to serbian frequency file.")
args.freq = os.path.join('freq', lang + '.xz')
else:
args.freq = os.path.join('wikifreq', args.lang[0] + '.xz')
eprint("Using frequency file:", args.freq)
if not os.path.exists(args.freq):
print("No frequency file found for language:", args.lang[0])
status = False
if not status:
sys.exit(1)
return args
def guess_anki(path):
if path == 'USER_DID_NOT_INPUT':
return None
if path:
return path # user defined location
if platform.system() == 'Windows':
path = os.path.join(os.getenv('APPDATA'), 'Anki2')
elif platform.system() == 'Linux':
path = "~/.local/share/Anki2"
elif platform.system() == 'Darwin':
path = "~/Library/Application Support/Anki2"
else:
print("Unexpected system platform", platform.system())
print("Please enter the .anki2 location manually using --anki <location>")
sys.exit(1)
path = os.path.expanduser(path)
if not os.path.exists(path):
print("Could not find directory:", path)
print("Please enter the .anki2 location manually using --anki <location>")
sys.exit(1)
path = os.path.join(path, 'User 1/collection.anki2')
if not os.path.exists(path):
print("Could not find .anki2 file :", path)
print("Please enter the .anki2 location manually using --anki <location>")
sys.exit(1)
print("Using default anki location:", path)
return path
def guess_lang(code):
"Look at language code or language name and attempt to guess the pair"
langnames = {val: key for key, val in LANGCODES.items()}
# Spelling corrections
spelling = dict(\
Albanain="Albanian",\
Aribic="Arabic",\
Armenain="Armenian",\
Basc="Basque",\
Bengali="Bengali",\
Bretonn="Breton",\
Bulgarain="Bulgarian",\
Catalon="Catalan",\
Chineese="Chinese",\
Czeck="Czech",\
Dansih="Danish",\
Duth="Dutch",\
Englsh="English",\
Espernato="Esperanto",\
Eestonian="Estonian",\
Finsih="Finnish",\
Franch="French",\
Gallacian="Galician",\
Georgain="Georgian",\
Greman="German",\
Greeek="Greek",\
Hebru="Hebrew",\
Hindhi="Hindi",\
Hungarain="Hungarian",\
Indonision="Indonesian",\
Italien="Italian",\
Japaneese="Japanese",\
Kazack="Kazakh",\
Koren="Korean",\
Latvion="Latvian",\
Macedonain="Macedonian",\
Malyalam="Malayalam",\
Norwiegian="Norwegian",\
Persain="Persian",\
Polnish="Polish",\
Portugeese="Portuguese",\
Roumanian="Romanian",\
Rusian="Russian",\
SerboCroation="Serbo-Croatian",\
Sinhalaese="Sinhalese",\
Slovack="Slovak",\
Slovenean="Slovenian",\
Spannish="Spanish",\
Sweedish="Swedish",\
Tgalog="Tagalog",\
Tammil="Tamil",\
Telagu="Telugu",\
Thia="Thai",\
Turkesh="Turkish",\
Ukranian="Ukrainian",\
Urdoo="Urdu",\
Veitnamese="Vietnamese",\
Albenian="Albanian",\
Aribian="Arabic",\
Arminian="Armenian",\
Basq="Basque",\
Bangali="Bengali",\
Bretton="Breton",\
Bulgerian="Bulgarian",\
Catelan="Catalan",\
Chinnese="Chinese",\
Check="Czech",\
Dainsih="Danish",\
Duch="Dutch",\
Englesh="English",\
Esparanto="Esperanto",\
Estonian="Estonian",\
Finesh="Finnish",\
Frensh="French",\
Galeecian="Galician",\
Georgean="Georgian",\
Germaan="German",\
Griek="Greek",\
Hebrow="Hebrew",\
Hindie="Hindi",\
Hungarian="Hungarian",\
Indonessian="Indonesian",\
Itallian="Italian",\
Japaneze="Japanese",\
Kazah="Kazakh",\
Korian="Korean",\
Latvian="Latvian",\
Macedonian="Macedonian",\
Malyayalam="Malayalam",\
Norveigan="Norwegian",\
Persian="Persian",\
Polisch="Polish",\
Portugues="Portuguese",\
Rumainian="Romanian",\
Russan="Russian",\
SerboKroatian="Serbo-Croatian",\
Sinhallese="Sinhalese",\
Slovac="Slovak",\
Slovinian="Slovenian",\
Spannisch="Spanish",\
Sweadish="Swedish",\
Taggalog="Tagalog",\
Tamill="Tamil",\
Telgugu="Telugu",\
Thay="Thai",\
Turqish="Turkish",\
Ukrainien="Ukrainian",\
Ordu="Urdu",\
Vietnamse="Vietnamese",\
)
# code -> language
code = code.lower().strip()
if code in LANGCODES:
return [code, LANGCODES[code]]
lang = code.title()
# Fix spelling
if lang in spelling:
lang = spelling[lang]
if lang in langnames:
return [langnames[lang], lang]
print("Unknown language:", code)
print("If this is a real language that's not in my list you can try typing --lang <code> <name>")
print("For example: --lang en English")
print("\nThese are the current languages I know about:")
print(' '.join(sorted(LANGCODES.values())))
return False
def tester():
os.chdir(sys.path[0]) # change to local dir
args = parse_args()
print(args)
if __name__ == "__main__":
tester()