forked from UMassCDS/IHOP-Reddit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dvc.lock
2123 lines (2123 loc) · 102 KB
/
dvc.lock
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
schema: '2.0'
stages:
download_comments@2021-04:
cmd: curl https://files.pushshift.io/reddit/comments/RC_2021-04.zst | unzstd
--long=31 | bzip2 > data/raw_data/comments/RC_2021-04.bz2
outs:
- path: data/raw_data/comments/RC_2021-04.bz2
md5: cb556dd19a80d28312b30e112098148b
size: 29620494972
prep_community2vec_data@2021-04:
cmd: mkdir -p data/community2vec/RC_2021-04 && python -m ihop.import_data --config
config.json c2v --top_n 10000 --exclude_top_user_perc 0.05 data/community2vec/RC_2021-04/subreddit_counts.csv
data/community2vec/RC_2021-04/user_contexts data/raw_data/comments/RC_2021-04.bz2
&& rm data/community2vec/RC_2021-04/user_contexts/.*.crc
deps:
- path: data/raw_data/comments/RC_2021-04.bz2
md5: cb556dd19a80d28312b30e112098148b
size: 29620494972
outs:
- path: data/community2vec/RC_2021-04/subreddit_counts.csv
md5: 881813271bcd6a5b6c5b00f82a423600
size: 171395
- path: data/community2vec/RC_2021-04/user_contexts
md5: dd38ff62d1d4f13a967d47d732906233.dir
size: 119874050
nfiles: 2
download_comments@2021-05:
cmd: curl https://files.pushshift.io/reddit/comments/RC_2021-05.zst | unzstd
--long=31 | bzip2 > data/raw_data/comments/RC_2021-05.bz2
outs:
- path: data/raw_data/comments/RC_2021-05.bz2
md5: 97c8b1bd3e13b747ef8d97bcbb934957
size: 30956683935
prep_community2vec_data@2021-05:
cmd: mkdir -p data/community2vec/RC_2021-05 && python -m ihop.import_data --config
config.json c2v --top_n 10000 --exclude_top_user_perc 0.05 data/community2vec/RC_2021-05/subreddit_counts.csv
data/community2vec/RC_2021-05/user_contexts data/raw_data/comments/RC_2021-05.bz2
&& rm data/community2vec/RC_2021-05/user_contexts/.*.crc
deps:
- path: data/raw_data/comments/RC_2021-05.bz2
md5: 97c8b1bd3e13b747ef8d97bcbb934957
size: 30956683935
outs:
- path: data/community2vec/RC_2021-05/subreddit_counts.csv
md5: d366e2a8b95ddb1530884a78349ce016
size: 171796
- path: data/community2vec/RC_2021-05/user_contexts
md5: 95be155cdbbef0f993d06ff030f5d52a.dir
size: 123624233
nfiles: 2
download_comments@2021-06:
cmd: curl https://files.pushshift.io/reddit/comments/RC_2021-06.zst | unzstd
--long=31 | bzip2 > data/raw_data/comments/RC_2021-06.bz2
outs:
- path: data/raw_data/comments/RC_2021-06.bz2
md5: d71768aef221928ab802c290cf010cc6
size: 29759838678
download_submissions@2021-06:
cmd: curl https://files.pushshift.io/reddit/submissions/RS_2021-06.zst | unzstd
--long=31 | bzip2 > data/raw_data/submissions/RS_2021-06.bz2
outs:
- path: data/raw_data/submissions/RS_2021-06.bz2
md5: 182ac21bc9dd472f9b6675c2e7737e4b
size: 6129995031
community2vec_models@2021-04:
cmd: "python -m ihop.community2vec --config config.json --contexts data/community2vec/RC_2021-04/user_contexts\
\ --vocab_csv data/community2vec/RC_2021-04/subreddit_counts.csv --param_grid\
\ '{\"alpha\": [0.08, 0.05], \"vector_size\":[100], \"sample\":[0, 0.001, 0.005],\
\ \"negative\":[10,20]}' --epochs 5 --output_dir data/community2vec/RC_2021-04\
\ --workers 12"
deps:
- path: data/community2vec/RC_2021-04/subreddit_counts.csv
md5: 881813271bcd6a5b6c5b00f82a423600
size: 171395
- path: data/community2vec/RC_2021-04/user_contexts
md5: dd38ff62d1d4f13a967d47d732906233.dir
size: 119874050
nfiles: 2
outs:
- path: data/community2vec/RC_2021-04/analogy_accuracy_results.csv
md5: 0169327163244d8abfd5f2407a927543
size: 4455
- path: data/community2vec/RC_2021-04/best_model/keyedVectors
md5: 2437517f963327f4118cc8916deb6a22
size: 4378125
- path: data/community2vec/RC_2021-04/best_model/metrics.json
md5: 699883bed4c1ced0bc3397f8c425d0da
size: 628
- path: data/community2vec/RC_2021-04/best_model/parameters.json
md5: 8e8382d4d66500f2a8eae6a74c9152a8
size: 305
- path: data/community2vec/RC_2021-04/best_model/word2vec.pickle
md5: a2ed6d65d9edabb04270519a338eb82d
size: 8382862
community2vec_models@2021-05:
cmd: "python -m ihop.community2vec --config config.json --contexts data/community2vec/RC_2021-05/user_contexts\
\ --vocab_csv data/community2vec/RC_2021-05/subreddit_counts.csv --param_grid\
\ '{\"alpha\": [0.08, 0.05], \"vector_size\":[100], \"sample\":[0, 0.001, 0.005],\
\ \"negative\":[10,20]}' --epochs 5 --output_dir data/community2vec/RC_2021-05\
\ --workers 12"
deps:
- path: data/community2vec/RC_2021-05/subreddit_counts.csv
md5: d366e2a8b95ddb1530884a78349ce016
size: 171796
- path: data/community2vec/RC_2021-05/user_contexts
md5: 95be155cdbbef0f993d06ff030f5d52a.dir
size: 123624233
nfiles: 2
outs:
- path: data/community2vec/RC_2021-05/analogy_accuracy_results.csv
md5: 050e7dc2562a3d086954a5506ec692ec
size: 4458
- path: data/community2vec/RC_2021-05/best_model/keyedVectors
md5: 97de61be6c9a590d9b7862a3fa1e5c21
size: 4378438
- path: data/community2vec/RC_2021-05/best_model/metrics.json
md5: 9fb0d96647e63776e932d937b9660c66
size: 620
- path: data/community2vec/RC_2021-05/best_model/parameters.json
md5: 98212e23b2f19d4db7a521b5d1b3819f
size: 301
- path: data/community2vec/RC_2021-05/best_model/word2vec.pickle
md5: 472476f21647d2a98ad28a35d2e58ec9
size: 8383156
prep_community2vec_data@2021-06:
cmd: mkdir -p data/community2vec/RC_2021-06 && python -m ihop.import_data --config
config.json c2v --top_n 10000 --exclude_top_user_perc 0.05 data/community2vec/RC_2021-06/subreddit_counts.csv
data/community2vec/RC_2021-06/user_contexts data/raw_data/comments/RC_2021-06.bz2
&& rm data/community2vec/RC_2021-06/user_contexts/.*.crc
deps:
- path: data/raw_data/comments/RC_2021-06.bz2
md5: d71768aef221928ab802c290cf010cc6
size: 29759838678
outs:
- path: data/community2vec/RC_2021-06/subreddit_counts.csv
md5: 03720916d8a23f5c1cc6e53bc1aed6b0
size: 172093
- path: data/community2vec/RC_2021-06/user_contexts
md5: 36082a5957db5c161df0da17306a9ce5.dir
size: 121029143
nfiles: 2
community2vec_models@2021-06:
cmd: "python -m ihop.community2vec --config config.json --contexts data/community2vec/RC_2021-06/user_contexts\
\ --vocab_csv data/community2vec/RC_2021-06/subreddit_counts.csv --param_grid\
\ '{\"alpha\": [0.08, 0.05], \"vector_size\":[100], \"sample\":[0, 0.001, 0.005],\
\ \"negative\":[10,20]}' --epochs 5 --output_dir data/community2vec/RC_2021-06\
\ --workers 12"
deps:
- path: data/community2vec/RC_2021-06/subreddit_counts.csv
md5: 03720916d8a23f5c1cc6e53bc1aed6b0
size: 172093
- path: data/community2vec/RC_2021-06/user_contexts
md5: 36082a5957db5c161df0da17306a9ce5.dir
size: 121029143
nfiles: 2
outs:
- path: data/community2vec/RC_2021-06/analogy_accuracy_results.csv
md5: a9c7270ff663246f411cf9f464afb674
size: 4447
- path: data/community2vec/RC_2021-06/best_model/keyedVectors
md5: 574577113291a1d823cf6bc3ae0813a1
size: 4378886
- path: data/community2vec/RC_2021-06/best_model/metrics.json
md5: 95fa42135964664ae2b72365ba7d2d71
size: 627
- path: data/community2vec/RC_2021-06/best_model/parameters.json
md5: 12cc9c9381adc4fb8809244d85f23289
size: 305
- path: data/community2vec/RC_2021-06/best_model/word2vec.pickle
md5: d05343a7769ffce6a6ff901e101e97c6
size: 8383624
download_submissions@2021-04:
cmd: curl https://files.pushshift.io/reddit/submissions/RS_2021-04.zst | unzstd
--long=31 | bzip2 > data/raw_data/submissions/RS_2021-04.bz2
outs:
- path: data/raw_data/submissions/RS_2021-04.bz2
md5: c1cca0b80dfff3bbe466bba082a77670
size: 7495667061
download_submissions@2021-05:
cmd: curl https://files.pushshift.io/reddit/submissions/RS_2021-05.zst | unzstd
--long=31 | bzip2 > data/raw_data/submissions/RS_2021-05.bz2
outs:
- path: data/raw_data/submissions/RS_2021-05.bz2
md5: 6c751e1488a16f7f0003b0fa342ad0eb
size: 4677172479
download_comments@2021-12:
cmd: curl https://files.pushshift.io/reddit/comments/RC_2021-12.zst | unzstd
--long=31 | bzip2 > data/raw_data/comments/RC_2021-12.bz2
outs:
- path: data/raw_data/comments/RC_2021-12.bz2
md5: dd1d4d216f5dd4e3c311ba2a0827c040
size: 34569221614
prep_community2vec_data@2021-12:
cmd: mkdir -p data/community2vec/RC_2021-12 && python -m ihop.import_data --config
config.json c2v --top_n 10000 --exclude_top_user_perc 0.05 data/community2vec/RC_2021-12/subreddit_counts.csv
data/community2vec/RC_2021-12/user_contexts data/raw_data/comments/RC_2021-12.bz2
&& rm data/community2vec/RC_2021-12/user_contexts/.*.crc
deps:
- path: data/raw_data/comments/RC_2021-12.bz2
md5: dd1d4d216f5dd4e3c311ba2a0827c040
size: 34569221614
outs:
- path: data/community2vec/RC_2021-12/subreddit_counts.csv
md5: 79d839e5967baa9a05fe469cf354af2d
size: 172200
- path: data/community2vec/RC_2021-12/user_contexts
md5: 59ec103e62a2b100f4b75829ebbef130.dir
size: 139899666
nfiles: 2
community2vec_models@2021-12:
cmd: "python -m ihop.community2vec --config config.json --contexts data/community2vec/RC_2021-12/user_contexts\
\ --vocab_csv data/community2vec/RC_2021-12/subreddit_counts.csv --param_grid\
\ '{\"alpha\": [0.08, 0.05], \"vector_size\":[100], \"sample\":[0, 0.001, 0.005],\
\ \"negative\":[10,20]}' --epochs 5 --output_dir data/community2vec/RC_2021-12\
\ --workers 12"
deps:
- path: data/community2vec/RC_2021-12/subreddit_counts.csv
md5: 79d839e5967baa9a05fe469cf354af2d
size: 172200
- path: data/community2vec/RC_2021-12/user_contexts
md5: 59ec103e62a2b100f4b75829ebbef130.dir
size: 139899666
nfiles: 2
outs:
- path: data/community2vec/RC_2021-12/analogy_accuracy_results.csv
md5: a9de33beb960061adcf6ffe962238c21
size: 4446
- path: data/community2vec/RC_2021-12/best_model/keyedVectors
md5: 2a79e6fdab92135423d892b3a9706da8
size: 4378516
- path: data/community2vec/RC_2021-12/best_model/metrics.json
md5: c5f8dc3652cc2d44d3a8941a84d14f63
size: 627
- path: data/community2vec/RC_2021-12/best_model/parameters.json
md5: e5d1ca3246da3d4c5b9ce2f0290345b2
size: 305
- path: data/community2vec/RC_2021-12/best_model/word2vec.pickle
md5: e8a5374b95c0dc18dab0aaa5942daf23
size: 8383253
download_comments@2021-10:
cmd: curl https://files.pushshift.io/reddit/comments/RC_2021-10.zst | unzstd
--long=31 | bzip2 > data/raw_data/comments/RC_2021-10.bz2
outs:
- path: data/raw_data/comments/RC_2021-10.bz2
md5: 63e4594147bbd8530f451138f51501f4
size: 32870949756
prep_community2vec_data@2021-10:
cmd: mkdir -p data/community2vec/RC_2021-10 && python -m ihop.import_data --config
config.json c2v --top_n 10000 --exclude_top_user_perc 0.05 data/community2vec/RC_2021-10/subreddit_counts.csv
data/community2vec/RC_2021-10/user_contexts data/raw_data/comments/RC_2021-10.bz2
&& rm data/community2vec/RC_2021-10/user_contexts/.*.crc
deps:
- path: data/raw_data/comments/RC_2021-10.bz2
md5: 63e4594147bbd8530f451138f51501f4
size: 32870949756
outs:
- path: data/community2vec/RC_2021-10/subreddit_counts.csv
md5: fdf986d7c61065120b61334d65bdbf33
size: 172049
- path: data/community2vec/RC_2021-10/user_contexts
md5: f8bac1751eb0915da69430555e168717.dir
size: 133928185
nfiles: 2
community2vec_models@2021-10:
cmd: "python -m ihop.community2vec --config config.json --contexts data/community2vec/RC_2021-10/user_contexts\
\ --vocab_csv data/community2vec/RC_2021-10/subreddit_counts.csv --param_grid\
\ '{\"alpha\": [0.08, 0.05], \"vector_size\":[100], \"sample\":[0, 0.001, 0.005],\
\ \"negative\":[10,20]}' --epochs 5 --output_dir data/community2vec/RC_2021-10\
\ --workers 12"
deps:
- path: data/community2vec/RC_2021-10/subreddit_counts.csv
md5: fdf986d7c61065120b61334d65bdbf33
size: 172049
- path: data/community2vec/RC_2021-10/user_contexts
md5: f8bac1751eb0915da69430555e168717.dir
size: 133928185
nfiles: 2
outs:
- path: data/community2vec/RC_2021-10/analogy_accuracy_results.csv
md5: 19b76c68dd12c01c687a1baf6b3faa2c
size: 4459
- path: data/community2vec/RC_2021-10/best_model/keyedVectors
md5: 87903464d51b2ff1827f9cd07f473e80
size: 4378504
- path: data/community2vec/RC_2021-10/best_model/metrics.json
md5: 082dcd5efadac4f5e7b084ef85014b37
size: 620
- path: data/community2vec/RC_2021-10/best_model/parameters.json
md5: 7f5812e17c364f8fae6391362573ffaf
size: 301
- path: data/community2vec/RC_2021-10/best_model/word2vec.pickle
md5: 54ce7541543089ba946c8110aa7376ed
size: 8383224
download_comments@2021-07:
cmd: curl https://files.pushshift.io/reddit/comments/RC_2021-07.zst | unzstd
--long=31 | bzip2 > data/raw_data/comments/RC_2021-07.bz2
outs:
- path: data/raw_data/comments/RC_2021-07.bz2
md5: 47cbcb5b1d435db3d166382b0eca09b7
size: 30911364258
prep_community2vec_data@2021-07:
cmd: mkdir -p data/community2vec/RC_2021-07 && python -m ihop.import_data --config
config.json c2v --top_n 10000 --exclude_top_user_perc 0.05 data/community2vec/RC_2021-07/subreddit_counts.csv
data/community2vec/RC_2021-07/user_contexts data/raw_data/comments/RC_2021-07.bz2
&& rm data/community2vec/RC_2021-07/user_contexts/.*.crc
deps:
- path: data/raw_data/comments/RC_2021-07.bz2
md5: 47cbcb5b1d435db3d166382b0eca09b7
size: 30911364258
outs:
- path: data/community2vec/RC_2021-07/subreddit_counts.csv
md5: 9703089de3cf78139889d04b554b2905
size: 171589
- path: data/community2vec/RC_2021-07/user_contexts
md5: 35d08167bb2a0e63a3feb91287efc7b2.dir
size: 120797885
nfiles: 2
download_comments@2021-11:
cmd: curl https://files.pushshift.io/reddit/comments/RC_2021-11.zst | unzstd
--long=31 | bzip2 > data/raw_data/comments/RC_2021-11.bz2
outs:
- path: data/raw_data/comments/RC_2021-11.bz2
md5: 0d4dd6fc206327e5d7984e7101cd6b1a
size: 33307050193
prep_community2vec_data@2021-11:
cmd: mkdir -p data/community2vec/RC_2021-11 && python -m ihop.import_data --config
config.json c2v --top_n 10000 --exclude_top_user_perc 0.05 data/community2vec/RC_2021-11/subreddit_counts.csv
data/community2vec/RC_2021-11/user_contexts data/raw_data/comments/RC_2021-11.bz2
&& rm data/community2vec/RC_2021-11/user_contexts/.*.crc
deps:
- path: data/raw_data/comments/RC_2021-11.bz2
md5: 0d4dd6fc206327e5d7984e7101cd6b1a
size: 33307050193
outs:
- path: data/community2vec/RC_2021-11/subreddit_counts.csv
md5: e2824367f028a76c39afe8148b4c538d
size: 171974
- path: data/community2vec/RC_2021-11/user_contexts
md5: 07ea707d535352f1cc988dc30eba2530.dir
size: 134622094
nfiles: 2
community2vec_models@2021-11:
cmd: "python -m ihop.community2vec --config config.json --contexts data/community2vec/RC_2021-11/user_contexts\
\ --vocab_csv data/community2vec/RC_2021-11/subreddit_counts.csv --param_grid\
\ '{\"alpha\": [0.08, 0.05], \"vector_size\":[100], \"sample\":[0, 0.001, 0.005],\
\ \"negative\":[10,20]}' --epochs 5 --output_dir data/community2vec/RC_2021-11\
\ --workers 12"
deps:
- path: data/community2vec/RC_2021-11/subreddit_counts.csv
md5: e2824367f028a76c39afe8148b4c538d
size: 171974
- path: data/community2vec/RC_2021-11/user_contexts
md5: 07ea707d535352f1cc988dc30eba2530.dir
size: 134622094
nfiles: 2
outs:
- path: data/community2vec/RC_2021-11/analogy_accuracy_results.csv
md5: 396e09ecfb8b87396168beaff6eed5a4
size: 4459
- path: data/community2vec/RC_2021-11/best_model/keyedVectors
md5: 597f36adbf0a42f06571de35a052c5c2
size: 4378410
- path: data/community2vec/RC_2021-11/best_model/metrics.json
md5: beafd1c43b85d52f831f8e956f432373
size: 628
- path: data/community2vec/RC_2021-11/best_model/parameters.json
md5: 40ef4919f8dfcf946f35bf760197b050
size: 305
- path: data/community2vec/RC_2021-11/best_model/word2vec.pickle
md5: ada2a7653e8711d5d9db0b977655cdc1
size: 8383147
download_comments@2021-09:
cmd: curl https://files.pushshift.io/reddit/comments/RC_2021-09.zst | unzstd
--long=31 | bzip2 > data/raw_data/comments/RC_2021-09.bz2
outs:
- path: data/raw_data/comments/RC_2021-09.bz2
md5: 9b3665fc25907ecbdc7808ae4043b6bd
size: 31996196383
download_comments@2022-01:
cmd: curl https://files.pushshift.io/reddit/comments/RC_2022-01.zst | unzstd
--long=31 | bzip2 > data/raw_data/comments/RC_2022-01.bz2
outs:
- path: data/raw_data/comments/RC_2022-01.bz2
md5: fd25ba68de752140547aa2a0c2080320
size: 37953945294
download_comments@2022-03:
cmd: curl https://files.pushshift.io/reddit/comments/RC_2022-03.zst | unzstd
--long=31 | bzip2 > data/raw_data/comments/RC_2022-03.bz2
outs:
- path: data/raw_data/comments/RC_2022-03.bz2
md5: 4daa2988038a4168cc70e21b7cea96ea
size: 35139378132
download_comments@2021-08:
cmd: curl https://files.pushshift.io/reddit/comments/RC_2021-08.zst | unzstd
--long=31 | bzip2 > data/raw_data/comments/RC_2021-08.bz2
outs:
- path: data/raw_data/comments/RC_2021-08.bz2
md5: 2495184724809cbf0838592eac5c1459
size: 33087814591
download_comments@2022-02:
cmd: curl https://files.pushshift.io/reddit/comments/RC_2022-02.zst | unzstd
--long=31 | bzip2 > data/raw_data/comments/RC_2022-02.bz2
outs:
- path: data/raw_data/comments/RC_2022-02.bz2
md5: 693b93d28cfeaa2fd54986c785072a0d
size: 32237946461
prep_community2vec_data@2022-02:
cmd: mkdir -p data/community2vec/RC_2022-02 && python -m ihop.import_data --config
config.json c2v --top_n 10000 --exclude_top_user_perc 0.05 data/community2vec/RC_2022-02/subreddit_counts.csv
data/community2vec/RC_2022-02/user_contexts data/raw_data/comments/RC_2022-02.bz2
&& rm data/community2vec/RC_2022-02/user_contexts/.*.crc
deps:
- path: data/raw_data/comments/RC_2022-02.bz2
md5: 693b93d28cfeaa2fd54986c785072a0d
size: 32237946461
outs:
- path: data/community2vec/RC_2022-02/subreddit_counts.csv
md5: 17d619a1d41b0c935773b6649c0f23d8
size: 172065
- path: data/community2vec/RC_2022-02/user_contexts
md5: 384b7ca2d09127824921ba9eb1fb589d.dir
size: 129614420
nfiles: 2
community2vec_models@2022-02:
cmd: "python -m ihop.community2vec --config config.json --contexts data/community2vec/RC_2022-02/user_contexts\
\ --vocab_csv data/community2vec/RC_2022-02/subreddit_counts.csv --param_grid\
\ '{\"alpha\": [0.08, 0.05], \"vector_size\":[100], \"sample\":[0, 0.001, 0.005],\
\ \"negative\":[10,20]}' --epochs 5 --output_dir data/community2vec/RC_2022-02\
\ --workers 12"
deps:
- path: data/community2vec/RC_2022-02/subreddit_counts.csv
md5: 17d619a1d41b0c935773b6649c0f23d8
size: 172065
- path: data/community2vec/RC_2022-02/user_contexts
md5: 384b7ca2d09127824921ba9eb1fb589d.dir
size: 129614420
nfiles: 2
outs:
- path: data/community2vec/RC_2022-02/analogy_accuracy_results.csv
md5: c9519ff751f32021d8f1389db914c071
size: 4432
- path: data/community2vec/RC_2022-02/best_model/keyedVectors
md5: 35b97ad3b82fcbb246da53ae73865651
size: 4378579
- path: data/community2vec/RC_2022-02/best_model/metrics.json
md5: f34ee1906facf3a1dd007504e729948e
size: 618
- path: data/community2vec/RC_2022-02/best_model/parameters.json
md5: ddef902cf9a76f41c92b32582965ed1c
size: 301
- path: data/community2vec/RC_2022-02/best_model/word2vec.pickle
md5: 18708dbc87680b5791709de034f581a4
size: 8383299
tsne_visualizations@2022-02:
cmd: python -m ihop.visualizations --config config.json data/community2vec/RC_2022-02/best_model
data/community2vec/RC_2022-02/best_model/tsne.csv
deps:
- path: data/community2vec/RC_2022-02/best_model/keyedVectors
md5: 35b97ad3b82fcbb246da53ae73865651
size: 4378579
- path: data/community2vec/RC_2022-02/best_model/parameters.json
md5: ddef902cf9a76f41c92b32582965ed1c
size: 301
- path: data/community2vec/RC_2022-02/best_model/word2vec.pickle
md5: 18708dbc87680b5791709de034f581a4
size: 8383299
outs:
- path: data/community2vec/RC_2022-02/best_model/tsne.csv
md5: 9722db4bdd7f66bc843cbb97e26ef632
size: 320914
tsne_visualizations@2021-10:
cmd: python -m ihop.visualizations --config config.json data/community2vec/RC_2021-10/best_model
data/community2vec/RC_2021-10/best_model/tsne.csv
deps:
- path: data/community2vec/RC_2021-10/best_model/keyedVectors
md5: 87903464d51b2ff1827f9cd07f473e80
size: 4378504
- path: data/community2vec/RC_2021-10/best_model/parameters.json
md5: 7f5812e17c364f8fae6391362573ffaf
size: 301
- path: data/community2vec/RC_2021-10/best_model/word2vec.pickle
md5: 54ce7541543089ba946c8110aa7376ed
size: 8383224
outs:
- path: data/community2vec/RC_2021-10/best_model/tsne.csv
md5: 807246644ab90b7ffc0c9fe7b10d591b
size: 320627
tsne_visualizations@2021-06:
cmd: python -m ihop.visualizations --config config.json data/community2vec/RC_2021-06/best_model
data/community2vec/RC_2021-06/best_model/tsne.csv
deps:
- path: data/community2vec/RC_2021-06/best_model/keyedVectors
md5: 574577113291a1d823cf6bc3ae0813a1
size: 4378886
- path: data/community2vec/RC_2021-06/best_model/parameters.json
md5: 12cc9c9381adc4fb8809244d85f23289
size: 305
- path: data/community2vec/RC_2021-06/best_model/word2vec.pickle
md5: d05343a7769ffce6a6ff901e101e97c6
size: 8383624
outs:
- path: data/community2vec/RC_2021-06/best_model/tsne.csv
md5: 71a3eb5f7810a440aadce83bed9180f0
size: 321009
prep_community2vec_data@2021-09:
cmd: mkdir -p data/community2vec/RC_2021-09 && python -m ihop.import_data --config
config.json c2v --top_n 10000 --exclude_top_user_perc 0.05 data/community2vec/RC_2021-09/subreddit_counts.csv
data/community2vec/RC_2021-09/user_contexts data/raw_data/comments/RC_2021-09.bz2
&& rm data/community2vec/RC_2021-09/user_contexts/.*.crc
deps:
- path: data/raw_data/comments/RC_2021-09.bz2
md5: 9b3665fc25907ecbdc7808ae4043b6bd
size: 31996196383
outs:
- path: data/community2vec/RC_2021-09/subreddit_counts.csv
md5: 2c480c1f245d786128c9ca6e867e6246
size: 171481
- path: data/community2vec/RC_2021-09/user_contexts
md5: cfac4ea396e0349066c368b5e6c6a942.dir
size: 129329242
nfiles: 2
community2vec_models@2021-09:
cmd: "python -m ihop.community2vec --config config.json --contexts data/community2vec/RC_2021-09/user_contexts\
\ --vocab_csv data/community2vec/RC_2021-09/subreddit_counts.csv --param_grid\
\ '{\"alpha\": [0.08, 0.05], \"vector_size\":[100], \"sample\":[0, 0.001, 0.005],\
\ \"negative\":[10,20]}' --epochs 5 --output_dir data/community2vec/RC_2021-09\
\ --workers 12"
deps:
- path: data/community2vec/RC_2021-09/subreddit_counts.csv
md5: 2c480c1f245d786128c9ca6e867e6246
size: 171481
- path: data/community2vec/RC_2021-09/user_contexts
md5: cfac4ea396e0349066c368b5e6c6a942.dir
size: 129329242
nfiles: 2
outs:
- path: data/community2vec/RC_2021-09/analogy_accuracy_results.csv
md5: 37b3b2317d096fcaaa68ab87e619e8fc
size: 4458
- path: data/community2vec/RC_2021-09/best_model/keyedVectors
md5: a63ae0d4b17e9acf197ba08d8f34c964
size: 4378025
- path: data/community2vec/RC_2021-09/best_model/metrics.json
md5: 6eaa96e3a1fc6ae79c07127e72c466a0
size: 620
- path: data/community2vec/RC_2021-09/best_model/parameters.json
md5: ea81712048924d2eda5bae2678a2c448
size: 301
- path: data/community2vec/RC_2021-09/best_model/word2vec.pickle
md5: 1f0d2124a8f354949313849cb067d497
size: 8382745
tsne_visualizations@2021-09:
cmd: python -m ihop.visualizations --config config.json data/community2vec/RC_2021-09/best_model
data/community2vec/RC_2021-09/best_model/tsne.csv
deps:
- path: data/community2vec/RC_2021-09/best_model/keyedVectors
md5: a63ae0d4b17e9acf197ba08d8f34c964
size: 4378025
- path: data/community2vec/RC_2021-09/best_model/parameters.json
md5: ea81712048924d2eda5bae2678a2c448
size: 301
- path: data/community2vec/RC_2021-09/best_model/word2vec.pickle
md5: 1f0d2124a8f354949313849cb067d497
size: 8382745
outs:
- path: data/community2vec/RC_2021-09/best_model/tsne.csv
md5: 6e1c7b8bde07d162693bcb8e381639d5
size: 320304
prep_community2vec_data@2022-03:
cmd: mkdir -p data/community2vec/RC_2022-03 && python -m ihop.import_data --config
config.json c2v --top_n 10000 --exclude_top_user_perc 0.05 data/community2vec/RC_2022-03/subreddit_counts.csv
data/community2vec/RC_2022-03/user_contexts data/raw_data/comments/RC_2022-03.bz2
&& rm data/community2vec/RC_2022-03/user_contexts/.*.crc
deps:
- path: data/raw_data/comments/RC_2022-03.bz2
md5: 4daa2988038a4168cc70e21b7cea96ea
size: 35139378132
outs:
- path: data/community2vec/RC_2022-03/subreddit_counts.csv
md5: 1a16c234b3b7c8c18198ba8643549cc2
size: 172252
- path: data/community2vec/RC_2022-03/user_contexts
md5: 8943a261905e40c174ca2072f707c826.dir
size: 139952576
nfiles: 2
community2vec_models@2022-03:
cmd: "python -m ihop.community2vec --config config.json --contexts data/community2vec/RC_2022-03/user_contexts\
\ --vocab_csv data/community2vec/RC_2022-03/subreddit_counts.csv --param_grid\
\ '{\"alpha\": [0.08, 0.05], \"vector_size\":[100], \"sample\":[0, 0.001, 0.005],\
\ \"negative\":[10,20]}' --epochs 5 --output_dir data/community2vec/RC_2022-03\
\ --workers 12"
deps:
- path: data/community2vec/RC_2022-03/subreddit_counts.csv
md5: 1a16c234b3b7c8c18198ba8643549cc2
size: 172252
- path: data/community2vec/RC_2022-03/user_contexts
md5: 8943a261905e40c174ca2072f707c826.dir
size: 139952576
nfiles: 2
outs:
- path: data/community2vec/RC_2022-03/analogy_accuracy_results.csv
md5: 1aee27120afec057cb9878b3c70fddcd
size: 4458
- path: data/community2vec/RC_2022-03/best_model/keyedVectors
md5: a2c307ee24ad6f775733d378346f4b45
size: 4378530
- path: data/community2vec/RC_2022-03/best_model/metrics.json
md5: 4cab5f5ca0ee8a6046bff4a569bf92bd
size: 628
- path: data/community2vec/RC_2022-03/best_model/parameters.json
md5: 3e983e5a9cf9766f8125d0698013d94f
size: 305
- path: data/community2vec/RC_2022-03/best_model/word2vec.pickle
md5: 73339c9fe05bc164ef502612f55c6ca8
size: 8383268
tsne_visualizations@2022-03:
cmd: python -m ihop.visualizations --config config.json data/community2vec/RC_2022-03/best_model
data/community2vec/RC_2022-03/best_model/tsne.csv
deps:
- path: data/community2vec/RC_2022-03/best_model/keyedVectors
md5: a2c307ee24ad6f775733d378346f4b45
size: 4378530
- path: data/community2vec/RC_2022-03/best_model/parameters.json
md5: 3e983e5a9cf9766f8125d0698013d94f
size: 305
- path: data/community2vec/RC_2022-03/best_model/word2vec.pickle
md5: 73339c9fe05bc164ef502612f55c6ca8
size: 8383268
outs:
- path: data/community2vec/RC_2022-03/best_model/tsne.csv
md5: 78ea13946f25541fd44fa60a1b1c1175
size: 320664
tsne_visualizations@2021-05:
cmd: python -m ihop.visualizations --config config.json data/community2vec/RC_2021-05/best_model
data/community2vec/RC_2021-05/best_model/tsne.csv
deps:
- path: data/community2vec/RC_2021-05/best_model/keyedVectors
md5: 97de61be6c9a590d9b7862a3fa1e5c21
size: 4378438
- path: data/community2vec/RC_2021-05/best_model/parameters.json
md5: 98212e23b2f19d4db7a521b5d1b3819f
size: 301
- path: data/community2vec/RC_2021-05/best_model/word2vec.pickle
md5: 472476f21647d2a98ad28a35d2e58ec9
size: 8383156
outs:
- path: data/community2vec/RC_2021-05/best_model/tsne.csv
md5: c0edc557bc430b437bab584fd64b5708
size: 320599
tsne_visualizations@2021-04:
cmd: python -m ihop.visualizations --config config.json data/community2vec/RC_2021-04/best_model
data/community2vec/RC_2021-04/best_model/tsne.csv
deps:
- path: data/community2vec/RC_2021-04/best_model/keyedVectors
md5: 2437517f963327f4118cc8916deb6a22
size: 4378125
- path: data/community2vec/RC_2021-04/best_model/parameters.json
md5: 8e8382d4d66500f2a8eae6a74c9152a8
size: 305
- path: data/community2vec/RC_2021-04/best_model/word2vec.pickle
md5: a2ed6d65d9edabb04270519a338eb82d
size: 8382862
outs:
- path: data/community2vec/RC_2021-04/best_model/tsne.csv
md5: 614b937ba3dbe28a153868135c81e556
size: 320453
community2vec_models@2021-07:
cmd: "python -m ihop.community2vec --config config.json --contexts data/community2vec/RC_2021-07/user_contexts\
\ --vocab_csv data/community2vec/RC_2021-07/subreddit_counts.csv --param_grid\
\ '{\"alpha\": [0.08, 0.05], \"vector_size\":[100], \"sample\":[0, 0.001, 0.005],\
\ \"negative\":[10,20]}' --epochs 5 --output_dir data/community2vec/RC_2021-07\
\ --workers 12"
deps:
- path: data/community2vec/RC_2021-07/subreddit_counts.csv
md5: 9703089de3cf78139889d04b554b2905
size: 171589
- path: data/community2vec/RC_2021-07/user_contexts
md5: 35d08167bb2a0e63a3feb91287efc7b2.dir
size: 120797885
nfiles: 2
outs:
- path: data/community2vec/RC_2021-07/analogy_accuracy_results.csv
md5: bd465300dfe6e0b3ae4c7118eed3ff31
size: 4456
- path: data/community2vec/RC_2021-07/best_model/keyedVectors
md5: 07cf3c3233c2edd09ca04905b3d7ea87
size: 4378249
- path: data/community2vec/RC_2021-07/best_model/metrics.json
md5: 050742d2799e0865e53e076b18d18da7
size: 627
- path: data/community2vec/RC_2021-07/best_model/parameters.json
md5: 5c0fcc5a8ab212c1aed264c945b536cc
size: 305
- path: data/community2vec/RC_2021-07/best_model/word2vec.pickle
md5: 33de6ad49a48b8691ccfabffae214fac
size: 8382987
tsne_visualizations@2021-07:
cmd: python -m ihop.visualizations --config config.json data/community2vec/RC_2021-07/best_model
data/community2vec/RC_2021-07/best_model/tsne.csv
deps:
- path: data/community2vec/RC_2021-07/best_model/keyedVectors
md5: 07cf3c3233c2edd09ca04905b3d7ea87
size: 4378249
- path: data/community2vec/RC_2021-07/best_model/parameters.json
md5: 5c0fcc5a8ab212c1aed264c945b536cc
size: 305
- path: data/community2vec/RC_2021-07/best_model/word2vec.pickle
md5: 33de6ad49a48b8691ccfabffae214fac
size: 8382987
outs:
- path: data/community2vec/RC_2021-07/best_model/tsne.csv
md5: 6d8b58e3dc1b5cebc4a1a15896bfd284
size: 320446
tsne_visualizations@2021-12:
cmd: python -m ihop.visualizations --config config.json data/community2vec/RC_2021-12/best_model
data/community2vec/RC_2021-12/best_model/tsne.csv
deps:
- path: data/community2vec/RC_2021-12/best_model/keyedVectors
md5: 2a79e6fdab92135423d892b3a9706da8
size: 4378516
- path: data/community2vec/RC_2021-12/best_model/parameters.json
md5: e5d1ca3246da3d4c5b9ce2f0290345b2
size: 305
- path: data/community2vec/RC_2021-12/best_model/word2vec.pickle
md5: e8a5374b95c0dc18dab0aaa5942daf23
size: 8383253
outs:
- path: data/community2vec/RC_2021-12/best_model/tsne.csv
md5: 3e1943c1bc66bee6a1d74e061abd4423
size: 320766
prep_community2vec_data@2022-01:
cmd: mkdir -p data/community2vec/RC_2022-01 && python -m ihop.import_data --config
config.json c2v --top_n 10000 --exclude_top_user_perc 0.05 data/community2vec/RC_2022-01/subreddit_counts.csv
data/community2vec/RC_2022-01/user_contexts data/raw_data/comments/RC_2022-01.bz2
&& rm data/community2vec/RC_2022-01/user_contexts/.*.crc
deps:
- path: data/raw_data/comments/RC_2022-01.bz2
md5: fd25ba68de752140547aa2a0c2080320
size: 37953945294
outs:
- path: data/community2vec/RC_2022-01/subreddit_counts.csv
md5: 9244d14e6127ace15538cfe1751980fc
size: 172470
- path: data/community2vec/RC_2022-01/user_contexts
md5: 255185bbe8738950c8b25b10a70c9b6c.dir
size: 153067875
nfiles: 2
community2vec_models@2022-01:
cmd: "python -m ihop.community2vec --config config.json --contexts data/community2vec/RC_2022-01/user_contexts\
\ --vocab_csv data/community2vec/RC_2022-01/subreddit_counts.csv --param_grid\
\ '{\"alpha\": [0.08, 0.05], \"vector_size\":[100], \"sample\":[0, 0.001, 0.005],\
\ \"negative\":[10,20]}' --epochs 5 --output_dir data/community2vec/RC_2022-01\
\ --workers 12"
deps:
- path: data/community2vec/RC_2022-01/subreddit_counts.csv
md5: 9244d14e6127ace15538cfe1751980fc
size: 172470
- path: data/community2vec/RC_2022-01/user_contexts
md5: 255185bbe8738950c8b25b10a70c9b6c.dir
size: 153067875
nfiles: 2
outs:
- path: data/community2vec/RC_2022-01/analogy_accuracy_results.csv
md5: 1c059f4d1a6526a0a6472e66ad5ea7d0
size: 4433
- path: data/community2vec/RC_2022-01/best_model/keyedVectors
md5: 9c28977afd93942e4a8b5d7e8e403001
size: 4378466
- path: data/community2vec/RC_2022-01/best_model/metrics.json
md5: e535481e04410e8d80aab8c7c42ade8a
size: 626
- path: data/community2vec/RC_2022-01/best_model/parameters.json
md5: 47471a61e5583f49dd80673ee197fbf3
size: 305
- path: data/community2vec/RC_2022-01/best_model/word2vec.pickle
md5: bf56be0fb7690f2eb8e8eb962b12db6d
size: 8383205
tsne_visualizations@2022-01:
cmd: python -m ihop.visualizations --config config.json data/community2vec/RC_2022-01/best_model
data/community2vec/RC_2022-01/best_model/tsne.csv
deps:
- path: data/community2vec/RC_2022-01/best_model/keyedVectors
md5: 9c28977afd93942e4a8b5d7e8e403001
size: 4378466
- path: data/community2vec/RC_2022-01/best_model/parameters.json
md5: 47471a61e5583f49dd80673ee197fbf3
size: 305
- path: data/community2vec/RC_2022-01/best_model/word2vec.pickle
md5: bf56be0fb7690f2eb8e8eb962b12db6d
size: 8383205
outs:
- path: data/community2vec/RC_2022-01/best_model/tsne.csv
md5: 185c41b1a3ea170da36443e1593a1d12
size: 320524
prep_community2vec_data@2021-08:
cmd: mkdir -p data/community2vec/RC_2021-08 && python -m ihop.import_data --config
config.json c2v --top_n 10000 --exclude_top_user_perc 0.05 data/community2vec/RC_2021-08/subreddit_counts.csv
data/community2vec/RC_2021-08/user_contexts data/raw_data/comments/RC_2021-08.bz2
&& rm data/community2vec/RC_2021-08/user_contexts/.*.crc
deps:
- path: data/raw_data/comments/RC_2021-08.bz2
md5: 2495184724809cbf0838592eac5c1459
size: 33087814591
outs:
- path: data/community2vec/RC_2021-08/subreddit_counts.csv
md5: 53204aaa3ea41be9275a37ecb70c5e63
size: 171606
- path: data/community2vec/RC_2021-08/user_contexts
md5: a3cd217c2a714ade0b061370f2a262f2.dir
size: 131137056
nfiles: 2
community2vec_models@2021-08:
cmd: "python -m ihop.community2vec --config config.json --contexts data/community2vec/RC_2021-08/user_contexts\
\ --vocab_csv data/community2vec/RC_2021-08/subreddit_counts.csv --param_grid\
\ '{\"alpha\": [0.08, 0.05], \"vector_size\":[100], \"sample\":[0, 0.001, 0.005],\
\ \"negative\":[10,20]}' --epochs 5 --output_dir data/community2vec/RC_2021-08\
\ --workers 12"
deps:
- path: data/community2vec/RC_2021-08/subreddit_counts.csv
md5: 53204aaa3ea41be9275a37ecb70c5e63
size: 171606
- path: data/community2vec/RC_2021-08/user_contexts
md5: a3cd217c2a714ade0b061370f2a262f2.dir
size: 131137056
nfiles: 2
outs:
- path: data/community2vec/RC_2021-08/analogy_accuracy_results.csv
md5: f617f4abcba646cb30b234ef300b3789
size: 4459
- path: data/community2vec/RC_2021-08/best_model/keyedVectors
md5: d736cbc567a35d5b766478d474658c84
size: 4378009
- path: data/community2vec/RC_2021-08/best_model/metrics.json
md5: 8141a40292ff173eedfbea8f080bcea7
size: 629
- path: data/community2vec/RC_2021-08/best_model/parameters.json
md5: 0ba6bdb00d4e02f6900eec147b97893d
size: 305
- path: data/community2vec/RC_2021-08/best_model/word2vec.pickle
md5: 880b1ed980351f8a2b349c5e3d81dec1
size: 8382743
tsne_visualizations@2021-08:
cmd: python -m ihop.visualizations --config config.json data/community2vec/RC_2021-08/best_model
data/community2vec/RC_2021-08/best_model/tsne.csv
deps:
- path: data/community2vec/RC_2021-08/best_model/keyedVectors
md5: d736cbc567a35d5b766478d474658c84
size: 4378009
- path: data/community2vec/RC_2021-08/best_model/parameters.json
md5: 0ba6bdb00d4e02f6900eec147b97893d
size: 305
- path: data/community2vec/RC_2021-08/best_model/word2vec.pickle
md5: 880b1ed980351f8a2b349c5e3d81dec1
size: 8382743
outs:
- path: data/community2vec/RC_2021-08/best_model/tsne.csv
md5: 54c9a49b5fe741d161ee9d5cf28632a1
size: 320266
tsne_visualizations@2021-11:
cmd: python -m ihop.visualizations --config config.json data/community2vec/RC_2021-11/best_model
data/community2vec/RC_2021-11/best_model/tsne.csv
deps:
- path: data/community2vec/RC_2021-11/best_model/keyedVectors
md5: 597f36adbf0a42f06571de35a052c5c2
size: 4378410
- path: data/community2vec/RC_2021-11/best_model/parameters.json
md5: 40ef4919f8dfcf946f35bf760197b050
size: 305
- path: data/community2vec/RC_2021-11/best_model/word2vec.pickle
md5: ada2a7653e8711d5d9db0b977655cdc1
size: 8383147
outs:
- path: data/community2vec/RC_2021-11/best_model/tsne.csv
md5: 90bf72080475cc771bdac541834e739b
size: 320491
kmeans_cluster_models@2021-04:
cmd: mkdir -p data/annotation_data/RC_2021-04/kmeans_model && python -m ihop.clustering
data/community2vec/RC_2021-04/best_model/keyedVectors --output_dir data/annotation_data/RC_2021-04/kmeans_model
--cluster_params '{"n_clusters":100}' --model-name 2021-04_kmeans_clusters
deps:
- path: data/community2vec/RC_2021-04/best_model/keyedVectors
md5: 2437517f963327f4118cc8916deb6a22
size: 4378125
outs:
- path: data/annotation_data/RC_2021-04/kmeans_model/clusters.csv
md5: ec3a86487c7dc3a17127e2ad81dd1a14
size: 147142
- path: data/annotation_data/RC_2021-04/kmeans_model/metrics.json
md5: b836dcf8348fe2933d25d8493e3249fc
size: 113
- path: data/annotation_data/RC_2021-04/kmeans_model/parameters.json
md5: c202e98a404d67088dcc945289eb46d6
size: 199
- path: data/annotation_data/RC_2021-04/kmeans_model/sklearn_cluster_model.joblib
md5: 3df8f5c22e1e4155477441639c72e928
size: 80694
kmeans_cluster_models@2021-05:
cmd: mkdir -p data/annotation_data/RC_2021-05/kmeans_model && python -m ihop.clustering
data/community2vec/RC_2021-05/best_model/keyedVectors --output_dir data/annotation_data/RC_2021-05/kmeans_model
--cluster_params '{"n_clusters":100}' --model-name 2021-05_kmeans_clusters
deps:
- path: data/community2vec/RC_2021-05/best_model/keyedVectors
md5: 97de61be6c9a590d9b7862a3fa1e5c21
size: 4378438
outs:
- path: data/annotation_data/RC_2021-05/kmeans_model/clusters.csv
md5: a0ca653486ed993f36984d920a3108de
size: 147139
- path: data/annotation_data/RC_2021-05/kmeans_model/metrics.json
md5: d724d75f6cf4e111ee98e2ca039cb0c5
size: 112
- path: data/annotation_data/RC_2021-05/kmeans_model/parameters.json
md5: 4ac9a25e8a6a4e03e799cba742d8b47a
size: 199
- path: data/annotation_data/RC_2021-05/kmeans_model/sklearn_cluster_model.joblib
md5: 4c9e6a018028975ac6de3ddc18c8b996
size: 80694
kmeans_cluster_models@2021-06:
cmd: mkdir -p data/annotation_data/RC_2021-06/kmeans_model && python -m ihop.clustering
data/community2vec/RC_2021-06/best_model/keyedVectors --output_dir data/annotation_data/RC_2021-06/kmeans_model
--cluster_params '{"n_clusters":100}' --model-name 2021-06_kmeans_clusters
deps:
- path: data/community2vec/RC_2021-06/best_model/keyedVectors
md5: 574577113291a1d823cf6bc3ae0813a1
size: 4378886
outs:
- path: data/annotation_data/RC_2021-06/kmeans_model/clusters.csv
md5: df9a44a53cbcc4775233af668661c3de
size: 147748
- path: data/annotation_data/RC_2021-06/kmeans_model/metrics.json
md5: a5545a9ad5a89b52a539e118c436e3bd
size: 114
- path: data/annotation_data/RC_2021-06/kmeans_model/parameters.json
md5: 262955a3c94ee5e3d303ca45722803b4
size: 199
- path: data/annotation_data/RC_2021-06/kmeans_model/sklearn_cluster_model.joblib
md5: 9007aa6f335cb9f2d497bdce0240798e
size: 80694
kmeans_cluster_models@2021-07:
cmd: mkdir -p data/annotation_data/RC_2021-07/kmeans_model && python -m ihop.clustering
data/community2vec/RC_2021-07/best_model/keyedVectors --output_dir data/annotation_data/RC_2021-07/kmeans_model
--cluster_params '{"n_clusters":100}' --model-name 2021-07_kmeans_clusters
deps:
- path: data/community2vec/RC_2021-07/best_model/keyedVectors
md5: 07cf3c3233c2edd09ca04905b3d7ea87
size: 4378249
outs:
- path: data/annotation_data/RC_2021-07/kmeans_model/clusters.csv
md5: b18b2bbbc9c1f8e759c1ef10bd5f1cda
size: 146835
- path: data/annotation_data/RC_2021-07/kmeans_model/metrics.json
md5: 0ebf7908819dcd8c9095063c6a508655
size: 113
- path: data/annotation_data/RC_2021-07/kmeans_model/parameters.json
md5: 41e038b81970f846dc5e1223921a91e8
size: 199
- path: data/annotation_data/RC_2021-07/kmeans_model/sklearn_cluster_model.joblib
md5: 987c9075b43d61905665bbbe041d3c29
size: 80694
kmeans_cluster_models@2021-08:
cmd: mkdir -p data/annotation_data/RC_2021-08/kmeans_model && python -m ihop.clustering
data/community2vec/RC_2021-08/best_model/keyedVectors --output_dir data/annotation_data/RC_2021-08/kmeans_model
--cluster_params '{"n_clusters":100}' --model-name 2021-08_kmeans_clusters
deps:
- path: data/community2vec/RC_2021-08/best_model/keyedVectors
md5: d736cbc567a35d5b766478d474658c84
size: 4378009
outs:
- path: data/annotation_data/RC_2021-08/kmeans_model/clusters.csv
md5: e8dd0da9dd0e0a9eef54ccf39264c2c6
size: 146961
- path: data/annotation_data/RC_2021-08/kmeans_model/metrics.json
md5: aea66c2ed11b9d875256429af4fc5f70
size: 111
- path: data/annotation_data/RC_2021-08/kmeans_model/parameters.json
md5: bd8eac8960bafde2395cfcb666e3393a
size: 199
- path: data/annotation_data/RC_2021-08/kmeans_model/sklearn_cluster_model.joblib
md5: 0f73106883162648221f5c3be1efd89e
size: 80694
kmeans_cluster_models@2021-09:
cmd: mkdir -p data/annotation_data/RC_2021-09/kmeans_model && python -m ihop.clustering
data/community2vec/RC_2021-09/best_model/keyedVectors --output_dir data/annotation_data/RC_2021-09/kmeans_model
--cluster_params '{"n_clusters":100}' --model-name 2021-09_kmeans_clusters
deps:
- path: data/community2vec/RC_2021-09/best_model/keyedVectors
md5: a63ae0d4b17e9acf197ba08d8f34c964
size: 4378025
outs:
- path: data/annotation_data/RC_2021-09/kmeans_model/clusters.csv
md5: c84c045f386c9de90a4978979cf0a174
size: 147066
- path: data/annotation_data/RC_2021-09/kmeans_model/metrics.json
md5: bc84fb52bf73d016066a79504aec22cf
size: 114
- path: data/annotation_data/RC_2021-09/kmeans_model/parameters.json
md5: d9ab5169de9cbfb83dd6d9b81e861b05
size: 199
- path: data/annotation_data/RC_2021-09/kmeans_model/sklearn_cluster_model.joblib
md5: 7a92ac5f11960d612070f16f6fe7ec5f