forked from UMassCDS/IHOP-Reddit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dvc.yaml
115 lines (106 loc) · 7.35 KB
/
dvc.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
stages:
# Download Reddit comments data. Note that curl sometimes hangs up early, so you should double check all data gets pulled.
download_comments:
foreach: ${months}
do:
desc: Download Reddit comments data for the month ${item}.
cmd: curl https://files.pushshift.io/reddit/comments/RC_${item}.zst | unzstd --long=31 | bzip2 > ${comments_dir}/RC_${item}.bz2
outs:
- ${comments_dir}/RC_${item}.bz2
# Download Reddit submissions data
#download_submissions:
# foreach: ${months}
# do:
# desc: Download Reddit submissions (posts) data for the month ${item}.
# cmd: curl https://files.pushshift.io/reddit/submissions/RS_${item}.zst | unzstd --long=31 | bzip2 > ${submissions_dir}/RS_${item}.bz2
# outs:
# - ${submissions_dir}/RS_${item}.bz2
# Turn the comments to 'user documents' to train community2vec models
prep_community2vec_data:
foreach: ${months}
do:
desc: Prepare data for training community2vec models over a single month's worth of data from ${item} and write it to the specified output directory. Cleans up Spark checkpoint files.
cmd: mkdir -p ${community2vec_dir}/RC_${item} && python -m ihop.import_data --config config.json c2v --top_n ${community2vec_data_prep.top_n} --exclude_top_user_perc ${community2vec_data_prep.exclude_top_users} ${community2vec_dir}/RC_${item}/subreddit_counts.csv ${community2vec_dir}/RC_${item}/user_contexts ${comments_dir}/RC_${item}.bz2 && rm ${community2vec_dir}/RC_${item}/user_contexts/.*.crc
deps:
- ${comments_dir}/RC_${item}.bz2
outs:
- ${community2vec_dir}/RC_${item}/subreddit_counts.csv
- ${community2vec_dir}/RC_${item}/user_contexts
# Use grid search to train community2vec models on a month's worth of Reddit data
community2vec_models:
foreach: ${months}
do:
desc: Train community2vec models on ${item} month of Reddit data, keeping the model that achieves the highest accuracy on the default analogy set. Grid search is done within the python script.
cmd: python -m ihop.community2vec --config config.json --contexts ${community2vec_dir}/RC_${item}/user_contexts --vocab_csv ${community2vec_dir}/RC_${item}/subreddit_counts.csv --param_grid ${community2vec_params.param_grid} --epochs ${community2vec_params.epochs} --output_dir ${community2vec_dir}/RC_${item} --workers ${community2vec_params.workers}
deps:
- ${community2vec_dir}/RC_${item}/subreddit_counts.csv
- ${community2vec_dir}/RC_${item}/user_contexts
outs:
- ${community2vec_dir}/RC_${item}/best_model/parameters.json
- ${community2vec_dir}/RC_${item}/best_model/word2vec.pickle
- ${community2vec_dir}/RC_${item}/best_model/keyedVectors
- ${community2vec_dir}/RC_${item}/analogy_accuracy_results.csv # Stores results for all trained models
metrics:
- ${community2vec_dir}/RC_${item}/best_model/metrics.json
# Generate the TSNE visualizations that will be used in the Dash application
tsne_visualizations:
foreach: ${months}
do:
desc: Produce the 2 dimensional t-sne visualization for the community2vec model for ${item} Reddit data to be used in the Dash app.
cmd: python -m ihop.visualizations --config config.json ${community2vec_dir}/RC_${item}/best_model ${community2vec_dir}/RC_${item}/best_model/tsne.csv
deps:
- ${community2vec_dir}/RC_${item}/best_model/parameters.json
- ${community2vec_dir}/RC_${item}/best_model/word2vec.pickle
- ${community2vec_dir}/RC_${item}/best_model/keyedVectors
outs:
- ${community2vec_dir}/RC_${item}/best_model/tsne.csv
# Note, for ease, this this just has explicit steps for kmeans and hierarchical agglomerative models,
# but it could easly be generalized to any clustering model supported in ihop.clustering
kmeans_cluster_models:
foreach: ${months}
do:
desc: Train kmeans clusters from the community2vec models.
cmd: >-
mkdir -p ${annotation_data_dir}/RC_${item}/kmeans_model
&& python -m ihop.clustering ${community2vec_dir}/RC_${item}/best_model/keyedVectors --output_dir ${annotation_data_dir}/RC_${item}/kmeans_model --cluster_params ${kmeans_cluster_params.model_params} --model-name ${item}_kmeans_clusters
deps:
- ${community2vec_dir}/RC_${item}/best_model/keyedVectors
outs:
- ${annotation_data_dir}/RC_${item}/kmeans_model/clusters.csv
- ${annotation_data_dir}/RC_${item}/kmeans_model/sklearn_cluster_model.joblib
- ${annotation_data_dir}/RC_${item}/kmeans_model/parameters.json
metrics:
- ${annotation_data_dir}/RC_${item}/kmeans_model/metrics.json
agglomerative_cluster_models:
foreach: ${months}
do:
desc: Train hierarchical agglomerative clusters from the community2vec models.
cmd: >-
mkdir -p ${annotation_data_dir}/RC_${item}/agglomerative_model
&& python -m ihop.clustering ${community2vec_dir}/RC_${item}/best_model/keyedVectors --output_dir ${annotation_data_dir}/RC_${item}/agglomerative_model --cluster_params ${agglomerative_cluster_params.model_params} --model-name ${item}_agglomerative_clusters --cluster_type agglomerative
deps:
- ${community2vec_dir}/RC_${item}/best_model/keyedVectors
outs:
- ${annotation_data_dir}/RC_${item}/agglomerative_model/clusters.csv
- ${annotation_data_dir}/RC_${item}/agglomerative_model/sklearn_cluster_model.joblib
- ${annotation_data_dir}/RC_${item}/agglomerative_model/parameters.json
metrics:
- ${annotation_data_dir}/RC_${item}/agglomerative_model/metrics.json
prep_annotation_data:
foreach: ${months}
do:
desc: Prepare cluster coherence and subreddit intruder annotation tasks given a kmeans and agglomerative cluster models and subreddit counts
cmd: |
python -m ihop.annotation_task_export ${annotation_data_dir}/RC_${item}/kmeans_model/clusters.csv --config config.json ${community2vec_dir}/RC_${item}/subreddit_counts.csv --cluster-labeling-csv ${annotation_data_dir}/RC_${item}/kmeans_model/RC_${item}_coherence_task.csv --intruder-csv ${annotation_data_dir}/RC_${item}/kmeans_model/RC_${item}_subreddit_intruder_task.csv
python -m ihop.annotation_task_export ${annotation_data_dir}/RC_${item}/agglomerative_model/clusters.csv --config config.json ${community2vec_dir}/RC_${item}/subreddit_counts.csv --cluster-labeling-csv ${annotation_data_dir}/RC_${item}/agglomerative_model/RC_${item}_coherence_task.csv --intruder-csv ${annotation_data_dir}/RC_${item}/agglomerative_model/RC_${item}_subreddit_intruder_task.csv
deps:
- ${community2vec_dir}/RC_${item}/subreddit_counts.csv
- ${annotation_data_dir}/RC_${item}/kmeans_model/clusters.csv
- ${annotation_data_dir}/RC_${item}/agglomerative_model/clusters.csv
outs:
- ${annotation_data_dir}/RC_${item}/kmeans_model/RC_${item}_coherence_task.csv
- ${annotation_data_dir}/RC_${item}/kmeans_model/RC_${item}_subreddit_intruder_task.csv
- ${annotation_data_dir}/RC_${item}/kmeans_model/RC_${item}_subreddit_intruder_task_answers.csv
- ${annotation_data_dir}/RC_${item}/agglomerative_model/RC_${item}_coherence_task.csv
- ${annotation_data_dir}/RC_${item}/agglomerative_model/RC_${item}_subreddit_intruder_task.csv
- ${annotation_data_dir}/RC_${item}/agglomerative_model/RC_${item}_subreddit_intruder_task_answers.csv