Skip to content

Commit

Permalink
Directories updating, add some comments
Browse files Browse the repository at this point in the history
  • Loading branch information
ichalkiad committed Jul 25, 2024
1 parent 29de5dd commit b37b204
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 77 deletions.
1 change: 0 additions & 1 deletion pymstdncollect/src/db.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pandas as pd
import ipdb
import pathlib
import pytz
from bs4 import BeautifulSoup
Expand Down
17 changes: 5 additions & 12 deletions pymstdncollect/src/toots.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pandas as pd
import ipdb
from datetime import datetime
import time
import pathlib
Expand Down Expand Up @@ -98,10 +97,6 @@ def daily_collection_hashtags_users(dbconn=None, toot_dir=None, hashtag_lists_di
contains words that are in the topic lists. Most popular hashtags
(95th percentile of hashtag distribution) are then added to the hashtags lists.
NOTE: currently for the topics of Climate, COVID-19 and Immigration. For extra topics, it would need to be extended.
TODO: modify to work on database instead of toots.json directory
Args:
toot_dir (_type_): _description_
hashtag_lists_dir (bool): _description_
Expand Down Expand Up @@ -161,14 +156,14 @@ def daily_collection_hashtags_users(dbconn=None, toot_dir=None, hashtag_lists_di
tophst = np.percentile(hashtagscntsval, 95, method="higher")
newtopichashtags = [i for i in hashtagscnts.keys() if hashtagscnts[i] >= tophst]
print(newtopichashtags)
if pathlib.Path("{}/{}_hashtags_upd.csv".format(hashtag_lists_dir, hashtagkey)).exists():
hash_tmp = pd.read_csv("{}/{}_hashtags_upd.csv".format(hashtag_lists_dir, hashtagkey))
if pathlib.Path("{}/{}_hashtags.csv".format(hashtag_lists_dir, hashtagkey)).exists():
hash_tmp = pd.read_csv("{}/{}_hashtags.csv".format(hashtag_lists_dir, hashtagkey))
hash_tmp = hash_tmp.tags.tolist()
hash_tmp.extend(newtopichashtags)
hash_tmp = np.unique(hash_tmp).tolist()
pd.DataFrame.from_dict({"tags": hash_tmp}).to_csv("{}/{}_hashtags_upd.csv".format(hashtag_lists_dir, hashtagkey), index=False)
pd.DataFrame.from_dict({"tags": hash_tmp}).to_csv("{}/{}_hashtags.csv".format(hashtag_lists_dir, hashtagkey), index=False)
else:
pd.DataFrame.from_dict({"tags": newtopichashtags}).to_csv("{}/{}_hashtags_upd.csv".format(hashtag_lists_dir, hashtagkey), index=False)
pd.DataFrame.from_dict({"tags": newtopichashtags}).to_csv("{}/{}_hashtags.csv".format(hashtag_lists_dir, hashtagkey), index=False)

###################################
# data collection for user posts
Expand Down Expand Up @@ -484,9 +479,7 @@ def collect_toots_and_tooters_apidirect(dbconn, res, keywords, textprocessor, in
toottext = toottext.lower()
if len(keywords) > 0:
def contains_kw(x): return x.lower() in toottext
found_kws_iterator = filter(contains_kw, keywords)
# Note: consider more finegrained, e.g. separation of output directory depending - TODO later?
# on which keywords were spotted
found_kws_iterator = filter(contains_kw, keywords)
if len(list(found_kws_iterator)) > 0:
filtered_toots.append(i)
else:
Expand Down
6 changes: 1 addition & 5 deletions pymstdncollect/src/utils.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import json
import pandas as pd
import ipdb
from datetime import datetime
import time
import pathlib
import pytz
import jsonlines
import os
import numpy as np
from flashtext import KeywordProcessor
import logging
import requests
import sys
import pickle

###################################
Expand Down Expand Up @@ -223,7 +219,7 @@ def get_user_id_from_username(username, url, instance_name, auth_dict):
# keyword lists
###################################

def load_keywords_topic_lists(topics=["climatechange", "epidemics", "immigration"], topic_lists_dir="./topiclists_iscpif/"):
def load_keywords_topic_lists(topics=["epidemics"], topic_lists_dir="./topiclists/"):

keywordsearchers = []
for topicname in topics:
Expand Down
13 changes: 6 additions & 7 deletions pymstdncollect/user_scripts/daily_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,18 @@
if __name__ == "__main__":

parallel = False
with open("/home/ubuntu/mstdncollect/authorisations/auth_dict.json", "r") as f:
auth_dict = json.load(f)
authdict_fullpath = "./authorisations/auth_dict.json"
with open(authdict_fullpath, "r") as f:
auth_dict = json.load(f)

# timestamp = datetime.now(timezone.utc) - timedelta(days=1)
upperend = datetime.now(timezone.utc)
# upperend = upperend - timedelta(days=2) # as per David: collect past 72h-48h intervals so that we have "favorited post" information
max_id_snowflake = datetime2snowflake(upperend)
timestamp = upperend - timedelta(days=1)
min_id_snowflake = datetime2snowflake(timestamp)
print(max_id_snowflake, min_id_snowflake)
# DIR_out = "./mastodon.social_allpublic_apidirect_nofilter/"

DIR_out = "/mnt2/dailycollects_pymstdn/"

# Provide full path to output directory
DIR_out = "/tmp/"
pathlib.Path(DIR_out).mkdir(parents=True, exist_ok=True)
pathlib.Path("{}/logging/".format(DIR_out)).mkdir(parents=True, exist_ok=True)
logging.basicConfig(filename="{}/logging/logging_{}.txt".format(DIR_out,
Expand Down
21 changes: 11 additions & 10 deletions pymstdncollect/user_scripts/daily_postcollection.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
import ipdb
from datetime import datetime, timezone, timedelta
from pymstdncollect.src.toots import daily_collection_hashtags_users
from pymstdncollect.src.db import connectTo_weekly_toots_db, execute_create_sql


if __name__ == "__main__":


# Data collection period
mindate = datetime.now(timezone.utc)
maxdate = mindate - timedelta(days=7)
database = "/mnt2/dailycollects_pymstdn/toots_db_{}_{}.db".format(mindate.strftime("%Y-%m-%d"), maxdate.strftime("%Y-%m-%d"))
maxdate = mindate - timedelta(days=1)
# Provide full file path of the SQLite database
database = "/tmp/toots_db_{}_{}.db".format(mindate.strftime("%Y-%m-%d"), maxdate.strftime("%Y-%m-%d"))
# Provide path for toot output in JSON format, else set to None to utilise the database
toot_dir = None
# Provide paths of directories that contain the hashtags lists that will be used, as well as the topic specific dictionaries
hashtag_lists_dir = "/home/ubuntu/PyMstdnCollect/collection_hashtags/"
topic_lists_dir = "/home/ubuntu/PyMstdnCollect/topiclists/"

sql_create_toots_table = """ CREATE TABLE IF NOT EXISTS toots (
globalID text PRIMARY KEY,
Expand Down Expand Up @@ -45,11 +50,7 @@
UNIQUE(globalID, accountglobalID)
); """


dbconn = connectTo_weekly_toots_db(database)
execute_create_sql(dbconn, sql_create_toots_table)
toot_dir = "/home/ubuntu/mstdncollect/"
hashtag_lists_dir = "/home/ubuntu/mstdncollect/collection_hashtags/"
topic_lists_dir = "/home/ubuntu/mstdncollect/"
daily_collection_hashtags_users(dbconn=dbconn, toot_dir=None, hashtag_lists_dir=hashtag_lists_dir,
execute_create_sql(dbconn, sql_create_toots_table)
daily_collection_hashtags_users(dbconn=dbconn, toot_dir=toot_dir, hashtag_lists_dir=hashtag_lists_dir,
topic_lists_dir=topic_lists_dir, dbtablename="toots")
50 changes: 24 additions & 26 deletions pymstdncollect/user_scripts/hashtags_contexts_collection.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import requests
import pandas as pd
import ipdb
import time
import pathlib
import pytz
Expand All @@ -21,7 +20,8 @@
def collect_timeline_hashtag_apidirect(hashtag=None, url=None, local=False, remote=False, only_media=False,
max_id=None, since_id=None, min_id=None,limit=40,
keywords=[], textprocessor=None, savedir="/tmp/",
instance_name=None, allcollectedhashtags=[], print_tree=False, dbconn=None, auth_dict=None, cutoff_date="2023-12-02"):
instance_name=None, allcollectedhashtags=[], print_tree=False, dbconn=None,
auth_dict=None, cutoff_date="2023-12-02"):
"""collect_timeline_hashtag_apidirect
Collects timelines and conversation data based on
Expand Down Expand Up @@ -204,20 +204,23 @@ def collect_timeline_hashtag_apidirect(hashtag=None, url=None, local=False, remo

if __name__ == "__main__":

# DEVISE STOPPING RULE FOR HASHTAGS, INFEASIBLE TO CHECK ALL OF THEM

parallel = False
with open("/home/ubuntu/mstdncollect/authorisations/auth_dict.json", "r") as f:
authdict_fullpath = "./authorisations/auth_dict.json"
with open(authdict_fullpath, "r") as f:
auth_dict = json.load(f)
topics = ["climatechange", "epidemics", "immigration"]

# Topics for which the data collection will be run - the corresponding hashtags and dictionaries should be
# in topiclists/ and collection_hashtags/
topics = ["epidemics"]
# Provide paths of directories that contain the hashtags lists that will be used
hashtag_lists_dir = "/home/ubuntu/PyMstdnCollect/collection_hashtags/"
upperend = datetime.now(timezone.utc)
upperend = upperend - timedelta(days=15) # as per David: collect past 72h-48h intervals so that we have "favorited post" information
upperend = upperend - timedelta(days=15)
max_id_snowflake = datetime2snowflake(upperend)
timestamp = upperend - timedelta(days=7)
min_id_snowflake = datetime2snowflake(timestamp)
print(max_id_snowflake, min_id_snowflake)
database = "/mnt2/dailycollects_pymstdn/toots_hashtags_{}_{}.db".format(timestamp.strftime("%Y-%m-%d"), upperend.strftime("%Y-%m-%d"))
# Provide full path for the output database
database = "/tmp/toots_hashtags_{}_{}.db".format(timestamp.strftime("%Y-%m-%d"), upperend.strftime("%Y-%m-%d"))
sql_create_toots_table = """ CREATE TABLE IF NOT EXISTS toots (
globalID text PRIMARY KEY,
id text NOT NULL,
Expand Down Expand Up @@ -255,24 +258,18 @@ def collect_timeline_hashtag_apidirect(hashtag=None, url=None, local=False, remo
); """
dbconn = connectTo_weekly_toots_db(database)
execute_create_sql(dbconn, sql_create_toots_table)
##################################################


# database = "/mnt2/toots_hashtags_{}_{}.db".format(timestamp.strftime("%Y-%m-%d"), upperend.strftime("%Y-%m-%d"))
# dbconn = connectTo_weekly_toots_db(database)
hashtag_lists_dir = "/home/ubuntu/mstdncollect/collection_hashtags/"
tree = False
climate_hashtags = pd.read_csv("{}/climate_hashtags_upd.csv".format(hashtag_lists_dir), header=None) # for subsequent runs change to _upd
covid_hashtags = pd.read_csv("{}/epidemics_hashtags_upd.csv".format(hashtag_lists_dir), header=None)
immigration_hashtags = pd.read_csv("{}/immigration_hashtags_upd.csv".format(hashtag_lists_dir), header=None)
climate_hashtags_list = climate_hashtags.values.flatten().tolist()
covid_hashtags_list = covid_hashtags.values.flatten().tolist()
immigration_hashtags_list = immigration_hashtags.values.flatten().tolist()

hashtag_list_all = [climate_hashtags_list, covid_hashtags_list, immigration_hashtags_list]

# Flag to determine printing of conversation tree - useful for debugging and understanding
tree = False
hashtag_list_all = []
hashtag_list_names = topics
for t in topics:
hashtags = pd.read_csv("{}/{}_hashtags.csv".format(hashtag_lists_dir, t), header=None)
hashtags_list = hashtags.values.flatten().tolist()
hashtag_list_all.append(hashtags_list)

allcollectedhashtags = []
for hashtaglistidx in range(3):
for hashtaglistidx in range(len(hashtag_list_names)):
hashtaglist = hashtag_list_all[hashtaglistidx]
name = hashtag_list_names[hashtaglistidx]
for hashtag in hashtaglist:
Expand All @@ -290,4 +287,5 @@ def collect_timeline_hashtag_apidirect(hashtag=None, url=None, local=False, remo
if hashdict[i] >= ninthdec and i not in hashtaglist:
hashtaglist.append(i)
print(hashtaglist)
pd.DataFrame.from_dict({"hashtags": list(set(hashtaglist))}).to_csv("{}/{}_hashtags_upd.csv".format(hashtag_lists_dir, name), index=False, header=False)
# Note that hashtag list will be overwritten to include new entries
pd.DataFrame.from_dict({"hashtags": list(set(hashtaglist))}).to_csv("{}/{}_hashtags.csv".format(hashtag_lists_dir, name), index=False, header=False)
33 changes: 17 additions & 16 deletions pymstdncollect/user_scripts/weekly_postcollection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import ipdb
import os
from datetime import datetime, timezone, timedelta
import pathlib
Expand Down Expand Up @@ -33,8 +32,7 @@ def weekly_users_postcollection(sourcedir=None, mindate=None, maxdate=None, dbco
dbconn (_type_, optional): _description_. Defaults to None.
outdir (str, optional): _description_. Defaults to "/tmp/".
"""
# change head directory structure to toots/year/month/toots.jsonl


if sourcedir is not None:
years = [f.name for f in os.scandir("{}/toots/".format(sourcedir)) if f.is_dir()]
months = [f.name for m in years for f in os.scandir("{}/toots/{}/".format(sourcedir, m)) if f.is_dir()]
Expand All @@ -50,10 +48,11 @@ def weekly_users_postcollection(sourcedir=None, mindate=None, maxdate=None, dbco
if len(topusers) > 10:
topactivity = np.percentile(topusers.followers.values, 98, interpolation="higher")
topusers = topusers.loc[topusers.followers >= topactivity].reset_index(drop=True)
if outdir is None:
outdir = "/tmp/"
pathlib.Path("{}/criticalusers/".format(outdir)).mkdir(parents=True, exist_ok=True)
topusers.to_csv("{}/criticalusers/users_{}_{}.csv".format(outdir, mindate.strftime("%d%m%Y"), maxdate.strftime("%d%m%Y")))
print(topusers)


doneusers = []
for i, row in topusers.iterrows():
try:
Expand Down Expand Up @@ -100,7 +99,6 @@ def weekly_users_postcollection(sourcedir=None, mindate=None, maxdate=None, dbco
except:
print(row)


def update_relevant_toots(dbconn, data, keywordsearchers, extra_keywords, auth_dict, mindate, maxdate):

tootwords = data["toottext"].split()
Expand Down Expand Up @@ -213,15 +211,18 @@ def weekly_toots_postcollection(sourcedir=None, mindate=None, maxdate=None, dbco

if __name__ == "__main__":

with open("/home/ubuntu/mstdncollect/authorisations/auth_dict.json", "r") as f:
auth_dict = json.load(f)

database = "/mnt2/dailycollects_pymstdn/tootsweekly_db.db"

authdict_fullpath = "./authorisations/auth_dict.json"
with open(authdict_fullpath, "r") as f:
auth_dict = json.load(f)
# Provide full path for the output database
database = "/tmp/toots_weekly_db_{}.db".format(datetime.now().astimezone(pytz.utc).strftime("%Y-%m-%d"))

dbconn = connectTo_weekly_toots_db(database)
toot_dir = None #"/mnt2/mstdndata/"
hashtag_lists_dir = "/home/ubuntu/mstdncollect/collection_hashtags/"
topic_lists_dir = "/home/ubuntu/mstdncollect/topiclists_iscpif/"
# Provide path for toot output in JSON format, else set to None to utilise the database
toot_dir = None
# Provide paths of directories that contain the hashtags lists that will be used, as well as the topic specific dictionaries
hashtag_lists_dir = "/home/ubuntu/PyMstdnCollect/collection_hashtags/"
topic_lists_dir = "/home/ubuntu/PyMstdnCollect/topiclists/"

"""
NOTE: the dictionaries should be stored in topic_lists_dir is csv format
Expand All @@ -230,9 +231,9 @@ def weekly_toots_postcollection(sourcedir=None, mindate=None, maxdate=None, dbco
Special keywords can be stored in extra_kw.csv in topic_lists_dir.
"""
topics = ["climatechange", "epidemics", "immigration"]
topics = ["epidemics"]
maxdate = datetime.now(timezone.utc)
mindate = maxdate - timedelta(days=10)
mindate = maxdate - timedelta(days=7)
daily_collection_hashtags_users(dbconn=dbconn, toot_dir=toot_dir, hashtag_lists_dir=hashtag_lists_dir, topics=topics,
topic_lists_dir=topic_lists_dir, dbtablename="toots")
weekly_users_postcollection(sourcedir=None, mindate=mindate, maxdate=maxdate, dbconn=dbconn,
Expand Down

0 comments on commit b37b204

Please sign in to comment.