Directories updating, add some comments

ichalkiad · Jul 25, 2024 · b37b204 · b37b204
1 parent 29de5dd
commit b37b204
Show file tree

Hide file tree

Showing 7 changed files with 64 additions and 77 deletions.
diff --git a/pymstdncollect/src/db.py b/pymstdncollect/src/db.py
@@ -1,5 +1,4 @@
 import pandas as pd
-import ipdb
 import pathlib 
 import pytz
 from bs4 import BeautifulSoup

diff --git a/pymstdncollect/src/toots.py b/pymstdncollect/src/toots.py
@@ -1,5 +1,4 @@
 import pandas as pd
-import ipdb
 from datetime import datetime 
 import time 
 import pathlib 
@@ -98,10 +97,6 @@ def daily_collection_hashtags_users(dbconn=None, toot_dir=None, hashtag_lists_di
     contains words that are in the topic lists. Most popular hashtags 
     (95th percentile of hashtag distribution) are then added to the hashtags lists.
 
-    NOTE: currently for the topics of Climate, COVID-19 and Immigration. For extra topics, it would need to be extended.
-          
-    TODO: modify to work on database instead of toots.json directory
-
     Args:
         toot_dir (_type_): _description_
         hashtag_lists_dir (bool): _description_
@@ -161,14 +156,14 @@ def daily_collection_hashtags_users(dbconn=None, toot_dir=None, hashtag_lists_di
             tophst = np.percentile(hashtagscntsval, 95, method="higher")
             newtopichashtags = [i for i in hashtagscnts.keys() if hashtagscnts[i] >= tophst]
             print(newtopichashtags)
-            if pathlib.Path("{}/{}_hashtags_upd.csv".format(hashtag_lists_dir, hashtagkey)).exists():
-                hash_tmp = pd.read_csv("{}/{}_hashtags_upd.csv".format(hashtag_lists_dir, hashtagkey))
+            if pathlib.Path("{}/{}_hashtags.csv".format(hashtag_lists_dir, hashtagkey)).exists():
+                hash_tmp = pd.read_csv("{}/{}_hashtags.csv".format(hashtag_lists_dir, hashtagkey))
                 hash_tmp = hash_tmp.tags.tolist()
                 hash_tmp.extend(newtopichashtags)
                 hash_tmp = np.unique(hash_tmp).tolist()
-                pd.DataFrame.from_dict({"tags": hash_tmp}).to_csv("{}/{}_hashtags_upd.csv".format(hashtag_lists_dir, hashtagkey), index=False)
+                pd.DataFrame.from_dict({"tags": hash_tmp}).to_csv("{}/{}_hashtags.csv".format(hashtag_lists_dir, hashtagkey), index=False)
             else:
-                pd.DataFrame.from_dict({"tags": newtopichashtags}).to_csv("{}/{}_hashtags_upd.csv".format(hashtag_lists_dir, hashtagkey), index=False)
+                pd.DataFrame.from_dict({"tags": newtopichashtags}).to_csv("{}/{}_hashtags.csv".format(hashtag_lists_dir, hashtagkey), index=False)
 
 ###################################
 # data collection for user posts
@@ -484,9 +479,7 @@ def collect_toots_and_tooters_apidirect(dbconn, res, keywords, textprocessor, in
         toottext = toottext.lower()
         if len(keywords) > 0:
             def contains_kw(x): return x.lower() in toottext
-            found_kws_iterator = filter(contains_kw, keywords)
-            # Note: consider more finegrained, e.g. separation of output directory depending - TODO later?
-            # on which keywords were spotted
+            found_kws_iterator = filter(contains_kw, keywords)           
             if len(list(found_kws_iterator)) > 0:
                 filtered_toots.append(i)
         else:

diff --git a/pymstdncollect/src/utils.py b/pymstdncollect/src/utils.py
@@ -1,17 +1,13 @@
-import json
 import pandas as pd
-import ipdb
 from datetime import datetime 
 import time 
 import pathlib 
 import pytz
 import jsonlines
-import os
 import numpy as np
 from flashtext import KeywordProcessor
 import logging
 import requests
-import sys
 import pickle
 
 ###################################
@@ -223,7 +219,7 @@ def get_user_id_from_username(username, url, instance_name, auth_dict):
 # keyword lists
 ###################################
 
-def load_keywords_topic_lists(topics=["climatechange", "epidemics", "immigration"], topic_lists_dir="./topiclists_iscpif/"):
+def load_keywords_topic_lists(topics=["epidemics"], topic_lists_dir="./topiclists/"):
 
     keywordsearchers = []
     for topicname in topics:

diff --git a/pymstdncollect/user_scripts/daily_collection.py b/pymstdncollect/user_scripts/daily_collection.py
@@ -14,19 +14,18 @@
 if __name__ == "__main__":
 
     parallel = False
-    with open("/home/ubuntu/mstdncollect/authorisations/auth_dict.json", "r") as f:
-        auth_dict = json.load(f)    
+    authdict_fullpath = "./authorisations/auth_dict.json"
+    with open(authdict_fullpath, "r") as f:
+        auth_dict = json.load(f)  
 
-    # timestamp = datetime.now(timezone.utc) - timedelta(days=1)
     upperend = datetime.now(timezone.utc) 
-    # upperend = upperend - timedelta(days=2) # as per David: collect past 72h-48h intervals so that we have "favorited post" information 
     max_id_snowflake = datetime2snowflake(upperend)
     timestamp = upperend - timedelta(days=1)
     min_id_snowflake = datetime2snowflake(timestamp)    
     print(max_id_snowflake, min_id_snowflake)
-    # DIR_out = "./mastodon.social_allpublic_apidirect_nofilter/"   
-
-    DIR_out = "/mnt2/dailycollects_pymstdn/"   
+
+    # Provide full path to output directory
+    DIR_out = "/tmp/"   
     pathlib.Path(DIR_out).mkdir(parents=True, exist_ok=True)
     pathlib.Path("{}/logging/".format(DIR_out)).mkdir(parents=True, exist_ok=True)
     logging.basicConfig(filename="{}/logging/logging_{}.txt".format(DIR_out, 

diff --git a/pymstdncollect/user_scripts/daily_postcollection.py b/pymstdncollect/user_scripts/daily_postcollection.py
@@ -1,15 +1,20 @@
-import ipdb
 from datetime import datetime, timezone, timedelta
 from pymstdncollect.src.toots import daily_collection_hashtags_users
 from pymstdncollect.src.db import connectTo_weekly_toots_db, execute_create_sql 
 
 
 if __name__ == "__main__":
 
-
+    # Data collection period
     mindate = datetime.now(timezone.utc) 
-    maxdate = mindate - timedelta(days=7)    
-    database = "/mnt2/dailycollects_pymstdn/toots_db_{}_{}.db".format(mindate.strftime("%Y-%m-%d"), maxdate.strftime("%Y-%m-%d"))
+    maxdate = mindate - timedelta(days=1)    
+    # Provide full file path of the SQLite database
+    database = "/tmp/toots_db_{}_{}.db".format(mindate.strftime("%Y-%m-%d"), maxdate.strftime("%Y-%m-%d"))
+    # Provide path for toot output in JSON format, else set to None to utilise the database
+    toot_dir = None
+    # Provide paths of directories that contain the hashtags lists that will be used, as well as the topic specific dictionaries
+    hashtag_lists_dir = "/home/ubuntu/PyMstdnCollect/collection_hashtags/"
+    topic_lists_dir = "/home/ubuntu/PyMstdnCollect/topiclists/"
 
     sql_create_toots_table = """ CREATE TABLE IF NOT EXISTS toots (
                                         globalID text PRIMARY KEY,
@@ -45,11 +50,7 @@
                                         UNIQUE(globalID, accountglobalID)
                                     ); """
 
-
     dbconn = connectTo_weekly_toots_db(database)
-    execute_create_sql(dbconn, sql_create_toots_table)
-    toot_dir = "/home/ubuntu/mstdncollect/"
-    hashtag_lists_dir = "/home/ubuntu/mstdncollect/collection_hashtags/"
-    topic_lists_dir = "/home/ubuntu/mstdncollect/"
-    daily_collection_hashtags_users(dbconn=dbconn, toot_dir=None, hashtag_lists_dir=hashtag_lists_dir, 
+    execute_create_sql(dbconn, sql_create_toots_table)    
+    daily_collection_hashtags_users(dbconn=dbconn, toot_dir=toot_dir, hashtag_lists_dir=hashtag_lists_dir, 
                                     topic_lists_dir=topic_lists_dir, dbtablename="toots")
diff --git a/pymstdncollect/user_scripts/hashtags_contexts_collection.py b/pymstdncollect/user_scripts/hashtags_contexts_collection.py
@@ -1,7 +1,6 @@
 import json
 import requests
 import pandas as pd
-import ipdb
 import time 
 import pathlib 
 import pytz
@@ -21,7 +20,8 @@
 def collect_timeline_hashtag_apidirect(hashtag=None, url=None, local=False, remote=False, only_media=False,
                             max_id=None, since_id=None, min_id=None,limit=40, 
                             keywords=[], textprocessor=None, savedir="/tmp/", 
-                            instance_name=None, allcollectedhashtags=[], print_tree=False, dbconn=None, auth_dict=None, cutoff_date="2023-12-02"):
+                            instance_name=None, allcollectedhashtags=[], print_tree=False, dbconn=None, 
+                            auth_dict=None, cutoff_date="2023-12-02"):
     """collect_timeline_hashtag_apidirect 
 
     Collects timelines and conversation data based on 
@@ -204,20 +204,23 @@ def collect_timeline_hashtag_apidirect(hashtag=None, url=None, local=False, remo
 
 if __name__ == "__main__":
 
-    # DEVISE STOPPING RULE FOR HASHTAGS, INFEASIBLE TO CHECK ALL OF THEM
-
     parallel = False
-    with open("/home/ubuntu/mstdncollect/authorisations/auth_dict.json", "r") as f:
+    authdict_fullpath = "./authorisations/auth_dict.json"
+    with open(authdict_fullpath, "r") as f:
         auth_dict = json.load(f)    
-    topics = ["climatechange", "epidemics", "immigration"]
-
+    # Topics for which the data collection will be run - the corresponding hashtags and dictionaries should be
+    # in topiclists/ and collection_hashtags/
+    topics = ["epidemics"]
+    # Provide paths of directories that contain the hashtags lists that will be used
+    hashtag_lists_dir = "/home/ubuntu/PyMstdnCollect/collection_hashtags/"    
     upperend = datetime.now(timezone.utc) 
-    upperend = upperend - timedelta(days=15) # as per David: collect past 72h-48h intervals so that we have "favorited post" information 
+    upperend = upperend - timedelta(days=15)
     max_id_snowflake = datetime2snowflake(upperend)
     timestamp = upperend - timedelta(days=7)
     min_id_snowflake = datetime2snowflake(timestamp)    
     print(max_id_snowflake, min_id_snowflake)
-    database = "/mnt2/dailycollects_pymstdn/toots_hashtags_{}_{}.db".format(timestamp.strftime("%Y-%m-%d"), upperend.strftime("%Y-%m-%d"))
+    # Provide full path for the output database
+    database = "/tmp/toots_hashtags_{}_{}.db".format(timestamp.strftime("%Y-%m-%d"), upperend.strftime("%Y-%m-%d"))
     sql_create_toots_table = """ CREATE TABLE IF NOT EXISTS toots (
                                        globalID text PRIMARY KEY,
                                        id text NOT NULL,
@@ -255,24 +258,18 @@ def collect_timeline_hashtag_apidirect(hashtag=None, url=None, local=False, remo
                                    ); """
     dbconn = connectTo_weekly_toots_db(database)
     execute_create_sql(dbconn, sql_create_toots_table) 
-    ##################################################
-
-
-    # database = "/mnt2/toots_hashtags_{}_{}.db".format(timestamp.strftime("%Y-%m-%d"), upperend.strftime("%Y-%m-%d"))
-    # dbconn = connectTo_weekly_toots_db(database)
-    hashtag_lists_dir = "/home/ubuntu/mstdncollect/collection_hashtags/"
-    tree = False    
-    climate_hashtags = pd.read_csv("{}/climate_hashtags_upd.csv".format(hashtag_lists_dir), header=None)  # for subsequent runs change to _upd
-    covid_hashtags = pd.read_csv("{}/epidemics_hashtags_upd.csv".format(hashtag_lists_dir), header=None)
-    immigration_hashtags = pd.read_csv("{}/immigration_hashtags_upd.csv".format(hashtag_lists_dir), header=None)
-    climate_hashtags_list = climate_hashtags.values.flatten().tolist()
-    covid_hashtags_list = covid_hashtags.values.flatten().tolist()
-    immigration_hashtags_list = immigration_hashtags.values.flatten().tolist()
-
-    hashtag_list_all = [climate_hashtags_list, covid_hashtags_list, immigration_hashtags_list]
+
+    # Flag to determine printing of conversation tree - useful for debugging and understanding
+    tree = False        
+    hashtag_list_all = []
     hashtag_list_names = topics
+    for t in topics:
+        hashtags = pd.read_csv("{}/{}_hashtags.csv".format(hashtag_lists_dir, t), header=None)    
+        hashtags_list = hashtags.values.flatten().tolist()    
+        hashtag_list_all.append(hashtags_list)
+
     allcollectedhashtags = []
-    for hashtaglistidx in range(3):
+    for hashtaglistidx in range(len(hashtag_list_names)):
         hashtaglist = hashtag_list_all[hashtaglistidx]
         name = hashtag_list_names[hashtaglistidx]
         for hashtag in hashtaglist:
@@ -290,4 +287,5 @@ def collect_timeline_hashtag_apidirect(hashtag=None, url=None, local=False, remo
             if hashdict[i] >= ninthdec and i not in hashtaglist:
                 hashtaglist.append(i)
         print(hashtaglist)
-        pd.DataFrame.from_dict({"hashtags": list(set(hashtaglist))}).to_csv("{}/{}_hashtags_upd.csv".format(hashtag_lists_dir, name), index=False, header=False)
+        # Note that hashtag list will be overwritten to include new entries
+        pd.DataFrame.from_dict({"hashtags": list(set(hashtaglist))}).to_csv("{}/{}_hashtags.csv".format(hashtag_lists_dir, name), index=False, header=False)
diff --git a/pymstdncollect/user_scripts/weekly_postcollection.py b/pymstdncollect/user_scripts/weekly_postcollection.py
@@ -1,4 +1,3 @@
-import ipdb
 import os
 from datetime import datetime, timezone, timedelta
 import pathlib 
@@ -33,8 +32,7 @@ def weekly_users_postcollection(sourcedir=None, mindate=None, maxdate=None, dbco
         dbconn (_type_, optional): _description_. Defaults to None.
         outdir (str, optional): _description_. Defaults to "/tmp/".
     """
-    # change head directory structure to toots/year/month/toots.jsonl
-
+
     if sourcedir is not None:
         years  = [f.name for f in os.scandir("{}/toots/".format(sourcedir)) if f.is_dir()]     
         months = [f.name for m in years for f in os.scandir("{}/toots/{}/".format(sourcedir, m)) if f.is_dir()]  
@@ -50,10 +48,11 @@ def weekly_users_postcollection(sourcedir=None, mindate=None, maxdate=None, dbco
     if len(topusers) > 10:
         topactivity = np.percentile(topusers.followers.values, 98, interpolation="higher")
         topusers = topusers.loc[topusers.followers >= topactivity].reset_index(drop=True)
+    if outdir is None:
+        outdir = "/tmp/"
     pathlib.Path("{}/criticalusers/".format(outdir)).mkdir(parents=True, exist_ok=True)
     topusers.to_csv("{}/criticalusers/users_{}_{}.csv".format(outdir, mindate.strftime("%d%m%Y"), maxdate.strftime("%d%m%Y")))
-    print(topusers)
-
+
     doneusers = []   
     for i, row in topusers.iterrows():
         try:            
@@ -100,7 +99,6 @@ def weekly_users_postcollection(sourcedir=None, mindate=None, maxdate=None, dbco
         except:
             print(row)
 
-
 def update_relevant_toots(dbconn, data, keywordsearchers, extra_keywords, auth_dict, mindate, maxdate):
 
     tootwords = data["toottext"].split()
@@ -213,15 +211,18 @@ def weekly_toots_postcollection(sourcedir=None, mindate=None, maxdate=None, dbco
 
 if __name__ == "__main__":
 
-    with open("/home/ubuntu/mstdncollect/authorisations/auth_dict.json", "r") as f:
-        auth_dict = json.load(f) 
-
-    database = "/mnt2/dailycollects_pymstdn/tootsweekly_db.db"   
-
+    authdict_fullpath = "./authorisations/auth_dict.json"
+    with open(authdict_fullpath, "r") as f:
+        auth_dict = json.load(f)   
+    # Provide full path for the output database
+    database = "/tmp/toots_weekly_db_{}.db".format(datetime.now().astimezone(pytz.utc).strftime("%Y-%m-%d"))
+
     dbconn = connectTo_weekly_toots_db(database)
-    toot_dir = None #"/mnt2/mstdndata/"
-    hashtag_lists_dir = "/home/ubuntu/mstdncollect/collection_hashtags/"
-    topic_lists_dir = "/home/ubuntu/mstdncollect/topiclists_iscpif/"
+    # Provide path for toot output in JSON format, else set to None to utilise the database
+    toot_dir = None
+    # Provide paths of directories that contain the hashtags lists that will be used, as well as the topic specific dictionaries
+    hashtag_lists_dir = "/home/ubuntu/PyMstdnCollect/collection_hashtags/"
+    topic_lists_dir = "/home/ubuntu/PyMstdnCollect/topiclists/"
 
     """ 
     NOTE: the dictionaries should be stored in topic_lists_dir is csv format
@@ -230,9 +231,9 @@ def weekly_toots_postcollection(sourcedir=None, mindate=None, maxdate=None, dbco
         Special keywords can be stored in extra_kw.csv in topic_lists_dir.
 
     """
-    topics = ["climatechange", "epidemics", "immigration"]
+    topics = ["epidemics"]
     maxdate = datetime.now(timezone.utc)
-    mindate = maxdate - timedelta(days=10)
+    mindate = maxdate - timedelta(days=7)
     daily_collection_hashtags_users(dbconn=dbconn, toot_dir=toot_dir, hashtag_lists_dir=hashtag_lists_dir, topics=topics, 
                                     topic_lists_dir=topic_lists_dir, dbtablename="toots")
     weekly_users_postcollection(sourcedir=None, mindate=mindate, maxdate=maxdate, dbconn=dbconn,