-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnews_twitter_similarity_proposed.py
155 lines (120 loc) · 5.48 KB
/
news_twitter_similarity_proposed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import configparser
from typing import Dict, List, Tuple
import tweepy
import openai
# Config parser
config = configparser.ConfigParser()
config.read('config.ini')
# Twitter API config
API_KEY = config.get('TWITTER', 'API_KEY')
API_SECRET_KEY = config.get('TWITTER', 'API_SECRET_KEY')
ACCESS_TOKEN = config.get('TWITTER', 'ACCESS_TOKEN')
ACCESS_TOKEN_SECRET = config.get('TWITTER', 'ACCESS_TOKEN_SECRET')
# OpenAI API config
OPENAI_API_KEY = config.get('OPENAI', 'OPENAI_API_KEY')
openai.api_key = OPENAI_API_KEY
# Model config
MODEL_NAME = config.get('MODEL', 'MODEL_NAME')
# Thresholds
SIMILARITY_THRESHOLD = config.getfloat('TWITTER_THRESHOLDS', 'SIMILARITY_THRESHOLD')
TOP_N_ARTICLES = config.getint('TWITTER_THRESHOLDS', 'TOP_N_ARTICLES')
def load_data(filepath: str) -> List[Tuple]:
with open(filepath, 'rb') as f:
summaries = pickle.load(f)
print(f"Loaded {len(summaries)} summaries from {filepath}")
return summaries
def generate_embeddings(summaries: List[Tuple], model: SentenceTransformer) -> np.ndarray:
corpus = [summary[0] + ' ' + summary[2][:500] for summary in summaries]
embeddings = model.encode(corpus, convert_to_tensor=True)
embeddings_np = embeddings.cpu().numpy()
normalized_embeddings = embeddings_np / np.linalg.norm(embeddings_np, axis=1, keepdims=True)
return normalized_embeddings
def generate_similarity_matrix(normalized_embeddings: np.ndarray) -> np.ndarray:
similarity_matrix = np.dot(normalized_embeddings, normalized_embeddings.T)
np.fill_diagonal(similarity_matrix, -1)
return similarity_matrix
def get_top_articles(similarity_matrix: np.ndarray, summaries: List[Tuple], threshold: float, top_n: int) -> List[Tuple]:
row_indices, col_indices = np.where(similarity_matrix > threshold)
if len(row_indices) == 0 or len(col_indices) == 0:
raise Exception("No pair of articles have similarity above the threshold")
indices = np.argsort(similarity_matrix[row_indices, col_indices])[::-1]
top_indices = indices[:top_n]
top_articles = [(summaries[row_indices[i]], summaries[col_indices[i]]) for i in top_indices]
return top_articles
def generate_top_articles_by_category(top_articles: List[Tuple]) -> Dict[str, Tuple]:
top_articles_by_category = {}
for article1, _ in top_articles:
_, category, _, _, _, _ = article1
if category not in top_articles_by_category:
top_articles_by_category[category] = article1
if len(top_articles_by_category) >= 5:
break
return top_articles_by_category
def generate_engaging_tweet(headline: str, summary: str, url: str) -> str:
messages = [
{
"role": "system",
"content": "You are a professional news agent, you take news headlines and convert them to tweets to be published ASAP. Transform the following information into an engaging tweet and link to NewsPlanetAi.com: THE ENTIRE TWEET MUST BE LESS THAN 200 CHARACTERS"
},
{
"role": "user",
"content": f"Please summarize and turn this article into a tweet, that MUST be less than 200 characters long, including the hashtags:\nHeadline: {headline}\nSummary: {summary}\nURL: NewsPlanetAi.com"
}
]
response = openai.ChatCompletion.create(
model="gpt-4-1106-preview",
messages=messages,
temperature=0.8,
max_tokens=60
)
tweet = response['choices'][0]['message']['content']
if tweet.startswith('"'):
tweet = tweet.strip('"')
return tweet
def post_tweet(tweet: str):
confirmation = input("Do you want to tweet this? (yes/no): ")
if confirmation.lower() != "yes":
print("Tweet not posted.")
return
client = tweepy.Client(consumer_key=API_KEY, consumer_secret=API_SECRET_KEY, access_token=ACCESS_TOKEN, access_token_secret=ACCESS_TOKEN_SECRET)
client.create_tweet(text=tweet)
print("Tweet posted successfully")
def main():
# Load and preprocess data
print("Loading and preprocessing data")
summaries = load_data('cache/module_summaries.p')
# Load model
print("Loading model")
model = SentenceTransformer(MODEL_NAME)
# Generate embedding
print("Generating embeddings")
normalized_embeddings = generate_embeddings(summaries, model)
# Generate similarity matrix
print("Generating similarity matrix")
similarity_matrix = generate_similarity_matrix(normalized_embeddings)
# Get top articles
print("Getting top articles")
top_articles = get_top_articles(similarity_matrix, summaries, SIMILARITY_THRESHOLD, TOP_N_ARTICLES)
# Get top articles by category
print("Getting top articles by category")
top_articles_by_category = generate_top_articles_by_category(top_articles)
# Print articles
print("Printing articles")
for idx, article in enumerate(top_articles_by_category.values()):
headline, category, summary, url, _, _ = article
print(f"Article {idx + 1}: {headline} ({url})\n")
# Request article choice
article_num = int(input("Enter the number of the article you want to choose: ")) - 1
articles_list = list(top_articles_by_category.values())
chosen_article = articles_list[article_num]
# Generate tweet data
headline, _, summary, url, _, _ = chosen_article
tweet = generate_engaging_tweet(headline, summary, url)
# Post tweet
print(f"Prepared tweet: \n{tweet}")
post_tweet(tweet)
if __name__ == "__main__":
main()