forked from nanuchi/my-project
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Anosh Rezaei <ahmadanosh.ar@gmail.com>
- Loading branch information
Showing
2 changed files
with
347 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import pandas as pd | ||
|
||
|
||
def merge_data(): | ||
|
||
# Paths to the CSV files | ||
csv_file_1 = "sources/qa.csv" # Answer | ||
csv_file_2 = "sources/cncf_stackoverflow_qas.csv" # answer | ||
|
||
# Read the CSV files | ||
df1 = pd.read_csv(csv_file_1) | ||
df2 = pd.read_csv(csv_file_2) | ||
|
||
# Select and rename the columns of interest from the first file | ||
df1_selected = df1[["Question", "Answer", "Project"]] | ||
|
||
# Select and rename the columns of interest from the second file | ||
|
||
df2_selected = df2[["question", "answer", "tag"]].rename( | ||
columns={"question": "Question", "answer": "Answer", "tag": "Project"} | ||
) | ||
|
||
# Concatenate the selected and renamed columns | ||
merged_df = pd.concat([df1_selected, df2_selected]) | ||
|
||
# Save the merged DataFrame to a new CSV file | ||
merged_df.to_csv("merged_qas.csv", index=False) | ||
|
||
print("Columns merged and saved successfully!") | ||
|
||
|
||
if __name__ == "__main__": | ||
merge_data() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,314 @@ | ||
import yaml | ||
import requests | ||
import pandas as pd | ||
from bs4 import BeautifulSoup | ||
import time | ||
import json | ||
import os | ||
import sys | ||
from datetime import datetime, timedelta | ||
|
||
API_KEY = '' # Replace with your actual API key of stackexchange | ||
REQUEST_DELAY = 0 # Number of seconds to wait between requests | ||
PROGRESS_FILE = 'sources/stackoverflow_progress.json' | ||
CSV_FILE = 'sources/cncf_stackoverflow_qas.csv' | ||
PROCESSED_IDS_FILE = 'sources/processed_question_ids.json' | ||
TAGS_FILE = 'sources/tags.json' | ||
TAGS_UPDATE_INTERVAL = 7 # Number of days between tag updates | ||
DAILY_REQUEST_LIMIT = 9000 | ||
|
||
|
||
def fetch_with_backoff(api_url, params): | ||
"""Fetch data from the API with exponential backoff for rate limiting. | ||
Args: | ||
api_url (str): The API endpoint URL. | ||
params (dict): Dictionary of query parameters for the API request. | ||
Returns: | ||
dict: The JSON response data from the API if successful. | ||
None: If the API request fails. | ||
""" | ||
while True: | ||
# print(f"Fetching data with params: {params}") | ||
response = requests.get(api_url, params=params) | ||
if response.status_code == 200: | ||
return response.json() | ||
elif response.status_code == 429: | ||
print("Rate limit exceeded. Waiting for retry...") | ||
retry_after = int(response.headers.get('retry-after', REQUEST_DELAY)) | ||
sys.exit() | ||
else: | ||
print(f"Failed to fetch data: {response.status_code} - {response.text}") | ||
sys.exit() | ||
return None | ||
|
||
def qa_extractor(request_count, tag, start_page, page_size=100,): | ||
"""Fetch questions from StackOverflow for a given tag. | ||
Args: | ||
request_count (int): Current count of API requests made. | ||
tag (str): The tag to search for on StackOverflow. | ||
start_page (int): The starting page number for the API request. | ||
page_size (int, optional): Number of results per page. Defaults to 100. | ||
Returns: | ||
int: Updated request count after fetching questions. | ||
""" | ||
|
||
api_url = "https://api.stackexchange.com/2.3/search/advanced" | ||
questions = [] | ||
|
||
# Load processed question IDs | ||
processed_question_ids = load_processed_question_ids() | ||
|
||
while True: | ||
if request_count >= DAILY_REQUEST_LIMIT: | ||
break | ||
|
||
params = { | ||
'page': start_page, | ||
'pagesize': page_size, | ||
'order': 'desc', | ||
'sort': 'activity', | ||
'answers': 1, | ||
'tagged': tag, | ||
'site': 'stackoverflow', | ||
'filter': 'withbody', # Ensuring the 'body' field is included | ||
'key': API_KEY | ||
} | ||
|
||
response_data = fetch_with_backoff(api_url, params) | ||
request_count += 1 | ||
if not response_data or not response_data['items']: | ||
save_progress(tag, "null") | ||
break | ||
QA_list = [] | ||
if response_data: | ||
questions.extend(response_data['items']) | ||
|
||
for question in response_data['items']: | ||
question_id = question['question_id'] | ||
if question_id in processed_question_ids: | ||
continue | ||
if question['answer_count'] > 0: | ||
question_text = remove_html_tags(question['body']) | ||
request_count += 1 | ||
|
||
answers = fetch_answers(question_id) | ||
# formatted_answers = [] | ||
|
||
for count, answer in enumerate(answers, start=1): | ||
if count > 3: | ||
break | ||
if answer['score'] < 0: | ||
continue | ||
answer_text = remove_html_tags(answer['body']) | ||
# formatted_answers.append(f"{count}. {answer_text}") | ||
|
||
QA_list.append({ | ||
"question": question_text, | ||
# "answer": "\n".join(formatted_answers), | ||
"answer": answer_text, | ||
"tag": tag, | ||
}) | ||
|
||
# Add question ID to the set of processed IDs | ||
processed_question_ids.add(question_id) | ||
|
||
has_more = response_data.get('has_more', False) | ||
if not has_more: | ||
save_progress(tag, "finished") | ||
break | ||
|
||
print(f"Fetched {len(response_data['items'])} questions from page {start_page} for tag '{tag}'. Total so far: {len(questions)}") | ||
save_to_csv(QA_list, CSV_FILE) | ||
save_processed_question_ids(processed_question_ids) | ||
start_page += 1 | ||
save_progress(tag, start_page) | ||
time.sleep(REQUEST_DELAY) # Add delay between requests to avoid rate limiting | ||
else: | ||
break | ||
if request_count >= DAILY_REQUEST_LIMIT: | ||
# print(f"Request count is: {request_count}") | ||
break | ||
|
||
print(f"Request count for question is: {request_count}") | ||
return request_count | ||
|
||
def fetch_answers(question_id): | ||
"""Fetch answers for a specific question from StackOverflow. | ||
Args: | ||
question_id (int): The ID of the question to fetch answers for. | ||
Returns: | ||
list: List of answer items if successful, otherwise an empty list. | ||
""" | ||
api_url = f"https://api.stackexchange.com/2.3/questions/{question_id}/answers" | ||
params = { | ||
'order': 'desc', | ||
'sort': 'votes', | ||
'site': 'stackoverflow', | ||
'filter': 'withbody', # Ensuring the 'body' field is included | ||
'key': API_KEY | ||
} | ||
|
||
response_data = fetch_with_backoff(api_url, params) | ||
return response_data['items'] if response_data else [] | ||
|
||
def remove_html_tags(text): | ||
"""Remove HTML tags from a given text. | ||
Args: | ||
text (str): The HTML text to be processed. | ||
Returns: | ||
str: The text with HTML tags removed. | ||
""" | ||
soup = BeautifulSoup(text, "html.parser") | ||
return soup.get_text() | ||
|
||
def extract_all_projects(tags, request_count): | ||
"""Extract QA pairs for multiple tags. | ||
Args: | ||
tags (list): List of tags to process. | ||
request_count (int): Initial count of API requests made. | ||
""" | ||
progress = load_progress() | ||
all_tags_done = True # Flag to check if all tags are done | ||
for tag in tags: | ||
if progress.get(tag) == "null" or progress.get(tag) == "finished": | ||
continue | ||
else: | ||
all_tags_done = False # Found a tag that needs processing | ||
start_page = progress.get(tag, 1) | ||
|
||
request_count = qa_extractor(request_count, tag, start_page=start_page) | ||
if request_count >= DAILY_REQUEST_LIMIT: | ||
break | ||
if all_tags_done: | ||
print("We have reached all question-answer data from StackOverflow.") | ||
|
||
def save_to_csv(data, filename): | ||
"""Save extracted data to a CSV file. | ||
Args: | ||
data (list): List of dictionaries containing QA data. | ||
filename (str): The filename for the CSV file. | ||
""" | ||
if os.path.exists(filename) and os.path.getsize(filename) > 0: | ||
try: | ||
df = pd.read_csv(filename) | ||
df = pd.concat([df, pd.DataFrame(data)], ignore_index=True) | ||
except pd.errors.EmptyDataError: | ||
df = pd.DataFrame(data) | ||
else: | ||
df = pd.DataFrame(data) | ||
df.to_csv(filename, index=False) | ||
# print(f"Data saved to {filename}") | ||
|
||
def load_progress(): | ||
"""Load progress data from file. | ||
Returns: | ||
dict: Dictionary containing progress data. | ||
""" | ||
try: | ||
with open(PROGRESS_FILE, 'r') as f: | ||
data = json.load(f) | ||
return data | ||
except FileNotFoundError: | ||
print(f"File {PROGRESS_FILE} not found.") | ||
return {} | ||
except json.JSONDecodeError: | ||
print(f"Error decoding JSON data in {PROGRESS_FILE}.") | ||
return {} | ||
|
||
def save_progress(tag, page): | ||
"""Save progress data to file. | ||
Args: | ||
tag (str): The tag being processed. | ||
page (str or int): The current page number or status. | ||
""" | ||
progress = load_progress() | ||
progress[tag] = page | ||
with open(PROGRESS_FILE, 'w') as f: | ||
json.dump(progress, f) | ||
|
||
def load_processed_question_ids(): | ||
"""Load processed question IDs from file. | ||
Returns: | ||
set: Set of processed question IDs. | ||
""" | ||
try: | ||
if os.path.getsize(PROCESSED_IDS_FILE) == 0: | ||
return set() | ||
with open(PROCESSED_IDS_FILE, 'r') as f: | ||
return set(json.load(f)) | ||
except FileNotFoundError: | ||
return set() | ||
except json.JSONDecodeError: | ||
return set() | ||
|
||
def save_processed_question_ids(processed_ids): | ||
"""Save processed question IDs to file. | ||
Args: | ||
processed_ids (set): Set of processed question IDs. | ||
""" | ||
with open(PROCESSED_IDS_FILE, 'w') as f: | ||
json.dump(list(processed_ids), f) | ||
|
||
def load_tags(): | ||
"""Load tags from the JSON file if it's not older than the update interval, otherwise from the YAML file. | ||
Returns: | ||
list: List of tags. | ||
""" | ||
if os.path.exists(TAGS_FILE): | ||
with open(TAGS_FILE, 'r') as f: | ||
tags_data = json.load(f) | ||
last_update = datetime.strptime(tags_data['last_update'], "%Y-%m-%d") | ||
if datetime.now() - last_update < timedelta(days=TAGS_UPDATE_INTERVAL): | ||
return tags_data['tags'] | ||
|
||
# If the JSON file doesn't exist or is older than the update interval, load from YAML | ||
with open("sourcesl/andscape_augmented.yml", 'r') as f: | ||
data = yaml.safe_load(f) | ||
|
||
tags = [] | ||
# Initialize a dictionary to save tags corresponding to each file | ||
tags_dict = {'Project_name': ""} | ||
# Process the loaded data | ||
for category in data['landscape']: | ||
category_list = ["App Definition and Development", "Orchestration & Management", "Runtime", \ | ||
"Provisioning", "Observability and Analysis", "Test_Provisioning"] | ||
if category['name'] not in category_list: | ||
continue | ||
tags_dict['Category'] = category['name'] | ||
for subcategory in category.get('subcategories', []): | ||
for item in subcategory.get('items', []): | ||
project_name = item['name'].split('(')[0].strip() | ||
tags_dict['Project_name'] = project_name | ||
tags.append(tags_dict['Project_name']) | ||
|
||
# Save the tags to the JSON file with the current date | ||
tags_data = { | ||
'tags': tags, | ||
'last_update': datetime.now().strftime("%Y-%m-%d") | ||
} | ||
with open(TAGS_FILE, 'w') as f: | ||
json.dump(tags_data, f) | ||
|
||
return tags | ||
|
||
if __name__ == "__main__": | ||
tags = load_tags() | ||
request_count = 0 | ||
# Extract and save QA pairs incrementally | ||
extract_all_projects(tags, request_count) |