-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
153 lines (125 loc) · 5.45 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import requests
import os
import pandas as pd
from urllib.parse import urlparse
from openai_helpers import make_openai_call
from scrape_tracker import ScrapeTracker
def download_urls():
os.makedirs("data", exist_ok=True)
domain_failures = set()
tracker = ScrapeTracker()
# Read URLs from CSV file
try:
df = pd.read_csv('starter_studies.csv')
urls = df['url'].unique().tolist()
# Add all URLs to tracker with default priority
for url in urls:
tracker.add_todo_url(url)
except Exception as e:
print(f"Error reading CSV file: {e}")
return
# Process pending URLs in batches
while True:
pending_urls = tracker.get_next_pending_urls(limit=10)
if not pending_urls:
break
for url in pending_urls:
if urlparse(url).netloc in domain_failures:
print(f"Skipping {url} due to previous failure.")
tracker.update_url_status(url, 'failed', error_message="Domain in failure list")
continue
# Get current URL info
url_info = tracker.get_url_info(url)
if url_info and url_info['status'] == 'completed' and url_info['file_path'] and os.path.exists(url_info['file_path']):
print(f"Skipping {url} - already scraped to {url_info['file_path']}")
continue
try:
# Mark as in progress
tracker.update_url_status(url, 'in_progress')
file_path = download_as_markdown(url, domain_failures)
if file_path:
tracker.update_url_status(url, 'completed', file_path=file_path)
print(f"Successfully scraped {url} to {file_path}")
else:
tracker.update_url_status(url, 'failed', error_message="Content not related to topics")
print(f"Content not related to topics: {url}")
except Exception as e:
error_msg = str(e)
print(f"Error scraping {url}: {error_msg}")
tracker.update_url_status(url, 'failed', error_message=error_msg)
domain_failures.add(urlparse(url).netloc)
def download_as_markdown(url, domain_failures):
headers = {
'Authorization': f'Bearer {os.getenv("JINA_API_KEY")}'
}
try:
response = requests.get(f'https://r.jina.ai/{url}', headers=headers)
response.raise_for_status()
markdown_content = response.text
is_related, validation_message = validate_content(markdown_content)
if not is_related:
print(validation_message)
return None
title = None
for line in markdown_content.split('\n'):
if line.startswith('Title:'):
title = line.replace('Title:', '').strip()
break
if title and not title.lower().startswith(('just a moment', 'verifying')):
filename = "".join(c if c.isalnum() or c in ('-', '_') else '_'
for c in title.lower()).strip('_')
filename = filename[:100]
else:
filename = urlparse(url).path.split('/')[-1] or 'index'
filename = f"{filename}.md"
md_path = f"data/{filename}"
counter = 1
while os.path.exists(md_path):
md_path = f"data/{filename[:-3]}_{counter}.md"
counter += 1
with open(md_path, 'w', encoding='utf-8') as file:
file.write(markdown_content)
return md_path
except Exception as e:
print(f"Error downloading {url}: {e}")
domain_failures.add(urlparse(url).netloc)
raise
def validate_content(content):
"""Enhanced content validation with multiple checks across document"""
# Check for common anti-bot patterns in first 1000 chars
content_start = content[:1000].lower()
if any(phrase in content_start for phrase in [
'captcha', 'cloudflare', 'access denied', 'robot check'
]):
return False, "Anti-bot protection detected"
# Take samples from start, middle and end of longer texts
content_length = len(content)
samples = []
# Always check start
samples.append(content[:1000])
# For longer texts, check middle and end sections
if content_length > 3000:
mid_point = content_length // 2
samples.append(content[mid_point-500:mid_point+500])
samples.append(content[-1000:])
# Check each sample with OpenAI
for i, sample in enumerate(samples):
response = make_openai_call(
messages=[{
"role": "user",
"content": f"""Analyze this text excerpt and determine if it's related to any of these topics:
1. Testosterone or hormone therapy
2. Sports medicine or exercise science
3. Fitness or weightlifting
4. Weight loss or body composition
Text: {sample}
Answer only with: RELATED or UNRELATED"""
}],
max_tokens=5
)
is_related = "related" in response.choices[0].message.content.strip().lower()
if is_related:
return True, "Content validation passed"
return False, "Content not related to topics"
if __name__ == "__main__":
download_urls()