-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdoctorslounge.py
132 lines (107 loc) · 5.21 KB
/
doctorslounge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# -*- coding: utf-8 -*-
from lxml import html
from lxml.html.soupparser import fromstring
from hashlib import sha1
from re import split
import requests, sys, math
reload(sys)
sys.setdefaultencoding('utf-8')
class Spidey:
def _get_forums(self,url):
page = requests.get(url)
tree = html.fromstring(page.content)
return tree.xpath("//a[@class='forumtitle']")
def _get_discussions(self,url):
discussions = []
page = requests.get(url)
tree = html.fromstring(page.content)
num_posts = tree.xpath("//div[@class='pagination']/text()")
if len(num_posts) == 0:
return discussions
num_posts = num_posts[0].encode('punycode')[:-1].strip()
parse_int = num_posts.find(' topics')
if parse_int == -1:
parse_int = num_posts.find(' topic')
num_posts = int(num_posts[:parse_int])
pages = int(math.ceil(num_posts/100))+1
for i in range (1, pages+1):
elements = tree.xpath("//a[@class='topictitle']")
for element in elements:
discuss_url = element.get('href')
title = element.text
new_discuss = [discuss_url, title]
discussions.append(new_discuss)
page = requests.get(url+'&start='+str(i*100))
tree = html.fromstring(page.content)
return discussions
def _get_posts(self,url):
posts = []
page = requests.get(url)
tree = html.fromstring(page.content)
num_posts = tree.xpath("//div[@class='pagination']/text()")
if len(num_posts) == 0:
return posts
num_posts = num_posts[0].encode('punycode')[:-1].strip()
parse_int = num_posts.find(' posts')
if parse_int == -1:
parse_int = num_posts.find(' post')
num_posts = int(num_posts[:parse_int])
pages = int(math.ceil(num_posts/15))+1
for i in range (1, pages+1):
elements = tree.xpath("//div[@class='postbody']")
for element in elements:
is_medic = False
user = element.xpath(".//a[@class='username']")
date = element.xpath(".//p[@class='author']/text()")
date = ''.join(date).strip()
if len(user) == 0:
user = element.xpath(".//a[@class='username-coloured']")
is_medic = True
if len(user) == 0:
continue
user = user[0].text.strip()
body = element.xpath(".//div[@class='content']/text()")
body = ''.join(body).strip().replace('\t','').replace('\n','').replace('\r','').encode('punycode')[:-1]
id = sha1(user+date+body).hexdigest()
new_post = [id,user,is_medic,date,body]
posts.append(new_post)
page = requests.get(url+'&start='+str(i*15))
tree = html.fromstring(page.content)
return posts
def crawl(self,dir='doctorslounge'):
base = 'https://www.doctorslounge.com/forums/'
main_url = base
forums = self._get_forums(main_url)
f = open(dir+'/discussions.tsv', 'w')
f.write('forum_id\tforum_name\tdiscussion_id\tdiscussion_title\tdiscussion_url\tpost_id\tposted_date\tusername\tis_medic\tcontent\n')
for forum in forums:
forum_url = base+forum.get('href')
forum_name = forum.text
forum_id = sha1(forum_url).hexdigest()
discussions = self._get_discussions(forum_url)
for discussion in discussions:
discuss_url = base+discussion[0]
discuss_title = discussion[1]
discuss_id = sha1(discuss_url).hexdigest()
posts = self._get_posts(discuss_url)
for post in posts:
post_id = post[0]
post_user = post[1]
post_is_medic = str(post[2])
post_date = post[3]
post_content = post[4]
f.write(forum_id+'\t'+forum_name+'\t'+discuss_id+'\t'+discuss_title+'\t'+discuss_url+'\t'+post_id+'\t'+post_date+'\t'+post_user+'\t'+post_is_medic+'\t'+post_content+'\n')
f.close()
class TestSpidey(object):
def test_get_forums(self):
assert len(Spidey()._get_forums('https://www.doctorslounge.com/forums/')) == 96
def test_get_discussions1(self):
assert len(Spidey()._get_discussions('https://www.doctorslounge.com/forums/viewforum.php?f=60')) == 133
def test_get_discussions2(self):
assert len(Spidey()._get_discussions('https://www.doctorslounge.com/forums/viewforum.php?f=62')) == 604
def test_get_posts1(self):
assert len(Spidey()._get_posts('https://www.doctorslounge.com/forums/viewtopic.php?f=62&t=40085')) == 2
def test_get_posts2(self):
assert len(Spidey()._get_posts('https://www.doctorslounge.com/forums/viewtopic.php?f=62&t=5528')) == 17
if __name__ == '__main__':
Spidey().crawl()