This repository has been archived by the owner on Aug 6, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper_2.py
88 lines (76 loc) · 3.36 KB
/
scraper_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
import time
import argparse
meses = ['--','enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre']
class Collector(object):
"""Collector of recent FaceBook posts.
Note: We bypass the FaceBook-Graph-API by using a
selenium FireFox instance!
This is against the FB guide lines and thus not allowed.
USE THIS FOR EDUCATIONAL PURPOSES ONLY. DO NOT ACTAULLY RUN IT.
"""
def __init__(self,permalinks):
super(Collector, self).__init__()
self.permalinks = permalinks
# browser instance
options = Options()
options.add_argument("--disable-notifications")
self.browser = webdriver.Chrome(options=options)
self.wait = WebDriverWait(self.browser, 10)
#self.browser = webdriver.Firefox()
self.delay = 3
def strip(self, string):
"""Helping function to remove all non alphanumeric characters"""
words = string.split()
words = [word for word in words if "#" not in word]
string = " ".join(words)
clean = ""
for c in string:
if str.isalnum(c) or (c in [" ", ".", ","]):
clean += c
return clean
def collect_permalink(self, permalink):
self.browser.get(permalink)
self.browser.execute_script("aksjdakd=document.evaluate(`//a[contains(string(), ' respuesta')]`, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);")
self.browser.execute_script("for (let i = 0, length = aksjdakd.snapshotLength; i < length; i++) { aksjdakd.snapshotItem(i).click(); }")
time.sleep(2.5)
# Once the full page is loaded, we can start scraping
postId = permalink.split("/")[6]
try:
userContentWrapper = self.browser.find_element_by_class_name("userContentWrapper")
with open("post.{}.html".format(postId), "w", encoding="utf-8") as f:
f.write(userContentWrapper.get_attribute("innerHTML"))
except:
with open('allposts.error.txt', 'a') as file:
file.write("{}\n".format(permalink))
def collect(self):
# navigate to page
self.browser.get('https://facebook.com')
username = self.browser.find_element_by_id("email")
password = self.browser.find_element_by_id("pass")
submit = self.browser.find_element_by_id("loginbutton")
username.send_keys()
password.send_keys()
# Step 4) Click Login
submit.click()
j = 0
for permalink in self.permalinks:
print(j)
j = j + 1
postId = permalink.split("/")[6]
try:
f = open("post.{}.html".format(postId))
continue
except IOError:
self.collect_permalink(permalink)
with open("allposts.txt", 'r') as fin:
permalinks = fin.read().splitlines(True)
permalinks = [line[:-1] for line in permalinks]
C = Collector(permalinks=permalinks)
C.collect()