-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrap.py
92 lines (82 loc) · 2.67 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import re
import requests
import json
import pandas as pd
import numpy as np
from selenium import webdriver
from bs4 import BeautifulSoup as bs4
from urllib.request import urlopen
from pandas.io.json import json_normalize
import time
import urllib
username =str(input("Enter your username :"))
passwd = str(input("Enter your password : "))
target = str(input("Enter target username :"))
os.mkdir(target)
browser = webdriver.Chrome('./chromedriver')
browser.get('https://www.instagram.com/')
time.sleep(3)
browser.find_element_by_name("username").send_keys(username)
browser.find_element_by_name("password").send_keys(passwd)
time.sleep(3)
browser.find_element_by_css_selector(".sqdOP.L3NKy.y3zKF").click()
time.sleep(3)
browser.find_element_by_css_selector(".sqdOP.yWX7d.y3zKF").click()
time.sleep(3)
browser.find_element_by_css_selector(".aOOlW.HoLwm").click()
time.sleep(2)
browser.get('https://www.instagram.com/'+target)
time.sleep(3)
last_height = browser.execute_script("return document.body.scrollHeight")
print("Scrolling")
links=[]
while True:
# Scroll down to the bottom.
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load the page.
time.sleep(3)
# Calculate new scroll height and compare with last scroll height.
new_height = browser.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Finding all the posts in specified user
source = browser.page_source
data=bs4(source, 'html.parser')
body = data.find('body')
mydiv = body.find("div", {"class": "_2z6nI"})
for link in mydiv.findAll('a'):
if re.match("/p", link.get('href')):
links.append('https://www.instagram.com'+link.get('href'))
links = list(dict.fromkeys(links))
print("saving all images")
#Saving the post
j = 0
for link in links:
j = j + 1
browser.get(link)
time.sleep(3)
source = browser.page_source
data=bs4(source, 'html.parser')
body = data.find('body')
try:
filename = target+"/"+str(j)+".jpg"
mydiv = body.find("img", {"class": "FFVAD"})
s = str(mydiv)
a = re.findall('https(.+)1080w',s)
st = str(a).split(",")[2]
link = st[:len(st)-3]
link = link.replace("&","&")
except:
filename = target+"/"+str(j)+".mp4"
mydiv = body.find("video", {"class": "tWeCl"})
s = str(mydiv)
a = re.findall('https(.+)" ',s)
a = str(a).split('src="')[1]
a = a[:len(a)-2]
a = a.replace("&","&")
else:
pass
urllib.request.urlretrieve(link, filename)
print("Number of post save : ",j)