generated from Monogramm/project-template
-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrapper.py
66 lines (50 loc) · 1.66 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import asyncio
import os
import random
import string
import aiohttp
from constants import DATASET_FOLDER, INVALID
def generate_url(url='http://i.imgur.com/'):
length = random.choice((5, 6))
if length == 5:
url += ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(5))
else:
url += ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(3))
url += ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(3))
url += '.jpg'
return url
def get_filename(url):
return url.rsplit('/', 1)[-1]
async def scrape_pictures():
while True:
url = generate_url()
filename = get_filename(url)
filename = url.rsplit('/', 1)[-1]
async with aiohttp.request('get', url) as response:
data = await response.read()
file = os.path.join(DATASET_FOLDER, filename)
with open(file, 'wb') as file:
file.write(data)
file_size = os.path.getsize(file.name)
if file_size in INVALID:
print("[-] Invalid: " + url)
os.remove(file.name)
else:
print("[+] Valid: " + url)
async def log_time():
count = 0
while True:
if count % 3 == 0:
print("{} seconds passed".format(count))
count += 1
await asyncio.sleep(1)
async def main():
tasks = []
async with aiohttp.ClientSession():
for i in range(10):
task = asyncio.create_task(scrape_pictures())
tasks.append(task)
tasks.append(log_time())
await asyncio.gather(*tasks)
if __name__ == '__main__':
asyncio.run(main())