-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
82 lines (69 loc) · 2.64 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import requests
import sys
import os
from bs4 import BeautifulSoup
from pathlib import Path
import chalk
from config import config
# Definitions for filetypes for naming
def fileType(link):
filetype = link.category.get("term")
if filetype == "edition":
return str(link.find("cpi:editiondate").text + "-" + link.find("dcterms:identifier").text.split("-")[0])
if filetype == "puzzle":
return str(link.find("cpi:puzzletype").text)
elif filetype == "section":
return str(link.title.text)
elif filetype == "slot":
return str(link.find("cpi:times_templateid").text + "-" + link.find("dcterms:identifier").text.split("-")[0])
elif filetype == "book":
return str(link.title.text)
elif filetype == "images":
return str(link.find("cpi:chpid").text)
elif filetype == "article":
return str(link.find("cpi:slug").text + "-" + link.find("dcterms:identifier").text.split("-")[0][0])
else:
return str(link.find("dcterms:identifier").text.split("-")[0])
# Main function
def writeFile(filepath: str, filelink: str, iteration: int, logLevel: int):
# Fetch the link from HTTP, return as XML blob
data = requests.get(filelink).text
# Soupify XML blob
soup = BeautifulSoup(data, features="html.parser")
# Run fileType function to determine naming convention
filetype = fileType(soup)
# Build path from current path plus filetype
path = filepath + "/" + filetype
# Check depth limit
depth = len(filepath.split("/"))
if depth > config["maxDepth"]:
return
# Tell the console where we are
if logLevel > 0:
print(chalk.bold("Depth/Iteration: ") +
chalk.blue(depth) + "/" + chalk.cyan(iteration))
# Write the files
if not os.path.exists(path + "/" + filetype + ".xml"):
os.makedirs(path)
f = open(path + "/" + filetype + ".xml", "w+")
f.write(soup.prettify())
f.close()
# Get the next set of links
for link in soup.find_all("link"):
# Quits if file is a binary or would result in a image save
if (link.get("cpi:qualifier") == "binary"):
return
if ".png" in link.get("href"):
return
if ".jpg" in soup.link.get("href"):
return
iteration = iteration + 1
writeFile(path, link.get("href"), iteration, logLevel)
# Print out config
print(chalk.red("######"))
print(chalk.bold("Methode crawler"))
print(chalk.red("######"))
print(chalk.green("Depth: ") + chalk.white(config["maxDepth"]))
print("Logging level: " + str(config["logLevel"]))
# Run function
writeFile("output/", sys.argv[1], 0, config["logLevel"])