-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathExpansionsSetup.py
168 lines (158 loc) · 7.39 KB
/
ExpansionsSetup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import csv
import sys
import requests, re, time, os
from bs4 import BeautifulSoup
from dataclasses import dataclass
FILE = open(".expansions.txt", "w+")
def request_block(link):
try:
page = requests.get(link)
while page.status_code != 200:
if page.status_code == 429:
print("\rIP blocked Around 1min pause.")
time.sleep(10)
page = requests.get(link)
else: # if page 404 or else
return -1
except Exception as e:
print("Unexpected Error :\n{}".format(e))
soup = BeautifulSoup(page.content, "lxml")
return soup
# I will be sleeping 5 between each scrapes because I'm putting too much on CM
def MTG_scraping_cardmarket(completeLinks=False):
soup = request_block("https://www.cardmarket.com/en/Magic/Products/Singles")
# print(soup)
found = str(soup.find_all("select")[1])
all_ext = re.findall(r'<option(.*?)/option>', found)
all_ext.pop(0)
all_id_list = []
for exp in all_ext:
value = re.findall(r'value=\"(.*?)\"', exp)[0]
name = re.findall(r'\">(.*?)<', exp)[0]
# name = re.sub('[^a-zA-Z0-9]+', '', name)
all_id_list.append([name, value])
the_list = []
# all_id_list now contanins all possible expansions value
for expansion in all_id_list:
# time.sleep(5) # lil sleeping
# go to the expansion
url = "https://www.cardmarket.com/en/Magic/Products/Singles?idCategory=1&idExpansion=" + expansion[
1] + "&idRarity=0&perSite=20"
soup = request_block(url)
if completeLinks:
meta = soup.find('link', rel="canonical")
href = re.findall(r'href=\"(.*?)\" ', str(meta))[0]
print(href)
else:
expansion_title_soup = str(soup.find('title'))
print(expansion_title_soup)
try:
expansion_title = re.findall(r'<title>(.*?) - ', expansion_title_soup)[0]
# scrape the first card, doesn't matter the card
found = str(soup.find_all("div", class_="table-body")[0])
urls = re.findall(r'<a href="(.*?)"', found)
if len(urls) > 0:
card_url = "https://www.cardmarket.com" + urls[0]
card_soup = request_block(card_url)
title_1 = str(card_soup.find('title')) # get base
title_2 = re.findall(r'<title>(.*?) - ', title_1)[0] # gget only interesting part
title_3 = re.findall(r'\((.*?)\)', title_2) # get all thats between parenthesis
title = title_3[-1] # only keep the last
print("\"{}\", \"{}\", {}".format(expansion_title, title, expansion[1]))
the_list.append([expansion_title, title, expansion[1]])
else:
print("Expansion : {} did not contain any link".format(expansion_title))
except Exception as exp:
print("there was a problem with {}, {}".format(expansion, exp))
def YGO_scraping_cardmarket(completeLinks=False):
soup = request_block("https://www.cardmarket.com/en/YuGiOh/Products/Singles")
# print(soup)
found = str(soup.find_all("select")[1])
all_ext = re.findall(r'<option(.*?)/option>', found)
all_ext.pop(0)
all_id_list = []
for exp in all_ext:
value = re.findall(r'value=\"(.*?)\"', exp)[0]
name = re.findall(r'\">(.*?)<', exp)[0]
# name = re.sub('[^a-zA-Z0-9]+', '', name)
all_id_list.append([name, value])
the_list = []
# all_id_list now contanins all possible expansions value
for expansion in all_id_list:
time.sleep(10) # lil sleeping
# go to the expansion
url = "https://www.cardmarket.com/en/YuGiOh/Products/Singles?idCategory=1&idExpansion=" + expansion[
1] + "&idRarity=0&perSite=20"
soup = request_block(url)
expansion_title_soup = str(soup.find('title'))
link_title = soup.find_all("link", rel="canonical")[0]
# print(link_title)
base_link_url = re.findall(r'href=\"(.*?)\"', str(link_title))
try:
expansion_title = re.findall(r'<title>(.*?) - ', expansion_title_soup)[0]
# scrape the first card, doesn't matter the card
found = str(soup.find_all("div", class_="table-body")[0])
sets = re.findall(r'<span>(.*?)</span>', found)
if len(sets) > 0:
print(sets[0]+", "+expansion[1]+", "+base_link_url[0]+", "+expansion[0])
else:
print("Expansion : {} did not contain any link".format(expansion_title))
except Exception as exp:
print("there was a problem with {}, {}".format(expansion, exp))
def Pokemon_scraping_cardmarket(completeLinks=False):
soup = request_block("https://www.cardmarket.com/fr/Pokemon/Products/Singles")
# print(soup)
found = str(soup.find_all("select")[1])
all_ext = re.findall(r'<option(.*?)/option>', found)
all_ext.pop(0)
all_id_list = []
for exp in all_ext:
value = re.findall(r'value=\"(.*?)\"', exp)[0]
name = re.findall(r'\">(.*?)<', exp)[0]
# name = re.sub('[^a-zA-Z0-9]+', '', name)
all_id_list.append([name, value])
the_list = []
# all_id_list now contanins all possible expansions value
all_id_list = all_id_list[200:]
for expansion in all_id_list:
# time.sleep(5) # lil sleeping
# go to the expansion
url = "https://www.cardmarket.com/en/YuGiOh/Products/Singles?idCategory=1&idExpansion=" + expansion[
1] + "&idRarity=0&perSite=20"
soup = request_block(url)
link_title = soup.find_all("link", rel="canonical")[0]
# print(link_title)
base_link_url = re.findall(r'href=\"(.*?)\"', str(link_title))
if completeLinks:
meta = soup.find('link', rel="canonical")
href = re.findall(r'href=\"(.*?)\" ', str(meta))[0]
print(href)
else:
# scrape the first card, doesn't matter the card
found = str(soup.find_all("div", class_="table-body")[0])
urls = re.findall(r'<a href="(.*?)"', found)
if len(urls) > 0:
card_url = "https://www.cardmarket.com" + urls[0]
card_soup = request_block(card_url)
expansion_title_soup = card_soup.find_all("img", src="/img/transparent.gif")
# print("--", expansion_title_soup)
try:
expansion_title = \
re.findall(r'static\.cardmarket\.com\/img\/[0-9a-z]*\/items/[0-9A-Za-z]*\/(.*?)\/',
str(expansion_title_soup))[0]
# print("Expansion title : {}".format(expansion_title))
print("{}, {}, {}".format(expansion_title, expansion[1], base_link_url[0]))
except Exception as exp:
print("Exception with {} : {}".format(expansion, exp))
else:
print("Expansion : {} did not contain any link".format(expansion_title))
YGO_scraping_cardmarket()
# Pokemon_scraping_cardmarket()
# with open(".pokemon", "r") as w11:
# with open("only_name_pkmn.txt", "r") as w22:
# lines1 = w11.read().splitlines()
# lines2 = w22.read().splitlines()
# print(len(lines1))
# print(len(lines2))
# for i, elem in enumerate(lines1):
# print("{}, {}".format(elem, lines2[i]))