Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Apple WWDC Scrapped Data #230

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions Sample_python_Scripts/Apple WWDC/attending.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import requests
import json
import os
import io
from bs4 import BeautifulSoup
try:
to_unicode = unicode
except NameError:
to_unicode = str


def clean_data(text):
return text.replace("\t", "")


site = requests.get("https://developer.apple.com/wwdc/attending/")
data = site.content.decode('utf-8')
soup = BeautifulSoup(data, 'lxml')
path = "Apple WWDC/Data/Attending/"

if not os.path.exists(path):
os.makedirs(path)

title = soup.find_all("h2", {"class": "typography-subsection-headline"})
topic = soup.find_all("strong")
description = soup.find_all("p", class_=lambda x:
x != ('typography-caption' and 'date-time'))[:-2]
time = soup.find_all("p", {"class": "date-time"})

with io.open('Apple WWDC/Data/Attending/attending_data.json',
'w', encoding='utf8') as outfile:
Attending = {"title": [], "topic": [],
"description": [], "time": []}
for ele in title:
Attending["title"].append(ele.text.strip())
for ele in topic:
Attending["topic"].append(ele.text.strip())
for ele in description:
Attending["description"].append(ele.text.strip().
replace("\t", "").replace("\n", ""))
for ele in time:
Attending["time"].append(
clean_data(ele.text.strip()).replace("\n", ", "))

str_ = json.dumps(Attending, indent=2, sort_keys=False,
separators=(',', ': '), ensure_ascii=False)
outfile.write(to_unicode(str_))
46 changes: 46 additions & 0 deletions Sample_python_Scripts/Apple WWDC/consultations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import requests
import json
import os
import io
from bs4 import BeautifulSoup
try:
to_unicode = unicode
except NameError:
to_unicode = str


def clean_data(text):
return text.replace("\t", "")


site = requests.get("https://developer.apple.com/wwdc/consultations/")
data = site.content.decode('utf-8')
soup = BeautifulSoup(data, 'lxml')
path = "Apple WWDC/Data/Consultations/"

if not os.path.exists(path):
os.makedirs(path)

title = soup.find_all("h2", {"class": "typography-subsection-headline"})
topic = soup.find_all("strong")[:-1]
time = soup.find_all("p", {"class": "date-time"})

with io.open('Apple WWDC/Data/Consultations/consultations_data.json',
'w', encoding='utf8') as outfile:
consultation = {"title": [], "topic": [],
"description": [], "time": []}
for ele in title:
consultation["title"].append(ele.text.strip())
for ele in topic:
consultation["topic"].append(ele.text.strip())
consultation["description"].append(
clean_data(ele.next_sibling.next_sibling).strip())
consultation["description"].append(
clean_data(title[-1].findNext("p").text.strip()))
for ele in time:
consultation["time"].append(
clean_data(ele.text.strip()).replace("\n", ", "))

str_ = json.dumps(consultation, indent=2, sort_keys=False,
separators=(',', ': '), ensure_ascii=False)
outfile.write(to_unicode(str_))
31 changes: 31 additions & 0 deletions Sample_python_Scripts/Apple WWDC/eveningEvents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import json
import os
import io
import requests
from bs4 import BeautifulSoup as BS
try:
to_unicode = unicode
except NameError:
to_unicode = str
site = requests.get('https://developer.apple.com/wwdc/events/')
data = site.content.decode('utf-8')
Soup = BS(data, 'lxml')
path = "Apple WWDC/Data/Special Events"
if not os.path.exists(path):
os.makedirs(path)
filename = 'Apple WWDC/Data/Special Events/evening_events_data.json'
with io.open(filename, 'w', encoding='utf8') as outfile:
events = {'name': [], 'description': [], 'date-time': []}
for ele in Soup.find_all('h2'):
if ele.text != "\n\n":
events['name'].append(ele.text)
for ele in Soup.find_all('p', {'class': None}):
events['description'].append(
ele.text.replace("\t", "").replace("\n", ""))
for ele in Soup.find_all('p', {'class': 'date-time'}):
events['date-time'].append(
ele.text.replace("\t", "").replace("\n", ""))

str_ = json.dumps(events, indent=2, sort_keys=False,
separators=(',', ': '), ensure_ascii=False)
outfile.write(to_unicode(str_))
37 changes: 37 additions & 0 deletions Sample_python_Scripts/Apple WWDC/getTogether.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os
import json
from bs4 import BeautifulSoup
import requests
import io
try:
to_unicode = unicode
except NameError:
to_unicode = str


def clean_data(text):
return text.replace("\t", "")


site = requests.get('https://developer.apple.com/wwdc/get-togethers/')
data = site.content.decode('utf-8')
soup = BeautifulSoup(data, 'lxml')
title = soup.find_all(attrs={'class': 'typography-subsection-headline'})
description = soup.find_all('p', class_=lambda x: x != 'date-time')[1:]
locationAndTime = soup.find_all('p', attrs={'class': 'date-time'})

path = "Apple WWDC/Data/Get Together/"

if not os.path.exists(path):
os.makedirs(path)

dictionary = {"title": [], "description": [], "location and time": []}
for x, y, z in zip(title, description, locationAndTime):
dictionary["title"].append(x.text)
dictionary["description"].append(clean_data(y.text))
dictionary["location and time"].append(clean_data(z.text))
with io.open('Apple WWDC/Data/Get Together/get_togethers_data.json',
'w', encoding='utf8') as outfile:
str_ = json.dumps(dictionary, indent=2, sort_keys=False,
separators=(',', ': '), ensure_ascii=False)
outfile.write(to_unicode(str_))
35 changes: 35 additions & 0 deletions Sample_python_Scripts/Apple WWDC/guestSpeakers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import json
import os
import io
import requests
import subprocess
from bs4 import BeautifulSoup as BS
try:
to_unicode = unicode
except NameError:
to_unicode = str
site = requests.get('https://developer.apple.com/wwdc/guest-speakers/')
data = site.content.decode('utf-8')
Soup = BS(data, 'lxml')
path = "Apple WWDC/Data/Guest Speakers/"
if not os.path.exists(path):
os.makedirs(path)
with io.open('Apple WWDC/Data/Guest Speakers/guest_speakers_data.json',
'w', encoding='utf8') as outfile:
guest = {'speakers': [], 'description': [], 'date-time': []}
for ele in Soup.find_all('p', {'class': 'heading'}):
guest['speakers'].append(ele.text)
for ele in Soup.find_all('p', {'class': 'description'}):
guest['description'].append(
ele.text.replace("\t", "").replace("\n", ""))
for ele in Soup.find_all('p', {'class': 'date-time'}):
guest['date-time'].append(ele.text.replace("\t", "").replace("\n", ""))

str_ = json.dumps(guest, indent=2, sort_keys=False,
separators=(',', ': '), ensure_ascii=False)
outfile.write(to_unicode(str_))
speaker_pictures = Soup.find_all('img')[1:]
for image in speaker_pictures:
url = "https://developer.apple.com" + image.get('src')
subprocess.call(
'wget -P Apple\ WWDC/Data/Guest\ Speakers/ --user-agent "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:11.0) Gecko/20100101 Firefox/11.0" ' + url, shell=True)
44 changes: 44 additions & 0 deletions Sample_python_Scripts/Apple WWDC/schedule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import json
import os
import io
from selenium import webdriver
from bs4 import BeautifulSoup
try:
to_unicode = unicode
except NameError:
to_unicode = str


def clean_data(text):
return text.replace("\t", "")


driver = webdriver.PhantomJS()
driver.set_window_size(1120, 520)
driver.get("https://developer.apple.com/wwdc/schedule/#/")
site = driver.page_source
data = site
soup = BeautifulSoup(data, 'lxml')
path = "Apple WWDC/Data/schedule/"

if not os.path.exists(path):
os.makedirs(path)

day = soup.find_all("h4", {"class": "small-caps"})
title = soup.find_all("h4", {"class": "event-item-title"})
venue = soup.find_all("span", {"class": "event-item-byline block smaller"})

with io.open('Apple WWDC/Data/schedule/schedule_data.json',
'w', encoding='utf8') as outfile:
consultation = {"day": [], "title": [], "venue": []}
for ele in title:
consultation["title"].append(ele.text.strip())
for ele in day:
consultation["day"].append(ele.text.strip())
for ele in venue:
consultation["venue"].append(
clean_data(ele.text.strip()).replace("\n", ", "))

str_ = json.dumps(consultation, indent=2, sort_keys=False,
separators=(',', ': '), ensure_ascii=False)
outfile.write(to_unicode(str_))
82 changes: 82 additions & 0 deletions Sample_python_Scripts/Apple WWDC/scholarships.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import json
import os
import io
import re
import requests
from bs4 import BeautifulSoup as BS

try:
to_unicode = unicode
except NameError:
to_unicode = str
site = requests.get('https://developer.apple.com/wwdc/scholarships/')
data = site.content.decode('utf-8')
Soup = BS(data, 'lxml')
path = os.path.join(os.getcwd(), "Data/Scholarships/")
if not os.path.exists(path):
os.makedirs(path)
with io.open(os.path.join(os.getcwd(), "Data/Scholarships/scholarships.json"),
'w', encoding='utf8') as outfile:
scholarships = {}

for ele in Soup.find_all('h2',
{'class': re.compile(
"typography-section-headline [a-z,A-Z]*")}):
scholarships[ele.string] = []
nextNode = ele
while nextNode is not None:
nextNode = nextNode.nextSibling
if nextNode is None:
break
if nextNode.string is None and nextNode.text is not None:
string = nextNode.text.replace("\t", "").replace("\n", "")
if len(string) != 0:
scholarships[ele.string].append(string)
else:
string = nextNode.string.replace("\t", "").replace("\n", "")
if len(string) != 0:
scholarships[ele.string].append(string)

for ele in Soup.find_all('h4',
{'class': re.compile(
"typography-subsection-headline [a-z,A-Z]*")}):
scholarships[ele.string] = []
nextNode = ele
while nextNode is not None:
nextNode = nextNode.nextSibling
if nextNode is None:
break
try:
tag_name = nextNode.name
except AttributeError:
tag_name = ""
if tag_name != "h4" and tag_name is not None:
if tag_name == "ul":
scholarships[ele.string].extend(
filter(lambda x: len(x) > 0,
nextNode.text.split('\n')))
if nextNode.string is None and nextNode.text is not None:
scholarships[ele.string]\
.append(nextNode.text.replace("\t", "")
.replace("\n", ""))
else:
scholarships[ele.string]\
.append(nextNode.string.replace("\t", "")
.replace("\n", ""))
elif tag_name == "h4":
break

scholarships['Deadline'] = []
for ele in Soup.find_all('p', {'class': 'typography-caption'}):
if ele.string is None and ele.text is not None:
string = ele.text.replace("\t", "").replace("\n", "")
if len(string) != 0:
scholarships['Deadline'].append(string)
else:
string = ele.string.replace("\t", "").replace("\n", "")
if len(string) != 0:
scholarships['Deadline'].append(string)

str_ = json.dumps(scholarships, indent=2, sort_keys=False,
separators=(',', ': '), ensure_ascii=False)
outfile.write(to_unicode(str_))
35 changes: 35 additions & 0 deletions Sample_python_Scripts/Apple WWDC/thirdPartyEvents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import json
import os
import io
import requests
from bs4 import BeautifulSoup as BS
try:
to_unicode = unicode
except NameError:
to_unicode = str
site = requests.get('https://developer.apple.com/wwdc/more/')
data = site.content.decode('utf-8')
Soup = BS(data, 'lxml')
path = "Apple WWDC/Data/Special Events"
if not os.path.exists(path):
os.makedirs(path)
filename = 'Apple WWDC/Data/Special Events/third_party_events_data.json'
with io.open(filename, 'w', encoding='utf8') as outfile:
events = {'name': [], 'description': [], 'location': [], 'date-day': []}
for ele in Soup.find_all('h4'):
events['name'].append(ele.text)
for ele in Soup.find_all('p', {'class': 'description'}):
events['description'].append(
ele.text.replace("\t", "").replace("\n", ""))
for ele in Soup.find_all('p', {'class': 'location'}):
events['location'].append(ele.text.replace("\t", "").replace("\n", ""))
dates = Soup.find_all('p', {'class': 'date'})
days = Soup.find_all('p', {'class': 'day'})
for x in range(len(days)):
date = dates[x].text.replace("\t", "").replace("\n", "")
day = days[x].text.replace("\t", "").replace("\n", "")
events['date-day'].append(date + ", " + day)

str_ = json.dumps(events, indent=2, sort_keys=False,
separators=(',', ': '), ensure_ascii=False)
outfile.write(to_unicode(str_))
Binary file added sample/Apple WWDC.zip
Binary file not shown.
Loading