-
Notifications
You must be signed in to change notification settings - Fork 0
/
simple_downloader.py
executable file
·150 lines (120 loc) · 3.74 KB
/
simple_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# This script basically downloads topictree from khan academy
# Then parses it and creates a temporary folder with files.
# Kept simple on purpose to just see how parsing the topic tree json works in python
import json
import pprint
import inspect
import os
import re
import urllib
global_index = 0
#http://stackoverflow.com/questions/3663450/python-remove-substring-only-at-the-end-of-string
def rchop(thestring, ending):
if thestring.endswith(ending):
return thestring[:-len(ending)]
return thestring
#somestring = rchop(somestring, ' rec')
def make_video_file(data, dirname, title, video_index):
global global_index
# for key in data:
# print key
video_title = re.sub('[^A-Za-z0-9]+', '_', data["translated_title"])
# for key in data["download_urls"]:
# print key
if "mp4-low" in data["download_urls"]:
download_url = data["download_urls"]["mp4-low"]
elif "mp4" in data["download_urls"]:
download_url = data["download_urls"]["mp4"]
else:
# Looks like there is a download_urls json that doesn't have a video.
print "No mp4 or mp4 for some videos"
print dirname
# print global_index
print video_title
for key in data["download_urls"]:
print key
return
if title == "New_and_noteworthy":
directory = dirname
else:
directory = rchop(dirname , title)
# print directory
# directory = re.sub('[^A-Za-z0-9]+', '_', directory)
# print directory
# print video_title
# print title
# print global_index
# print video_index
if data['translated_title'] is not None:
full_title = data['translated_title'].encode('utf-8')
else:
# Use file name
full_title = video_title
if data['translated_description'] is not None:
full_description = data['translated_description'].encode('utf-8')
else:
full_description = "No description"
id = []
if data['id'] is not None:
id = data['id']
if not os.path.exists(directory):
os.makedirs(directory)
fp = open(directory + "/" + str(global_index) + "_" + video_title, "wb")
fp.write("Video Title : ")
fp.write(full_title + "\n")
fp.write("Video Description : ")
fp.write(full_description + "\n")
fp.write("Download URL : ")
fp.write(download_url + "\n")
fp.write("Video ID : ")
fp.write(id + "\n")
fp.close()
def list_dict_keys(data, level, dirname, title):
# print type(data)
global global_index
video_index = 0
base = (" " * level)
if type(data) is dict:
for key in data:
# print base + key
# if key == 'title':
if key == 'translated_title':
title = re.sub('[^A-Za-z0-9]+', '_', data[key])
dirname = dirname + "/" + title
# print base + dirname
# print base + data[key]
if key == 'relative_url':
continue
print base + data[key]
if key == 'translated_youtube_id':
continue
print base + data[key]
if key == 'download_urls':
# Now that we have recursively reached a node with a video
# Create a folder and file.
global_index = global_index + 1
# video_index = video_index + 1
make_video_file(data, dirname, title, video_index)
# list_dict_keys(data[key], level, dirname, title)
# if key == 'mp4':
# print base + data[key]
if key == 'children':
list_members(data[key], level + 1, dirname, title)
def list_list_keys(data):
if type(data) is dict:
for key in dict:
print key
def list_members(data, level, dirname, title):
for index, item in enumerate(data):
if type(data[index]) is dict:
list_dict_keys(data[index], level, dirname, title)
if not os.path.exists('topictree'):
print "Downloading Topictree. This can take a while"
topictree = urllib.urlretrieve("http://www.khanacademy.org/api/v1/topictree", "topictree")
with open('topictree') as data_file:
data = json.load(data_file)
# print json.dumps(data)
list_dict_keys(data,0, "myfolder", "title")
quit()