-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathscraper.py
173 lines (120 loc) · 5.2 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from string import strip
from operator import itemgetter
import os
import StringIO
import zipfile
import tempfile
import shutil
from bs4 import BeautifulSoup
import requests
from pprint import pprint
base_url = 'https://developer.chrome.com/extensions/'
page_url = base_url + 'samples'
## Scrape data from Examples page
r = requests.get(page_url, verify=False)
soup = BeautifulSoup(r.text)
examples = []
for index, sample_section in enumerate(soup('div', class_='sample')):
project = {}
project['name'] = sample_section.a.string
project['desc'] = (sample_section.a.parent.next_sibling).strip()
zip_url = sample_section.a.get('href')
project['zip'] = zip_url
project['folder'] = zip_url[zip_url.rfind('/') + 1: zip_url.rfind('.zip')]
doc = []
files = []
links = sample_section('ul')
for index, item in enumerate(links[0].select('li code a')):
doc.append({'call': item.string,
'link': base_url + item.get('href')})
for index, item in enumerate(links[1].select('li code a')):
files.append({'call': item.string,
'link': base_url + item.get('href')})
project['doc'] = doc
project['files'] = files
examples.append(project)
## Setup subfolder for new git repo
subfolder = 'chrome-extensions-examples'
os.mkdir(subfolder)
## Download and extract example projects
bullet_list_template = '* [%(call)s](%(link)s)'
readme_template = """
%(name)s
=======
%(desc)s
[Zipfile](http://developer.chrome.com/extensions/%(zip)s)
Content is licensed under the [Google BSD License](https://developers.google.com/open-source/licenses/bsd).
Calls
-----
"""
for index, project in enumerate(examples):
print project['name']
project_path = os.path.join(subfolder, project['folder'])
tmp_dir = tempfile.mkdtemp()
r = requests.get(base_url + project['zip'], verify=False)
if r.ok:
z = zipfile.ZipFile(StringIO.StringIO(r.content))
z.extractall(path=tmp_dir)
else:
if r.status_code == 500:
print '>> error downloading ' + base_url + project['zip']
del examples[index]
continue
try:
shutil.copytree(os.path.join(tmp_dir, project['folder']), project_path)
except OSError, e:
if e.errno == 17:
num = 1
while True:
try:
tmp_path = project_path + '_' + str(num)
print '>> Got existing subfolder: "' + project_path + '", trying "' + tmp_path + ""
shutil.copytree(os.path.join(tmp_dir, project['folder']), tmp_path)
project_path = tmp_path
break
except OSError, e:
if e.errno == 17:
num += 1
continue
else:
break
raise e
else:
raise e
shutil.rmtree(tmp_dir)
project['subfolder'] = os.path.split(project_path)[1]
bullets = []
for item in project['doc']:
bullets.append(bullet_list_template % item)
readme = readme_template % project
readme += '\n'.join(bullets)
with open(os.path.join(project_path, 'README.md'), 'a') as outfile:
outfile.write(readme)
## Write main readme with correct project subfolders
project_list_template = '* [%(name)s](/%(subfolder)s/)'
project_list = []
for project in sorted(examples, key=itemgetter('name')):
try:
project_list.append(project_list_template % project)
except KeyError, e:
print '>> got keyerror creating link for '+ project['name']
main_readme = """
**This is not an official mirror of the Chrome extension examples. Report any issues with the examples themselves to Google's issue trackers/forums.**
**There is an ongoing effort to standardize the Extensions on different browsers, as [discussed on MDN](https://developer.mozilla.org/en-US/Add-ons/WebExtensions) and defined in the [WebExtensions Spec Draft](https://browserext.github.io/browserext/). The resources on [browser support](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/Browser_support_for_JavaScript_APIs) and [incompatibilities](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/Chrome_incompatibilities) may be of interest.**
chrome-extensions-examples
==========================
The [Chrome Extensions examples](http://developer.chrome.com/extensions/samples) did not
exist as a Git repository, and browsing both the samples page and the VCViewer did not seem particularly
handy. So, I decided to scrape the content into this repository for easier browsing and (possible)
editing.
If you would like to clone a part of this repository, use git
[sparse checkouts](http://jasonkarns.com/blog/subdirectory-checkouts-with-git-sparse-checkout/).
You can find the scraper used to generate this repository (except for a `git init` and push)
on [github](https://github.com/orbitbot/chrome-extension-scraper).
Content is licensed under the [Google BSD License](https://developers.google.com/open-source/licenses/bsd).
Example projects
----------------
"""
main_readme += '\n'.join(project_list)
with open(os.path.join(subfolder, 'README.md'), 'w') as outfile:
outfile.write(main_readme)