-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add spec data for features w/ Specifications table
This change adds spec data in the `*.json` sources for all features that have an `mdn_url` for an MDN article with a **Specification(s)** table — modulo the following exceptions: * no spec data is added for `"status": "deprecated"` features * no spec data is added for any cases where a URL found in an MDN **Specification(s)** table has no fragment-ID part Here’s an example of the data it adds for a particular feature, the `api.History` feature: ``` "specs": [ { "name": "HTML WHATWG", "url": "https://html.spec.whatwg.org/multipage/browsers.html#the-history-interface" }, { "name": "HTML5 W3C", "url": "https://www.w3.org/TR/html50/browsers.html#the-history-interface" }, { "name": "Custom Scroll Restoration", "url": "https://majido.github.io/scroll-restoration-proposal/history-based-api.html#web-idl" } ] ``` The change also includes an `add-specs.py` script that can be used to (re)generate all the spec data and update all the `*.json` sources. (The script works by scraping MDN **Specification(s)** tables.)
- Loading branch information
1 parent
c6d80e5
commit 1e76365
Showing
1,409 changed files
with
45,573 additions
and
4,885 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
#!/usr/bin/env python2 | ||
import certifi | ||
import io | ||
import json | ||
import os.path | ||
import sys | ||
import time | ||
import urllib3 | ||
from collections import OrderedDict | ||
from lxml.html import parse | ||
from termcolor import cprint | ||
from urlparse import urlparse | ||
|
||
|
||
def alarm(message): | ||
cprint('Alarm: %s' % message, 'red', attrs=['bold']) | ||
|
||
|
||
def getSpecsArray(mdn_url, sectionname, spec_urls, http): | ||
url = 'https://developer.mozilla.org' + urlparse(mdn_url).path + \ | ||
'?raw¯os§ion=' + sectionname | ||
print 'Trying %s' % url | ||
response = http.request('GET', url) | ||
if response.status == 404: | ||
return [] | ||
if response.status > 499: | ||
sys.stderr.write('50x for %s. Will retry after 60s...\n' % url) | ||
time.sleep(61) | ||
print 'Retrying %s' % url | ||
response = http.request('GET', url) | ||
if response.status == 404: | ||
return [] | ||
if response.status > 499: | ||
sys.stderr.write('50x for %s. Giving up.\n' % url) | ||
return [] | ||
html = response.data.decode('utf-8') | ||
if html == '': | ||
return [] | ||
try: | ||
doc = parse(io.StringIO(unicode(html))) | ||
rows = doc.xpath('//table[1]//tr[td]') | ||
if not(rows): | ||
return [] | ||
specs = [] | ||
for row in rows: | ||
hrefs = row.xpath('td[1]/a/@href') | ||
if not(hrefs): | ||
continue | ||
spec_url = hrefs[0] | ||
if not(urlparse(spec_url).fragment): | ||
alarm(mdn_url + ' has spec URL with no fragment: ' + spec_url) | ||
continue | ||
if not(urlparse(spec_url).hostname): | ||
alarm(mdn_url + ' has spec URL with no hostname: ' + spec_url) | ||
continue | ||
spec_name = '' | ||
for base_url in spec_urls: | ||
if spec_url.startswith(base_url): | ||
spec_name = spec_urls[base_url]['name'] | ||
cprint('Adding %s (%s)' % (spec_url, spec_name), 'green') | ||
spec = OrderedDict() | ||
spec['name'] = spec_name | ||
spec['url'] = spec_url | ||
specs.append(spec) | ||
return specs | ||
except Exception, e: | ||
sys.stderr.write('Something went wrong: %s\n' % str(e)) | ||
return [] | ||
|
||
|
||
def walkBaseData(basedata, filename, spec_urls, http, basename, sectionname, | ||
bcd_data): | ||
for featurename in basedata: | ||
feature_data = basedata[featurename] | ||
path = '%s.%s.%s' % (sectionname, basename, featurename) | ||
bcd_data[sectionname][basename][featurename] = \ | ||
processTarget(feature_data, filename, spec_urls, http, path) | ||
for subfeaturename in feature_data: | ||
subfeaturedata = feature_data[subfeaturename] | ||
path = '%s.%s.%s.%s' % (sectionname, basename, featurename, | ||
subfeaturename) | ||
bcd_data[sectionname][basename][featurename][subfeaturename] = \ | ||
processTarget(subfeaturedata, filename, spec_urls, http, path) | ||
|
||
|
||
def processTarget(target, filename, spec_urls, http, path): | ||
try: | ||
if not('__compat' in target): | ||
return target | ||
target_data = target['__compat'] | ||
if not('mdn_url' in target_data): | ||
if '_' not in path: | ||
alarm('%s in %s has no mdn_url' % (path, filename)) | ||
return target | ||
if target_data['status']['deprecated']: | ||
return target | ||
if 'specs' in target_data: | ||
if not(len(sys.argv) > 1 and sys.argv[1] == 'fullupdate'): | ||
return target | ||
mdn_url = target_data['mdn_url'] | ||
specs = getSpecsArray(mdn_url, 'Specifications', spec_urls, http) | ||
if not(specs): | ||
specs = getSpecsArray(mdn_url, 'Specification', spec_urls, http) | ||
if not(specs): | ||
return target | ||
target['__compat']['specs'] = specs | ||
except TypeError: | ||
pass | ||
return target | ||
|
||
|
||
def main(): | ||
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', | ||
ca_certs=certifi.where()) | ||
response = http.request('GET', 'https://raw.githubusercontent.com/mdn/' + | ||
'kumascript/master/macros/SpecData.json') | ||
spec_data = json.loads(response.data, object_pairs_hook=OrderedDict) | ||
spec_urls = {} | ||
for spec_name in spec_data: | ||
url = spec_data[spec_name]['url'] | ||
spec_urls[url] = {} | ||
spec_urls[url]['name'] = spec_name | ||
dirnames = \ | ||
[ | ||
'api', | ||
'css', | ||
'html', | ||
'http', | ||
'javascript', | ||
'mathml', | ||
'svg', | ||
'webdriver', | ||
'xpath', | ||
'xslt' | ||
] | ||
for dirname in dirnames: | ||
files = [os.path.join(dirpath, filename) | ||
for (dirpath, dirs, files) | ||
in os.walk(dirname) | ||
for filename in (dirs + files)] | ||
files.sort() | ||
for filename in files: | ||
if os.path.splitext(filename)[1] != '.json': | ||
continue | ||
f = io.open(filename, 'r+', encoding='utf-8') | ||
bcd_data = json.load(f, object_pairs_hook=OrderedDict) | ||
for sectionname in bcd_data: | ||
for basename in bcd_data[sectionname]: | ||
basedata = bcd_data[sectionname][basename] | ||
path = '%s.%s' % (sectionname, basename) | ||
path = sectionname + '.' + basename | ||
bcd_data[sectionname][basename] = \ | ||
processTarget(basedata, filename, spec_urls, http, path) | ||
if basedata: | ||
walkBaseData(basedata, filename, spec_urls, http, | ||
basename, sectionname, bcd_data) | ||
f.seek(0) | ||
f.write(unicode(json.dumps(bcd_data, indent=2, | ||
separators=(',', ': '), | ||
ensure_ascii=False) + '\n')) | ||
f.truncate() | ||
f.close() | ||
|
||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.