Add spec_url data for JavaScript features

This change adds spec URLs in the `*.json` sources for all JavaScript features that have an `mdn_url` for an MDN article with a **Specification(s)** table — modulo the following exceptions: * no spec data is added for `"status": "deprecated"` features * no spec data is added for any cases where a URL found in an MDN **Specification(s)** table has no fragment-ID part The new field that holds the spec-URL data is named `spec_url`, and its value is either a single URL (in the case where the feature is only associated with a single specification references), or any array of URLs (in the case where the feature is associated with multiple specification references). The change also: * Includes an `add-specs.py` script that can be used to (re)generate all the spec data and update all the `*.json` sources. (The script works by scraping MDN **Specification(s)** tables.) * Updates `schemas/compat-data.schema.json` to allow the additional spec data and to specify the structure it must conform to.
mdn · Nov 6, 2018 · eca786d · eca786d
1 parent de679e7
commit eca786d
Show file tree

Hide file tree

Showing 80 changed files with 1,514 additions and 603 deletions.
diff --git a/add-specs.py b/add-specs.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python2
+import certifi
+import io
+import json
+import os.path
+import sys
+import time
+import urllib3
+from collections import OrderedDict
+from lxml.html import parse
+from termcolor import cprint
+from urlparse import urlparse
+
+
+def alarm(message):
+    cprint('Alarm: %s' % message, 'red', attrs=['bold'])
+
+
+def getAdjustedSpecURL(url):
+    if url.startswith('http://drafts.csswg.org/css-scoping/'):
+        return url.replace('http://drafts.csswg', 'https://drafts.csswg')
+    if url.startswith('https://drafts.csswg.org/css-logical-props/'):
+        return url.replace('/css-logical-props/', '/css-logical/')
+    if url.startswith('https://www.w3.org/TR/xpath-20/'):
+        return url.replace('/TR/xpath-20/', '/TR/xpath20/')
+    if url.startswith('https://w3c.github.io/input-events/index.html'):
+        return url.replace('/input-events/index.html', '/input-events/')
+    if url.startswith('https://w3c.github.io/webappsec-csp/embedded/'):
+        return url.replace('/webappsec-csp/embedded/', '/webappsec-cspee/')
+    if url.startswith('https://wicg.github.io/media-capabilities#'):
+        return url.replace('/media-capabilities#', '/media-capabilities/#')
+    if url.startswith('https://w3c.github.io/keyboard-lock#'):
+        return url.replace('/keyboard-lock#', '/keyboard-lock/#')
+    if url.startswith('https://dev.w3.org/geo/api/spec-source.html'):
+        return url.replace('https://dev.w3.org/geo/api/spec-source.html',
+                           'https://www.w3.org/TR/geolocation-API/')
+    if '/deviceorientation/spec-source-orientation.html' in url:
+        return url.replace('spec-source-orientation.html', '')
+    if 'spec.whatwg.org#' in url:
+        return url.replace('spec.whatwg.org#', 'spec.whatwg.org/#')
+    return url
+
+
+def isObsolete(url):
+    if url.startswith('https://www.w3.org/TR/REC-DOM-Level-1/'):
+        return True
+    if url.startswith('https://www.w3.org/TR/DOM-Level-2-'):
+        return True
+    if url.startswith('https://www.w3.org/TR/DOM-Level-3-Core/'):
+        return True
+    if url.startswith('https://www.w3.org/TR/ElementTraversal/'):
+        return True
+    if url.startswith('https://www.w3.org/TR/selectors-api/'):
+        return True
+    if url.startswith('https://dev.w3.org/2006/webapi/selectors-api2'):
+        return True
+    if url.startswith('https://w3c.github.io/webcomponents/spec/shadow/'):
+        return True
+    if url.startswith('https://w3c.github.io/staticrange/'):
+        return True
+    if url.startswith('https://www.w3.org/TR/dom/'):
+        return True
+    if url.startswith('https://w3c.github.io/microdata/'):
+        return True
+    if url.startswith('https://www.w3.org/TR/html5'):
+        return True
+    if url.startswith('https://www.ecma-international.org/'):
+        return True
+    if url.startswith('https://www.w3.org/TR/CSS1/'):
+        return True
+    if 'html401' in url:
+        return True
+    if 'developer.apple.com/library/safari' in url:
+        return True
+    if 'https://www.w3.org/TR/2014/WD-DOM-Level-3-Events-20140925/' in url:
+        return True
+    return False
+
+
+def getSpecURLsArray(mdn_url, sectionname, http):
+    url = 'https://developer.mozilla.org' + urlparse(mdn_url).path + \
+        '?raw&macros&section=' + sectionname
+    print 'Trying %s' % url
+    response = http.request('GET', url)
+    if response.status == 404:
+        return []
+    if response.status > 499:
+        sys.stderr.write('50x for %s. Will retry after 60s...\n' % url)
+        time.sleep(61)
+        print 'Retrying %s' % url
+        response = http.request('GET', url)
+        if response.status == 404:
+            return []
+        if response.status > 499:
+            sys.stderr.write('50x for %s. Giving up.\n' % url)
+            return []
+    html = response.data.decode('utf-8')
+    if html == '':
+        return []
+    try:
+        doc = parse(io.StringIO(unicode(html)))
+        rows = doc.xpath('//table[1]//tr[td]')
+        if not(rows):
+            return []
+        spec_urls = []
+        has_spec_url = False
+        for row in rows:
+            hrefs = row.xpath('td[1]//a/@href')
+            if not(hrefs):
+                continue
+            spec_url = hrefs[0].strip()
+            if isObsolete(spec_url):
+                continue
+            if not(urlparse(spec_url).fragment):
+                alarm(mdn_url + ' has spec URL with no fragment: ' + spec_url)
+                continue
+            if not(urlparse(spec_url).hostname):
+                alarm(mdn_url + ' has spec URL with no hostname: ' + spec_url)
+                continue
+            if has_spec_url:
+                cprint('Note:  ' + mdn_url + ' has multiple spec URLs', 'cyan')
+            spec_url = getAdjustedSpecURL(spec_url)
+            cprint('Adding %s' % (spec_url), 'green')
+            spec_urls.append(spec_url)
+            has_spec_url = True
+        return spec_urls
+    except Exception, e:
+        sys.stderr.write('Something went wrong: %s\n' % str(e))
+        return []
+
+
+def walkBaseData(basedata, filename, http, basename, sectionname,
+                 bcd_data):
+    for featurename in basedata:
+        feature_data = basedata[featurename]
+        path = '%s.%s.%s' % (sectionname, basename, featurename)
+        bcd_data[sectionname][basename][featurename] = \
+            processTarget(feature_data, filename, http, path)
+        for subfeaturename in feature_data:
+            subfeaturedata = feature_data[subfeaturename]
+            path = '%s.%s.%s.%s' % (sectionname, basename, featurename,
+                                    subfeaturename)
+            bcd_data[sectionname][basename][featurename][subfeaturename] = \
+                processTarget(subfeaturedata, filename, http, path)
+
+
+def processTarget(target, filename, http, path):
+    try:
+        if not('__compat' in target):
+            return target
+        target_data = target['__compat']
+        if not('mdn_url' in target_data):
+            if '_' not in path:
+                alarm('%s in %s has no mdn_url' % (path, filename))
+            return target
+        if target_data['status']['deprecated']:
+            return target
+        if 'spec_urls' in target_data:
+            if not(len(sys.argv) > 1 and sys.argv[1] == 'fullupdate'):
+                return target
+            else:
+                del target['__compat']['spec_urls']
+        if 'spec_url' in target_data:
+            if not(len(sys.argv) > 1 and sys.argv[1] == 'fullupdate'):
+                return target
+            else:
+                del target['__compat']['spec_url']
+        mdn_url = target_data['mdn_url']
+        spec_urls = getSpecURLsArray(mdn_url, 'Specifications', http)
+        if not(spec_urls):
+            spec_urls = getSpecURLsArray(mdn_url, 'Specification', http)
+        if not(spec_urls):
+            cprint('Note:  ' + mdn_url + ' has no spec URL', 'yellow')
+            return target
+        if len(spec_urls) == 1:
+            spec_urls = spec_urls[0]
+        target['__compat']['spec_url'] = spec_urls
+    except TypeError:
+        pass
+    return target
+
+
+def main():
+    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
+                               ca_certs=certifi.where())
+    dirnames = \
+        [
+            'api',
+            'css',
+            'html',
+            'http',
+            'javascript',
+            'mathml',
+            'svg',
+            'webdriver',
+            'xpath',
+            'xslt'
+        ]
+    if len(sys.argv) > 2:
+        dirnames = [sys.argv[2]]
+    for dirname in dirnames:
+        files = [os.path.join(dirpath, filename)
+                 for (dirpath, dirs, files)
+                 in os.walk(dirname)
+                 for filename in (dirs + files)]
+        files.sort()
+        for filename in files:
+            if os.path.splitext(filename)[1] != '.json':
+                continue
+            f = io.open(filename, 'r+', encoding='utf-8')
+            bcd_data = json.load(f, object_pairs_hook=OrderedDict)
+            for sectionname in bcd_data:
+                for basename in bcd_data[sectionname]:
+                    basedata = bcd_data[sectionname][basename]
+                    path = '%s.%s' % (sectionname, basename)
+                    path = sectionname + '.' + basename
+                    bcd_data[sectionname][basename] = \
+                        processTarget(basedata, filename, http, path)
+                    if basedata:
+                        walkBaseData(basedata, filename, http, basename,
+                                     sectionname, bcd_data)
+            f.seek(0)
+            f.write(unicode(json.dumps(bcd_data, indent=2,
+                                       separators=(',', ': '),
+                                       ensure_ascii=False) + '\n'))
+            f.truncate()
+            f.close()
+
+
+main()