-
Notifications
You must be signed in to change notification settings - Fork 0
/
cap_index_parse.py
executable file
·148 lines (115 loc) · 4.07 KB
/
cap_index_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/python2.4
#
# Copyright 2009 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CAP parsing utilities.
ParseCapIndex can parse either RSS or ATOM feed indices of CAP files.
"""
__author__ = 'Matthew.H.Frantz@gmail.com (Matt Frantz)'
from xml.dom import minidom
try:
# Google3 environment.
from google3.apphosting.runtime.apiproxy_errors import DeadlineExceededError
from google3.dotorg.gongo.appengine_cap2kml import xml_util
except ImportError:
from google.appengine.runtime import DeadlineExceededError
import xml_util
class Error(Exception):
pass
class CapIndexFormatError(Error):
"""Raised when a CAP index document is not one of the supported formats."""
def __init__(self, text, root_cause):
"""Initializes an CapIndexFormatError object.
Args:
text: Text that was parsed (string)
root_cause: Further explanation of the error (string)
"""
Error.__init__(
self, 'CAP index format error: %s: %s' % (root_cause, text))
def ParseCapIndex(index_text):
"""Parses a CAP index and returns references to CAP files.
Args:
index_text: XML (RSS or ATOM) with links to CAP files (string)
Returns:
List of CAP URL's (strings)
Raises:
CapIndexFormatError, if there is a problem parsing.
"""
try:
# Must be XML.
doc = minidom.parseString(index_text)
# See if it is RSS.
rss_nodes = doc.getElementsByTagName('rss')
if rss_nodes:
# Shouldn't have more than one, but it's easy enough to support.
urls = []
for rss_node in rss_nodes:
urls.extend(_ParseCapIndexRss(rss_node))
return urls
# See if it is ATOM.
feed_nodes = doc.getElementsByTagName('feed')
if not feed_nodes:
# TODO(Matt Frantz): Really support XML namespaces.
feed_nodes = doc.getElementsByTagName('atom:feed')
if feed_nodes:
# Shouldn't have more than one, but it's easy enough to support.
urls = []
for feed_node in feed_nodes:
urls.extend(_ParseCapIndexAtom(feed_node))
return urls
# Not sure what it is.
raise CapIndexFormatError(index_text, 'Unrecognized document type')
except (CapIndexFormatError, DeadlineExceededError, AssertionError):
raise
except Exception, e:
raise CapIndexFormatError(index_text, 'Parse error: %s' % e)
def _ParseCapIndexRss(rss):
"""Parses a CAP index in the RSS format.
Args:
rss: RSS CAP index document (xml.dom.Node object)
Returns:
List of CAP URL's (strings)
"""
link_urls = []
for item in rss.getElementsByTagName('item'):
link_node_name = 'link';
link_nodes = item.getElementsByTagName(link_node_name)
for link_node in link_nodes:
link_url = xml_util.GetText(link_nodes[0].childNodes)
if link_url:
link_urls.append(link_url)
return link_urls
def _ParseCapIndexAtom(atom):
"""Parses a CAP index in the ATOM format.
Args:
atom: ATOM CAP index document (xml.dom.Node object)
Returns:
List of CAP URL's (strings)
"""
link_urls = []
entries = atom.getElementsByTagName('entry')
if not entries:
# TODO(Matt Frantz): Really support XML namespaces.
entries = atom.getElementsByTagName('atom:entry')
for entry in entries:
link_nodes = entry.getElementsByTagName('link')
if not link_nodes:
# TODO(Matt Frantz): Really support XML namespaces.
link_nodes = entry.getElementsByTagName('atom:link')
for link_node in link_nodes:
# TODO(Matt Frantz): What about other kinds of links? http://b/2188342
href = link_nodes[0].getAttribute('href')
if href:
link_urls.append(href)
return link_urls