-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathdecode_xml.py
167 lines (129 loc) · 5.12 KB
/
decode_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: 'arvin'
import xml.sax
class CVEHandler(xml.sax.ContentHandler):
"""sax handler class
sax event handler class, for the consideration of CVE xml size,
we don't use ElementsTree(need to much memories)
Attributes:
currentData: current xml node when parser traverse
title: cve title
nodeDescription: cve description in Note node
published: cve publish time in Note node
modified: cve modified time in Note node
cve: cve number
url: cve reference url
description: cve reference description
references: cve reference list
referenceCount: count the reference number in every cve node
cveCount: count the cve number
skip: skip all cve number less than this value
"""
def __init__(self, skip=''):
"""init Handler and inner vars
Args:
skip: assignment the self.skip
Returns:
"""
self.currentData = ''
self.title = ''
self.noteDescription = ''
self.published = ''
self.modified = ''
self.cve = ''
self.url = ''
self.description = ''
self.references = []
self.referenceCount = -1
self.cveCount = -1
self.cves = []
self.skip = skip
def startElement(self, name, attrs):
"""call back when reach every element tag start
Args:
<Note Type="1"></Note>
name: element name/tags, Note here
attrs: element attributes dict, {'Type': '1'}
Return:
"""
if name == 'Vulnerability':
# reach the vulnerability element's beginning
# initialize all vars for recording a vulnerability
self.title = ''
self.noteDescription = ''
self.published = ''
self.modified = ''
self.cve = ''
self.url = ''
self.description = ''
self.references = []
# cveCount ++
self.cveCount += 1
self.referenceCount = -1
self.currentData = name
# the Notes element is a little bit complex
# node name is decided by the attribute in the attrs var
elif name == 'Note' and self.cveCount >= 0:
if attrs['Type'] == 'Description':
self.currentData = 'NoteDescription'
elif attrs['Type'] == 'Other':
if attrs['Title'] == 'Published':
self.currentData = 'Published'
elif attrs['Title'] == 'Modified':
self.currentData = 'Modified'
# find a reference node
elif name == 'Reference' and self.cveCount >= 0:
self.referenceCount += 1
self.references.append(['', '']) # add a pair to record the url and description in each reference
self.currentData = name
else:
self.currentData = name
def characters(self, content):
"""tag content process function
record the node content and distinguish the node name by currentData
which given by startElement()
Args:
<Note Type="1">content</Note>
content: here refer to the content
Return:
"""
if self.cveCount >= 0:
if self.currentData == 'Title':
self.title += content
self.title = self.title.replace('\n', '')
elif self.currentData == 'NoteDescription':
self.noteDescription += content
self.noteDescription = self.noteDescription.replace('\n', ' ')
elif self.currentData == 'Published':
self.published += content
self.published = self.published.replace('\n', '')
elif self.currentData == 'Modified':
self.modified += content
self.modified = self.modified.replace('\n', '')
elif self.currentData == 'CVE':
self.cve += content
self.cve = self.cve.replace('\n', '')
elif self.currentData == 'URL':
self.references[self.referenceCount][0] = content
elif self.currentData == 'Description':
self.references[self.referenceCount][1] = content
def endElement(self, name):
"""when tag end call back this function
Args:
name: tag name
"""
if name == 'Vulnerability' and self.cveCount >= 0 and (self.title > self.skip):
self.cves.append([self.title, self.noteDescription, self.published,
self.modified, self.cve, self.references]) # add a cve node to the list
self.currentData = '' # reset the element name
if __name__ == '__main__':
parser = xml.sax.make_parser()
Handler = CVEHandler()
parser.setContentHandler(Handler)
parser.parse('resource/allitems-cvrf-year-2015.xml')
# parser.parse('resource/cvrf-template.xml')
# parser.parse('resource/allitems-cvrf.xml')
print len(Handler.cves)
# print Handler.cves
# print Handler.cves[random.randint(0, 1000)]