-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfact_extractor.py
114 lines (75 loc) · 5.06 KB
/
fact_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Implements the class FactExtractor which automatically extracts the set of assumption and guarantee facts
# from (a subset of) the NVD
import sys
import os
from handler_json_nvd import HandlerCVEJson
import re
from handler_stanford_corenlp_xml_output import HandlerStanfordCoreNLPOutputXML
class FactExtractor:
def __init__(self, db_fldr, prs_fldr, stopwrd_fl, years):
self.work_folder = db_fldr
self.parser_folder = prs_fldr
self.nvd_handler = HandlerCVEJson(db_fldr, years)
if os.path.isfile(stopwrd_fl):
self.stopwords = set(ln.strip() for ln in open(stopwrd_fl, 'rt').readlines())
else:
self.stopwords = set([])
self.stopwords.add('')
def do_extraction(self):
self.extract_affected_platforms_facts()
self.extract_allowed_actions_facts()
def extract_affected_platforms_facts(self):
ant_fact_set_fl = open(os.path.join(self.work_folder, 'facts-assumptions.txt'), 'wt')
for i in range(0, self.nvd_handler.countCVEDescr()):
affect_info_items = self.nvd_handler.getAffectsInfo(i)
affect_info_items_no_version = set((item[0], item[1]) for item in affect_info_items)
if len(affect_info_items) >= 1 or len(affect_info_items_no_version) >= 1:
ant_fact_set_fl.write(self.nvd_handler.getID(i) + '|' + ' '.join('affectedPlatform(' + ':'.join(item) + ')' for item in affect_info_items) + ' ' + ' '.join('affectedPlatform(' + ':'.join(item) + ')' for item in affect_info_items_no_version) + '\n')
ant_fact_set_fl.close()
def extract_allowed_actions_facts(self):
cons_fact_set_fl = open(os.path.join(self.work_folder, 'facts-guarantees.txt'), 'wt')
for i in range(0, self.nvd_handler.countCVEDescr()):
descr = self.nvd_handler.getCVEDescrOrd(i).lower()
cons_fact_set_ln = self.nvd_handler.getID(i)
useful_sent_found = False
if re.search(r'allows?', descr) is not None or re.search(r'permits?', descr) is not None: # We first check if the (possibly multi-sentence) description contains the terms
corenlp_handler = HandlerStanfordCoreNLPOutputXML(os.path.join(self.parser_folder, self.nvd_handler.getID(i) + '.txt.xml'))
for i in range(0, corenlp_handler.sentence_count()): # We will use the sentence splitting made by Stanford CoreNLP
allow_clause_heads = corenlp_handler.get_occurrences(i, [('allow','VB'), ('permit','VB')]) # Now we actually determine if the sentence contains the terms acting as verbs
if len(allow_clause_heads) >= 1:
cons_fact_set_ln += '|'
for head in allow_clause_heads:
agent_heads = corenlp_handler.get_dependents(i, ['dobj'], head)
action_vbs = corenlp_handler.get_dependents(i, ['xcomp'], head)
for ah in agent_heads:
agent_bow = self.clean_bow(corenlp_handler.get_noun_phrase_bow(i, ah[1]))
if len(agent_bow) >= 1:
for av in action_vbs:
effect_heads = corenlp_handler.get_dependents(i, ['dobj'], av[1])
for eh in effect_heads:
effect_bow = self.clean_bow(corenlp_handler.get_noun_phrase_bow(i, eh[1]))
if len(effect_bow) >= 1:
cons_fact_set_ln += 'allowedAction({' + ','.join(agent_bow) + '},' + corenlp_handler.get_lemma(i, av[1]) + ',{' + ','.join(effect_bow) + '}) '
useful_sent_found = True
if useful_sent_found:
cons_fact_set_fl.write(cons_fact_set_ln + '\n')
cons_fact_set_fl.close()
def clean_bow(self, bow):
clbow = set([])
for term in bow:
if term not in self.stopwords and self.not_just_numbers(term):
clbow.add(term)
return clbow
def not_just_numbers(self, term):
for ch in term:
if ch not in '-+0123456789':
return True
return False
# argv[1]: NVD folder
# argv[2]: parser output folder
# argv[3]: stopwords
# argv[4]...: years to consider from NVD data
if len(sys.argv) >= 5:
years = list(int(yr) for yr in sys.argv[4 : len(sys.argv)])
extractor = FactExtractor(sys.argv[1], sys.argv[2], sys.argv[3], years)
extractor.do_extraction()