From 2f23c5e44055c7190d943ad5305077545fc0ca96 Mon Sep 17 00:00:00 2001 From: Majid Date: Thu, 7 Dec 2017 12:10:13 -0800 Subject: [PATCH 1/2] entity table data extraction with glossary added. --- etk/core.py | 15 ++++++- etk/data_extractors/table_extractor.py | 40 +++++++++++++++++++ .../extraction_config_table_content.json | 20 +++++++++- 3 files changed, 72 insertions(+), 3 deletions(-) diff --git a/etk/core.py b/etk/core.py index 42275cca..5464a396 100644 --- a/etk/core.py +++ b/etk/core.py @@ -387,6 +387,7 @@ def process(self, doc, create_knowledge_graph=True, html_description=False): for input_path in input_paths: if _FIELDS in de_config: if input_path not in self.data_extraction_path: + print input_path self.data_extraction_path[input_path] = parse(input_path) matches = self.data_extraction_path[input_path].find(doc) for match in matches: @@ -652,7 +653,7 @@ def process(self, doc, create_knowledge_graph=True, html_description=False): print 'LOG: {},{},{},{}'.format(doc_id, 'TOTAL', 'TOTAL', time_taken_process) self.log('Document: {} took {} seconds'.format(doc[_DOCUMENT_ID], str(time_taken_process)), _INFO, doc_id=doc[_DOCUMENT_ID], url=doc[_URL] if _URL in doc else None, extra=extra) - + print 'Correct etk!' return doc def convert_json_content(self, doc, json_content_extractor): @@ -1705,6 +1706,18 @@ def extract_table(self, d, config): te = table_extractor.TableExtraction() return te.extract(d) + def entity_table_extractor(self, d, config): + dic = set() + # print config + if 'dic' in config: + # config = config[_CONFIG] + dic = config['dic'] + dic = self.load_json(dic) + + te = table_extractor.EntityTableDataExtraction() + res = te.extract(d, dic) + return res if len(res) > 0 else None + @staticmethod def extract_landmark(html, url, extraction_rules, threshold=0.5): return landmark_extraction.extract(html, url, extraction_rules, threshold) diff --git a/etk/data_extractors/table_extractor.py b/etk/data_extractors/table_extractor.py index 1e3409f7..8bd9ec7f 100644 --- a/etk/data_extractors/table_extractor.py +++ b/etk/data_extractors/table_extractor.py @@ -53,6 +53,46 @@ def clean_cells(t): # modifies t r[i] = re.sub('\s+', ' ', r[i]) r[i] = r[i].strip() +class EntityTableDataExtraction: + def wrap_context(self, text): + return {'value': text, + 'context': {'start': 0, + 'end': 0 + } + } + + def extract(self, table, dic): + # print dic + if table['features']['max_cols_in_a_row'] != 2 and table['features']['no_of_rows'] < 2: + return None + res = [] + for row in table['rows']: + if len(row['cells']) != 2: + continue + text = [row['cells'][0]['text'], row['cells'][1]['text']] + for x in dic: + # print x,text + + if self.matches_cell(text[0], x): + print x,text + res.append(self.wrap_context(text[1])) + if self.matches_cell(text[1], x): + print x, text + res.append(self.wrap_context(text[0])) + # if any([self.matches_cell(text[0], x) for x in dic]): + # return text[1] + # if any([self.matches_cell(text[0], x) for x in dic]): + # return text[0] + return res + + def matches_cell(self, cell_text, text): + cell_text = cell_text.lower() + text = text.lower() + if text in cell_text and float(len(cell_text))/float(len(text)) < 1.5: + return True + return False + + class TableExtraction: @staticmethod diff --git a/etk/resources/extraction_config_table_content.json b/etk/resources/extraction_config_table_content.json index 2b4e45c1..29633a83 100644 --- a/etk/resources/extraction_config_table_content.json +++ b/etk/resources/extraction_config_table_content.json @@ -1,5 +1,5 @@ { - "document_id": "cdr_id", + "document_id": "doc_id", "extraction_policy": "replace", "error_handling": "raise_error", "resources": { @@ -9,6 +9,7 @@ "landmark": [ ], "pickle": { + "my_dic": "/Users/majid/DIG/test_fields.txt" } }, "content_extraction": { @@ -23,7 +24,22 @@ } }, "data_extraction": [ - + { + "input_path": [ + "*.table.tables[*]" + ], + "fields": { + "my_field": { + "extractors": { + "entity_table_extractor": { + "config": { + "dic": "my_dic" + } + } + } + } + } + } ], "kg_enhancement": [ From 81fa394eabd1d7c363a6414f4cd8297ecaac0f0d Mon Sep 17 00:00:00 2001 From: Majid Date: Thu, 7 Dec 2017 12:18:42 -0800 Subject: [PATCH 2/2] conflicts resolved. --- etk/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/etk/core.py b/etk/core.py index 5dcfca16..b8dec599 100644 --- a/etk/core.py +++ b/etk/core.py @@ -722,7 +722,6 @@ def process(self, doc, create_knowledge_graph=True, html_description=False): print 'LOG: {},{},{},{}'.format(doc_id, 'TOTAL', 'TOTAL', time_taken_process) self.log('Document: {} took {} seconds'.format(doc[_DOCUMENT_ID], str(time_taken_process)), _INFO, doc_id=doc[_DOCUMENT_ID], url=doc[_URL] if _URL in doc else None, extra=extra) - print 'Correct etk!' return doc def convert_json_content(self, doc, json_content_extractor):