Skip to content

Commit

Permalink
Merge pull request #154 from majidghgol/development
Browse files Browse the repository at this point in the history
entity table data extraction
  • Loading branch information
saggu committed Dec 7, 2017
2 parents 251af9d + 81fa394 commit 65a7a79
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 3 deletions.
13 changes: 12 additions & 1 deletion etk/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,6 @@ def process(self, doc, create_knowledge_graph=True, html_description=False):
print 'LOG: {},{},{},{}'.format(doc_id, 'TOTAL', 'TOTAL', time_taken_process)
self.log('Document: {} took {} seconds'.format(doc[_DOCUMENT_ID], str(time_taken_process)), _INFO,
doc_id=doc[_DOCUMENT_ID], url=doc[_URL] if _URL in doc else None, extra=extra)

return doc

def convert_json_content(self, doc, json_content_extractor):
Expand Down Expand Up @@ -1793,6 +1792,18 @@ def extract_table(self, d, config):
te = table_extractor.TableExtraction()
return te.extract(d)

def entity_table_extractor(self, d, config):
dic = set()
# print config
if 'dic' in config:
# config = config[_CONFIG]
dic = config['dic']
dic = self.load_json(dic)

te = table_extractor.EntityTableDataExtraction()
res = te.extract(d, dic)
return res if len(res) > 0 else None

@staticmethod
def extract_landmark(html, url, extraction_rules, threshold=0.5):
return landmark_extraction.extract(html, url, extraction_rules, threshold)
Expand Down
40 changes: 40 additions & 0 deletions etk/data_extractors/table_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,46 @@ def clean_cells(t): # modifies t
r[i] = re.sub('\s+', ' ', r[i])
r[i] = r[i].strip()

class EntityTableDataExtraction:
def wrap_context(self, text):
return {'value': text,
'context': {'start': 0,
'end': 0
}
}

def extract(self, table, dic):
# print dic
if table['features']['max_cols_in_a_row'] != 2 and table['features']['no_of_rows'] < 2:
return None
res = []
for row in table['rows']:
if len(row['cells']) != 2:
continue
text = [row['cells'][0]['text'], row['cells'][1]['text']]
for x in dic:
# print x,text

if self.matches_cell(text[0], x):
print x,text
res.append(self.wrap_context(text[1]))
if self.matches_cell(text[1], x):
print x, text
res.append(self.wrap_context(text[0]))
# if any([self.matches_cell(text[0], x) for x in dic]):
# return text[1]
# if any([self.matches_cell(text[0], x) for x in dic]):
# return text[0]
return res

def matches_cell(self, cell_text, text):
cell_text = cell_text.lower()
text = text.lower()
if text in cell_text and float(len(cell_text))/float(len(text)) < 1.5:
return True
return False



class TableExtraction:
@staticmethod
Expand Down
20 changes: 18 additions & 2 deletions etk/resources/extraction_config_table_content.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"document_id": "cdr_id",
"document_id": "doc_id",
"extraction_policy": "replace",
"error_handling": "raise_error",
"resources": {
Expand All @@ -9,6 +9,7 @@
"landmark": [
],
"pickle": {
"my_dic": "/Users/majid/DIG/test_fields.txt"
}
},
"content_extraction": {
Expand All @@ -23,7 +24,22 @@
}
},
"data_extraction": [

{
"input_path": [
"*.table.tables[*]"
],
"fields": {
"my_field": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "my_dic"
}
}
}
}
}
}
],
"kg_enhancement": [

Expand Down

0 comments on commit 65a7a79

Please sign in to comment.