Skip to content

Commit

Permalink
when using extract_using_spacy fieldname can be anything with date in it
Browse files Browse the repository at this point in the history
  • Loading branch information
saggu committed Dec 7, 2017
1 parent 11ae748 commit 251af9d
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 6 deletions.
6 changes: 4 additions & 2 deletions etk/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
_URL = 'url'
_AGE = 'age'
_POSTING_DATE = 'posting_date'
_DATE = 'date'
_SOCIAL_MEDIA = 'social_media'
_ADDRESS = 'address'
_RESOURCES = 'resources'
Expand Down Expand Up @@ -1466,11 +1467,12 @@ def extract_using_spacy(self, d, config):
if field_name == _AGE:
results = self._relevant_text_from_context(d[_SIMPLE_TOKENS],
spacy_age_extractor.extract(nlp_doc, self.matchers[_AGE]), _AGE)
elif field_name == _POSTING_DATE:
elif field_name == _POSTING_DATE or _DATE in field_name:
self.load_matchers(_POSTING_DATE)
results = self._relevant_text_from_context(d[_SIMPLE_TOKENS],
spacy_date_extractor.extract(nlp_doc,
self.matchers[_POSTING_DATE]),
_POSTING_DATE)
field_name)
if _POST_FILTER in config:
post_filters = config[_POST_FILTER]
results = self.run_post_filters_results(results, post_filters)
Expand Down
44 changes: 40 additions & 4 deletions etk/unit_tests/test_extractions_using_spacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
import sys
import os
import json

sys.path.append('../../')
sys.path.append('../')
from etk.core import Core


class TestExtractionsUsingSpacy(unittest.TestCase):

def setUp(self):

e_config = {
Expand Down Expand Up @@ -89,8 +89,8 @@ def create_list_from_social_media(extractions):
if not isinstance(ps, list):
ps = [ps]
for p in ps:
x = p['qualifiers']['social_network']
results[x] = [p['extracted_value']]
x = p['qualifiers']['social_network']
results[x] = [p['extracted_value']]
return results

def test_spacy_extractions(self):
Expand Down Expand Up @@ -227,7 +227,43 @@ def test_spacy_extractions(self):

correct_addresses = t['extracted']
self.assertEquals(extracted_addresses.sort(), correct_addresses.sort())


def test_spacy_date(self):
doc = {
"url": "http://date.test.com",
"doc_id": "12344",
"content_extraction": {
"useful_text": {
"text": u"Alert: Tue, 2006-02-07"
}
}
}
e_config = {
"document_id": "doc_id",
'data_extraction': [
{
"fields": {
"event_date": {
"extractors": {
"extract_using_spacy": {
"config": {
"post_filter": "parse_date"
}
}
}
}
},
"input_path": [
"content_extraction.useful_text.text.`parent`"
]
}
]}
core = Core(extraction_config=e_config)
r = core.process(doc)
kg = r['knowledge_graph']
self.assertTrue('event_date' in kg)
self.assertEqual(kg['event_date'][0]['value'], '2006-02-07T00:00:00')


if __name__ == '__main__':
unittest.main()

0 comments on commit 251af9d

Please sign in to comment.