Skip to content

Commit

Permalink
add an integration unit test between html metadata extractor and spacy
Browse files Browse the repository at this point in the history
  • Loading branch information
saggu committed Jul 27, 2018
1 parent d79a972 commit f9b1c46
Show file tree
Hide file tree
Showing 3 changed files with 494 additions and 0 deletions.
298 changes: 298 additions & 0 deletions etk/unit_tests/ground_truth/news.html

Large diffs are not rendered by default.

170 changes: 170 additions & 0 deletions etk/unit_tests/ground_truth/sample_spacy_rule.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
{
"rules": [
{
"polarity": true,
"description": "",
"pattern": [
{
"prefix": "",
"suffix": "",
"capitalization": [],
"part_of_speech": [],
"length": [],
"maximum": "",
"minimum": "",
"shapes": [],
"token": [
"Iranian",
"General",
"Locks",
"Horns",
"With"
],
"numbers": [],
"is_in_vocabulary": false,
"is_out_of_vocabulary": false,
"is_required": true,
"type": "word",
"is_in_output": false,
"match_all_forms": false,
"contain_digit": false
},
{
"prefix": "",
"suffix": "",
"capitalization": [],
"part_of_speech": [],
"length": [],
"maximum": "",
"minimum": "",
"shapes": [],
"token": [
"Trump"
],
"numbers": [],
"is_in_vocabulary": false,
"is_out_of_vocabulary": false,
"is_required": true,
"type": "word",
"is_in_output": true,
"match_all_forms": false,
"contain_digit": false
}
],
"output_format": "",
"is_active": true,
"dependencies": [],
"identifier": "current rule"
}
],
"test_text": "Iranian General Locks Horns With Trump, Escalating Threat-Filled Feud - The New York Times",
"field_name": "dummy",
"test_tokens": [
{
"index": 0,
"whitespace": " ",
"text": "Iranian"
},
{
"index": 1,
"whitespace": " ",
"text": "General"
},
{
"index": 2,
"whitespace": " ",
"text": "Locks"
},
{
"index": 3,
"whitespace": " ",
"text": "Horns"
},
{
"index": 4,
"whitespace": " ",
"text": "With"
},
{
"index": 5,
"whitespace": "",
"text": "Trump"
},
{
"index": 6,
"whitespace": " ",
"text": ","
},
{
"index": 7,
"whitespace": " ",
"text": "Escalating"
},
{
"index": 8,
"whitespace": "",
"text": "Threat"
},
{
"index": 9,
"whitespace": "",
"text": "-"
},
{
"index": 10,
"whitespace": " ",
"text": "Filled"
},
{
"index": 11,
"whitespace": " ",
"text": "Feud"
},
{
"index": 12,
"whitespace": " ",
"text": "-"
},
{
"index": 13,
"whitespace": " ",
"text": "The"
},
{
"index": 14,
"whitespace": " ",
"text": "New"
},
{
"index": 15,
"whitespace": " ",
"text": "York"
},
{
"index": 16,
"whitespace": "",
"text": "Times"
}
],
"results": [
{
"confidence": 1.0,
"start_token": 4,
"end_token": 6,
"start_char": 28,
"end_char": 38,
"identifier": "current rule",
"text": "Trump",
"token_based_match_mapping": {
"0": [
0,
1
],
"1": [
1,
2
]
}
}
]
}
26 changes: 26 additions & 0 deletions etk/unit_tests/test_spacy_rule_and_html_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import unittest
from etk.extractors.spacy_rule_extractor import SpacyRuleExtractor
import spacy
from etk.extractors.html_metadata_extractor import HTMLMetadataExtractor
import json


class TestSpacyRuleExtractor(unittest.TestCase):

def test_SpacyRuleExtractor(self) -> None:
hme = HTMLMetadataExtractor()
with open('etk/unit_tests/ground_truth/news.html', 'r') as f:
sample_html = f.read()

sample_rules = json.load(open('etk/unit_tests/ground_truth/sample_spacy_rule.json'))

title_extraction = hme.extract(sample_html, extract_title=True)[0].value

sample_rule_extractor = SpacyRuleExtractor(spacy.load("en_core_web_sm"), sample_rules, "dummy")
extractions = sample_rule_extractor.extract(title_extraction)
expected_extraction = 'Trump'
self.assertEqual(extractions[0].value, expected_extraction)


if __name__ == '__main__':
unittest.main()

0 comments on commit f9b1c46

Please sign in to comment.