diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 36f0daae75..f477a44714 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -164,23 +164,24 @@ def find_interlinks(raw): Returns ------- - dict - Mapping from the linked article to the actual text found. + list + List of tuples in format [(linked article, the actual text found), ...]. """ filtered = filter_wiki(raw, promote_remaining=False, simplify_links=False) interlinks_raw = re.findall(RE_P16, filtered) - interlinks = {} + interlinks = [] for parts in [i.split('|') for i in interlinks_raw]: actual_title = parts[0] try: interlink_text = parts[1] - interlinks[actual_title] = interlink_text except IndexError: - interlinks[actual_title] = actual_title + interlink_text = actual_title + interlink_tuple = (actual_title, interlink_text) + interlinks.append(interlink_tuple) - legit_interlinks = {i: j for i, j in interlinks.items() if '[' not in i and ']' not in i} + legit_interlinks = [(i, j) for i, j in interlinks if '[' not in i and ']' not in i] return legit_interlinks diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py index db15619fd6..209b83424c 100644 --- a/gensim/scripts/segment_wiki.py +++ b/gensim/scripts/segment_wiki.py @@ -90,8 +90,9 @@ def segment_all_articles(file_path, min_article_character=200, workers=None, inc Yields ------ - (str, list of (str, str), (Optionally) dict of str: str) - Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}). + (str, list of (str, str), (Optionally) list of (str, str)) + Structure contains (title, [(section_heading, section_content), ...], + (Optionally) [(interlink_article, interlink_text), ...]). """ with gensim.utils.open(file_path, 'rb') as xml_fileobj: @@ -215,8 +216,9 @@ def segment(page_xml, include_interlinks=False): Returns ------- - (str, list of (str, str), (Optionally) dict of (str: str)) - Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}). + (str, list of (str, str), (Optionally) list of (str, str)) + Structure contains (title, [(section_heading, section_content), ...], + (Optionally) [(interlink_article, interlink_text), ...]). """ elem = cElementTree.fromstring(page_xml) @@ -313,8 +315,9 @@ def get_texts_with_sections(self): Yields ------ - (str, list of (str, str), dict of (str: str)) - Structure contains (title, [(section_heading, section_content), ...], (Optionally){interlinks}). + (str, list of (str, str), list of (str, str)) + Structure contains (title, [(section_heading, section_content), ...], + (Optionally)[(interlink_article, interlink_text), ...]). """ skipped_namespace, skipped_length, skipped_redirect = 0, 0, 0 @@ -378,7 +381,7 @@ def get_texts_with_sections(self): parser.add_argument( '-i', '--include-interlinks', help='Include a mapping for interlinks to other articles in the dump. The mappings format is: ' - '"interlinks": {"article_title_1": "interlink_text_1", "article_title_2": "interlink_text_2", ...}', + '"interlinks": [("article_title_1", "interlink_text_1"), ("article_title_2", "interlink_text_2"), ...]', action='store_true' ) args = parser.parse_args() diff --git a/gensim/test/test_scripts.py b/gensim/test/test_scripts.py index 1e0144e2af..8f7bc5d9eb 100644 --- a/gensim/test/test_scripts.py +++ b/gensim/test/test_scripts.py @@ -70,9 +70,10 @@ def test_segment_all_articles(self): self.assertTrue(first_sentence in first_section_text) # Check interlinks - self.assertTrue(interlinks['self-governance'] == 'self-governed') - self.assertTrue(interlinks['Hierarchy'] == 'hierarchical') - self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon') + self.assertEqual(len(interlinks), 685) + self.assertTrue(interlinks[0] == ("political philosophy", "political philosophy")) + self.assertTrue(interlinks[1] == ("self-governance", "self-governed")) + self.assertTrue(interlinks[2] == ("stateless society", "stateless societies")) def test_generator_len(self): expected_num_articles = 106 @@ -105,9 +106,11 @@ def test_segment_and_write_all_articles(self): self.assertEqual(section_titles, self.expected_section_titles) # Check interlinks - self.assertTrue(interlinks['self-governance'] == 'self-governed') - self.assertTrue(interlinks['Hierarchy'] == 'hierarchical') - self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon') + # JSON has no tuples, only lists. So, we convert lists to tuples explicitly before comparison. + self.assertEqual(len(interlinks), 685) + self.assertEqual(tuple(interlinks[0]), ("political philosophy", "political philosophy")) + self.assertEqual(tuple(interlinks[1]), ("self-governance", "self-governed")) + self.assertEqual(tuple(interlinks[2]), ("stateless society", "stateless societies")) class TestWord2Vec2Tensor(unittest.TestCase):