From 485b248091a6dffd8f4c0cd77a8fcb4fde8eca09 Mon Sep 17 00:00:00 2001 From: Dan Lee <71398022+dandhlee@users.noreply.github.com> Date: Wed, 25 Aug 2021 12:19:52 -0400 Subject: [PATCH] fix: parse markdown header more carefully (#111) * fix: check for markdown header more carefully * test: update unit test * fix: update parser and unit test * fix: removing redundant code and adding comment * test: update lint and open file formats * fix: update to parse_markdown_header --- docfx_yaml/extension.py | 42 ++++- tests/markdown_example.md | 2 +- tests/markdown_example_alternate.md | 7 + tests/markdown_example_alternate_bad.md | 6 + tests/markdown_example_alternate_header.md | 20 +++ tests/markdown_example_alternate_less.md | 7 + tests/markdown_example_bad_header.md | 4 + ..._example_bad.md => markdown_example_h2.md} | 2 +- tests/markdown_example_header.md | 2 +- tests/markdown_example_noheader.md | 2 +- tests/test_unit.py | 170 ++++++++++++++---- 11 files changed, 221 insertions(+), 43 deletions(-) create mode 100644 tests/markdown_example_alternate.md create mode 100644 tests/markdown_example_alternate_bad.md create mode 100644 tests/markdown_example_alternate_header.md create mode 100644 tests/markdown_example_alternate_less.md create mode 100644 tests/markdown_example_bad_header.md rename tests/{markdown_example_bad.md => markdown_example_h2.md} (59%) diff --git a/docfx_yaml/extension.py b/docfx_yaml/extension.py index dac1bfe4..9d28bb8b 100644 --- a/docfx_yaml/extension.py +++ b/docfx_yaml/extension.py @@ -1039,18 +1039,48 @@ def pretty_package_name(package_group): return " ".join(capitalized_name) +# Check is the current lines conform to markdown header format. +def parse_markdown_header(header_line, prev_line): + # Markdown h1 prefix should have only 1 of '#' character followed by exactly one space. + h1_header_prefix = "# " + if h1_header_prefix in header_line and header_line.count("#") == 1: + # Check for proper h1 header formatting, ensure there's more than just + # the hashtag character, and exactly only one space after the hashtag. + if not header_line[header_line.index(h1_header_prefix)+2].isspace() and \ + len(header_line) > 2: + + return header_line.strip("#").strip() + + elif "=" in header_line: + # Check if we're inspecting an empty or undefined lines. + if not prev_line: + return "" + + # Check if the current line only has equal sign divider. + if header_line.count("=") == len(header_line.strip()): + # Update header to the previous line. + return prev_line.strip() + + return "" + + # For a given markdown file, extract its header line. def extract_header_from_markdown(mdfile_iterator): + mdfile_name = mdfile_iterator.name.split("/")[-1].split(".")[0].capitalize() + prev_line = "" + for header_line in mdfile_iterator: + # Ignore licenses and other non-headers prior to the header. - if "#" in header_line: - break + header = parse_markdown_header(header_line, prev_line) + # If we've found the header, return the header. + if header != "": + return header - if header_line.count("#") != 1: - raise ValueError(f"The first header of {mdfile_iterator.name} is not a h1 header: {header_line}") + prev_line = header_line - # Extract the header name. - return header_line.strip("#").strip() + print(f"Could not find a title for {mdfile_iterator.name}. Using {mdfile_name} as the title instead.") + return mdfile_name # Given generated markdown files, incorporate them into the docfx_yaml output. diff --git a/tests/markdown_example.md b/tests/markdown_example.md index 04f9b339..a2208b1d 100644 --- a/tests/markdown_example.md +++ b/tests/markdown_example.md @@ -1,4 +1,4 @@ -#Test header for a simple markdown file. +# Test header for a simple markdown file. ##Content header This is a simple line followed by an h2 header. diff --git a/tests/markdown_example_alternate.md b/tests/markdown_example_alternate.md new file mode 100644 index 00000000..9945014e --- /dev/null +++ b/tests/markdown_example_alternate.md @@ -0,0 +1,7 @@ +This is a simple alternate header +================================= + +With a different style +---------------------- + +This is a simple markdown file testing for different header style. diff --git a/tests/markdown_example_alternate_bad.md b/tests/markdown_example_alternate_bad.md new file mode 100644 index 00000000..e500ba9a --- /dev/null +++ b/tests/markdown_example_alternate_bad.md @@ -0,0 +1,6 @@ +============== + +There should be a header line before the divider. + +##Content header +This is a simple line followed by an h2 header. diff --git a/tests/markdown_example_alternate_header.md b/tests/markdown_example_alternate_header.md new file mode 100644 index 00000000..ec83c703 --- /dev/null +++ b/tests/markdown_example_alternate_header.md @@ -0,0 +1,20 @@ + + +This is a simple alternate header +================================= + +With a different style +---------------------- + +This is a simple markdown file testing for different header style. diff --git a/tests/markdown_example_alternate_less.md b/tests/markdown_example_alternate_less.md new file mode 100644 index 00000000..38194ce9 --- /dev/null +++ b/tests/markdown_example_alternate_less.md @@ -0,0 +1,7 @@ +This is a simple alternate header +========= + +With less divider length but it's still a header. +-------- + +This is a markdown file to test for alternate header style with shorter divider. diff --git a/tests/markdown_example_bad_header.md b/tests/markdown_example_bad_header.md new file mode 100644 index 00000000..6a3a1893 --- /dev/null +++ b/tests/markdown_example_bad_header.md @@ -0,0 +1,4 @@ + #Test header for a bad formatted markdown file. + +##Content header +This is a simple line followed by an h2 header. diff --git a/tests/markdown_example_bad.md b/tests/markdown_example_h2.md similarity index 59% rename from tests/markdown_example_bad.md rename to tests/markdown_example_h2.md index 8697361a..bc90dbe2 100644 --- a/tests/markdown_example_bad.md +++ b/tests/markdown_example_h2.md @@ -1,4 +1,4 @@ -##Test header for a simple markdown file. +## Test header for a simple markdown file. ##Content header This is a simple line followed by an h2 header. diff --git a/tests/markdown_example_header.md b/tests/markdown_example_header.md index fb392b54..9a534ce3 100644 --- a/tests/markdown_example_header.md +++ b/tests/markdown_example_header.md @@ -12,7 +12,7 @@ limitations under the License. --> -# Test header for a simple markdown file. +# Test header for a simple markdown file. ##Content header This is a simple line followed by an h2 header. diff --git a/tests/markdown_example_noheader.md b/tests/markdown_example_noheader.md index 1c27dd02..6a69ee45 100644 --- a/tests/markdown_example_noheader.md +++ b/tests/markdown_example_noheader.md @@ -2,4 +2,4 @@ This is a simple markdown file with no header. When running the test on this file to extract its header, -it should throw an exception. +it should use the filename as the title. diff --git a/tests/test_unit.py b/tests/test_unit.py index 39443bb1..91fd57ed 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -8,6 +8,7 @@ from docfx_yaml.extension import pretty_package_name from docfx_yaml.extension import group_by_package from docfx_yaml.extension import extract_header_from_markdown +from docfx_yaml.extension import parse_markdown_header import unittest @@ -40,40 +41,35 @@ def test_find_unique_name(self): def test_disambiguate_toc_name(self): - want_file = open('tests/yaml_post.yaml', 'r') - yaml_want = load(want_file, Loader=Loader) + with open('tests/yaml_post.yaml', 'r') as want_file: + yaml_want = load(want_file, Loader=Loader) disambiguated_names_want = { 'google.cloud.spanner_admin_database_v1.types': 'spanner_admin_database_v1.types', - 'google.cloud.spanner_admin_instance_v1.types': 'spanner_admin_instance_v1.types', + 'google.cloud.spanner_admin_instance_v1.types': 'spanner_admin_instance_v1.types', 'google.cloud.spanner_v1.types': 'spanner_v1.types' } - test_file = open('tests/yaml_pre.yaml', 'r') - yaml_got = load(test_file, Loader=Loader) + with open('tests/yaml_pre.yaml', 'r') as test_file: + yaml_got = load(test_file, Loader=Loader) disambiguated_names_got = disambiguate_toc_name(yaml_got) - want_file.close() - test_file.close() - self.assertEqual(yaml_want, yaml_got) self.assertEqual(disambiguated_names_want, disambiguated_names_got) def test_disambiguate_toc_name_duplicate(self): - want_file = open('tests/yaml_post_duplicate.yaml', 'r') - yaml_want = load(want_file, Loader=Loader) + with open('tests/yaml_post_duplicate.yaml', 'r') as want_file: + yaml_want = load(want_file, Loader=Loader) disambiguated_names_want = { - 'google.api_core.client_info': 'client_info', + 'google.api_core.client_info': 'client_info', 'google.api_core.gapic_v1.client_info': 'gapic_v1.client_info' } - - test_file = open('tests/yaml_pre_duplicate.yaml', 'r') - yaml_got = load(test_file, Loader=Loader) + + with open('tests/yaml_pre_duplicate.yaml', 'r') as test_file: + yaml_got = load(test_file, Loader=Loader) disambiguated_names_got = disambiguate_toc_name(yaml_got) - want_file.close() - test_file.close() self.assertEqual(yaml_want, yaml_got) self.assertEqual(disambiguated_names_want, disambiguated_names_got) @@ -538,41 +534,149 @@ def test_group_by_package(self): self.assertCountEqual(toc_yaml_got, toc_yaml_want) + def test_parse_markdown_header(self): + # Test for simple header_line. + header_line_want = "Test header" + + header_line = "# Test header" + prev_line = "" + + header_line_got = parse_markdown_header(header_line, prev_line) + + self.assertEqual(header_line_got, header_line_want) + + # Test for invalid input. + header_line_want = "" + + header_line = "#Test header" + prev_line = "" + + header_line_got = parse_markdown_header(header_line, prev_line) + + self.assertEqual(header_line_got, header_line_want) + + # Test for invalid input. + header_line_want = "" + + header_line = "# Test header" + prev_line = "" + + header_line_got = parse_markdown_header(header_line, prev_line) + + self.assertEqual(header_line_got, header_line_want) + + # Test for no header. + header_line_want = "" + + header_line = "-->" + prev_line = "limitations under the License.\n" + + header_line_got = parse_markdown_header(header_line, prev_line) + + self.assertEqual(header_line_got, header_line_want) + + + def test_parse_markdown_header_alternate(self): + # Test for simple alternate header. + header_line_want = "Test header" + + header_line = "============\n" + prev_line = "Test header" + + header_line_got = parse_markdown_header(header_line, prev_line) + + self.assertEqual(header_line_got, header_line_want) + + # Test for no header. + header_line_want = "" + + header_line = "============\n" + prev_line = "" + + header_line_got = parse_markdown_header(header_line, prev_line) + + self.assertEqual(header_line_got, header_line_want) + + + # Test for shorter divider. + header_line_want = "Test header" + + header_line = "======\n" + prev_line = "Test header" + + header_line_got = parse_markdown_header(header_line, prev_line) + + self.assertEqual(header_line_got, header_line_want) + + def test_extract_header_from_markdown(self): # Check the header for a normal markdown file. header_line_want = "Test header for a simple markdown file." - mdfile = open('tests/markdown_example.md', 'r') - header_line_got = extract_header_from_markdown(mdfile) + with open('tests/markdown_example.md', 'r') as mdfile: + header_line_got = extract_header_from_markdown(mdfile) self.assertEqual(header_line_got, header_line_want) - mdfile.close() # The header should be the same even with the license header. header_line_with_license_want = header_line_want - mdfile = open('tests/markdown_example_header.md', 'r') - header_line_with_license_got = extract_header_from_markdown(mdfile) + with open('tests/markdown_example_header.md', 'r') as mdfile_license: + header_line_with_license_got = extract_header_from_markdown(mdfile_license) self.assertEqual(header_line_with_license_got, header_line_with_license_want) - mdfile.close() - def test_extract_header_from_markdown_check_error(self): - # Check an exception is thrown for markdown files that's not well - # formatted. - mdfile = open('tests/markdown_example_bad.md', 'r') - with self.assertRaises(ValueError): - header_line = extract_header_from_markdown(mdfile) + def test_extract_header_from_markdown_alternate_header(self): + # Check the header for an alternate header style. + header_line_want = "This is a simple alternate header" - mdfile.close() + with open('tests/markdown_example_alternate.md', 'r') as mdfile: + header_line_got = extract_header_from_markdown(mdfile) - mdfile = open('tests/markdown_example_noheader.md', 'r') - with self.assertRaises(ValueError): - header_line = extract_header_from_markdown(mdfile) + self.assertEqual(header_line_got, header_line_want) + + # The header should be the same even with the license header. + header_line_with_license_want = header_line_want + + with open('tests/markdown_example_alternate_header.md', 'r') as mdfile: + header_line_with_license_got = extract_header_from_markdown(mdfile) + + self.assertEqual(header_line_with_license_got, header_line_with_license_want) + + # Check the header for an alternate header style. + header_line_want = "This is a simple alternate header" + + with open('tests/markdown_example_alternate_less.md', 'r') as mdfile: + header_line_got = extract_header_from_markdown(mdfile) + + self.assertEqual(header_line_got, header_line_want) + + + def test_extract_header_from_markdown_bad_headers(self): + # Check that the filename is used as header if no valid header is found. + header_line_want = "Markdown_example_bad_header" + + with open('tests/markdown_example_bad_header.md', 'r') as mdfile: + header_line_got = extract_header_from_markdown(mdfile) + + self.assertEqual(header_line_want, header_line_got) + + # Check that only h1 headers are parsed. + header_line_want = "Markdown_example_h2" + + with open('tests/markdown_example_h2.md', 'r') as mdfile: + header_line_got = extract_header_from_markdown(mdfile) + + self.assertEqual(header_line_want, header_line_got) + + # Check that there must be a line before the h1 header breaker. + header_line_want = "Markdown_example_alternate_bad" - mdfile.close() + with open('tests/markdown_example_alternate_bad.md', 'r') as mdfile: + header_line_got = extract_header_from_markdown(mdfile) + self.assertEqual(header_line_want, header_line_got) if __name__ == '__main__':