fix: parse markdown header more carefully (#111)

* fix: check for markdown header more carefully * test: update unit test * fix: update parser and unit test * fix: removing redundant code and adding comment * test: update lint and open file formats * fix: update to parse_markdown_header
googleapis · Aug 25, 2021 · 485b248 · 485b248
1 parent 18bf0de
commit 485b248
Show file tree

Hide file tree

Showing 11 changed files with 221 additions and 43 deletions.
diff --git a/docfx_yaml/extension.py b/docfx_yaml/extension.py
@@ -1039,18 +1039,48 @@ def pretty_package_name(package_group):
     return " ".join(capitalized_name)
 
 
+# Check is the current lines conform to markdown header format.
+def parse_markdown_header(header_line, prev_line):
+    # Markdown h1 prefix should have only 1 of '#' character followed by exactly one space.
+    h1_header_prefix = "# "
+    if h1_header_prefix in header_line and header_line.count("#") == 1:
+        # Check for proper h1 header formatting, ensure there's more than just
+        # the hashtag character, and exactly only one space after the hashtag.
+        if not header_line[header_line.index(h1_header_prefix)+2].isspace() and \
+            len(header_line) > 2:
+
+            return header_line.strip("#").strip()
+
+    elif "=" in header_line:
+        # Check if we're inspecting an empty or undefined lines.
+        if not prev_line:
+            return ""
+
+        # Check if the current line only has equal sign divider.
+        if header_line.count("=") == len(header_line.strip()):
+            # Update header to the previous line.
+            return prev_line.strip()
+
+    return ""
+
+
 # For a given markdown file, extract its header line.
 def extract_header_from_markdown(mdfile_iterator):
+    mdfile_name = mdfile_iterator.name.split("/")[-1].split(".")[0].capitalize()
+    prev_line = ""
+
     for header_line in mdfile_iterator:
+
         # Ignore licenses and other non-headers prior to the header.
-        if "#" in header_line:
-            break
+        header = parse_markdown_header(header_line, prev_line)
+        # If we've found the header, return the header.
+        if header != "":
+            return header
 
-    if header_line.count("#") != 1:
-        raise ValueError(f"The first header of {mdfile_iterator.name} is not a h1 header: {header_line}")
+        prev_line = header_line
 
-    # Extract the header name.
-    return header_line.strip("#").strip()
+    print(f"Could not find a title for {mdfile_iterator.name}. Using {mdfile_name} as the title instead.")
+    return mdfile_name
 
 
 # Given generated markdown files, incorporate them into the docfx_yaml output.

diff --git a/tests/markdown_example.md b/tests/markdown_example.md
@@ -1,4 +1,4 @@
-#Test header for a simple markdown file.  
+# Test header for a simple markdown file.  
 
 ##Content header
 This is a simple line followed by an h2 header.
diff --git a/tests/markdown_example_alternate.md b/tests/markdown_example_alternate.md
@@ -0,0 +1,7 @@
+This is a simple alternate header
+=================================
+
+With a different style
+----------------------
+
+This is a simple markdown file testing for different header style.
diff --git a/tests/markdown_example_alternate_bad.md b/tests/markdown_example_alternate_bad.md
@@ -0,0 +1,6 @@
+==============
+
+There should be a header line before the divider.
+
+##Content header
+This is a simple line followed by an h2 header.
diff --git a/tests/markdown_example_alternate_header.md b/tests/markdown_example_alternate_header.md
@@ -0,0 +1,20 @@
+<!--
+Copyright 2021 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+This is a simple alternate header
+=================================
+
+With a different style
+----------------------
+
+This is a simple markdown file testing for different header style.
diff --git a/tests/markdown_example_alternate_less.md b/tests/markdown_example_alternate_less.md
@@ -0,0 +1,7 @@
+This is a simple alternate header
+=========
+
+With less divider length but it's still a header.
+--------
+
+This is a markdown file to test for alternate header style with shorter divider.
diff --git a/tests/markdown_example_bad_header.md b/tests/markdown_example_bad_header.md
@@ -0,0 +1,4 @@
+ #Test header for a bad formatted markdown file.  
+
+##Content header
+This is a simple line followed by an h2 header.
diff --git a/tests/markdown_example_bad.md → tests/markdown_example_h2.md b/tests/markdown_example_bad.md → tests/markdown_example_h2.md
@@ -1,4 +1,4 @@
-##Test header for a simple markdown file.  
+## Test header for a simple markdown file.  
 
 ##Content header
 This is a simple line followed by an h2 header.
diff --git a/tests/markdown_example_header.md b/tests/markdown_example_header.md
@@ -12,7 +12,7 @@ limitations under the License.
 -->
 
 
-#  Test header for a simple markdown file.
+# Test header for a simple markdown file.
 
 ##Content header
 This is a simple line followed by an h2 header.
diff --git a/tests/markdown_example_noheader.md b/tests/markdown_example_noheader.md
@@ -2,4 +2,4 @@ This is a simple markdown file with no header.
 
 When running the test on this file to extract its header,
 
-it should throw an exception.
+it should use the filename as the title.
diff --git a/tests/test_unit.py b/tests/test_unit.py
@@ -8,6 +8,7 @@
 from docfx_yaml.extension import pretty_package_name
 from docfx_yaml.extension import group_by_package
 from docfx_yaml.extension import extract_header_from_markdown
+from docfx_yaml.extension import parse_markdown_header
 
 import unittest
 
@@ -40,40 +41,35 @@ def test_find_unique_name(self):
 
     def test_disambiguate_toc_name(self):
 
-        want_file = open('tests/yaml_post.yaml', 'r')
-        yaml_want = load(want_file, Loader=Loader)
+        with open('tests/yaml_post.yaml', 'r') as want_file:
+            yaml_want = load(want_file, Loader=Loader)
         disambiguated_names_want = {
             'google.cloud.spanner_admin_database_v1.types': 'spanner_admin_database_v1.types',
-            'google.cloud.spanner_admin_instance_v1.types': 'spanner_admin_instance_v1.types', 
+            'google.cloud.spanner_admin_instance_v1.types': 'spanner_admin_instance_v1.types',
             'google.cloud.spanner_v1.types': 'spanner_v1.types'
         }
 
-        test_file = open('tests/yaml_pre.yaml', 'r')
-        yaml_got = load(test_file, Loader=Loader)
+        with open('tests/yaml_pre.yaml', 'r') as test_file:
+            yaml_got = load(test_file, Loader=Loader)
         disambiguated_names_got = disambiguate_toc_name(yaml_got)
 
-        want_file.close()
-        test_file.close()
-
         self.assertEqual(yaml_want, yaml_got)
         self.assertEqual(disambiguated_names_want, disambiguated_names_got)
 
 
     def test_disambiguate_toc_name_duplicate(self):
 
-        want_file = open('tests/yaml_post_duplicate.yaml', 'r')
-        yaml_want = load(want_file, Loader=Loader)
+        with open('tests/yaml_post_duplicate.yaml', 'r') as want_file:
+            yaml_want = load(want_file, Loader=Loader)
         disambiguated_names_want = {
-            'google.api_core.client_info': 'client_info', 
+            'google.api_core.client_info': 'client_info',
             'google.api_core.gapic_v1.client_info': 'gapic_v1.client_info'
         }
-        
-        test_file = open('tests/yaml_pre_duplicate.yaml', 'r')
-        yaml_got = load(test_file, Loader=Loader)
+
+        with open('tests/yaml_pre_duplicate.yaml', 'r') as test_file:
+            yaml_got = load(test_file, Loader=Loader)
         disambiguated_names_got = disambiguate_toc_name(yaml_got)
 
-        want_file.close()
-        test_file.close()
 
         self.assertEqual(yaml_want, yaml_got)
         self.assertEqual(disambiguated_names_want, disambiguated_names_got)
@@ -538,41 +534,149 @@ def test_group_by_package(self):
         self.assertCountEqual(toc_yaml_got, toc_yaml_want)
 
 
+    def test_parse_markdown_header(self):
+        # Test for simple header_line.
+        header_line_want = "Test header"
+
+        header_line = "# Test header"
+        prev_line = ""
+
+        header_line_got = parse_markdown_header(header_line, prev_line)
+
+        self.assertEqual(header_line_got, header_line_want)
+
+        # Test for invalid input.
+        header_line_want = ""
+
+        header_line = "#Test header"
+        prev_line = ""
+
+        header_line_got = parse_markdown_header(header_line, prev_line)
+
+        self.assertEqual(header_line_got, header_line_want)
+
+        # Test for invalid input.
+        header_line_want = ""
+
+        header_line = "#  Test header"
+        prev_line = ""
+
+        header_line_got = parse_markdown_header(header_line, prev_line)
+
+        self.assertEqual(header_line_got, header_line_want)
+
+        # Test for no header.
+        header_line_want = ""
+
+        header_line = "-->"
+        prev_line = "limitations under the License.\n"
+
+        header_line_got = parse_markdown_header(header_line, prev_line)
+
+        self.assertEqual(header_line_got, header_line_want)
+
+
+    def test_parse_markdown_header_alternate(self):
+        # Test for simple alternate header.
+        header_line_want = "Test header"
+
+        header_line = "============\n"
+        prev_line = "Test header"
+
+        header_line_got = parse_markdown_header(header_line, prev_line)
+
+        self.assertEqual(header_line_got, header_line_want)
+
+        # Test for no header.
+        header_line_want = ""
+
+        header_line = "============\n"
+        prev_line = ""
+
+        header_line_got = parse_markdown_header(header_line, prev_line)
+
+        self.assertEqual(header_line_got, header_line_want)
+
+
+        # Test for shorter divider.
+        header_line_want = "Test header"
+
+        header_line = "======\n"
+        prev_line = "Test header"
+
+        header_line_got = parse_markdown_header(header_line, prev_line)
+
+        self.assertEqual(header_line_got, header_line_want)
+
+
     def test_extract_header_from_markdown(self):
         # Check the header for a normal markdown file.
         header_line_want = "Test header for a simple markdown file."
 
-        mdfile = open('tests/markdown_example.md', 'r')
-        header_line_got = extract_header_from_markdown(mdfile)
+        with open('tests/markdown_example.md', 'r') as mdfile:
+            header_line_got = extract_header_from_markdown(mdfile)
 
         self.assertEqual(header_line_got, header_line_want)
-        mdfile.close()
 
         # The header should be the same even with the license header.
         header_line_with_license_want = header_line_want
 
-        mdfile = open('tests/markdown_example_header.md', 'r')
-        header_line_with_license_got = extract_header_from_markdown(mdfile)
+        with open('tests/markdown_example_header.md', 'r') as mdfile_license:
+            header_line_with_license_got = extract_header_from_markdown(mdfile_license)
 
         self.assertEqual(header_line_with_license_got, header_line_with_license_want)
-        mdfile.close()
 
 
-    def test_extract_header_from_markdown_check_error(self):
-        # Check an exception is thrown for markdown files that's not well
-        # formatted.
-        mdfile = open('tests/markdown_example_bad.md', 'r')
-        with self.assertRaises(ValueError):
-            header_line = extract_header_from_markdown(mdfile)
+    def test_extract_header_from_markdown_alternate_header(self):
+        # Check the header for an alternate header style.
+        header_line_want = "This is a simple alternate header"
 
-        mdfile.close()
+        with open('tests/markdown_example_alternate.md', 'r') as mdfile:
+            header_line_got = extract_header_from_markdown(mdfile)
 
-        mdfile = open('tests/markdown_example_noheader.md', 'r')
-        with self.assertRaises(ValueError):
-            header_line = extract_header_from_markdown(mdfile)
+        self.assertEqual(header_line_got, header_line_want)
+
+        # The header should be the same even with the license header.
+        header_line_with_license_want = header_line_want
+
+        with open('tests/markdown_example_alternate_header.md', 'r') as mdfile:
+            header_line_with_license_got = extract_header_from_markdown(mdfile)
+
+        self.assertEqual(header_line_with_license_got, header_line_with_license_want)
+
+        # Check the header for an alternate header style.
+        header_line_want = "This is a simple alternate header"
+
+        with open('tests/markdown_example_alternate_less.md', 'r') as mdfile:
+            header_line_got = extract_header_from_markdown(mdfile)
+
+        self.assertEqual(header_line_got, header_line_want)
+
+
+    def test_extract_header_from_markdown_bad_headers(self):
+        # Check that the filename is used as header if no valid header is found.
+        header_line_want = "Markdown_example_bad_header"
+
+        with open('tests/markdown_example_bad_header.md', 'r') as mdfile:
+            header_line_got = extract_header_from_markdown(mdfile)
+
+        self.assertEqual(header_line_want, header_line_got)
+
+        # Check that only h1 headers are parsed.
+        header_line_want = "Markdown_example_h2"
+
+        with open('tests/markdown_example_h2.md', 'r') as mdfile:
+            header_line_got = extract_header_from_markdown(mdfile)
+
+        self.assertEqual(header_line_want, header_line_got)
+
+        # Check that there must be a line before the h1 header breaker.
+        header_line_want = "Markdown_example_alternate_bad"
 
-        mdfile.close()
+        with open('tests/markdown_example_alternate_bad.md', 'r') as mdfile:
+            header_line_got = extract_header_from_markdown(mdfile)
 
+        self.assertEqual(header_line_want, header_line_got)
 
 
 if __name__ == '__main__':