Skip to content

Commit

Permalink
fix: parse markdown header more carefully (#111)
Browse files Browse the repository at this point in the history
* fix: check for markdown header more carefully

* test: update unit test

* fix: update parser and unit test

* fix: removing redundant code and adding comment

* test: update lint and open file formats

* fix: update to parse_markdown_header
  • Loading branch information
dandhlee committed Aug 25, 2021
1 parent 18bf0de commit 485b248
Show file tree
Hide file tree
Showing 11 changed files with 221 additions and 43 deletions.
42 changes: 36 additions & 6 deletions docfx_yaml/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -1039,18 +1039,48 @@ def pretty_package_name(package_group):
return " ".join(capitalized_name)


# Check is the current lines conform to markdown header format.
def parse_markdown_header(header_line, prev_line):
# Markdown h1 prefix should have only 1 of '#' character followed by exactly one space.
h1_header_prefix = "# "
if h1_header_prefix in header_line and header_line.count("#") == 1:
# Check for proper h1 header formatting, ensure there's more than just
# the hashtag character, and exactly only one space after the hashtag.
if not header_line[header_line.index(h1_header_prefix)+2].isspace() and \
len(header_line) > 2:

return header_line.strip("#").strip()

elif "=" in header_line:
# Check if we're inspecting an empty or undefined lines.
if not prev_line:
return ""

# Check if the current line only has equal sign divider.
if header_line.count("=") == len(header_line.strip()):
# Update header to the previous line.
return prev_line.strip()

return ""


# For a given markdown file, extract its header line.
def extract_header_from_markdown(mdfile_iterator):
mdfile_name = mdfile_iterator.name.split("/")[-1].split(".")[0].capitalize()
prev_line = ""

for header_line in mdfile_iterator:

# Ignore licenses and other non-headers prior to the header.
if "#" in header_line:
break
header = parse_markdown_header(header_line, prev_line)
# If we've found the header, return the header.
if header != "":
return header

if header_line.count("#") != 1:
raise ValueError(f"The first header of {mdfile_iterator.name} is not a h1 header: {header_line}")
prev_line = header_line

# Extract the header name.
return header_line.strip("#").strip()
print(f"Could not find a title for {mdfile_iterator.name}. Using {mdfile_name} as the title instead.")
return mdfile_name


# Given generated markdown files, incorporate them into the docfx_yaml output.
Expand Down
2 changes: 1 addition & 1 deletion tests/markdown_example.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#Test header for a simple markdown file.
# Test header for a simple markdown file.

##Content header
This is a simple line followed by an h2 header.
7 changes: 7 additions & 0 deletions tests/markdown_example_alternate.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
This is a simple alternate header
=================================

With a different style
----------------------

This is a simple markdown file testing for different header style.
6 changes: 6 additions & 0 deletions tests/markdown_example_alternate_bad.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
==============

There should be a header line before the divider.

##Content header
This is a simple line followed by an h2 header.
20 changes: 20 additions & 0 deletions tests/markdown_example_alternate_header.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<!--
Copyright 2021 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

This is a simple alternate header
=================================

With a different style
----------------------

This is a simple markdown file testing for different header style.
7 changes: 7 additions & 0 deletions tests/markdown_example_alternate_less.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
This is a simple alternate header
=========

With less divider length but it's still a header.
--------

This is a markdown file to test for alternate header style with shorter divider.
4 changes: 4 additions & 0 deletions tests/markdown_example_bad_header.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#Test header for a bad formatted markdown file.

##Content header
This is a simple line followed by an h2 header.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
##Test header for a simple markdown file.
## Test header for a simple markdown file.

##Content header
This is a simple line followed by an h2 header.
2 changes: 1 addition & 1 deletion tests/markdown_example_header.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ limitations under the License.
-->


# Test header for a simple markdown file.
# Test header for a simple markdown file.

##Content header
This is a simple line followed by an h2 header.
2 changes: 1 addition & 1 deletion tests/markdown_example_noheader.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ This is a simple markdown file with no header.

When running the test on this file to extract its header,

it should throw an exception.
it should use the filename as the title.
170 changes: 137 additions & 33 deletions tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from docfx_yaml.extension import pretty_package_name
from docfx_yaml.extension import group_by_package
from docfx_yaml.extension import extract_header_from_markdown
from docfx_yaml.extension import parse_markdown_header

import unittest

Expand Down Expand Up @@ -40,40 +41,35 @@ def test_find_unique_name(self):

def test_disambiguate_toc_name(self):

want_file = open('tests/yaml_post.yaml', 'r')
yaml_want = load(want_file, Loader=Loader)
with open('tests/yaml_post.yaml', 'r') as want_file:
yaml_want = load(want_file, Loader=Loader)
disambiguated_names_want = {
'google.cloud.spanner_admin_database_v1.types': 'spanner_admin_database_v1.types',
'google.cloud.spanner_admin_instance_v1.types': 'spanner_admin_instance_v1.types',
'google.cloud.spanner_admin_instance_v1.types': 'spanner_admin_instance_v1.types',
'google.cloud.spanner_v1.types': 'spanner_v1.types'
}

test_file = open('tests/yaml_pre.yaml', 'r')
yaml_got = load(test_file, Loader=Loader)
with open('tests/yaml_pre.yaml', 'r') as test_file:
yaml_got = load(test_file, Loader=Loader)
disambiguated_names_got = disambiguate_toc_name(yaml_got)

want_file.close()
test_file.close()

self.assertEqual(yaml_want, yaml_got)
self.assertEqual(disambiguated_names_want, disambiguated_names_got)


def test_disambiguate_toc_name_duplicate(self):

want_file = open('tests/yaml_post_duplicate.yaml', 'r')
yaml_want = load(want_file, Loader=Loader)
with open('tests/yaml_post_duplicate.yaml', 'r') as want_file:
yaml_want = load(want_file, Loader=Loader)
disambiguated_names_want = {
'google.api_core.client_info': 'client_info',
'google.api_core.client_info': 'client_info',
'google.api_core.gapic_v1.client_info': 'gapic_v1.client_info'
}
test_file = open('tests/yaml_pre_duplicate.yaml', 'r')
yaml_got = load(test_file, Loader=Loader)

with open('tests/yaml_pre_duplicate.yaml', 'r') as test_file:
yaml_got = load(test_file, Loader=Loader)
disambiguated_names_got = disambiguate_toc_name(yaml_got)

want_file.close()
test_file.close()

self.assertEqual(yaml_want, yaml_got)
self.assertEqual(disambiguated_names_want, disambiguated_names_got)
Expand Down Expand Up @@ -538,41 +534,149 @@ def test_group_by_package(self):
self.assertCountEqual(toc_yaml_got, toc_yaml_want)


def test_parse_markdown_header(self):
# Test for simple header_line.
header_line_want = "Test header"

header_line = "# Test header"
prev_line = ""

header_line_got = parse_markdown_header(header_line, prev_line)

self.assertEqual(header_line_got, header_line_want)

# Test for invalid input.
header_line_want = ""

header_line = "#Test header"
prev_line = ""

header_line_got = parse_markdown_header(header_line, prev_line)

self.assertEqual(header_line_got, header_line_want)

# Test for invalid input.
header_line_want = ""

header_line = "# Test header"
prev_line = ""

header_line_got = parse_markdown_header(header_line, prev_line)

self.assertEqual(header_line_got, header_line_want)

# Test for no header.
header_line_want = ""

header_line = "-->"
prev_line = "limitations under the License.\n"

header_line_got = parse_markdown_header(header_line, prev_line)

self.assertEqual(header_line_got, header_line_want)


def test_parse_markdown_header_alternate(self):
# Test for simple alternate header.
header_line_want = "Test header"

header_line = "============\n"
prev_line = "Test header"

header_line_got = parse_markdown_header(header_line, prev_line)

self.assertEqual(header_line_got, header_line_want)

# Test for no header.
header_line_want = ""

header_line = "============\n"
prev_line = ""

header_line_got = parse_markdown_header(header_line, prev_line)

self.assertEqual(header_line_got, header_line_want)


# Test for shorter divider.
header_line_want = "Test header"

header_line = "======\n"
prev_line = "Test header"

header_line_got = parse_markdown_header(header_line, prev_line)

self.assertEqual(header_line_got, header_line_want)


def test_extract_header_from_markdown(self):
# Check the header for a normal markdown file.
header_line_want = "Test header for a simple markdown file."

mdfile = open('tests/markdown_example.md', 'r')
header_line_got = extract_header_from_markdown(mdfile)
with open('tests/markdown_example.md', 'r') as mdfile:
header_line_got = extract_header_from_markdown(mdfile)

self.assertEqual(header_line_got, header_line_want)
mdfile.close()

# The header should be the same even with the license header.
header_line_with_license_want = header_line_want

mdfile = open('tests/markdown_example_header.md', 'r')
header_line_with_license_got = extract_header_from_markdown(mdfile)
with open('tests/markdown_example_header.md', 'r') as mdfile_license:
header_line_with_license_got = extract_header_from_markdown(mdfile_license)

self.assertEqual(header_line_with_license_got, header_line_with_license_want)
mdfile.close()


def test_extract_header_from_markdown_check_error(self):
# Check an exception is thrown for markdown files that's not well
# formatted.
mdfile = open('tests/markdown_example_bad.md', 'r')
with self.assertRaises(ValueError):
header_line = extract_header_from_markdown(mdfile)
def test_extract_header_from_markdown_alternate_header(self):
# Check the header for an alternate header style.
header_line_want = "This is a simple alternate header"

mdfile.close()
with open('tests/markdown_example_alternate.md', 'r') as mdfile:
header_line_got = extract_header_from_markdown(mdfile)

mdfile = open('tests/markdown_example_noheader.md', 'r')
with self.assertRaises(ValueError):
header_line = extract_header_from_markdown(mdfile)
self.assertEqual(header_line_got, header_line_want)

# The header should be the same even with the license header.
header_line_with_license_want = header_line_want

with open('tests/markdown_example_alternate_header.md', 'r') as mdfile:
header_line_with_license_got = extract_header_from_markdown(mdfile)

self.assertEqual(header_line_with_license_got, header_line_with_license_want)

# Check the header for an alternate header style.
header_line_want = "This is a simple alternate header"

with open('tests/markdown_example_alternate_less.md', 'r') as mdfile:
header_line_got = extract_header_from_markdown(mdfile)

self.assertEqual(header_line_got, header_line_want)


def test_extract_header_from_markdown_bad_headers(self):
# Check that the filename is used as header if no valid header is found.
header_line_want = "Markdown_example_bad_header"

with open('tests/markdown_example_bad_header.md', 'r') as mdfile:
header_line_got = extract_header_from_markdown(mdfile)

self.assertEqual(header_line_want, header_line_got)

# Check that only h1 headers are parsed.
header_line_want = "Markdown_example_h2"

with open('tests/markdown_example_h2.md', 'r') as mdfile:
header_line_got = extract_header_from_markdown(mdfile)

self.assertEqual(header_line_want, header_line_got)

# Check that there must be a line before the h1 header breaker.
header_line_want = "Markdown_example_alternate_bad"

mdfile.close()
with open('tests/markdown_example_alternate_bad.md', 'r') as mdfile:
header_line_got = extract_header_from_markdown(mdfile)

self.assertEqual(header_line_want, header_line_got)


if __name__ == '__main__':
Expand Down

0 comments on commit 485b248

Please sign in to comment.