Merge pull request #1513 from fishtown-analytics/fix/only-parse-docs-…

…blocks Only parse docs blocks contents when reading md files (#988)
dbt-labs · Jun 6, 2019 · c1387c5 · c1387c5
2 parents ddb1785 + a4a9221
commit c1387c5
Show file tree

Hide file tree

Showing 7 changed files with 96 additions and 25 deletions.
diff --git a/core/dbt/clients/_jinja_blocks.py b/core/dbt/clients/_jinja_blocks.py
@@ -52,6 +52,8 @@ def end_pat(self):
 RAW_START_PATTERN = regex(
     r'(?:\s*\{\%\-|\{\%)\s*(?P<raw_start>(raw))\s*(?:\-\%\}\s*|\%\})'
 )
+EXPR_START_PATTERN = regex(r'(?P<expr_start>(\{\{\s*))')
+EXPR_END_PATTERN = regex(r'(?P<expr_end>(\s*\}\}))')
 
 BLOCK_START_PATTERN = regex(''.join((
     r'(?:\s*\{\%\-|\{\%)\s*',
@@ -92,6 +94,8 @@ def end_pat(self):
     r'"([^"\\]*(?:\\.[^"\\]*)*)"))'
 )
 
+QUOTE_START_PATTERN = regex(r'''(?P<quote>(['"]))''')
+
 # any number of non-quote characters, followed by:
 # - quote: a quote mark indicating start of a string (you'll want to backtrack
 #          the regex end on quotes and then match with the string pattern)
@@ -179,6 +183,31 @@ def _expect_match(self, expected_name, *patterns, **kwargs):
             dbt.exceptions.raise_compiler_error(msg)
         return match
 
+    def handle_expr(self):
+        """Handle an expression. At this point we're at a string like:
+            {{ 1 + 2 }}
+               ^ right here
+
+        We expect to find a `}}`, but we might find one in a string before
+        that. Imagine the case of `{{ 2 * "}}" }}`...
+
+        You're not allowed to have blocks or comments inside an expr so it is
+        pretty straightforward, I hope: only strings can get in the way.
+        """
+        while True:
+            match = self._expect_match('}}',
+                                       EXPR_END_PATTERN,
+                                       QUOTE_START_PATTERN)
+            if match.groupdict().get('expr_end') is not None:
+                break
+            else:
+                # it's a quote. we haven't advanced for this match yet, so
+                # just slurp up the whole string, no need to rewind.
+                match = self._expect_match('string', STRING_PATTERN)
+                self.advance(match.end())
+
+        self.advance(match.end())
+
     def handle_block(self, match, block_start=None):
         """Handle a block. The current state of the parser should be after the
         open block is completed:
@@ -197,12 +226,18 @@ def handle_block(self, match, block_start=None):
 
         self._block_contents = ''
 
+        search = [found.end_pat(), COMMENT_START_PATTERN, RAW_START_PATTERN,
+                  EXPR_START_PATTERN]
+
+        # docs and macros do not honor embedded quotes
+        if found.block_type_name not in ('docs', 'macro'):
+            # is this right?
+            search.append(QUOTE_START_PATTERN)
+
         # you can have as many comments in your block as you'd like!
         while True:
             match = self._expect_match(
-                '"{}"'.format(found.end_block_type_name),
-                found.end_pat(), COMMENT_START_PATTERN, RAW_START_PATTERN,
-                regex('''(?P<quote>(['"]))''')
+                '"{}"'.format(found.end_block_type_name), *search
             )
             groups = match.groupdict()
             if groups.get('endblock') is not None:
@@ -218,6 +253,8 @@ def handle_block(self, match, block_start=None):
                 self.rewind()
                 match = self._expect_match('any string', STRING_PATTERN)
                 self.advance(match.end())
+            elif groups.get('expr_start') is not None:
+                self.handle_expr()
             else:
                 raise dbt.exceptions.InternalException(
                     'unhandled regex in handle_block, no match: {}'

diff --git a/core/dbt/node_types.py b/core/dbt/node_types.py
@@ -8,7 +8,7 @@ class NodeType(object):
     Macro = 'macro'
     Operation = 'operation'
     Seed = 'seed'
-    Documentation = 'documentation'
+    Documentation = 'docs'
     Source = 'source'
     RPCCall = 'rpc'
 

diff --git a/core/dbt/parser/docs.py b/core/dbt/parser/docs.py
@@ -3,6 +3,8 @@
 from dbt.parser.base import BaseParser
 from dbt.contracts.graph.unparsed import UnparsedDocumentationFile
 from dbt.contracts.graph.parsed import ParsedDocumentation
+from dbt.clients.jinja import extract_toplevel_blocks, get_template
+from dbt.clients import system
 
 import jinja2.runtime
 import os
@@ -16,14 +18,12 @@ def load_file(cls, package_name, root_dir, relative_dirs):
         """
         extension = "[!.#~]*.md"
 
-        file_matches = dbt.clients.system.find_matching(
-            root_dir,
-            relative_dirs,
-            extension)
+        file_matches = system.find_matching(root_dir, relative_dirs, extension)
 
         for file_match in file_matches:
-            file_contents = dbt.clients.system.load_file_contents(
-                file_match.get('absolute_path'), strip=False)
+            file_contents = system.load_file_contents(
+                file_match.get('absolute_path'),
+                strip=False)
 
             parts = dbt.utils.split_path(file_match.get('relative_path', ''))
             name, _ = os.path.splitext(parts[-1])
@@ -44,12 +44,26 @@ def load_file(cls, package_name, root_dir, relative_dirs):
 
     def parse(self, docfile):
         try:
-            template = dbt.clients.jinja.get_template(docfile.file_contents,
-                                                      {})
-        except dbt.exceptions.CompilationException as e:
-            e.node = docfile
+            blocks = extract_toplevel_blocks(docfile.file_contents)
+        except dbt.exceptions.CompilationException as exc:
+            if exc.node is None:
+                exc.node = docfile
             raise
 
+        for block in blocks:
+            if block.block_type_name != NodeType.Documentation:
+                continue
+
+            try:
+                template = get_template(block.full_block, {})
+            except dbt.exceptions.CompilationException as e:
+                e.node = docfile
+                raise
+            # in python 3.x this can just be "yield from" isntead of a loop
+            for d in self._parse_template_docs(template, docfile):
+                yield d
+
+    def _parse_template_docs(self, template, docfile):
         for key, item in template.module.__dict__.items():
             if type(item) != jinja2.runtime.Macro:
                 continue

diff --git a/test/integration/029_docs_generate_tests/models/readme.md b/test/integration/029_docs_generate_tests/models/readme.md
@@ -0,0 +1 @@
+This is a readme.md file with {{ invalid-ish jinja }} in it
diff --git a/test/integration/029_docs_generate_tests/test_docs_generate.py b/test/integration/029_docs_generate_tests/test_docs_generate.py
@@ -1282,7 +1282,7 @@ def expected_postgres_references_manifest(self, model_database=None):
                     'original_file_path': docs_path,
                     'package_name': 'test',
                     'path': 'docs.md',
-                    'resource_type': 'documentation',
+                    'resource_type': 'docs',
                     'root_path': os.getcwd(),
                     'unique_id': 'test.ephemeral_summary'
                 },
@@ -1293,7 +1293,7 @@ def expected_postgres_references_manifest(self, model_database=None):
                     'original_file_path': docs_path,
                     'package_name': 'test',
                     'path': 'docs.md',
-                    'resource_type': 'documentation',
+                    'resource_type': 'docs',
                     'root_path': os.getcwd(),
                     'unique_id': 'test.source_info',
                 },
@@ -1304,7 +1304,7 @@ def expected_postgres_references_manifest(self, model_database=None):
                     'original_file_path': docs_path,
                     'package_name': 'test',
                     'path': 'docs.md',
-                    'resource_type': 'documentation',
+                    'resource_type': 'docs',
                     'root_path': os.getcwd(),
                     'unique_id': 'test.summary_count'
                 },
@@ -1315,7 +1315,7 @@ def expected_postgres_references_manifest(self, model_database=None):
                     'original_file_path': docs_path,
                     'package_name': 'test',
                     'path': 'docs.md',
-                    'resource_type': 'documentation',
+                    'resource_type': 'docs',
                     'root_path': os.getcwd(),
                     'unique_id': 'test.summary_first_name'
                 },
@@ -1326,7 +1326,7 @@ def expected_postgres_references_manifest(self, model_database=None):
                     'original_file_path': docs_path,
                     'package_name': 'test',
                     'path': 'docs.md',
-                    'resource_type': 'documentation',
+                    'resource_type': 'docs',
                     'root_path': os.getcwd(),
                     'unique_id': 'test.table_info'
                 },
@@ -1340,7 +1340,7 @@ def expected_postgres_references_manifest(self, model_database=None):
                     'original_file_path': docs_path,
                     'package_name': 'test',
                     'path': 'docs.md',
-                    'resource_type': 'documentation',
+                    'resource_type': 'docs',
                     'root_path': os.getcwd(),
                     'unique_id': 'test.view_summary'
                 },

diff --git a/test/unit/test_docs_blocks.py b/test/unit/test_docs_blocks.py
@@ -2,16 +2,12 @@
 import mock
 import unittest
 
-from dbt.config import RuntimeConfig
 from dbt.node_types import NodeType
-import dbt.utils
 from dbt.parser import docs
 from dbt.contracts.graph.unparsed import UnparsedDocumentationFile
 
 from .utils import config_from_parts_or_dicts
 
-#DocumentationParser
-
 
 SNOWPLOW_SESSIONS_DOCS = r'''
 This table contains one record for every session recorded by Snowplow.
@@ -98,7 +94,8 @@ def setUp(self):
         self.subdir_project_config = config_from_parts_or_dicts(
             project=subdir_project, profile=profile_data
         )
-    @mock.patch('dbt.clients.system')
+
+    @mock.patch('dbt.parser.docs.system')
     def test_load_file(self, system):
         system.load_file_contents.return_value = TEST_DOCUMENTATION_FILE
         system.find_matching.return_value = [{

diff --git a/test/unit/test_jinja.py b/test/unit/test_jinja.py
@@ -271,6 +271,28 @@ def test_quoted_endblock_within_block(self):
         self.assertEqual(blocks[0].block_type_name, 'myblock')
         self.assertEqual(blocks[0].contents, '{% set x = ("{% endmyblock %}") %}  ')
 
+    def test_docs_block(self):
+        body = '{% docs __my_doc__ %} asdf {# nope {% enddocs %}} #} {% enddocs %} {% docs __my_other_doc__ %} asdf "{% enddocs %}'
+        all_blocks = extract_toplevel_blocks(body)
+        blocks = [b for b in all_blocks if b.block_type_name != '__dbt__data']
+        self.assertEqual(len(blocks), 2)
+        self.assertEqual(blocks[0].block_type_name, 'docs')
+        self.assertEqual(blocks[0].contents, ' asdf {# nope {% enddocs %}} #} ')
+        self.assertEqual(blocks[0].block_name, '__my_doc__')
+        self.assertEqual(blocks[1].block_type_name, 'docs')
+        self.assertEqual(blocks[1].contents, ' asdf "')
+        self.assertEqual(blocks[1].block_name, '__my_other_doc__')
+
+    def test_docs_block_expr(self):
+        body = '{% docs more_doc %} asdf {{ "{% enddocs %}" ~ "}}" }}{% enddocs %}'
+        all_blocks = extract_toplevel_blocks(body)
+        blocks = [b for b in all_blocks if b.block_type_name != '__dbt__data']
+        self.assertEqual(len(blocks), 1)
+        self.assertEqual(blocks[0].block_type_name, 'docs')
+        self.assertEqual(blocks[0].contents, ' asdf {{ "{% enddocs %}" ~ "}}" }}')
+        self.assertEqual(blocks[0].block_name, 'more_doc')
+
+
 bar_block = '''{% mytype bar %}
 {# a comment
     that inside it has