Re-structure: move search utils tasks into ...tasks.search

I wasn't able to find nor solve the problem with the `cyclic-import` error reported by pylint. So, I tried moving the utils functions used by search tasks out from `readthedocs.projects.tasks.utils` into `readthedocs.projects.tasks.search` instead. This made `prospector` and `pylint` to pass locally without reporting any issue.
readthedocs · Feb 7, 2022 · f8ab09e · f8ab09e
1 parent fde6a1a
commit f8ab09e
Show file tree

Hide file tree

Showing 3 changed files with 195 additions and 196 deletions.
diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
@@ -1,14 +1,20 @@
+from fnmatch import fnmatch
+import json
+
+from sphinx.ext import intersphinx
 import structlog
 
-from readthedocs.worker import app
+from django.conf import settings
 
 from readthedocs.builds.constants import EXTERNAL
 from readthedocs.builds.models import Version
-from readthedocs.projects.models import HTMLFile, ImportedFile
+from readthedocs.projects.models import HTMLFile, ImportedFile, Project
+from readthedocs.projects.signals import files_changed
 from readthedocs.search.utils import remove_indexed_files, index_new_files
 from readthedocs.sphinx_domains.models import SphinxDomain
+from readthedocs.storage import build_media_storage
+from readthedocs.worker import app
 
-from .utils import _create_imported_files, _create_intersphinx_data
 
 log = structlog.get_logger(__name__)
 
@@ -108,3 +114,188 @@ def remove_search_indexes(project_slug, version_slug=None):
         project_slug=project_slug,
         version_slug=version_slug,
     )
+
+
+def _create_intersphinx_data(version, commit, build):
+    """
+    Create intersphinx data for this version.
+
+    :param version: Version instance
+    :param commit: Commit that updated path
+    :param build: Build id
+    """
+    if not version.is_sphinx_type:
+        return
+
+    html_storage_path = version.project.get_storage_path(
+        type_='html', version_slug=version.slug, include_file=False
+    )
+    json_storage_path = version.project.get_storage_path(
+        type_='json', version_slug=version.slug, include_file=False
+    )
+
+    object_file = build_media_storage.join(html_storage_path, 'objects.inv')
+    if not build_media_storage.exists(object_file):
+        log.debug('No objects.inv, skipping intersphinx indexing.')
+        return
+
+    type_file = build_media_storage.join(json_storage_path, 'readthedocs-sphinx-domain-names.json')
+    types = {}
+    titles = {}
+    if build_media_storage.exists(type_file):
+        try:
+            data = json.load(build_media_storage.open(type_file))
+            types = data['types']
+            titles = data['titles']
+        except Exception:
+            log.exception('Exception parsing readthedocs-sphinx-domain-names.json')
+
+    # These classes are copied from Sphinx
+    # https://github.com/sphinx-doc/sphinx/blob/d79d041f4f90818e0b495523fdcc28db12783caf/sphinx/ext/intersphinx.py#L400-L403  # noqa
+    class MockConfig:
+        intersphinx_timeout = None
+        tls_verify = False
+        user_agent = None
+
+    class MockApp:
+        srcdir = ''
+        config = MockConfig()
+
+        def warn(self, msg):
+            log.warning('Sphinx MockApp.', msg=msg)
+
+    # Re-create all objects from the new build of the version
+    object_file_url = build_media_storage.url(object_file)
+    if object_file_url.startswith('/'):
+        # Filesystem backed storage simply prepends MEDIA_URL to the path to get the URL
+        # This can cause an issue if MEDIA_URL is not fully qualified
+        object_file_url = settings.RTD_INTERSPHINX_URL + object_file_url
+
+    invdata = intersphinx.fetch_inventory(MockApp(), '', object_file_url)
+    for key, value in sorted(invdata.items() or {}):
+        domain, _type = key.split(':', 1)
+        for name, einfo in sorted(value.items()):
+            # project, version, url, display_name
+            # ('Sphinx', '1.7.9', 'faq.html#epub-faq', 'Epub info')
+            try:
+                url = einfo[2]
+                if '#' in url:
+                    doc_name, anchor = url.split(
+                        '#',
+                        # The anchor can contain ``#`` characters
+                        maxsplit=1
+                    )
+                else:
+                    doc_name, anchor = url, ''
+                display_name = einfo[3]
+            except Exception:
+                log.exception(
+                    'Error while getting sphinx domain information. Skipping...',
+                    project_slug=version.project.slug,
+                    version_slug=version.slug,
+                    sphinx_domain='{domain}->{name}',
+                )
+                continue
+
+            # HACK: This is done because the difference between
+            # ``sphinx.builders.html.StandaloneHTMLBuilder``
+            # and ``sphinx.builders.dirhtml.DirectoryHTMLBuilder``.
+            # They both have different ways of generating HTML Files,
+            # and therefore the doc_name generated is different.
+            # More info on: http://www.sphinx-doc.org/en/master/usage/builders/index.html#builders
+            # Also see issue: https://github.com/readthedocs/readthedocs.org/issues/5821
+            if doc_name.endswith('/'):
+                doc_name += 'index.html'
+
+            html_file = HTMLFile.objects.filter(
+                project=version.project, version=version,
+                path=doc_name, build=build,
+            ).first()
+
+            if not html_file:
+                log.debug(
+                    'HTMLFile object not found.',
+                    project_slug=version.project.slug,
+                    version_slug=version.slug,
+                    build_id=build,
+                    doc_name=doc_name
+                )
+
+                # Don't create Sphinx Domain objects
+                # if the HTMLFile object is not found.
+                continue
+
+            SphinxDomain.objects.create(
+                project=version.project,
+                version=version,
+                html_file=html_file,
+                domain=domain,
+                name=name,
+                display_name=display_name,
+                type=_type,
+                type_display=types.get(f'{domain}:{_type}', ''),
+                doc_name=doc_name,
+                doc_display=titles.get(doc_name, ''),
+                anchor=anchor,
+                commit=commit,
+                build=build,
+            )
+
+
+def _create_imported_files(*, version, commit, build, search_ranking, search_ignore):
+    """
+    Create imported files for version.
+
+    :param version: Version instance
+    :param commit: Commit that updated path
+    :param build: Build id
+    """
+    # Re-create all objects from the new build of the version
+    storage_path = version.project.get_storage_path(
+        type_='html', version_slug=version.slug, include_file=False
+    )
+    for root, __, filenames in build_media_storage.walk(storage_path):
+        for filename in filenames:
+            # We don't care about non-HTML files
+            if not filename.endswith('.html'):
+                continue
+
+            full_path = build_media_storage.join(root, filename)
+
+            # Generate a relative path for storage similar to os.path.relpath
+            relpath = full_path.replace(storage_path, '', 1).lstrip('/')
+
+            page_rank = 0
+            # Last pattern to match takes precedence
+            # XXX: see if we can implement another type of precedence,
+            # like the longest pattern.
+            reverse_rankings = reversed(list(search_ranking.items()))
+            for pattern, rank in reverse_rankings:
+                if fnmatch(relpath, pattern):
+                    page_rank = rank
+                    break
+
+            ignore = False
+            for pattern in search_ignore:
+                if fnmatch(relpath, pattern):
+                    ignore = True
+                    break
+
+            # Create imported files from new build
+            HTMLFile.objects.create(
+                project=version.project,
+                version=version,
+                path=relpath,
+                name=filename,
+                rank=page_rank,
+                commit=commit,
+                build=build,
+                ignore=ignore,
+            )
+
+    # This signal is used for purging the CDN.
+    files_changed.send(
+        sender=Project,
+        project=version.project,
+        version=version,
+    )