From 6651b1bb6a6a3773dd0a546d15a5138729d15488 Mon Sep 17 00:00:00 2001 From: Timo Tijhof Date: Wed, 11 Oct 2023 14:50:54 -0700 Subject: [PATCH] Build: Enable typesense scraper This includes the sitemap so that we're sure no content is missed. Unlike api.jquery.com, api.jquerymobile.com does not start with an index that links to all content pages. This means the crawler would have to to rely on category pages to discover all content, except we don't want the cralwer to index /category/ pages, and thus are matched by stop_urls, which means they are never crawled. If there was a variant of `stop_urls` that behaved like `follow,noindex` instead of `noindex,follow` we could use that, but I'm not aware of such feature. The sitemap accomplishes the same thing in a more efficient manner. Ref https://github.com/jquery/infrastructure-puppet/issues/33 --- .github/workflows/typesense.yaml | 27 ++++++++++++++ docsearch.config.json | 63 ++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 .github/workflows/typesense.yaml create mode 100644 docsearch.config.json diff --git a/.github/workflows/typesense.yaml b/.github/workflows/typesense.yaml new file mode 100644 index 0000000..32a5dbb --- /dev/null +++ b/.github/workflows/typesense.yaml @@ -0,0 +1,27 @@ +name: typesense +on: + # Or after a deployment + push: + branches: + - main + - add-typesense + # Or manually + workflow_dispatch: + +jobs: + typesense: + name: Update Typesense + if: ${{ github.repository_owner == 'jquery' }} # skip on forks + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Docsearch Scraper + shell: bash + run: | + docker run \ + -e TYPESENSE_API_KEY=${{ secrets.TYPESENSE_ADMIN_KEY }} \ + -e TYPESENSE_HOST="${{ secrets.TYPESENSE_HOST }}" \ + -e TYPESENSE_PORT="443" \ + -e TYPESENSE_PROTOCOL="https" \ + -e CONFIG="$(cat docsearch.config.json | jq -r tostring)" \ + typesense/docsearch-scraper:0.8.0 diff --git a/docsearch.config.json b/docsearch.config.json new file mode 100644 index 0000000..f33fb64 --- /dev/null +++ b/docsearch.config.json @@ -0,0 +1,63 @@ +{ + "index_name": "jquerymobile_com", + "start_urls": [ + { "url": "https://api.jquerymobile.com", "selectors_key": "api", "page_rank": 20 }, + { "url": "https://jquerymobile.com", "page_rank": 10 } + ], + "sitemap_urls": [ + "https://api.jquerymobile.com/wp-sitemap.xml" + ], + "// stop_urls": [ + "// Exclude URLs containing '?' such as /themeroller/?...", + "// Avoid excluding https://jquerymobile.com/resources/ itself" + ], + "stop_urls": [ + "\\?", + ".com/category/", + ".com/resources/.+", + ".com\\/\\d\\." + ], + "selectors": { + "default": { + "lvl0": { + "selector": "#menu-top .menu-item.current > a", + "global": true, + "default_value": "Documentation" + }, + "lvl1": "#content h1", + "lvl2": "#content h2", + "lvl3": "#content h3", + "lvl4": "#content h4", + "lvl5": "#content h5", + "text": "#content p, #content li, #content tr" + }, + "api": { + "lvl0": { + "selector": "#categories .cat-item.current-cat > a", + "global": true, + "default_value": "API" + }, + "lvl1": "#content h1", + "lvl2": "#content h2, #content h4.name", + "lvl3": "#content h3, #content h4:not(.name)", + "lvl4": "#content h5, #content strong:first-child", + "text": ".entry-content p, .entry-content li" + } + }, + "custom_settings": { + "token_separators": ["_", "-", "."] + }, + "selectors_exclude": [ + "header ~ article", + ".returns", + ".version-details", + ".section-title", + ".icon-link.toc-link", + "[class^=toclevel]", + "#toctitle", + ".desc strong:first-child", + "#quick-nav header h2" + ], + "min_indexed_level": 2, + "scrape_start_urls": false +}