Merge branch 'main' into dev

guoyao · Aug 9, 2024 · 92767ed · 92767ed
2 parents 39cd5e9 + d27ec9d
commit 92767ed
Show file tree

Hide file tree

Showing 49 changed files with 1,188 additions and 472 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -7,11 +7,12 @@ body:
   - type: checkboxes
     id: existingcheck
     attributes:
-      label: Is there an existing issue for this?
-      description: Please search to see if an issue already exists for the bug you encountered.
+      label: Do you need to file an issue?
+      description: Please help us manage our time by avoiding duplicates and common questions with the steps below.
       options:
-        - label: I have searched the existing issues
-        - label: I have checked [#657](https://github.com/microsoft/graphrag/issues/657) to validate if my issue is covered by community support
+        - label: I have searched the existing issues and this bug is not already filed.
+        - label: My model is hosted on OpenAI or Azure. If not, please look at the "model providers" issue and don't file a new one here.
+        - label: I believe this is a legitimate bug, not just a question. If this is a question, please use the Discussions area.
   - type: textarea
     id: description
     attributes:

diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -4,6 +4,15 @@ labels: ["enhancement"]
 title: "[Feature Request]: <title>"
 
 body:
+  - type: checkboxes
+    id: existingcheck
+    attributes:
+      label: Do you need to file an issue?
+      description: Please help us manage our time by avoiding duplicates and common questions with the steps below.
+      options:
+        - label: I have searched the existing issues and this feature is not already filed.
+        - label: My model is hosted on OpenAI or Azure. If not, please look at the "model providers" issue and don't file a new one here.
+        - label: I believe this is a legitimate feature request, not just a question. If this is a question, please use the Discussions area.
   - type: textarea
     id: problem_description
     attributes:

diff --git a/.github/workflows/issues-autoresolve.yml b/.github/workflows/issues-autoresolve.yml
@@ -10,15 +10,15 @@ jobs:
       issues: write
       pull-requests: write
     steps:
-      - uses: actions/stale@v5
+      - uses: actions/stale@v9
         with:
           days-before-issue-stale: 7
           days-before-issue-close: 5
           stale-issue-label: "stale"
           close-issue-label: "autoresolved"
           stale-issue-message: "This issue has been marked stale due to inactivity after repo maintainer or community member responses that request more information or suggest a solution. It will be closed after five additional days."
           close-issue-message: "This issue has been closed after being marked as stale for five days. Please reopen if needed."
-          exempt-issue-label: "triage"
+          any-of-labels: "awaiting_response"
           days-before-pr-stale: -1
           days-before-pr-close: -1
           repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
@@ -108,18 +108,18 @@ jobs:
         run: |
           poetry run poe test_integration
 
-      - name: Smoke Test
-        if: steps.changes.outputs.python == 'true'
-        run: |
-          poetry run poe test_smoke
-
-      - uses: actions/upload-artifact@v4
-        if: always()
-        with:
-          name: smoke-test-artifacts-${{ matrix.python-version }}-${{ matrix.poetry-version }}-${{ runner.os }}
-          path: tests/fixtures/*/output
-
-      - name: E2E Test
-        if: steps.changes.outputs.python == 'true'
-        run: |
-          ./scripts/e2e-test.sh
+      # - name: Smoke Test
+      #   if: steps.changes.outputs.python == 'true'
+      #   run: |
+      #     poetry run poe test_smoke
+
+      # - uses: actions/upload-artifact@v4
+      #   if: always()
+      #   with:
+      #     name: smoke-test-artifacts-${{ matrix.python-version }}-${{ matrix.poetry-version }}-${{ runner.os }}
+      #     path: tests/fixtures/*/output
+
+      # - name: E2E Test
+      #   if: steps.changes.outputs.python == 'true'
+      #   run: |
+      #     ./scripts/e2e-test.sh
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -36,9 +36,6 @@ jobs:
         with:
           poetry-version: ${{ env.POETRY_VERSION }}
 
-      - name: Add poetry-dynamic-versioning plugin
-        run: poetry self add "poetry-dynamic-versioning[plugin]"
-
       - name: Install dependencies
         shell: bash
         run: poetry install

diff --git a/.semversioner/0.2.1.json b/.semversioner/0.2.1.json
@@ -0,0 +1,70 @@
+{
+  "changes": [
+    {
+      "description": "Added default columns for vector store at create_pipeline_config. No change for other cases.",
+      "type": "patch"
+    },
+    {
+      "description": "Change json parsing error in the map step of global search to warning",
+      "type": "patch"
+    },
+    {
+      "description": "Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config",
+      "type": "patch"
+    },
+    {
+      "description": "Fix json parsing when LLM returns faulty responses",
+      "type": "patch"
+    },
+    {
+      "description": "Fix missing community reports and refactor community context builder",
+      "type": "patch"
+    },
+    {
+      "description": "Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.",
+      "type": "patch"
+    },
+    {
+      "description": "Try parsing json before even repairing",
+      "type": "patch"
+    },
+    {
+      "description": "Update Prompt Tuning meta prompts with finer examples",
+      "type": "patch"
+    },
+    {
+      "description": "Update default entity extraction and gleaning prompts to reduce hallucinations",
+      "type": "patch"
+    },
+    {
+      "description": "add encoding-model to entity/claim extraction config",
+      "type": "patch"
+    },
+    {
+      "description": "add encoding-model to text chunking config",
+      "type": "patch"
+    },
+    {
+      "description": "add user prompt to history-tracking llm",
+      "type": "patch"
+    },
+    {
+      "description": "update config reader to allow for zero gleans",
+      "type": "patch"
+    },
+    {
+      "description": "update config-reader to allow for empty chunk-by arrays",
+      "type": "patch"
+    },
+    {
+      "description": "update history-tracking LLm to use 'assistant' instead of 'system' in output history.",
+      "type": "patch"
+    },
+    {
+      "description": "use history argument in hash key computation; add history input to cache data",
+      "type": "patch"
+    }
+  ],
+  "created_at": "2024-08-06T00:25:52+00:00",
+  "version": "0.2.1"
+}
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,25 @@
 # Changelog
 Note: version releases in the 0.x.y range may introduce breaking changes.
 
+## 0.2.1
+
+- patch: Added default columns for vector store at create_pipeline_config. No change for other cases.
+- patch: Change json parsing error in the map step of global search to warning
+- patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config
+- patch: Fix json parsing when LLM returns faulty responses
+- patch: Fix missing community reports and refactor community context builder
+- patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.
+- patch: Try parsing json before even repairing
+- patch: Update Prompt Tuning meta prompts with finer examples
+- patch: Update default entity extraction and gleaning prompts to reduce hallucinations
+- patch: add encoding-model to entity/claim extraction config
+- patch: add encoding-model to text chunking config
+- patch: add user prompt to history-tracking llm
+- patch: update config reader to allow for zero gleans
+- patch: update config-reader to allow for empty chunk-by arrays
+- patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history.
+- patch: use history argument in hash key computation; add history input to cache data
+
 ## 0.2.0
 
 - minor: Add content-based KNN for selecting prompt tune few shot examples

diff --git a/dictionary.txt b/dictionary.txt
@@ -132,11 +132,30 @@ MSRC
 Arrary
 
 # Prompt Inputs
-dulce
-Asadi
 ABILA
 Abila
+ALHAMIA
+Alhamia
+Asadi
+Aurelians
+Bataglani
+BATAGLANI
+Bratinas
+dulce
+Durke
+Firuzabad
+Firuzabad's
+FIRUZABAD
+Krohaara
+KROHAARA
 POKRALLY
+Tazbah
+TIRUZIA
+Tiruzia
+Tiruzia's
+Verdantis
+Verdantis's
+
 
 # English
 skippable

diff --git a/docsite/posts/config/env_vars.md b/docsite/posts/config/env_vars.md
@@ -132,11 +132,12 @@ These settings control the data input used by the pipeline. Any settings with a
 
 ## Data Chunking
 
-| Parameter                   | Description                                                                                 | Type  | Required or Optional | Default |
-| --------------------------- | ------------------------------------------------------------------------------------------- | ----- | -------------------- | ------- |
-| `GRAPHRAG_CHUNK_SIZE`       | The chunk size in tokens for text-chunk analysis windows.                                   | `str` | optional             | 1200    |
-| `GRAPHRAG_CHUNK_OVERLAP`    | The chunk overlap in tokens for text-chunk analysis windows.                                | `str` | optional             | 100     |
-| `GRAPHRAG_CHUNK_BY_COLUMNS` | A comma-separated list of document attributes to groupby when performing TextUnit chunking. | `str` | optional             | `id`    |
+| Parameter                       | Description                                                                                 | Type  | Required or Optional | Default                       |
+| ------------------------------- | ------------------------------------------------------------------------------------------- | ----- | -------------------- | ----------------------------- |
+| `GRAPHRAG_CHUNK_SIZE`           | The chunk size in tokens for text-chunk analysis windows.                                   | `str` | optional             | 1200                          |
+| `GRAPHRAG_CHUNK_OVERLAP`        | The chunk overlap in tokens for text-chunk analysis windows.                                | `str` | optional             | 100                           |
+| `GRAPHRAG_CHUNK_BY_COLUMNS`     | A comma-separated list of document attributes to groupby when performing TextUnit chunking. | `str` | optional             | `id`                          |
+| `GRAPHRAG_CHUNK_ENCODING_MODEL` | The encoding model to use for chunking.                                                     | `str` | optional             | The top-level encoding model. |
 
 ## Prompting Overrides
 
@@ -145,12 +146,14 @@ These settings control the data input used by the pipeline. Any settings with a
 | `GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE`      | The path (relative to the root) of an entity extraction prompt template text file.         | `str`    | optional             | `None`                                                           |
 | `GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS`    | The maximum number of redrives (gleanings) to invoke when extracting entities in a loop.   | `int`    | optional             | 1                                                                |
 | `GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES`     | A comma-separated list of entity types to extract.                                         | `str`    | optional             | `organization,person,event,geo`                                  |
+| `GRAPHRAG_ENTITY_EXTRACTION_ENCODING_MODEL`		| The encoding model to use for entity extraction.                                           | `str`    | optional             | The top-level encoding model.                                    |
 | `GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE` | The path (relative to the root) of an description summarization prompt template text file. | `str`    | optional             | `None`                                                           |
 | `GRAPHRAG_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH`  | The maximum number of tokens to generate per description summarization.                    | `int`    | optional             | 500                                                              |
 | `GRAPHRAG_CLAIM_EXTRACTION_ENABLED`           | Whether claim extraction is enabled for this pipeline.                                     | `bool`   | optional             | `False`                                                          |
 | `GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION`       | The claim_description prompting argument to utilize.                                       | `string` | optional             | "Any claims or facts that could be relevant to threat analysis." |
 | `GRAPHRAG_CLAIM_EXTRACTION_PROMPT_FILE`       | The claim extraction prompt to utilize.                                                    | `string` | optional             | `None`                                                           |
 | `GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS`     | The maximum number of redrives (gleanings) to invoke when extracting claims in a loop.     | `int`    | optional             | 1                                                                |
+| `GRAPHRAG_CLAIM_EXTRACTION_ENCODING_MODEL`		| The encoding model to use for claim extraction.                                            | `str`    | optional             | The top-level encoding model                                     |
 | `GRAPHRAG_COMMUNITY_REPORTS_PROMPT_FILE`      | The community reports extraction prompt to utilize.                                        | `string` | optional             | `None`                                                           |
 | `GRAPHRAG_COMMUNITY_REPORTS_MAX_LENGTH`       | The maximum number of tokens to generate per community reports.                            | `int`    | optional             | 1500                                                             |
 

diff --git a/docsite/posts/config/json_yaml.md b/docsite/posts/config/json_yaml.md
@@ -102,6 +102,7 @@ This is the base LLM configuration section. Other steps may override this config
 - `size` **int** - The max chunk size in tokens.
 - `overlap` **int** - The chunk overlap in tokens.
 - `group_by_columns` **list[str]** - group documents by fields before chunking.
+- `encoding_model` **str** - The text encoding model to use. Default is to use the top-level encoding model.
 - `strategy` **dict** - Fully override the chunking strategy.
 
 ## cache
@@ -144,6 +145,7 @@ This is the base LLM configuration section. Other steps may override this config
 - `prompt` **str** - The prompt file to use.
 - `entity_types` **list[str]** - The entity types to identify.
 - `max_gleanings` **int** - The maximum number of gleaning cycles to use.
+- `encoding_model` **str** - The text encoding model to use. By default, this will use the top-level encoding model.
 - `strategy` **dict** - Fully override the entity extraction strategy.
 
 ## summarize_descriptions
@@ -168,6 +170,7 @@ This is the base LLM configuration section. Other steps may override this config
 - `prompt` **str** - The prompt file to use.
 - `description` **str** - Describes the types of claims we want to extract.
 - `max_gleanings` **int** - The maximum number of gleaning cycles to use.
+- `encoding_model` **str** - The text encoding model to use. By default, this will use the top-level encoding model.
 - `strategy` **dict** - Fully override the claim extraction strategy.
 
 ## community_reports

diff --git a/docsite/posts/query/3-cli.md b/docsite/posts/query/3-cli.md
@@ -9,11 +9,12 @@ date: 2024-27-03
 The GraphRAG query CLI allows for no-code usage of the GraphRAG Query engine.
 
 ```bash
-python -m graphrag.query --data <path-to-data> --community_level <comunit-level> --response_type <response-type> --method <"local"|"global"> <query>
+python -m graphrag.query --config <config_file.yml> --data <path-to-data> --community_level <comunit-level> --response_type <response-type> --method <"local"|"global"> <query>
 ```
 
 ## CLI Arguments
 
+- `--config <config_file.yml>` - The configuration yaml file to use when running the query. If this is used, then none of the environment-variables below will apply.
 - `--data <path-to-data>` - Folder containing the `.parquet` output files from running the Indexer.
 - `--community_level <community-level>` - Community level in the Leiden community hierarchy from which we will load the community reports higher value means we use reports on smaller communities. Default: 2
 - `--response_type <response-type>` - Free form text describing the response type and format, can be anything, e.g. `Multiple Paragraphs`, `Single Paragraph`, `Single Sentence`, `List of 3-7 Points`, `Single Page`, `Multi-Page Report`. Default: `Multiple Paragraphs`.

diff --git a/examples_notebooks/global_search.ipynb b/examples_notebooks/global_search.ipynb
@@ -115,7 +115,10 @@
     "\n",
     "reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)\n",
     "entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)\n",
-    "print(f\"Report records: {len(report_df)}\")\n",
+    "print(f\"Total report count: {len(report_df)}\")\n",
+    "print(\n",
+    "    f\"Report count after filtering by community level {COMMUNITY_LEVEL}: {len(reports)}\"\n",
+    ")\n",
     "report_df.head()"
    ]
   },
@@ -223,17 +226,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "LLM calls: 13. LLM tokens: 184660\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# inspect number of LLM calls and tokens\n",
     "print(f\"LLM calls: {result.llm_calls}. LLM tokens: {result.prompt_tokens}\")"