Merge branch 'microsoft:main' into main

KylinMountain · Jul 25, 2024 · ddc4dbb · ddc4dbb
2 parents 6e9d072 + 61b5eea
commit ddc4dbb
Show file tree

Hide file tree

Showing 46 changed files with 592 additions and 365 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -4,6 +4,14 @@ title: "[Bug]: <title>"
 labels: ["bug", "triage"]
 
 body:
+  - type: checkboxes
+    id: existingcheck
+    attributes:
+      label: Is there an existing issue for this?
+      description: Please search to see if an issue already exists for the bug you encountered.
+      options:
+        - label: I have searched the existing issues
+        - label: I have checked [#657](https://github.com/microsoft/graphrag/issues/657) to validate if my issue is covered by community support
   - type: textarea
     id: description
     attributes:
@@ -34,6 +42,11 @@ body:
       label: GraphRAG Config Used
       description: The GraphRAG configuration used for the run.
       placeholder: The settings.yaml content or GraphRAG configuration
+      value: |
+        ```yaml
+        # Paste your config here
+
+        ```
   - type: textarea
     id: screenshotslogs
     attributes:

diff --git a/.github/ISSUE_TEMPLATE/general_issue.yml b/.github/ISSUE_TEMPLATE/general_issue.yml
@@ -4,6 +4,14 @@ title: "[Issue]: <title> "
 labels: ["triage"]
 
 body:
+  - type: checkboxes
+    id: existingcheck
+    attributes:
+      label: Is there an existing issue for this?
+      description: Please search to see if an issue already exists for the bug you encountered.
+      options:
+        - label: I have searched the existing issues
+        - label: I have checked [#657](https://github.com/microsoft/graphrag/issues/657) to validate if my issue is covered by community support
   - type: textarea
     id: description
     attributes:
@@ -28,6 +36,11 @@ body:
       label: GraphRAG Config Used
       description: The GraphRAG configuration used for the run.
       placeholder: The settings.yaml content or GraphRAG configuration
+      value: |
+        ```yaml
+        # Paste your config here
+
+        ```
   - type: textarea
     id: screenshotslogs
     attributes:

diff --git a/.github/workflows/issues-autoresolve.yml b/.github/workflows/issues-autoresolve.yml
@@ -0,0 +1,24 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "30 1 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v5
+        with:
+          days-before-issue-stale: 7
+          days-before-issue-close: 5
+          stale-issue-label: "stale"
+          close-issue-label: "autoresolved"
+          stale-issue-message: "This issue has been marked stale due to inactivity after repo maintainer or community member responses that request more information or suggest a solution. It will be closed after five additional days."
+          close-issue-message: "This issue has been closed after being marked as stale for five days. Please reopen if needed."
+          exempt-issue-label: "triage"
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
@@ -21,7 +21,7 @@ jobs:
   python-ci:
     strategy:
       matrix:
-        python-version: ["3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11"] # add 3.12 once gensim supports it. TODO: watch this issue - https://github.com/piskvorky/gensim/issues/3510
         os: [ubuntu-latest, windows-latest]
     env:
       DEBUG: 1
@@ -79,7 +79,10 @@ jobs:
 
       - name: Install dependencies
         shell: bash
-        run: poetry self add setuptools && poetry run python -m pip install gensim && poetry install
+        run: |
+          poetry self add setuptools wheel
+          poetry run python -m pip install gensim
+          poetry install
 
       - name: Check Semversioner
         run: |

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -40,6 +40,9 @@ jobs:
         shell: bash
         run: poetry install
 
+      - name: Export Publication Version
+        run: echo "version=`poetry version --short`" >> $GITHUB_OUTPUT
+
       - name: Build Distributable
         shell: bash
         run: poetry build

diff --git a/.semversioner/0.2.0.json b/.semversioner/0.2.0.json
@@ -0,0 +1,94 @@
+{
+  "changes": [
+    {
+      "description": "Add content-based KNN for selecting prompt tune few shot examples",
+      "type": "minor"
+    },
+    {
+      "description": "Add dynamic community report rating to the prompt tuning engine",
+      "type": "minor"
+    },
+    {
+      "description": "Add Minute-based Rate Limiting and fix rpm, tpm settings",
+      "type": "patch"
+    },
+    {
+      "description": "Add N parameter support",
+      "type": "patch"
+    },
+    {
+      "description": "Add cli flag to overlay default values onto a provided config.",
+      "type": "patch"
+    },
+    {
+      "description": "Add exception handling on file load",
+      "type": "patch"
+    },
+    {
+      "description": "Add language support to prompt tuning",
+      "type": "patch"
+    },
+    {
+      "description": "Add llm params to local and global search",
+      "type": "patch"
+    },
+    {
+      "description": "Fix broken prompt tuning link on docs",
+      "type": "patch"
+    },
+    {
+      "description": "Fix delta none on query calls",
+      "type": "patch"
+    },
+    {
+      "description": "Fix docsite base url",
+      "type": "patch"
+    },
+    {
+      "description": "Fix encoding model parameter on prompt tune",
+      "type": "patch"
+    },
+    {
+      "description": "Fix for --limit exceeding the dataframe length",
+      "type": "patch"
+    },
+    {
+      "description": "Fix for Ruff 0.5.2",
+      "type": "patch"
+    },
+    {
+      "description": "Fixed an issue where base OpenAI embeddings can't work with Azure OpenAI LLM",
+      "type": "patch"
+    },
+    {
+      "description": "Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls",
+      "type": "patch"
+    },
+    {
+      "description": "fix community_report doesn't work in settings.yaml",
+      "type": "patch"
+    },
+    {
+      "description": "fix llm response content is None in query",
+      "type": "patch"
+    },
+    {
+      "description": "fix the organization parameter is ineffective during queries",
+      "type": "patch"
+    },
+    {
+      "description": "remove duplicate file read",
+      "type": "patch"
+    },
+    {
+      "description": "support non-open ai model config to prompt tune",
+      "type": "patch"
+    },
+    {
+      "description": "use binary io processing for all file io operations",
+      "type": "patch"
+    }
+  ],
+  "created_at": "2024-07-25T02:01:38+00:00",
+  "version": "0.2.0"
+}
diff --git a/.semversioner/next-release/minor-20240710183748086411.json b/.semversioner/next-release/minor-20240710183748086411.json
diff --git a/.semversioner/next-release/patch-20240701233152787373.json b/.semversioner/next-release/patch-20240701233152787373.json
diff --git a/.semversioner/next-release/patch-20240703152422358587.json b/.semversioner/next-release/patch-20240703152422358587.json
diff --git a/.semversioner/next-release/patch-20240703182750529114.json b/.semversioner/next-release/patch-20240703182750529114.json
diff --git a/.semversioner/next-release/patch-20240704181236015699.json b/.semversioner/next-release/patch-20240704181236015699.json
diff --git a/.semversioner/next-release/patch-20240705184142723331.json b/.semversioner/next-release/patch-20240705184142723331.json
diff --git a/.semversioner/next-release/patch-20240705235656897489.json b/.semversioner/next-release/patch-20240705235656897489.json
diff --git a/.semversioner/next-release/patch-20240707063053679262.json b/.semversioner/next-release/patch-20240707063053679262.json
diff --git a/.semversioner/next-release/patch-20240709225514193665.json b/.semversioner/next-release/patch-20240709225514193665.json
diff --git a/.semversioner/next-release/patch-20240710114442871595.json b/.semversioner/next-release/patch-20240710114442871595.json
diff --git a/.semversioner/next-release/patch-20240710165603516866.json b/.semversioner/next-release/patch-20240710165603516866.json
diff --git a/.semversioner/next-release/patch-20240711004716103302.json b/.semversioner/next-release/patch-20240711004716103302.json
diff --git a/.semversioner/next-release/patch-20240711092703710242.json b/.semversioner/next-release/patch-20240711092703710242.json
diff --git a/.semversioner/next-release/patch-20240711223132221685.json b/.semversioner/next-release/patch-20240711223132221685.json
diff --git a/.semversioner/next-release/patch-20240712035356859335.json b/.semversioner/next-release/patch-20240712035356859335.json
diff --git a/.semversioner/next-release/patch-20240712210400518089.json b/.semversioner/next-release/patch-20240712210400518089.json
diff --git a/.semversioner/next-release/patch-20240712235357550877.json b/.semversioner/next-release/patch-20240712235357550877.json
diff --git a/.semversioner/next-release/patch-20240716225953784804.json b/.semversioner/next-release/patch-20240716225953784804.json
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,31 @@
+# Changelog
+Note: version releases in the 0.x.y range may introduce breaking changes.
+
+## 0.2.0
+
+- minor: Add content-based KNN for selecting prompt tune few shot examples
+- minor: Add dynamic community report rating to the prompt tuning engine
+- patch: Add Minute-based Rate Limiting and fix rpm, tpm settings
+- patch: Add N parameter support
+- patch: Add cli flag to overlay default values onto a provided config.
+- patch: Add exception handling on file load
+- patch: Add language support to prompt tuning
+- patch: Add llm params to local and global search
+- patch: Fix broken prompt tuning link on docs
+- patch: Fix delta none on query calls
+- patch: Fix docsite base url
+- patch: Fix encoding model parameter on prompt tune
+- patch: Fix for --limit exceeding the dataframe length
+- patch: Fix for Ruff 0.5.2
+- patch: Fixed an issue where base OpenAI embeddings can't work with Azure OpenAI LLM
+- patch: Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls
+- patch: fix community_report doesn't work in settings.yaml
+- patch: fix llm response content is None in query
+- patch: fix the organization parameter is ineffective during queries
+- patch: remove duplicate file read
+- patch: support non-open ai model config to prompt tune
+- patch: use binary io processing for all file io operations
+
+## 0.1.0
+
+- minor: Initial Release
diff --git a/docsite/index.md b/docsite/index.md
@@ -44,7 +44,7 @@ GraphRAG builds upon our prior [research](https://www.microsoft.com/en-us/workla
 
 ### Index
 
-- Slice up an input corpus into a series of TextUnits, which act as analyzable units for the rest of the process, and provide fine-grained references into our outputs.
+- Slice up an input corpus into a series of TextUnits, which act as analyzable units for the rest of the process, and provide fine-grained references in our outputs.
 - Extract all entities, relationships, and key claims from the TextUnits using an LLM.
 - Perform a hierarchical clustering of the graph using the [Leiden technique](https://arxiv.org/pdf/1810.08473.pdf). To see this visually, check out Figure 1 above. Each circle is an entity (e.g., a person, place, or organization), with the size representing the degree of the entity, and the color representing its community.
 - Generate summaries of each community and its constituents from the bottom-up. This aids in holistic understanding of the dataset.

diff --git a/graphrag/config/models/claim_extraction_config.py b/graphrag/config/models/claim_extraction_config.py
@@ -43,7 +43,9 @@ def resolved_strategy(self, root_dir: str) -> dict:
             "type": ExtractClaimsStrategyType.graph_intelligence,
             "llm": self.llm.model_dump(),
             **self.parallelization.model_dump(),
-            "extraction_prompt": (Path(root_dir) / self.prompt).read_text()
+            "extraction_prompt": (Path(root_dir) / self.prompt)
+            .read_bytes()
+            .decode(encoding="utf-8")
             if self.prompt
             else None,
             "claim_description": self.description,

diff --git a/graphrag/config/models/community_reports_config.py b/graphrag/config/models/community_reports_config.py
@@ -38,7 +38,9 @@ def resolved_strategy(self, root_dir) -> dict:
             "type": CreateCommunityReportsStrategyType.graph_intelligence,
             "llm": self.llm.model_dump(),
             **self.parallelization.model_dump(),
-            "extraction_prompt": (Path(root_dir) / self.prompt).read_text()
+            "extraction_prompt": (Path(root_dir) / self.prompt)
+            .read_bytes()
+            .decode(encoding="utf-8")
             if self.prompt
             else None,
             "max_report_length": self.max_length,

diff --git a/graphrag/config/models/entity_extraction_config.py b/graphrag/config/models/entity_extraction_config.py
@@ -38,7 +38,9 @@ def resolved_strategy(self, root_dir: str, encoding_model: str) -> dict:
             "type": ExtractEntityStrategyType.graph_intelligence,
             "llm": self.llm.model_dump(),
             **self.parallelization.model_dump(),
-            "extraction_prompt": (Path(root_dir) / self.prompt).read_text()
+            "extraction_prompt": (Path(root_dir) / self.prompt)
+            .read_bytes()
+            .decode(encoding="utf-8")
             if self.prompt
             else None,
             "max_gleanings": self.max_gleanings,

diff --git a/graphrag/config/models/summarize_descriptions_config.py b/graphrag/config/models/summarize_descriptions_config.py
@@ -34,7 +34,9 @@ def resolved_strategy(self, root_dir: str) -> dict:
             "type": SummarizeStrategyType.graph_intelligence,
             "llm": self.llm.model_dump(),
             **self.parallelization.model_dump(),
-            "summarize_prompt": (Path(root_dir) / self.prompt).read_text()
+            "summarize_prompt": (Path(root_dir) / self.prompt)
+            .read_bytes()
+            .decode(encoding="utf-8")
             if self.prompt
             else None,
             "max_summary_length": self.max_length,