ubiquity-os-marketplace · gentlementlegen · Dec 26, 2024 · Dec 26, 2024 · Dec 29, 2024 · Dec 29, 2024
diff --git a/.cspell.json b/.cspell.json
@@ -31,7 +31,8 @@
     "Rpcs",
     "sonarjs",
     "pico",
-    "timespan"
+    "timespan",
+    "tfidf"
   ],
   "dictionaries": ["typescript", "node", "software-terms"],
   "import": [

diff --git a/bun.lockb b/bun.lockb
diff --git a/package.json b/package.json
@@ -41,6 +41,7 @@
     "js-tiktoken": "1.0.15",
     "jsdom": "24.0.0",
     "markdown-it": "14.1.0",
+    "natural": "^8.0.1",
     "openai": "4.56.0",
     "yaml": "^2.6.1"
   },

diff --git a/src/helpers/tf-idf.ts b/src/helpers/tf-idf.ts
@@ -0,0 +1,59 @@
+import natural from "natural";
+import { AllComments } from "../types/content-evaluator-module-type";
+
+export class TfIdf {
+  private _tfidf: natural.TfIdf;
+
+  constructor() {
+    this._tfidf = new natural.TfIdf();
+  }
+
+  private _preprocessText(text: string): string {
+    return text
+      .toLowerCase()
+      .replace(/[^\w\s]/g, " ")
+      .replace(/\s+/g, " ")
+      .trim();
+  }
+
+  public calculateSimilarity(text1: string, text2: string): number {
+    this._tfidf = new natural.TfIdf();
+    const processed1 = this._preprocessText(text1);
+    const processed2 = this._preprocessText(text2);
+
+    this._tfidf.addDocument(processed1);
+    this._tfidf.addDocument(processed2);
+
+    const vector1 = this._tfidf.listTerms(0);
+    const vector2 = this._tfidf.listTerms(1);
+
+    const terms = new Set([...vector1.map((v) => v.term), ...vector2.map((v) => v.term)]);
+
+    const v1: number[] = [];
+    const v2: number[] = [];
+
+    terms.forEach((term) => {
+      const term1 = vector1.find((v) => v.term === term);
+      const term2 = vector2.find((v) => v.term === term);
+      v1.push(term1 ? term1.tfidf : 0);
+      v2.push(term2 ? term2.tfidf : 0);
+    });
+
+    const dotProduct = v1.reduce((sum, val, i) => sum + val * v2[i], 0);
+    const magnitude1 = Math.sqrt(v1.reduce((sum, val) => sum + val * val, 0));
+    const magnitude2 = Math.sqrt(v2.reduce((sum, val) => sum + val * val, 0));
+
+    if (magnitude1 === 0 || magnitude2 === 0) return 0;
+
+    return dotProduct / (magnitude1 * magnitude2);
+  }
+
+  getTopComments(specification: string, comments: AllComments, limit = 10) {
+    return comments
+      .map((comment) => {
+        return { similarity: this.calculateSimilarity(specification, comment.comment), comment };
+      })
+      .sort((a, b) => b.similarity - a.similarity)
+      .slice(0, limit);
+  }
+}
diff --git a/src/parser/content-evaluator-module.ts b/src/parser/content-evaluator-module.ts
@@ -15,6 +15,9 @@ import {
 import { BaseModule } from "../types/module";
 import { ContextPlugin } from "../types/plugin-input";
 import { GithubCommentScore, Result } from "../types/results";
+import { TfIdf } from "../helpers/tf-idf";
+
+const TOKEN_MODEL_LIMIT = 124000;
 
 /**
  * Evaluates and rates comments.
@@ -61,7 +64,7 @@ export class ContentEvaluatorModule extends BaseModule {
     const allCommentsUnClean = data.allComments || [];
     const allComments: { id: number; comment: string; author: string }[] = [];
     for (const commentObj of allCommentsUnClean) {
-      if (commentObj.user) {
+      if (commentObj.user && commentObj.user.type !== "Bot") {
         allComments.push({ id: commentObj.id, comment: commentObj.body ?? "", author: commentObj.user.login });
       }
     }
@@ -178,15 +181,33 @@ export class ContentEvaluatorModule extends BaseModule {
       const dummyResponse = JSON.stringify(this._generateDummyResponse(comments), null, 2);
       const maxTokens = this._calculateMaxTokens(dummyResponse);
 
-      const promptForComments = this._generatePromptForComments(specification, comments, allComments);
+      let promptForComments = this._generatePromptForComments(specification, comments, allComments);
+      if (this._calculateMaxTokens(promptForComments, Infinity) > TOKEN_MODEL_LIMIT) {
+        const tfidf = new TfIdf();
+        const mostImportantComments = tfidf.getTopComments(specification, allComments);
+        promptForComments = this._generatePromptForComments(
+          specification,
+          comments,
+          mostImportantComments.map((o) => o.comment)
+        );
+      }
       commentRelevances = await this._submitPrompt(promptForComments, maxTokens);
     }
 
     if (prComments.length) {
       const dummyResponse = JSON.stringify(this._generateDummyResponse(prComments), null, 2);
       const maxTokens = this._calculateMaxTokens(dummyResponse);
 
-      const promptForPrComments = this._generatePromptForPrComments(specification, prComments);
+      let promptForPrComments = this._generatePromptForPrComments(specification, prComments);
+      if (this._calculateMaxTokens(promptForPrComments, Infinity) > TOKEN_MODEL_LIMIT) {
+        const tfidf = new TfIdf();
+        const mostImportantComments = tfidf.getTopComments(specification, allComments);
+        promptForPrComments = this._generatePromptForComments(
+          specification,
+          comments,
+          mostImportantComments.map((o) => o.comment)
+        );
+      }
       prCommentRelevances = await this._submitPrompt(promptForPrComments, maxTokens);
     }