Merged llm-indexer.git testnet into subfolder indexer

indexnetwork · Jan 14, 2024 · 487e5ae · 487e5ae
2 parents e515e61 + e505fca
commit 487e5ae
Show file tree

Hide file tree

Showing 9 changed files with 506 additions and 0 deletions.
diff --git a/indexer/.github/workflows/build.yaml b/indexer/.github/workflows/build.yaml
@@ -0,0 +1,70 @@
+name: deploy
+on:
+  push:
+    branches:
+    - testnet
+    - dev
+jobs:
+  main:
+    runs-on: ubuntu-latest
+    steps:
+
+      - name: Install kubectl
+        uses: azure/setup-kubectl@v2.0
+        with:
+          version: 'v1.23.6' # default is latest stable
+        id: install       
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-1
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v1
+
+      - name: Install kubectl
+        uses: azure/setup-kubectl@v2.0   
+
+      - name: Set kubectl context
+        uses: azure/k8s-set-context@v3
+        with:
+           method: kubeconfig
+           kubeconfig: ${{ secrets.KUBECONFIG }}
+           context: microk8s
+
+      - name: Check k8s connection
+        run: kubectl get pods
+
+      - name: Store build time
+        id: build-time
+        shell: bash
+        run: >-
+          echo "::set-output name=time::$(date +%s)"
+
+      - name: Check out the repo
+        uses: actions/checkout@v2
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Build, tag, and push image to Amazon ECR
+        env:
+          DOCKER_TAG: indexnetwork/llm-indexer:${{ steps.build-time.outputs.time }}
+          DOCKER_REGISTRY: 236785930124.dkr.ecr.us-east-1.amazonaws.com
+        run: |
+          docker build -t $DOCKER_TAG .
+          docker tag $DOCKER_TAG $DOCKER_REGISTRY/$DOCKER_TAG
+          docker push $DOCKER_REGISTRY/$DOCKER_TAG
+          docker tag $DOCKER_TAG $DOCKER_REGISTRY/indexnetwork/llm-indexer:latest-${GITHUB_REF#refs/heads/}
+          docker push $DOCKER_REGISTRY/indexnetwork/llm-indexer:latest-${GITHUB_REF#refs/heads/}
+
+      - name: Deploy
+        run: |-
+          kubectl set image deployment/llm-indexer llm-indexer=236785930124.dkr.ecr.us-east-1.amazonaws.com/indexnetwork/llm-indexer:${{ steps.build-time.outputs.time }} --namespace env-${GITHUB_REF#refs/heads/}
diff --git a/indexer/.gitignore b/indexer/.gitignore
@@ -0,0 +1,7 @@
+.chroma/*
+chroma-indexes/*
+.idea/*
+app/__pycache__/*
+.DS_Store
+.env
+files
diff --git a/indexer/Dockerfile b/indexer/Dockerfile
@@ -0,0 +1,8 @@
+FROM python:3.11.5
+WORKDIR /code
+RUN pip3 install numpy==1.24.2 fastapi==0.99.1 pydantic==1.10.11 langchain==0.0.326 llama_index==0.8.28 chromadb==0.4.10  uuid  openai==0.28.0 redis --no-cache-dir
+RUN pip3 install "unstructured[all-docs]"
+RUN pip3 install opencv-python-headless
+
+COPY ./app /code/app
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"]
diff --git a/indexer/LICENSE b/indexer/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 indexas
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/indexer/README.md b/indexer/README.md
@@ -0,0 +1 @@
+# llm-indexer
diff --git a/indexer/TODO.md b/indexer/TODO.md
@@ -0,0 +1,17 @@
+[x] Add link
+[x] Get sources
+[x] Get answer
+[] Frontend Integration
+- [x] Chat
+- [x] LLamaIndex & NextJS Chat Integration
+- [x] Conversation history
+- [] per session hash
+- [] chat ui with index_id
+- [x] chat api 
+- [] indexer deploy
+[] Bonus
+- [x] Composability
+- [] Huggingface Models
+[] Others
+- [] Remove link
+- [] Update link
diff --git a/indexer/app/__init__.py b/indexer/app/__init__.py
diff --git a/indexer/app/document_parser.py b/indexer/app/document_parser.py
@@ -0,0 +1,132 @@
+import mimetypes
+import os
+import requests
+import uuid
+from unstructured.partition.auto import partition
+from langchain.document_loaders import JSONLoader, UnstructuredFileLoader
+
+
+from fastapi.responses import JSONResponse
+
+
+class Transformers:
+    @staticmethod
+    def unstructured(fileName, file_type):
+        loader = UnstructuredFileLoader(file_path=fileName, mode="paged")
+        return loader.load()        
+
+    @staticmethod
+    def apify(url):
+        print("Processing with Apify...")
+        # Actual logic for JSON Transformer goes here
+        return url
+
+    @staticmethod
+    def langchainJSON(fileName, file_type):
+        print("Processing with Langchain JSON Loader...")
+        loader = JSONLoader(file_path=fileName, jq_schema='.', text_content=False)
+        return loader.load()
+
+fileTypes = {
+    'csv': Transformers.unstructured,
+    'epub': Transformers.unstructured,
+    'xlsx': Transformers.unstructured,
+    'xls': Transformers.unstructured,
+    'md': Transformers.unstructured,
+    'org': Transformers.unstructured,
+    'odt': Transformers.unstructured,
+    'pdf': Transformers.unstructured,
+    'txt': Transformers.unstructured,
+    'ppt': Transformers.unstructured,
+    'pptx': Transformers.unstructured,
+    'rst': Transformers.unstructured,
+    'rtf': Transformers.unstructured,
+    'tsv': Transformers.unstructured,
+    'doc': Transformers.unstructured,
+    'docx': Transformers.unstructured,
+    'xml': Transformers.unstructured,
+    'json': Transformers.langchainJSON,
+    'html': Transformers.unstructured, # ? 
+}
+
+def save_file(url, file_type):
+    # Fetch the content from the URL
+    response = requests.get(url)
+    response.raise_for_status()  # Raise exception if the request was unsuccessful
+
+    # Generate a unique filename based on UUID
+    unique_filename = f"{uuid.uuid4()}.{file_type}"
+
+    output_path = os.environ.get('OUTPUT_PATH', '.') 
+    full_path = os.path.join(output_path, unique_filename)
+
+    # Save the content to the specified path with the generated filename
+    with open(full_path, 'wb') as f:
+        f.write(response.content)
+
+    return full_path
+
+
+def delete_file(file_path):
+    try:
+        os.remove(file_path)
+    except FileNotFoundError:
+        print(f"File {file_path} not found!")
+    except OSError as e:
+        print(f"Error deleting file {file_path}. Reason: {e}")
+
+def guess_file_type(url_or_file_path):
+    mime_type, encoding = mimetypes.guess_type(url_or_file_path)
+    extension = url_or_file_path.split('.')[-1] if '.' in url_or_file_path else None
+    return extension, mime_type
+
+def get_mime_from_head_request(url):
+    try:
+        response = requests.head(url, allow_redirects=True)
+        return response.headers.get('Content-Type', '').split(';')[0]  # Exclude charset if present
+    except requests.RequestException:
+        return None
+
+def map_mime_to_filetype(mime_type):
+    mime_to_filetype_map = {
+        'text/csv': 'csv',
+        'application/epub+zip': 'epub',
+        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
+        'application/vnd.ms-excel': 'xls',
+        'text/html': 'html',
+        'text/markdown': 'md',
+        'text/org': 'org',
+        'application/vnd.oasis.opendocument.text': 'odt',
+        'application/pdf': 'pdf',
+        'text/plain': 'txt',
+        'application/vnd.ms-powerpoint': 'ppt',
+        'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
+        'text/rtf': 'rtf',
+        'text/tab-separated-values': 'tsv',
+        'application/msword': 'doc',
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
+        'application/xml': 'xml',
+        'application/json': 'json'
+    }
+    return mime_to_filetype_map.get(mime_type)
+
+
+def resolve_file(url):
+    file_type, _ = guess_file_type(url)
+    transformer = fileTypes.get(file_type)
+    if transformer:
+        return transformer, file_type
+
+    mime_type = get_mime_from_head_request(url)
+    if mime_type:
+        file_type = map_mime_to_filetype(mime_type)
+        transformer = fileTypes.get(file_type)
+        return transformer, file_type
+
+def get_document(url):
+    transformer, file_type = resolve_file(url)
+    fileName = save_file(url, file_type)
+    print("Processing", url, file_type, transformer, fileName)
+    nodes = transformer(fileName, file_type)
+    delete_file(fileName)
+    return nodes