Skip to content

Commit

Permalink
Merge pull request #158 from amosproj/feature/98_integration_tests_fo…
Browse files Browse the repository at this point in the history
…r_confluence_data_reading

Signed-off-by: Hafidz Arifin <hafidz.harifin@gmail.com>
  • Loading branch information
zenzeii authored Jun 27, 2023
2 parents 114e60e + ac80937 commit 7716636
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 6 deletions.
8 changes: 4 additions & 4 deletions QAChat/Data_Processing/confluence_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __init__(self):
self.last_update_lookup = dict()
self.chunk_id_lookup_table = dict()
self.g_docs_proc = GoogleDocPreProcessor()
self.pdf_reder = PDFReader()
self.pdf_reader = PDFReader()

def init_blacklist(self):
# Retrieve blacklist data from Supabase table
Expand Down Expand Up @@ -151,7 +151,7 @@ def get_relevant_data_from_pages(self):
google_doc_content = self.get_content_from_google_drive(urls)

# get content from confluence attachments
pdf_content = self.add_content_of_pdf_to_all_page_information(page_id)
pdf_content = self.get_content_from_page_attachments(page_id)

# replace consecutive occurrences of \n into one space
text = re.sub(
Expand Down Expand Up @@ -203,11 +203,11 @@ def get_content_from_google_drive(self, urls):
pdf_bytes = self.g_docs_proc.export_pdf(google_drive_id)

# get content from pdf
pdf_content += self.pdf_reder.read_pdf(pdf_bytes) + " "
pdf_content += self.pdf_reader.read_pdf(pdf_bytes) + " "

return pdf_content

def add_content_of_pdf_to_all_page_information(self, page_id) -> str:
def get_content_from_page_attachments(self, page_id) -> str:
start = 0
limit = 100
attachments = []
Expand Down
2 changes: 1 addition & 1 deletion QAChat/Data_Processing/data_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from datetime import datetime
from typing import List

from document_embedder import DataInformation
from QAChat.Data_Processing.document_embedder import DataInformation


class DataPreprocessor(ABC):
Expand Down
2 changes: 1 addition & 1 deletion QAChat/Data_Processing/google_doc_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@

from __future__ import print_function
import io
from QAChat.Data_Processing.pdf_reader import PDFReader
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseDownload
from google_auth_oauthlib.flow import InstalledAppFlow
from QAChat.Data_Processing.pdf_reader import PDFReader
from google.oauth2 import service_account
import os

Expand Down
115 changes: 115 additions & 0 deletions Testing/confluence_integration_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: 2023 Hafidz Arifin

from dotenv import load_dotenv
from atlassian import Confluence
from bs4 import BeautifulSoup
import unittest
import requests
import random
import string
import os
import io

from QAChat.Data_Processing.pdf_reader import PDFReader


load_dotenv("../QAChat/tokens.env")

# Get Confluence API credentials from environment variables
CONFLUENCE_ADDRESS = os.getenv("CONFLUENCE_ADDRESS")
CONFLUENCE_USERNAME = os.getenv("CONFLUENCE_USERNAME")
CONFLUENCE_TOKEN = os.getenv("CONFLUENCE_TOKEN")

# Get Supabase API credentials from environment variables
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY")


class ConfluenceIntegrationTest(unittest.TestCase):
def setUp(self):
self.confluence = Confluence(
url=CONFLUENCE_ADDRESS,
username=CONFLUENCE_USERNAME,
password=CONFLUENCE_TOKEN,
cloud=True,
)

def test_reading(self):
# set parameter for new page
space = "Test2"
page_title = "Example Page " + "".join(
random.choice(string.ascii_lowercase) for i in range(4)
)
page_body = "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."

# create page
self.confluence.create_page(
space,
page_title,
page_body,
parent_id=None,
type="page",
representation="storage",
editor="v2",
full_width=False,
)

# get page id
page_id = self.confluence.get_page_id(space, page_title)

# get page body
page_raw_body = self.confluence.get_page_by_id(
page_id, expand="body.storage, version", status=None, version=None
)["body"]["storage"]["value"]

# filter page body
page_filtered_body = BeautifulSoup(
page_raw_body, features="html.parser"
).get_text()

# set pdf
test_pdf = "../Deliverables/sprint-03/planning-documents.pdf"

# attach pdf
self.confluence.attach_file(
test_pdf,
name=None,
content_type=None,
page_id=page_id,
title=None,
space=space,
comment=None,
)
# get pdf
attachment = self.confluence.get_attachments_from_content(
page_id=page_id, start=0, limit=100
)["results"][0]

# download pdf
r = requests.get(
self.confluence.url + attachment["_links"]["download"],
auth=(self.confluence.username, self.confluence.password),
)

# read pdf from confluence
pdf_bytes = io.BytesIO(r.content).read()
retrieved_pdf_content = PDFReader().read_pdf(pdf_bytes)

# read pdf from local
with open("../Deliverables/sprint-03/planning-documents.pdf", "rb") as f:
pdf_bytes = f.read()
local_pdf_content = PDFReader().read_pdf(pdf_bytes)

# delete page
self.confluence.remove_page(page_id, status=None, recursive=False)

# assert page body
self.assertEqual(page_filtered_body, page_body)

# assert page attachments
self.assertEqual(retrieved_pdf_content, local_pdf_content)


if __name__ == "__main__":
unittest.main()

0 comments on commit 7716636

Please sign in to comment.