Merge pull request #8 from pablomarin/pages_tabular

Pages tabular
pablomarin · Apr 7, 2023 · 7c4d0c6 · 7c4d0c6
2 parents 315e9fa + dacdf2c
commit 7c4d0c6
Show file tree

Hide file tree

Showing 4 changed files with 186 additions and 60 deletions.
diff --git a/app/Home.py b/app/Home.py
@@ -0,0 +1,67 @@
+import streamlit as st
+import urllib
+import os
+import time
+import requests
+import random
+from collections import OrderedDict
+from openai.error import OpenAIError
+from langchain.docstore.document import Document
+
+from components.sidebar import sidebar
+from utils import (
+    embed_docs,
+    get_answer,
+    get_sources,
+    search_docs
+)
+from credentials import (
+    DATASOURCE_CONNECTION_STRING,
+    AZURE_SEARCH_API_VERSION,
+    AZURE_SEARCH_ENDPOINT,
+    AZURE_SEARCH_KEY,
+    COG_SERVICES_NAME,
+    COG_SERVICES_KEY,
+    AZURE_OPENAI_ENDPOINT,
+    AZURE_OPENAI_KEY,
+    AZURE_OPENAI_API_VERSION
+
+)
+
+os.environ["OPENAI_API_BASE"] = os.environ["AZURE_OPENAI_ENDPOINT"] = st.session_state["AZURE_OPENAI_ENDPOINT "] = AZURE_OPENAI_ENDPOINT
+os.environ["OPENAI_API_KEY"] = os.environ["AZURE_OPENAI_API_KEY"] = st.session_state["AZURE_OPENAI_API_KEY"] = AZURE_OPENAI_KEY
+os.environ["OPENAI_API_VERSION"] = os.environ["AZURE_OPENAI_API_VERSION"] = AZURE_OPENAI_API_VERSION
+
+st.set_page_config(page_title="GPT Smart Search", page_icon="📖", layout="wide")
+
+
+
+st.image("https://user-images.githubusercontent.com/113465005/226238596-cc76039e-67c2-46b6-b0bb-35d037ae66e1.png")
+
+st.header("GPT Smart Search Engine")
+
+
+st.markdown("---")
+st.markdown("""
+    GPT Smart Search allows you to ask questions about your
+    documents and get accurate answers with instant citations.
+    
+    This engine finds information from the following:
+    - ~10k [Computer Science Publications in Arxiv from 2020-2022](https://www.kaggle.com/datasets/1b6883fb66c5e7f67c697c2547022cc04c9ee98c3742f9a4d6c671b4f4eda591)
+    - ~52k [COVID-19 literature in LitCovid from 2020-2023](https://www.ncbi.nlm.nih.gov/research/coronavirus/)
+    
+    **👈 Select a demo from the sidebar** to see some examples
+    of what Azure Cognitive Search and Azure OpenAI Service can do!
+    ### Want to learn more?
+    - Check out [Github Repo](https://github.com/pablomarin/GPT-Azure-Search-Engine/)
+    - Jump into [Azure OpenAI documentation](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/)
+    - Ask a question or submit a [GitHub Issue!](https://github.com/pablomarin/GPT-Azure-Search-Engine/issues/new)
+
+
+    
+"""
+)
+st.markdown("---")
+
+
+st.sidebar.success("Select a demo above.")
diff --git a/app/pages/1_Chat_(Preview).py b/app/pages/1_Chat_(Preview).py
diff --git a/app/main.py → app/pages/1_GPT_Smart_Search.py b/app/main.py → app/pages/1_GPT_Smart_Search.py
@@ -71,28 +71,27 @@ def get_search_results(query, indexes):
 st.set_page_config(page_title="GPT Smart Search", page_icon="📖", layout="wide")
 st.header("GPT Smart Search Engine")
 
-sidebar()
-
-with st.expander("Instructions"):
+with st.sidebar:
+    st.markdown("""# Instructions""")
     st.markdown("""
-                Ask a question that you think can be answered with the information in about 10k Arxiv Computer Science publications from 2020-2021 or in 52k Medical Covid-19 Publications from 2020.
-                
-                For example:
-                - What are markov chains?
-                - List the authors that talk about Gradient Boosting Machines
-                - How does random forest work?
-                - What kind of problems can I solve with reinforcement learning? Give me some real life examples
-                - What kind of problems Turing Machines solve?
-                - What are the main risk factors for Covid-19?
-                - What medicine reduces inflamation in the lungs?
-                - Why Covid doesn't affect kids that much compared to adults?
-                
-                \nYou will notice that the answers to these questions are diferent from the open ChatGPT, since these papers are the only possible context. This search engine does not look at the open internet to answer these questions. If the context doesn't contain information, the engine will respond: I don't know.
-                """)
+Ask a question that you think can be answered with the information in about 10k Arxiv Computer Science publications from 2020-2021 or in 52k Medical Covid-19 Publications from 2020.
+
+For example:
+- What are markov chains?
+- List the authors that talk about Gradient Boosting Machines
+- How does random forest work?
+- What kind of problems can I solve with reinforcement learning? Give me some real life examples
+- What kind of problems Turing Machines solve?
+- What are the main risk factors for Covid-19?
+- What medicine reduces inflamation in the lungs?
+- Why Covid doesn't affect kids that much compared to adults?
+    
+    \nYou will notice that the answers to these questions are diferent from the open ChatGPT, since these papers are the only possible context. This search engine does not look at the open internet to answer these questions. If the context doesn't contain information, the engine will respond: I don't know.
+    """)
     st.markdown("""
-                - ***Quick Answer***: GPT model only uses, as context, the captions of the results coming from Azure Search
-                - ***Best Answer***: GPT model uses, as context. all of the content of the documents coming from Azure Search
-                """)
+            - ***Quick Answer***: GPT model only uses, as context, the captions of the results coming from Azure Search
+            - ***Best Answer***: GPT model uses, as context. all of the content of the documents coming from Azure Search
+            """)
 
 query = st.text_input("Ask a question to your enterprise data lake", value= "What is CLP?", on_change=clear_submit)
 

diff --git a/app/pages/2_Tabular_Data_(Preview).py b/app/pages/2_Tabular_Data_(Preview).py
@@ -0,0 +1,100 @@
+import streamlit as st
+import os
+import pandas as pd
+from langchain.llms import AzureOpenAI
+from langchain.chat_models import AzureChatOpenAI
+from langchain.agents import create_pandas_dataframe_agent
+from langchain.agents import create_csv_agent
+
+
+def sidebar():
+    with st.sidebar:
+        st.markdown("""# Instructions""")
+        st.markdown("---")
+        st.markdown("""
+            **GPT GPT Tabular data Q&A** allows you to ask questions to your Tabular CSV files.
+        """
+        )
+        st.markdown("**Note**: GPT-4 is in preview and with limited availability. There is a lot of limitation on the API, so it takes longer than needed and it fails some times. Retry if it fails.")
+        st.markdown("---")
+
+        st.session_state["AZURE_OPENAI_GPT4_NAME"] = st.text_input("Enter your GPT-4 deployment name:")
+        st.session_state["AZURE_OPENAI_ENDPOINT"] = st.text_input("Enter your Azure OpenAI Endpoint:")
+        st.session_state["AZURE_OPENAI_API_KEY"] = st.text_input("Enter Azure OpenAI Key:", type="password")
+
+preffix = 'First set the pandas display options to show all the columns, then get the column names, then answer the question: '
+suffix = '. ALWAYS before giving the Final Answer, reflect on the answer and ask yourself if it answers correctly the original question. If you are not sure, try another method. \n If the two runs does not give the same result, reflect again two more times until you have two runs that have the same result. If you still cannot arrive to a consistent result, say that you are not sure of the answer. But, if you are sure of the correct answer, create a beautiful and thorough response. ALWAYS, as part of your final answer, explain how you got to the answer. Format the final answer in Markdown language'
+
+max_retries = 5
+
+st.set_page_config(page_title="GPT Tabular data Q&A", page_icon="📖", layout="wide")
+st.header("GPT Tabular data Q&A (preview)")
+
+sidebar()
+
+def clear_submit():
+    st.session_state["submit"] = False
+
+
+col1, col2 = st.columns([1,1])
+with col1:
+    uploaded_file  = st.file_uploader(label = "Upload your tabular CSV file", type="csv", accept_multiple_files=False, key=None, help="Upload your CSV file that contains tabular data, make sure that the first row corresponds to the columns", on_change=None, disabled=False)
+# with col2:
+#     st.markdown("Or pick from these sample datasets:")
+#     st.markdown("[Covid Tracking Project](https://learn.microsoft.com/en-us/azure/open-datasets/dataset-covid-tracking?tabs=azure-storage) ")
+#     ingest_button = st.button("Load Sample CSV") # Give button a variable name
+
+# if ingest_button: # Make button a condition.
+#     uploaded_file = "https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/covid_tracking/latest/covid_tracking.csv"
+
+if uploaded_file is not None:
+    df = pd.read_csv(uploaded_file)
+    st.write("Here is the first two rows of your file:", df.head(2))
+
+    query_str = st.text_input("Ask a question:", on_change=clear_submit)
+
+    qbutton = st.button('Generate Answer')
+
+
+    if (qbutton or st.session_state.get("submit")) and uploaded_file:
+        if not query_str:
+            st.error("Please enter a question")
+        else:
+            st.session_state["submit"] = True
+            placeholder = st.empty()
+
+            if not st.session_state.get("AZURE_OPENAI_ENDPOINT"):
+                st.error("Please set your Azure OpenAI API Endpoint on the side bar!")
+            elif not st.session_state.get("AZURE_OPENAI_API_KEY"):
+                st.error("Please configure your Azure OpenAI API key on the side bar!")
+            elif not st.session_state.get("AZURE_OPENAI_GPT4_NAME"):
+                st.error("Please configure your GPT-4 Deployment Name in the sidebar") 
+
+            else:
+
+                os.environ["OPENAI_API_BASE"] = os.environ["AZURE_OPENAI_ENDPOINT"] = st.session_state["AZURE_OPENAI_ENDPOINT"]
+                os.environ["OPENAI_API_KEY"] = os.environ["AZURE_OPENAI_API_KEY"] = st.session_state["AZURE_OPENAI_API_KEY"]
+                os.environ["OPENAI_API_VERSION"] = os.environ["AZURE_OPENAI_API_VERSION"] = "2023-03-15-preview"
+
+                llm = AzureChatOpenAI(deployment_name=st.session_state["AZURE_OPENAI_GPT4_NAME"], temperature=0.5, max_tokens=999)
+                agent = create_pandas_dataframe_agent(llm, df, verbose=True)
+
+
+                try:
+
+                    with st.spinner("Coming up with an answer... ⏳"):
+                        for i in range(max_retries):
+                            try:
+                                response = agent.run(preffix + query_str + suffix) 
+                                break
+                            except:
+                                response = "Error too many failed retries - GPT-4 still in preview and just for testing"
+                                continue  
+
+
+                    with placeholder.container():
+                        st.markdown("#### Answer")
+                        st.markdown(response.replace("$","\$"))
+
+                except Exception as e:
+                    st.error(e)