diff --git a/mdagent/agent/agent.py b/mdagent/agent/agent.py index 9d9e6532..f67df6de 100644 --- a/mdagent/agent/agent.py +++ b/mdagent/agent/agent.py @@ -3,8 +3,6 @@ from dotenv import load_dotenv from langchain.agents import AgentExecutor, OpenAIFunctionsAgent from langchain.agents.structured_chat.base import StructuredChatAgent -from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler -from langchain.chat_models import ChatOpenAI from ..tools import get_tools, make_all_tools from ..utils import PathRegistry, SetCheckpoint, _make_llm @@ -38,9 +36,10 @@ def __init__( tools=None, agent_type="OpenAIFunctionsAgent", # this can also be structured_chat model="gpt-4-1106-preview", # current name for gpt-4 turbo - tools_model="gpt-4-1106-preview", + tools_model=None, temp=0.1, - verbose=True, + streaming=True, + verbose=False, ckpt_dir="ckpt", top_k_tools=20, # set "all" if you want to use all tools use_human_tool=False, @@ -48,10 +47,15 @@ def __init__( run_id="", use_memory=False, ): + self.llm = _make_llm(model, temp, streaming) + if tools_model is None: + tools_model = model + self.tools_llm = _make_llm(tools_model, temp, streaming) + self.use_memory = use_memory self.path_registry = PathRegistry.get_instance(ckpt_dir=ckpt_dir) self.ckpt_dir = self.path_registry.ckpt_dir - self.memory = MemoryManager(self.path_registry, run_id=run_id) + self.memory = MemoryManager(self.path_registry, self.tools_llm, run_id=run_id) self.run_id = self.memory.run_id self.uploaded_files = uploaded_files @@ -60,18 +64,10 @@ def __init__( self.agent = None self.agent_type = agent_type - self.user_tools = tools - self.tools_llm = _make_llm(tools_model, temp, verbose) self.top_k_tools = top_k_tools self.use_human_tool = use_human_tool - - self.llm = ChatOpenAI( - temperature=temp, - model=model, - client=None, - streaming=True, - callbacks=[StreamingStdOutCallbackHandler()], - ) + self.user_tools = tools + self.verbose = verbose def _initialize_tools_and_agent(self, user_input=None): """Retrieve tools and initialize the agent.""" @@ -83,6 +79,7 @@ def _initialize_tools_and_agent(self, user_input=None): self.tools = get_tools( query=user_input, llm=self.tools_llm, + top_k_tools=self.top_k_tools, human=self.use_human_tool, ) else: @@ -97,6 +94,7 @@ def _initialize_tools_and_agent(self, user_input=None): self.llm, self.tools, ), + verbose=self.verbose, handle_parsing_errors=True, ) @@ -107,7 +105,7 @@ def run(self, user_input, callbacks=None): elif self.agent_type == "OpenAIFunctionsAgent": self.prompt = openaifxn_prompt.format(input=user_input, context=run_memory) self.agent = self._initialize_tools_and_agent(user_input) - model_output = self.agent.run(self.prompt, callbacks=callbacks) + model_output = self.agent.invoke(self.prompt, callbacks=callbacks) if self.use_memory: self.memory.generate_agent_summary(model_output) print("Your run id is: ", self.run_id) diff --git a/mdagent/agent/memory.py b/mdagent/agent/memory.py index 06b98c53..f450e47c 100644 --- a/mdagent/agent/memory.py +++ b/mdagent/agent/memory.py @@ -3,10 +3,8 @@ import random import string -from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler -from langchain.chains import LLMChain -from langchain.chat_models import ChatOpenAI from langchain.prompts import PromptTemplate +from langchain_core.output_parsers import StrOutputParser from mdagent.utils import PathRegistry @@ -32,8 +30,7 @@ class MemoryManager: def __init__( self, path_registry: PathRegistry, - model="gpt-3.5-turbo", - temp=0.1, + llm, run_id="", ): self.path_registry = path_registry @@ -46,14 +43,7 @@ def __init__( else: pull_mem = True - llm = ChatOpenAI( - temperature=temp, - model=model, - client=None, - streaming=True, - callbacks=[StreamingStdOutCallbackHandler()], - ) - self.llm_agent_trace = LLMChain(llm=llm, prompt=agent_summary_template) + self.llm_agent_trace = agent_summary_template | llm | StrOutputParser() self._make_all_dirs() if pull_mem: @@ -138,7 +128,7 @@ def generate_agent_summary(self, agent_trace): Returns: - None """ - llm_out = self.llm_agent_trace({"agent_trace": agent_trace})["text"] + llm_out = self.llm_agent_trace.invoke({"agent_trace": agent_trace}) key_str = f"{self.run_id}.{self.get_summary_number()}" run_summary = {key_str: llm_out} self._write_to_json(run_summary, self.agent_trace_summary) diff --git a/mdagent/agent/prompt.py b/mdagent/agent/prompt.py index 53983f65..dbfbd669 100644 --- a/mdagent/agent/prompt.py +++ b/mdagent/agent/prompt.py @@ -3,30 +3,35 @@ structured_prompt = PromptTemplate( input_variables=["input, context"], template=""" - You are an expert molecular dynamics scientist and + You are an expert molecular dynamics scientist, and your task is to respond to the question or solve the problem to the best of your ability using the provided tools. You can only respond with a single complete - Thought, Action, Action Input' format + 'Thought, Action, Action Input' format OR a single 'Final Answer' format. Complete format: - Thought: (reflect on your progress and decide what " "to do next) - Action: (the action name, should be the name of a tool) - Action Input: (the input string to the action) + Thought: (reflect on your progress and decide what to do next) + Action: + ``` + {{ + "action": (the action name, it should be the name of a tool), + "action_input": (the input string for the action) + }} + ''' OR Final Answer: (the final response to the original input - question, when all steps are complete) + question, once all steps are complete) You are required to use the tools provided, using the most specific tool available for each action. Your final answer should contain all information - necessary to answer the question and subquestions. + necessary to answer the question and its subquestions. Before you finish, reflect on your progress and make sure you have addressed the question in its entirety. @@ -41,81 +46,10 @@ Question: {input} """, ) - -modular_analysis_prompt = PromptTemplate( - input_variables=[ - "Main_Task", - "Subtask_types", - "Proteins", - "Parameters", - "UserProposedPlan", - "context", - ], - template=""" - Approach the molecular dynamics inquiry by dissecting it into its modular - components: - Main Task: {Main_Task} - Subtasks: {Subtask_types} - Target Proteins: {Proteins} - Parameters: {Parameters} - Initial Plan Proposed by User: {UserProposedPlan} - - The Main Task is the user's request. - - The Subtasks are (some of/all) the individual steps that may need to be taken - to complete the Main Task; Preprocessing/Preparation usually involves - cleaning the initial pdb file (adding hydrogens, removing/adding water, etc.) - or making the required box for the simulation, Simulation involves running the - simulation and/or modifying the simulation script, Postprocessing involves - analyzing the results of the simulation (either using provided tools or figuring - it out on your own). Finally, Question is used if the user query is more - of a question than a request for a specific task. - - the Target Proteins are the protein(s) that the user wants to focus on, - the Parameters are the 'special' conditions that the user wants to set and use - for the simulation, preprocessing and or analysis. - - Sometimes users already have an idea of what is needed to be done. - Initial Plan Proposed by User is the user's initial plan for the simulation. You - can use this as a guide to understand what the user wants to do. You can also - modify it if you think is necessary. - - You can only respond with a single complete - 'Thought, Action, Action Input' format - OR a single 'Final Answer' format. - - Complete format: - Thought: (reflect on your progress and decide what " "to do next) - Action: (the action name, should be the name of a tool) - Action Input: (the input string to the action) - - OR - - Final Answer: (the final answer to the original input - question) - - Use the tools provided, using the most specific tool - available for each action. - Your final answer should contain all information - necessary to answer the question and subquestions. - Your thought process should be clean and clear, - and you must explicitly state the actions you are taking. - - If you are asked to continue - or reference previous runs, - the context will be provided to you. - If context is provided, you should assume - you are continuing a chat. - - Here is the input: - Previous Context: {context} - """, -) - openaifxn_prompt = PromptTemplate( input_variables=["input", "context"], template=""" - You are an expert molecular dynamics scientist and your + You are an expert molecular dynamics scientist, and your task is to respond to the question or solve the problem to the best of your ability using the provided tools. Once you map a path to a short name, diff --git a/mdagent/tools/base_tools/__init__.py b/mdagent/tools/base_tools/__init__.py index 4482997e..dab5d3fc 100644 --- a/mdagent/tools/base_tools/__init__.py +++ b/mdagent/tools/base_tools/__init__.py @@ -44,11 +44,7 @@ UniprotID2Name, ) from .simulation_tools.create_simulation import ModifyBaseSimulationScriptTool -from .simulation_tools.setup_and_run import ( - SetUpandRunFunction, - SetUpAndRunTool, - SimulationFunctions, -) +from .simulation_tools.setup_and_run import SetUpandRunFunction from .util_tools.git_issues_tool import SerpGitTool from .util_tools.registry_tools import ListRegistryPaths, MapPath2Name from .util_tools.search_tools import Scholar2ResultLLM @@ -92,9 +88,7 @@ "RMSDCalculator", "Scholar2ResultLLM", "SerpGitTool", - "SetUpAndRunTool", "SetUpandRunFunction", - "SimulationFunctions", "SimulationOutputFigures", "SmallMolPDB", "UniprotID2Name", diff --git a/mdagent/tools/base_tools/preprocess_tools/pdb_get.py b/mdagent/tools/base_tools/preprocess_tools/pdb_get.py index 80d37f5c..675390f0 100644 --- a/mdagent/tools/base_tools/preprocess_tools/pdb_get.py +++ b/mdagent/tools/base_tools/preprocess_tools/pdb_get.py @@ -27,7 +27,7 @@ def get_pdb(query_string: str, path_registry: PathRegistry): } r = requests.post(url, json=query) if r.status_code == 204: - return None + return None, None if "cif" in query_string or "CIF" in query_string: filetype = "cif" else: @@ -57,7 +57,7 @@ def get_pdb(query_string: str, path_registry: PathRegistry): ) return filename, file_id - return None + return None, None class ProteinName2PDBTool(BaseTool): diff --git a/mdagent/tools/base_tools/simulation_tools/__init__.py b/mdagent/tools/base_tools/simulation_tools/__init__.py index 56933956..0022cf88 100644 --- a/mdagent/tools/base_tools/simulation_tools/__init__.py +++ b/mdagent/tools/base_tools/simulation_tools/__init__.py @@ -1,9 +1,7 @@ from .create_simulation import ModifyBaseSimulationScriptTool -from .setup_and_run import SetUpandRunFunction, SetUpAndRunTool, SimulationFunctions +from .setup_and_run import SetUpandRunFunction __all__ = [ "ModifyBaseSimulationScriptTool", "SetUpandRunFunction", - "SetUpAndRunTool", - "SimulationFunctions", ] diff --git a/mdagent/tools/base_tools/simulation_tools/create_simulation.py b/mdagent/tools/base_tools/simulation_tools/create_simulation.py index 8fdaf918..51ce6e36 100644 --- a/mdagent/tools/base_tools/simulation_tools/create_simulation.py +++ b/mdagent/tools/base_tools/simulation_tools/create_simulation.py @@ -2,9 +2,9 @@ from typing import Optional from langchain.base_language import BaseLanguageModel -from langchain.chains import LLMChain from langchain.prompts import PromptTemplate from langchain.tools import BaseTool +from langchain_core.output_parsers import StrOutputParser from pydantic import BaseModel, Field from mdagent.utils import FileType, PathRegistry @@ -48,7 +48,7 @@ def _prompt_summary(self, query: str): prompt = PromptTemplate( template=prompt_template, input_variables=["base_script", "query"] ) - llm_chain = LLMChain(prompt=prompt, llm=self.llm) + llm_chain = prompt | self.llm | StrOutputParser() return llm_chain.invoke(query) diff --git a/mdagent/tools/base_tools/simulation_tools/setup_and_run.py b/mdagent/tools/base_tools/simulation_tools/setup_and_run.py index 9c926ab6..413ce5e1 100644 --- a/mdagent/tools/base_tools/simulation_tools/setup_and_run.py +++ b/mdagent/tools/base_tools/simulation_tools/setup_and_run.py @@ -1,6 +1,4 @@ # Standard Library Imports -import ast -import json import os import re @@ -8,15 +6,11 @@ import textwrap from typing import Any, Dict, List, Optional, Type -import langchain import requests import streamlit as st -from langchain.chains import LLMChain -from langchain.prompts import PromptTemplate from langchain.tools import BaseTool from openff.toolkit.topology import Molecule from openmm import ( - AndersenThermostat, BrownianIntegrator, LangevinIntegrator, LangevinMiddleIntegrator, @@ -24,7 +18,6 @@ OpenMMException, Platform, VerletIntegrator, - app, unit, ) from openmm.app import ( @@ -45,7 +38,7 @@ Simulation, StateDataReporter, ) -from openmm.unit import bar, femtoseconds, kelvin, nanometers, picosecond, picoseconds +from openmm.unit import bar, kelvin, nanometers, picoseconds from openmmforcefields.generators import SMIRNOFFTemplateGenerator from pydantic import BaseModel, Field from rdkit import Chem @@ -110,430 +103,6 @@ ] -class SimulationFunctions: - def __init__( - self, - path_registry, - temperature: float = 0.05, - model_name: str = "gpt-4", - request_timeout: int = 1000, - max_tokens: int = 2000, - ): - self.path_registry = path_registry - self.temperature = temperature - self.model_name = model_name - self.request_timeout = request_timeout - self.max_tokens = max_tokens - - self.llm = langchain.chat_models.ChatOpenAI( - temperature=self.temperature, - model_name=self.model_name, - request_timeout=self.request_timeout, - max_tokens=self.request_timeout, - ) - - #######==================System Congifuration==================######## - # System Configuration initialization. - - def _create_system( - pdb, - forcefield, - nonbondedMethod="NoCutoff", - nonbondedCutoff=None, - ewaldErrorTolerance=None, - constraints="None", - rigidWater=False, - constraintTolerance=None, - **kwargs, - ): - # Create a dictionary to hold system parameters - system_params = { - "nonbondedMethod": nonbondedMethod, - "constraints": constraints, - "rigidWater": rigidWater, - } - - # Set nonbondedCutoff if applicable - if ( - nonbondedMethod in ["PME", "CutoffNonPeriodic", "CutoffPeriodic"] - and nonbondedCutoff is not None - ): - system_params["nonbondedCutoff"] = nonbondedCutoff - - # Set ewaldErrorTolerance if PME is used - if nonbondedMethod == "PME" and ewaldErrorTolerance is not None: - system_params["ewaldErrorTolerance"] = ewaldErrorTolerance - - # Set constraintTolerance if constraints are used - if constraints in ["HBonds", " AllBonds"] and constraintTolerance is not None: - system_params["constraintTolerance"] = constraintTolerance - elif system_params["rigidWater"] and constraintTolerance is not None: - system_params["constraintTolerance"] = constraintTolerance - - # Update system_params with any additional parameters provided - system_params.update(kwargs) - system = forcefield.createSystem(pdb.topology, **system_params) - return system - - ########==================Integrator==================######## - # Integrator - def _define_integrator( - integrator_type="LangevinMiddle", - temperature=300 * kelvin, - friction=1.0 / picoseconds, - timestep=0.004 * picoseconds, - **kwargs, - ): - # Create a dictionary to hold integrator parameters - integrator_params = { - "temperature": temperature, - "friction": friction, - "timestep": timestep, - } - - # Update integrator_params with any additional parameters provided - integrator_params.update(kwargs) - - # Create the integrator - if integrator_type == "LangevinMiddle": - integrator = LangevinMiddleIntegrator(**integrator_params) - elif integrator_type == "Verlet": - integrator = VerletIntegrator(**integrator_params) - elif integrator_type == "Brownian": - integrator = BrownianIntegrator(**integrator_params) - else: - raise Exception("Integrator type not recognized") - - return integrator - - def _prompt_summary(self, query: str): - prompt_template = """Your input is the original query. Your - task is to parse through the user query. - and provide a summary of the file path input, - the type of preprocessing needed (this is the - same as cleaning the file), the forcefield - used for the simulation, - the ensemble of the simulation, the integrator needed, - the number of steps, the timestep, the temperature, - and other instructions. - and follow the format "name: description. - - File Path: what is the file path of the file - you are using? it must include a .cif or .pdb extension. - Preprocessing: what preprocessing is needed? - you can choose from the following: standard cleaning, - remove water, add hydrogens, add hydrogens and remove - water. The default is add hydrogens and remove water. - Forcefield: what forcefields are you using? - you can choose from the following: AMBER, CHARMM, - OPLS, GROMACS. Default --> "amber14-all.xml, tip3p.xml". - Ensemble: what ensemble are you using? - you can choose from the following: - NPT, NVT, NVE. Default --> "NVT". - Integrator: what integrator are you using? - you can choose from the following: - Langevin, Verlet, Brownian. - The default depends on the ensemble - (NPT -> Langevin, NVT -> Langevin, NVE -> Verlet). - Number of Steps: how many steps - are you using? The default is 10000. - Timestep: what is the timestep? - Default --> "1 fs". - Temperature: what is the temperature? - Default --> "300 K". - Pressure: What is the pressure? - If NPT ensemble, the default is 1.0 bar, otherwise None. - Friction: what is the friction coefficient? - Default --> "1.0" - record_params: what parameters do you want to record? - you can choose from the following: - step, time, potentialEnergy, kineticEnergy, - totalEnergy, temperature, volume, density, - progress, remainingTime, speed, elapsedTime, - separator, systemMass, totalSteps, append. - Default --> ["step", "potentialEnergy", "temperature"]. - Other Instructions: what other instructions do you have? - The default is none. - Example of the final output: - File Path: 1a1p.pdb - Preprocessing: standard cleaning - Forcefield: amber14-all.xml, tip3p.xml - Ensemble: NPT - Integrator: Langevin - Number of Steps: 10000 - Timestep: 1 fs - Temperature: 300 K - Pressure: 1.0 bar - Friction: 1.0 - record_params: ["step", "potentialEnergy", "temperature"] - Other Instructions: none - If there is not enough information in a category, - you may fill in with the default, but explicitly state so. - Here is the information:{query}""" - prompt = PromptTemplate(template=prompt_template, input_variables=["query"]) - llm_chain = LLMChain(prompt=prompt, llm=self.llm) - - return llm_chain.run(" ".join(query)) - - def _save_to_file(self, summary: str, filename: str): - """Parse the summary string and - save it to a file in JSON format.""" - # Split the summary into lines - lines = summary.strip().split("\n") - - # Parse each line into a key and a value - summary_dict = {} - for line in lines: - key, value = line.split(":") - summary_dict[key.strip()] = value.strip() - - # Save the dictionary to a file - with open(filename, "w") as f: - json.dump(summary_dict, f) - - # add filename to registry - file_description = "Simulation Parameters" - self.path_registry.map_path(filename, filename, file_description) - - def _instruction_summary(self, query: str): - summary = self._prompt_summary(query) - self._save_to_file( - summary, f"{self.path_registry.ckpt_files}/simulation_parameters.json" - ) - return summary - - def _setup_simulation_from_json(self, file_name): - # Open the json file and load the parameters - with open(file_name, "r") as f: - params = json.load(f) - return params - - def _setup_and_run_simulation(self, query): - # Load the force field - # ask for inputs from the user - params = self._setup_simulation_from_json(query) - - # forcefield key can be forcefield_files or Forcefield - if "forcefield_files" in params: - params["forcefield_files"] = ( - params["forcefield_files"] - .replace("(default)", "") - .replace(" and ", ",") - .strip() - ) - Forcefield_files = [ - file.strip() for file in params["forcefield_files"].split(",") - ] - Forcefield = Forcefield_files[0] - Water_model = Forcefield_files[1] - else: - params["Forcefield"] = ( - params["Forcefield"] - .replace("(default)", "") - .replace(" and ", ",") - .strip() - ) - Forcefield_files = [ - file.strip() for file in params["Forcefield"].split(",") - ] - Forcefield = Forcefield_files[0] - Water_model = Forcefield_files[1] - print("Setting up forcefields :", Forcefield, Water_model) - st.markdown("Setting up forcefields", unsafe_allow_html=True) - # check if forcefields end in .xml - if Forcefield.endswith(".xml") and Water_model.endswith(".xml"): - forcefield = ForceField(Forcefield, Water_model) - # adding forcefield to registry - - # Load the PDB file - pdbfile = self.path_registry.get_mapped_path(params["File Path"]) - name = pdbfile.split(".")[0] - end = pdbfile.split(".")[1] - if end == "pdb": - pdb = PDBFile(pdbfile) - elif end == "cif": - pdb = PDBxFile(pdbfile) - - modeller = Modeller(pdb.topology, pdb.positions) - system = forcefield.createSystem( - modeller.topology, - nonbondedMethod=app.PME, - nonbondedCutoff=1.0 * nanometers, - constraints=app.PME, - ) - - _integrator = params["Integrator"].split(" ")[0].strip() - _temp = params["Temperature"].split(" ")[0].strip() - _friction_coef = params["Friction"].split(" ")[0].strip() - _timestep = params["Timestep"].split(" ")[0].strip() - - if _integrator == "Langevin": - print( - "Setting up Langevin integrator with Parameters:", - _temp, - "K", - _friction_coef, - "1/ps", - _timestep, - "fs", - ) - st.markdown("Setting up Langevin integrator", unsafe_allow_html=True) - if params["Ensemble"] == "NPT": - _pressure = params["Pressure"].split(" ")[0].strip() - system.addForce(MonteCarloBarostat(_pressure * bar, _temp * kelvin)) - integrator = LangevinIntegrator( - float(_temp) * kelvin, - float(_friction_coef) / picosecond, - float(_timestep) * femtoseconds, - ) - elif _integrator == "Verlet": - if params["Ensemble"] == "NPT": - _pressure = params["Pressure"].split(" ")[0].strip() - system.addForce(AndersenThermostat(_temp * kelvin, 1 / picosecond)) - system.addForce(MonteCarloBarostat(_pressure * bar, _temp * kelvin)) - print( - "Setting up Verlet integrator with Parameters:", - _timestep, - "fs", - _temp, - "K", - _pressure, - "bar", - ) - print("Setting up Verlet integrator with Parameters:", _timestep, "fs") - st.markdown("Setting up Verlet integrator", unsafe_allow_html=True) - integrator = VerletIntegrator(float(_timestep) * picoseconds) - - simulation = Simulation(modeller.topology, system, integrator) - simulation.context.setPositions(modeller.positions) - simulation.minimizeEnergy() - # save initial positions to registry - file_name = "initial_positions.pdb" - with open(file_name, "w") as f: - PDBFile.writeFile( - simulation.topology, - simulation.context.getState(getPositions=True).getPositions(), - f, - ) - print("Initial Positions saved to initial_positions.pdb") - simulation.reporters.append(PDBReporter(f"{name}.pdb", 1000)) - # reporter_args = {"reportInterval": 1000} - reporter_args = {} - params["record_params"] = ast.literal_eval(params["record_params"]) - for param in params["record_params"]: - if param in [ - "step", - "time", - "potentialEnergy", - "kineticEnergy", - "totalEnergy", - "temperature", - "volume", - "density", - "progress", - "remainingTime", - "speed", - "elapsedTime", - "separator", - "systemMass", - "totalSteps", - "append", - ]: - # The params from the json file should be booleans - reporter_args[param] = True - simulation.reporters.append( - StateDataReporter(f"{name}.csv", 1000, **reporter_args) - ) - - simulation.step(int(params["Number of Steps"].split(" ")[0].strip())) - - # add filenames to registry - file_name1 = "simulation_trajectory.pdb" - file_description1 = "Simulation PDB, containing the simulation trajectory" - self.path_registry.map_path(file_name1, f"{name}.pdb", file_description1) - file_name2 = "simulation_data.csv" - file_description2 = ( - "Simulation Data, containing step, potential energy, and temperature" - ) - self.path_registry.map_path(file_name2, f"{name}.csv", file_description2) - - return simulation - - def _extract_parameters_path(self): - """Check directory for parameters.json file.""" - # Check if there is a parameters.json file in the directory. - if os.path.exists("simulation_parameters_summary.json"): - return "simulation_parameters_summary.json" - # If there's no exact match, check for - # any JSON file that contains 'parameters' in its name. - else: - for file in os.listdir("."): - if "parameters" in file and file.endswith(".json"): - return file - # If no matching file is found, raise an exception. - raise ValueError("No parameters.json file found in directory.") - - -class SetUpAndRunTool(BaseTool): - name = "SetUpAndRunTool" - description = """This tool will set up the simulation objects - and run the simulation. - It will ask for the parameters path. - input: json file - """ - path_registry: Optional[PathRegistry] - - def __init__( - self, - path_registry: Optional[PathRegistry], - ): - super().__init__() - self.path_registry = path_registry - - def _run(self, query: str) -> str: - """Use the tool""" - # find the parameters in the directory - try: - if self.path_registry is None: # this should not happen - return "Registry not initialized" - sim_fxns = SimulationFunctions(path_registry=self.path_registry) - parameters = sim_fxns._extract_parameters_path() - - except ValueError as e: - return ( - str(e) - + """\nPlease use the Instruction summary tool with the - to create a parameters.json file in the directory.""" - ) - self.log("This are the parameters:") - self.log(parameters) - # print the parameters in json file - with open(parameters) as f: - params = json.load(f) - for key, value in params.items(): - print(key, ":", value) - - self.log("Are you sure you want to run the simulation? (y/n)") - response = input("yes or no: ") - if response.lower() in ["yes", "y"]: - sim_fxns._setup_and_run_simulation(parameters) - else: - return "Simulation interrupted due to human input" - return "Simulation Completed, simulation trajectory and data files saved." - - def log(self, text, color="blue"): - if color == "blue": - print("\033[1;34m\t{}\033[00m".format(text)) - if color == "red": - print("\033[31m\t{}\033[00m".format(text)) - - async def _arun(self, query: str) -> str: - """Use the tool asynchronously.""" - raise NotImplementedError("custom_search does not support async") - - -#######==================System Configuration==================######## -# System Configuration class SetUpandRunFunctionInput(BaseModel): pdb_id: str forcefield_files: List[str] diff --git a/mdagent/tools/base_tools/util_tools/git_issues_tool.py b/mdagent/tools/base_tools/util_tools/git_issues_tool.py index 8ce86f1d..1feb0852 100644 --- a/mdagent/tools/base_tools/util_tools/git_issues_tool.py +++ b/mdagent/tools/base_tools/util_tools/git_issues_tool.py @@ -2,27 +2,17 @@ import requests import tiktoken -from langchain.chains import LLMChain from langchain.prompts import PromptTemplate from langchain.tools import BaseTool +from langchain_core.output_parsers import StrOutputParser from serpapi import GoogleSearch -from mdagent.utils import _make_llm - class GitToolFunctions: """Class to store the functions of the tool.""" - def __init__( - self, - model: str = "gpt-3.5-turbo-16k", - temp: float = 0.05, - verbose: bool = False, - ): - self.model = model - self.temp = temp - self.verbose = verbose - self.llm = _make_llm(model=self.model, temp=self.temp, verbose=self.verbose) + def __init__(self, llm): + self.llm = llm def _prompt_summary(self, query: str, output: str): prompt_template = """You're receiving the following github issues and comments. @@ -54,9 +44,9 @@ def _prompt_summary(self, query: str, output: str): prompt = PromptTemplate( template=prompt_template, input_variables=["query", "output"] ) - llm_chain = LLMChain(prompt=prompt, llm=self.llm) + llm_chain = prompt | self.llm | StrOutputParser() - return llm_chain.run({"query": query, "output": output}) + return llm_chain.invoke({"query": query, "output": output}) """Function to get the number of requests remaining for the Github API """ @@ -80,12 +70,13 @@ class SerpGitTool(BaseTool): Input: """ serp_key: Optional[str] - def __init__(self, serp_key): + def __init__(self, serp_key, llm): super().__init__() self.serp_key = serp_key + self.llm = llm def _run(self, query: str): - fxns = GitToolFunctions() + fxns = GitToolFunctions(self.llm) # print("this is the key", self.serp_key) params = { "engine": "google", diff --git a/mdagent/tools/base_tools/util_tools/search_tools.py b/mdagent/tools/base_tools/util_tools/search_tools.py index 003c8847..1015d69d 100644 --- a/mdagent/tools/base_tools/util_tools/search_tools.py +++ b/mdagent/tools/base_tools/util_tools/search_tools.py @@ -1,3 +1,4 @@ +import logging import os import re from typing import Optional @@ -8,11 +9,22 @@ import paperscraper from langchain.base_language import BaseLanguageModel from langchain.tools import BaseTool +from langchain_core.output_parsers import StrOutputParser from pypdf.errors import PdfReadError from mdagent.utils import PathRegistry +def configure_logging(path): + # to log all runtime errors from paperscraper, which can be VERY noisy + log_file = os.path.join(path, "scraping_errors.log") + logging.basicConfig( + filename=log_file, + level=logging.ERROR, + format="%(asctime)s:%(levelname)s:%(message)s", + ) + + def paper_scraper(search: str, pdir: str = "query") -> dict: try: return paperscraper.search_papers(search, pdir=pdir) @@ -32,10 +44,11 @@ def paper_search(llm, query, path_registry): ) path = f"{path_registry.ckpt_files}/query" - query_chain = langchain.chains.llm.LLMChain(llm=llm, prompt=prompt) + query_chain = prompt | llm | StrOutputParser() if not os.path.isdir(path): os.mkdir(path) - search = query_chain.run(query) + configure_logging(path) + search = query_chain.invoke(query) print("\nSearch:", search) papers = paper_scraper(search, pdir=f"{path}/{re.sub(' ', '', search)}") return papers @@ -44,10 +57,14 @@ def paper_search(llm, query, path_registry): def scholar2result_llm(llm, query, path_registry, k=5, max_sources=2): """Useful to answer questions that require technical knowledge. Ask a specific question.""" + if llm.model_name.startswith("gpt"): + docs = paperqa.Docs(llm=llm.model_name) + else: + docs = paperqa.Docs() # uses default gpt model in paperqa + papers = paper_search(llm, query, path_registry) if len(papers) == 0: return "Failed. Not enough papers found" - docs = paperqa.Docs(llm=llm.model_name) not_loaded = 0 for path, data in papers.items(): try: diff --git a/mdagent/tools/maketools.py b/mdagent/tools/maketools.py index 5e373f94..ab985d43 100644 --- a/mdagent/tools/maketools.py +++ b/mdagent/tools/maketools.py @@ -1,9 +1,11 @@ +import os + import streamlit as st from dotenv import load_dotenv from langchain import agents from langchain.base_language import BaseLanguageModel -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.vectorstores import Chroma +from langchain_chroma import Chroma +from langchain_openai import OpenAIEmbeddings from mdagent.utils import PathRegistry @@ -70,8 +72,9 @@ def make_all_tools( # all_tools += [PythonREPLTool()] all_tools += [ ModifyBaseSimulationScriptTool(path_registry=path_instance, llm=llm), - Scholar2ResultLLM(llm=llm, path_registry=path_instance), ] + if "OPENAI_API_KEY" in os.environ: + all_tools += [Scholar2ResultLLM(llm=llm, path_registry=path_instance)] if human: all_tools += [agents.load_tools(["human"], llm)[0]] @@ -151,7 +154,6 @@ def get_tools( ids=[tool.name], metadatas=[{"tool_name": tool.name, "index": i}], ) - vectordb.persist() # retrieve 'k' tools k = min(top_k_tools, vectordb._collection.count()) diff --git a/mdagent/utils/makellm.py b/mdagent/utils/makellm.py index 82bd4803..9eaf6738 100644 --- a/mdagent/utils/makellm.py +++ b/mdagent/utils/makellm.py @@ -1,23 +1,47 @@ -import langchain from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler -def _make_llm(model, temp, verbose): +def _make_llm(model, temp, streaming): if model.startswith("gpt-3.5-turbo") or model.startswith("gpt-4"): - llm = langchain.chat_models.ChatOpenAI( + from langchain_openai import ChatOpenAI + + llm = ChatOpenAI( + temperature=temp, + model_name=model, + request_timeout=1000, + streaming=streaming, + callbacks=[StreamingStdOutCallbackHandler()] if streaming else None, + ) + elif model.startswith("accounts/fireworks"): + from langchain_fireworks import ChatFireworks + + llm = ChatFireworks( temperature=temp, model_name=model, request_timeout=1000, - streaming=True if verbose else False, - callbacks=[StreamingStdOutCallbackHandler()] if verbose else None, + streaming=streaming, + callbacks=[StreamingStdOutCallbackHandler()] if streaming else None, ) - elif model.startswith("text-"): - llm = langchain.OpenAI( + elif model.startswith("together/"): + # user needs to add 'together/' prefix to use TogetherAI provider + from langchain_together import ChatTogether + + llm = ChatTogether( + temperature=temp, + model=model.replace("together/", ""), + request_timeout=1000, + streaming=streaming, + callbacks=[StreamingStdOutCallbackHandler()] if streaming else None, + ) + elif model.startswith("claude"): + from langchain_anthropic import ChatAnthropic + + llm = ChatAnthropic( temperature=temp, model_name=model, - streaming=True if verbose else False, - callbacks=[StreamingStdOutCallbackHandler()] if verbose else None, + streaming=streaming, + callbacks=[StreamingStdOutCallbackHandler()] if streaming else None, ) else: - raise ValueError(f"Invalid model name: {model}") + raise ValueError(f"Unrecognized or unsupported model name: {model}") return llm diff --git a/setup.py b/setup.py index f94b4fd6..cba4e7f5 100644 --- a/setup.py +++ b/setup.py @@ -17,10 +17,15 @@ license="MIT", packages=find_packages(), install_requires=[ - "chromadb==0.4.24", + "chromadb", "google-search-results", - "langchain==0.0.336", - "langchain_experimental", + "langchain==0.2.12", + "langchain-anthropic==0.1.22", + "langchain-chroma", + "langchain-community", + "langchain-fireworks==0.1.7", + "langchain-openai==0.1.19", + "langchain-together==0.1.4", "matplotlib", "nbformat", "openai", diff --git a/tests/test_general_tools/test_search_tools.py b/tests/test_general_tools/test_search_tools.py index 9e28d201..e457ee6e 100644 --- a/tests/test_general_tools/test_search_tools.py +++ b/tests/test_general_tools/test_search_tools.py @@ -1,5 +1,5 @@ import pytest -from langchain.chat_models import ChatOpenAI +from langchain_openai import ChatOpenAI from mdagent.tools.base_tools import Scholar2ResultLLM diff --git a/tests/test_preprocess/test_uniprot.py b/tests/test_preprocess/test_uniprot.py index f3eaba30..370dda9b 100644 --- a/tests/test_preprocess/test_uniprot.py +++ b/tests/test_preprocess/test_uniprot.py @@ -504,6 +504,7 @@ def test_get_ids(query_uniprot): "P02091", ] all_ids = query_uniprot.get_ids("hemoglobin") + print(all_ids) single_id = query_uniprot.get_ids("hemoglobin", single_id=True) assert single_id[0] in hg_ids assert len(single_id) == 1 diff --git a/tests/test_sim/test_simulation_tools.py b/tests/test_sim/test_simulation_tools.py deleted file mode 100644 index 5b8a7cc6..00000000 --- a/tests/test_sim/test_simulation_tools.py +++ /dev/null @@ -1,46 +0,0 @@ -from unittest.mock import mock_open, patch - -import pytest - -from mdagent.tools.base_tools import SimulationFunctions - - -@pytest.fixture -def sim_fxns(get_registry): - return SimulationFunctions(get_registry("raw", False)) - - -@patch("os.path.exists") -@patch("os.listdir") -def test_extract_parameters_path(mock_listdir, mock_exists, sim_fxns): - # Test when parameters.json exists - mock_exists.return_value = True - assert sim_fxns._extract_parameters_path() == "simulation_parameters_summary.json" - mock_exists.assert_called_once_with("simulation_parameters_summary.json") - mock_exists.reset_mock() # Reset the mock for the next scenario - - # Test when parameters.json does not exist, but some_parameters.json does - mock_exists.return_value = False - mock_listdir.return_value = ["some_parameters.json", "other_file.txt"] - assert sim_fxns._extract_parameters_path() == "some_parameters.json" - - # Test when no appropriate file exists - mock_listdir.return_value = ["other_file.json", "other_file.txt"] - with pytest.raises(ValueError) as e: - sim_fxns._extract_parameters_path() - assert str(e.value) == "No parameters.json file found in directory." - - -@patch( - "builtins.open", - new_callable=mock_open, - read_data='{"param1": "value1", "param2": "value2"}', -) -@patch("json.load") -def test_setup_simulation_from_json(mock_json_load, mock_file_open, sim_fxns): - # Define the mock behavior for json.load - mock_json_load.return_value = {"param1": "value1", "param2": "value2"} - params = sim_fxns._setup_simulation_from_json("test_file.json") - mock_file_open.assert_called_once_with("test_file.json", "r") - mock_json_load.assert_called_once() - assert params == {"param1": "value1", "param2": "value2"} diff --git a/tests/test_utils/test_memory.py b/tests/test_utils/test_memory.py index 0f7d7199..1f233000 100644 --- a/tests/test_utils/test_memory.py +++ b/tests/test_utils/test_memory.py @@ -2,6 +2,7 @@ import os import pytest +from langchain_openai import ChatOpenAI from mdagent.agent.agent import MDAgent from mdagent.agent.memory import MemoryManager @@ -9,7 +10,8 @@ @pytest.fixture def memory_manager(get_registry): - return MemoryManager(get_registry("raw", False)) + llm = ChatOpenAI() + return MemoryManager(get_registry("raw", False), llm) def test_mdagent_memory(): @@ -26,11 +28,13 @@ def test_mdagent_memory(): def test_memory_init(memory_manager, get_registry): + llm = ChatOpenAI() + assert memory_manager is not None assert memory_manager.run_id is not None assert len(memory_manager.run_id) == 8 - mm_path_id = MemoryManager(get_registry("raw", False), run_id="TESTRUNN") + mm_path_id = MemoryManager(get_registry("raw", False), llm, run_id="TESTRUNN") assert mm_path_id.run_id == "TESTRUNN"