Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alfoa/hybrid model for batching and ensemble model #2322

Merged
merged 36 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
1879ae4
moving ahead fixing hybrid model and ensemble model (and batching wit…
alfoa May 17, 2024
1b78cd1
added test
alfoa May 17, 2024
fafedbb
Update ravenframework/JobHandler.py
alfoa May 18, 2024
e8c57bb
identifiers
alfoa May 20, 2024
d18282a
Merge branch 'alfoa/hybrid_model_for_batching_and_ensemble_model' of …
alfoa May 20, 2024
b09eacc
Update ravenframework/Models/Model.py
alfoa May 20, 2024
7114f63
Added test
alfoa May 20, 2024
b131c8a
t push
alfoa May 20, 2024
2e94d8c
Apply suggestions from code review
alfoa May 20, 2024
a332bd0
Merge branch 'devel' into alfoa/hybrid_model_for_batching_and_ensembl…
alfoa May 20, 2024
648c163
updated addFinishedJob
alfoa May 20, 2024
135bcf7
identifier is added from the job runner
alfoa May 20, 2024
1084d1e
removed identifier factory...too many objects rely on reusing identif…
alfoa May 20, 2024
3269196
forgot model file
alfoa May 20, 2024
ab91508
inputs 'types' should be requested to the users and not guessed in th…
alfoa May 20, 2024
5230636
test local jobhandler in ensemble model
alfoa May 22, 2024
09d5b63
casting the batch solution from rlz into float is not necessary and c…
alfoa May 22, 2024
e648a1d
fix ensemble
alfoa Jun 24, 2024
d7cab4d
added option to run volume calc
alfoa Jun 26, 2024
2cf8dd2
addition of command separator option
alfoa Jun 26, 2024
9c4aede
updated serpent documentation
alfoa Jul 17, 2024
71bc89c
removed some comments
alfoa Jul 17, 2024
b25a176
added doc for command separator
alfoa Jul 17, 2024
7067497
addressed comments
alfoa Jul 17, 2024
40ad61b
jobhandler only if parallelstrategy is == 2
alfoa Jul 17, 2024
8d4a3f5
serpent
alfoa Jul 17, 2024
691547e
Merge branch 'devel' into alfoa/hybrid_model_for_batching_and_ensembl…
alfoa Jul 17, 2024
5527d23
fixed doc
alfoa Jul 18, 2024
c81ffea
Merge branch 'alfoa/hybrid_model_for_batching_and_ensemble_model' of …
alfoa Jul 18, 2024
76e91c3
Merge branch 'devel' into alfoa/hybrid_model_for_batching_and_ensembl…
alfoa Aug 26, 2024
64611a5
addressed Josh's and Congjians comments
alfoa Aug 26, 2024
5193b22
added property to set if an hybrid model/logical model or ensemble mo…
alfoa Aug 27, 2024
8113d33
Merge branch 'devel' into alfoa/hybrid_model_for_batching_and_ensembl…
alfoa Sep 11, 2024
31f466f
merge
alfoa Oct 3, 2024
bf652c5
makedir
alfoa Oct 3, 2024
751f751
fixed typo
alfoa Oct 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 76 additions & 2 deletions ravenframework/JobHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import copy
import sys
import threading
from random import randint
import socket
import re

Expand All @@ -47,6 +46,70 @@
# FIXME: Finished jobs can bog down the queue waiting for other objects to take
# them away. Can we shove them onto a different list and free up the job queue?

#class IdentifiersFactory(BaseType):
#"""
#Identifier Factory. This class contains the memory of identifiers used to execute
#JOBS in the job handler. The identifiers are removed from the Factory once out of
#scope (i.e. once the job is collected)
#"""
#def __init__(self, **kwargs):
#"""
#Constructor
#@ In, None
#@ Out, None
#"""
#super().__init__(**kwargs)
#self.__IDENTIFIERS_FACTORY = {} # {identifier:uniqueHandler}
#self.__counter = 0

#def __len__(self):
#"""
#length (number of identifiers)
#"""
#return len(self.__IDENTIFIERS_FACTORY)

#def addIdentifier(self, identifier: str, uniqueHandler: str | None) -> None:
#"""
#Add identifier in factory
#@ In, identifier, str, new identifier to add
#@ In, uniqueHandler, str, optional, the `uniqueHandler` if associated with this identifier
#@ Out, None
#"""
#if identifier in self.__IDENTIFIERS_FACTORY:
#self.raiseAnError(RuntimeError, f"Identifier {identifier} is still in use and cannot be re-used yet!")

#self.__IDENTIFIERS_FACTORY[identifier] = uniqueHandler
#self.__counter += 1

#def removeIdentifier(self, identifier: str) -> None:
#"""
#Remove identifier in factory
#@ In, identifier, str, new identifier to add
#@ Out, None
#"""
#if identifier not in self.__IDENTIFIERS_FACTORY:
#self.raiseAnError(RuntimeError, f"Identifier {identifier} is not present in identifier factory. It cannot be removed!")

#self.__IDENTIFIERS_FACTORY.pop(identifier)

#def checkIfIdentifierIsInUse(self, identifier: str) -> bool:
#"""
#This method is a utility method used to check if an identifier is in use.
#@ In, identifier, str, the identifier to check
#@ Out, checkIfIdentifierIsInUse, bool, is the Identifier in use?
#"""
#return identifier in list(self.__IDENTIFIERS_FACTORY.keys())

#def clear(self) -> None:
#"""
#Clear
#@ In, None
#@ Out, None
#"""
#self.__IDENTIFIERS_FACTORY = {}

#IDENTIFIERS_COLLECTOR = IdentifiersFactory()
alfoa marked this conversation as resolved.
Show resolved Hide resolved

class JobHandler(BaseType):
"""
JobHandler class. This handles the execution of any job in the RAVEN
Expand Down Expand Up @@ -162,7 +225,7 @@ def initialize(self):
# initialize PBS
with self.__queueLock:
self.__running = [None]*self.runInfoDict['batchSize']
self.__clientRunning = [None]*self.runInfoDict['batchSize']
self.__clientRunning = [None]*self.runInfoDict['batchSize'] * 2
alfoa marked this conversation as resolved.
Show resolved Hide resolved
self._parallelLib = ParallelLibEnum.shared
if self.runInfoDict['parallelMethod'] is not None and self.runInfoDict['parallelMethod'] != ParallelLibEnum.distributed:
self._parallelLib = self.runInfoDict['parallelMethod']
Expand Down Expand Up @@ -640,6 +703,7 @@ def addJob(self, args, functionToRun, identifier, metadata=None, forceUseThreads
clientQueue
@ Out, None
"""
#global IDENTIFIERS_COLLECTOR
assert "original_function" in dir(functionToRun), "to parallelize a function, it must be" \
" decorated with RAVEN Parallel decorator"
if self._server is None or forceUseThreads:
Expand Down Expand Up @@ -686,6 +750,8 @@ def addJob(self, args, functionToRun, identifier, metadata=None, forceUseThreads
self.__batching[groupId]["ids"].append(identifier)
# add the runner in the Queue
self.reAddJob(internalJob)
# update identifier factory
#IDENTIFIERS_COLLECTOR.addIdentifier(internalJob.identifier, uniqueHandler)

def reAddJob(self, runner):
"""
Expand Down Expand Up @@ -747,6 +813,8 @@ def addFinishedJob(self, data, metadata=None, uniqueHandler="any", profile=False
@ In, profile, bool, optional, if True then at de-construction timing statements will be printed
@ Out, None
"""
#global IDENTIFIERS_COLLECTOR

# create a placeholder runner
run = Runners.factory.returnInstance('PassthroughRunner', data, None,
metadata=metadata,
Expand All @@ -755,6 +823,8 @@ def addFinishedJob(self, data, metadata=None, uniqueHandler="any", profile=False
# place it on the finished queue
with self.__queueLock:
self.__finished.append(run)
# update identifier factory
#IDENTIFIERS_COLLECTOR.addIdentifier(run.identifier, uniqueHandler)

def isFinished(self, uniqueHandler=None):
"""
Expand Down Expand Up @@ -912,6 +982,7 @@ def getFinished(self, removeFinished=True, jobIdentifier='', uniqueHandler="any"
finished = [job1, job2, [job3.1, job3.2], job4 ] (job3.1/3.2 belong to the same groupID)
or [job1, job2, job3, job4]
"""
#global IDENTIFIERS_COLLECTOR
# If the user does not specify a jobIdentifier, then set it to the empty
# string because every job will match this starting string.
if jobIdentifier is None:
Expand Down Expand Up @@ -955,6 +1026,7 @@ def getFinished(self, removeFinished=True, jobIdentifier='', uniqueHandler="any"
if removeFinished:
for i in reversed(runsToBeRemoved):
self.__finished[i].trackTime('collected')
#IDENTIFIERS_COLLECTOR.removeIdentifier(self.__finished[i].identifier)
del self.__finished[i]

# end with self.__queueLock
Expand Down Expand Up @@ -1140,8 +1212,10 @@ def shutdown(self):
@ In, None
@ Out, None
"""
#global IDENTIFIERS_COLLECTOR
self.completed = True
self.__shutdownParallel()
#IDENTIFIERS_COLLECTOR.clear()

def terminateAll(self):
"""
Expand Down
9 changes: 8 additions & 1 deletion ravenframework/Models/EnsembleModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def localInputAndChecks(self,xmlNode):
self.raiseAnError(IOError, "Input XML node for Model" + modelName +" has not been inputted!")
if len(self.modelsInputDictionary[modelName].values()) > allowedEntriesLen:
self.raiseAnError(IOError, "TargetEvaluation, Input and metadataToTransfer XML blocks are the only XML sub-blocks allowed!")
if child.attrib['type'].strip() == "Code":
if child.attrib['type'].strip() in ["Code", 'HybridModel', 'LogicalModel']:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

very ugly :(

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be slightly less ugly if you used a set:

       if child.attrib['type'].strip() in {"Code", 'HybridModel', 'LogicalModel'}:

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

self.createWorkingDir = True
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is ugly and in addition this creates a sub-working directory even if, for example, the Logical/hybrid models do not use a Code. In case of HybridModel/LogicalModel using only ExternalModels/ROMs, the subdirectory is created but will stay empty. Not very elegant. @joshua-cogliati-inl @wangcj05 any ideas on how to improve this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wangcj05 @joshua-cogliati-inl any ideas for this? I cannot find a better solution at this stage

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have a class attribute to indicate if there is a code associated with the Model? For example, In Hybrid Model/Logical/Ensemble Model, we define a self._isCodeAvail, and assign it to true when we detect a code in the Model. @alfoa

Copy link
Contributor

@joshua-cogliati-inl joshua-cogliati-inl Jul 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I am trying to fully understand why you need to create the directory? Just to check, it is needed if the Logical/hybrid models use a Code? (Congjian's idea sounds reasonable)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes that's why. If there is a Code in the underlying Logical/Hybrid model (contained in the ensemble model) the subfolder is required.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wangcj05 @joshua-cogliati-inl can you tell me how exactly you would like that flag to be coded? (I prefer not to take a code design decision (that might be needed to be modified) on my own)?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wangcj05 @joshua-cogliati-inl FY: if you can send feedbacks within tomorrow I can try to address them before leaving on Friday. Otherwise it will need to wait till September.

if child.tag == 'settings':
self.__readSettings(child)
Expand Down Expand Up @@ -246,6 +246,7 @@ def initialize(self,runInfo,inputs,initDict=None):
for modelClass, modelType, modelName, modelInstance in self.assemblerDict['Model']:
if not isThereACode:
isThereACode = modelType == 'Code'

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest we add these lines in Logical model and Hybrid model.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I addressed this as well.

self.modelsDictionary[modelName]['Instance'] = modelInstance
inputInstancesForModel = []
for inputName in self.modelsInputDictionary[modelName]['Input']:
Expand All @@ -267,6 +268,12 @@ def initialize(self,runInfo,inputs,initDict=None):

# initialize model
self.modelsDictionary[modelName]['Instance'].initialize(runInfo,inputInstancesForModel,initDict)
if modelType in ['HybridModel', 'LogicalModel']:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe should be replaced by issubclass(self.modelsDictionary[modelName]['Instance'], HybridModelBase)?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you suggestion will be better here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

for submodelInst in self.modelsDictionary[modelName]['Instance'].modelInstances.values():
if not isThereACode:
isThereACode = submodelInst.type == 'Code'


# retrieve 'TargetEvaluation' DataObjects
targetEvaluation = self.retrieveObjectFromAssemblerDict('TargetEvaluation',self.modelsInputDictionary[modelName]['TargetEvaluation'], True)
# assert acceptable TargetEvaluation types are used
Expand Down
33 changes: 29 additions & 4 deletions ravenframework/Models/HybridModels/HybridModelBase.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,32 @@ def submit(self,myInput,samplerType,jobHandler,**kwargs):
## Hybrid models need access to the job handler, so let's stuff it in our
## catch all kwargs where evaluateSample can pick it up, not great, but
## will suffice until we can better redesign this whole process.
prefix = kwargs['prefix']
kwargs['jobHandler'] = jobHandler
jobHandler.addClientJob((self, myInput, samplerType, kwargs), self.__class__.evaluateSample, prefix, kwargs)

nRuns = 1
alfoa marked this conversation as resolved.
Show resolved Hide resolved
batchMode = kwargs.get("batchMode", False)
if batchMode:
nRuns = kwargs["batchInfo"]['nRuns']

for index in range(nRuns):
if batchMode:
kw = kwargs['batchInfo']['batchRealizations'][index]
kw['batchMode'] = False
else:
kw = kwargs

kw['jobHandler'] = jobHandler

prefix = kw.get("prefix")
uniqueHandler = kw.get("uniqueHandler",'any')
## These kw are updated by createNewInput, so the job either should not
## have access to the metadata, or it needs to be updated from within the
## evaluateSample function, which currently is not possible since that
## function does not know about the job instance.
metadata = kw

jobHandler.addClientJob((self, myInput, samplerType, kw), self.__class__.evaluateSample, prefix, metadata=metadata,
uniqueHandler=uniqueHandler,
groupInfo={'id': kwargs['batchInfo']['batchId'], 'size': nRuns} if batchMode else None)

@Parallel()
def evaluateSample(self, myInput, samplerType, kwargs):
Expand All @@ -187,7 +210,9 @@ def evaluateSample(self, myInput, samplerType, kwargs):
# assure rlz has all metadata
rlz = dict((var,np.atleast_1d(kwargsToKeep[var])) for var in kwargsToKeep.keys())
# update rlz with input space from inRun and output space from result
rlz.update(dict((var,np.atleast_1d(kwargsToKeep['SampledVars'][var] if var in kwargs['SampledVars'] else result[var])) for var in set(itertools.chain(result.keys(),kwargsToKeep['SampledVars'].keys()))))
rlz.update(dict((var,np.atleast_1d(kwargsToKeep['SampledVars'][var]
if var in kwargs['SampledVars'] else result[var]))
for var in set(itertools.chain(result.keys(),kwargsToKeep['SampledVars'].keys()))))
return rlz

@abc.abstractmethod
Expand Down
2 changes: 1 addition & 1 deletion ravenframework/Models/HybridModels/LogicalModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def _externalRun(self, inRun, jobHandler):
# TODO: execute control function, move this to createNewInput
modelToRun = inputKwargs.pop('modelToRun')
inputKwargs['prefix'] = modelToRun + utils.returnIdSeparator() + identifier
inputKwargs['uniqueHandler'] = self.name + identifier
inputKwargs['uniqueHandler'] = self.name + utils.returnIdSeparator() + identifier

moveOn = False
while not moveOn:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
x,y,poly
0.73162765242,1.73162765242,0.0720237169458
0.492704456326,1.49270445633,1.29349617899
0.555294141261,1.55529414126,0.197763300797
0.765845655665,1.76584565567,0.0548282569708
0.503382503605,1.50338250361,0.246628937725
0.587434256432,1.58743425643,0.170210492765
0.639866882027,1.63986688203,0.129695862661
0.823657390038,1.82365739004,0.0310967160881
0.458268852019,1.45826885202,1.34107648262
0.419415902449,1.41941590245,1.40084817762
0.379586681845,1.37958668184,1.46948600818
0.819461917952,1.81946191795,0.0325939990697
0.324619247789,1.32461924779,1.5779699205
0.690076483411,1.69007648341,0.0960525861347
0.875131506812,1.87513150681,0.0155921405911
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#############################################
# #
# Dummy Input File for Poly Python Module #
# #
#############################################

case = $RAVEN-output$
auxfile = $RAVEN-aux$
x = $RAVEN-x$
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#############################################
# #
# Dummy Input File for Poly Python Module #
# #
#############################################

y = $RAVEN-y$
Loading