numenta · rhyolight · Mar 5, 2015 · Mar 3, 2015 · Mar 4, 2015 · Mar 4, 2015
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -10,3 +10,5 @@ recursive-include extensions *.hpp *.cpp
 recursive-include external/common/share/swig *.*
 recursive-include external/darwin64 swig *.a
 recursive-include external/linux64 swig *.a
+
+recursive-include nupic/datafiles *.csv *.txt
diff --git a/examples/network/hierarchy_network_demo.py b/examples/network/hierarchy_network_demo.py
@@ -33,8 +33,9 @@
 import os
 import math
 
+from pkg_resources import resource_filename
+
 from nupic.algorithms.anomaly import computeRawAnomalyScore
-from nupic.data.datasethelpers import findDataset
 from nupic.data.file_record_stream import FileRecordStream
 from nupic.engine import Network
 from nupic.encoders import MultiEncoder
@@ -45,7 +46,9 @@
 
 # Seed used for random number generation
 _SEED = 2045
-_INPUT_FILE_PATH = "../prediction/data/extra/hotgym/rec-center-hourly.csv"
+_INPUT_FILE_PATH = resource_filename(
+  "nupic.datafiles", "extra/hotgym/rec-center-hourly.csv"
+)
 _OUTPUT_FILE_NAME = "hierarchy-demo-output.csv"
 
 # Parameter dict for SPRegion
@@ -347,8 +350,7 @@ def runNetwork(network, numRecords, writer):
 
 
 def runDemo():
-  trainFile = findDataset(_INPUT_FILE_PATH)
-  dataSource = FileRecordStream(streamID=trainFile)
+  dataSource = FileRecordStream(streamID=_INPUT_FILE_PATH)
   numRecords = dataSource.getDataRowCount()
   print "Creating network"
   network = createNetwork(dataSource)

diff --git a/examples/network/network_api_demo.py b/examples/network/network_api_demo.py
@@ -25,15 +25,18 @@
 import json
 import os
 
+from pkg_resources import resource_filename
+
 from nupic.algorithms.anomaly import computeRawAnomalyScore
-from nupic.data.datasethelpers import findDataset
 from nupic.data.file_record_stream import FileRecordStream
 from nupic.engine import Network
 from nupic.encoders import MultiEncoder
 
 _VERBOSITY = 0  # how chatty the demo should be
 _SEED = 1956  # the random seed used throughout
-_DATA_PATH = "extra/hotgym/rec-center-hourly.csv"
+_INPUT_FILE_PATH = resource_filename(
+  "nupic.datafiles", "extra/hotgym/rec-center-hourly.csv"
+)
 _OUTPUT_PATH = "network-demo-output.csv"
 _NUM_RECORDS = 2000
 
@@ -213,8 +216,7 @@ def runNetwork(network, writer):
 
 
 if __name__ == "__main__":
-  trainFile = findDataset(_DATA_PATH)
-  dataSource = FileRecordStream(streamID=trainFile)
+  dataSource = FileRecordStream(streamID=_INPUT_FILE_PATH)
 
   network = createNetwork(dataSource)
   outputPath = os.path.join(os.path.dirname(__file__), _OUTPUT_PATH)

diff --git a/examples/opf/clients/hotgym/anomaly/hotgym_anomaly.py b/examples/opf/clients/hotgym/anomaly/hotgym_anomaly.py
@@ -21,7 +21,7 @@
 # ----------------------------------------------------------------------
 
 """
-A simple client to create a CLA anomaly detection model for hotgym.
+A simple client to create a HTM anomaly detection model for hotgym.
 The script prints out all records that have an abnormally high anomaly
 score.
 """
@@ -30,16 +30,17 @@
 import datetime
 import logging
 
-from nupic.data.datasethelpers import findDataset
+from pkg_resources import resource_filename
+
 from nupic.frameworks.opf.modelfactory import ModelFactory
-from nupic.frameworks.opf.predictionmetricsmanager import MetricsManager
 
 import model_params
 
 _LOGGER = logging.getLogger(__name__)
 
-_DATA_PATH = "extra/hotgym/rec-center-hourly.csv"
-
+_INPUT_DATA_FILE = resource_filename(
+  "nupic.datafiles", "extra/hotgym/rec-center-hourly.csv"
+)
 _OUTPUT_PATH = "anomaly_scores.csv"
 
 _ANOMALY_THRESHOLD = 0.9
@@ -52,7 +53,7 @@ def createModel():
 def runHotgymAnomaly():
   model = createModel()
   model.enableInference({'predictedField': 'consumption'})
-  with open (findDataset(_DATA_PATH)) as fin:
+  with open (_INPUT_DATA_FILE) as fin:
     reader = csv.reader(fin)
     csvWriter = csv.writer(open(_OUTPUT_PATH,"wb"))
     csvWriter.writerow(["timestamp", "consumption", "anomaly_score"])

diff --git a/examples/opf/clients/hotgym/simple/hotgym.py b/examples/opf/clients/hotgym/simple/hotgym.py
@@ -26,7 +26,8 @@
 import datetime
 import logging
 
-from nupic.data.datasethelpers import findDataset
+from pkg_resources import resource_filename
+
 from nupic.frameworks.opf.metrics import MetricSpec
 from nupic.frameworks.opf.modelfactory import ModelFactory
 from nupic.frameworks.opf.predictionmetricsmanager import MetricsManager
@@ -35,7 +36,9 @@
 
 _LOGGER = logging.getLogger(__name__)
 
-_DATA_PATH = "extra/hotgym/rec-center-hourly.csv"
+_INPUT_FILE_PATH = resource_filename(
+  "nupic.datafiles", "extra/hotgym/rec-center-hourly.csv"
+)
 
 _METRIC_SPECS = (
     MetricSpec(field='consumption', metric='multiStep',
@@ -66,7 +69,7 @@ def runHotgym():
   model.enableInference({'predictedField': 'consumption'})
   metricsManager = MetricsManager(_METRIC_SPECS, model.getFieldInfo(),
                                   model.getInferenceType())
-  with open (findDataset(_DATA_PATH)) as fin:
+  with open (_INPUT_FILE_PATH) as fin:
     reader = csv.reader(fin)
     headers = reader.next()
     reader.next()

diff --git a/nupic/data/aggregator.py b/nupic/data/aggregator.py
@@ -25,9 +25,10 @@
 import datetime
 from collections import defaultdict
 
+from pkg_resources import resource_filename
+
 from nupic.data import SENTINEL_VALUE_FOR_MISSING_DATA
 from nupic.data.file_record_stream import FileRecordStream
-from nupic.data.datasethelpers import findDataset
 
 
 """The aggregator aggregates PF datasets
@@ -755,8 +756,7 @@ def generateDataset(aggregationInfo, inputFilename, outputFilename=None):
         have values of 0, then aggregation will be suppressed, and the given
         inputFile parameter value will be returned.
 
-  inputFilename: filename (or relative path form NTA_DATA_PATH) of
-               the input dataset
+  inputFilename: filename of the input dataset within examples/prediction/data
 
   outputFilename: name for the output file. If not given, a name will be
         generated based on the input filename and the aggregation params
@@ -781,7 +781,7 @@ def generateDataset(aggregationInfo, inputFilename, outputFilename=None):
 
 
   # Create the input stream
-  inputFullPath = findDataset(inputFilename)
+  inputFullPath = resource_filename("nupic.datafiles", inputFilename)
   inputObj = FileRecordStream(inputFullPath)
 
 
@@ -864,7 +864,7 @@ def getFilename(aggregationInfo, inputFile):
   """
 
   # Find the actual file, with an absolute path
-  inputFile = findDataset(inputFile)
+  inputFile = resource_filename("nupic.datafiles", inputFile)
 
   a = defaultdict(lambda: 0, aggregationInfo)
   outputDir = os.path.dirname(inputFile)

diff --git a/nupic/data/datasethelpers.py b/nupic/data/datasethelpers.py
diff --git a/nupic/data/stats.py b/nupic/data/stats.py
@@ -22,7 +22,8 @@
 import os
 import pickle
 
-from nupic.data.datasethelpers import findDataset
+from pkg_resources import resource_filename
+
 from nupic.regions.RecordSensor import RecordSensor
 from nupic.data.file_record_stream import FileRecordStream
 
@@ -128,7 +129,7 @@ def generateStats(filename, statsInfo, maxSamples = None, filters=[], cache=True
     raise RuntimeError("statsInfo must be a dict -- "
                        "found '%s' instead" % type(statsInfo))
 
-  filename = findDataset(filename)
+  filename = resource_filename("nupic.datafiles", filename)
 
   if cache:
     statsFilename = getStatsFilename(filename, statsInfo, filters)