Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correct issue with corpus loading process #69

Merged
merged 3 commits into from
Oct 28, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ include requirements.txt
include database.db

recursive-include tests *
recursive-include corpus *.json

recursive-exclude * *.pyc
recursive-exclude * *.py~
3 changes: 2 additions & 1 deletion chatterbot/adapters/storage/jsondatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,5 +123,6 @@ def drop(self):
"""
import os

os.remove(self.database.path)
if os.path.exists(self.database.path):
os.remove(self.database.path)

2 changes: 2 additions & 0 deletions chatterbot/corpus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .corpus import Corpus

61 changes: 61 additions & 0 deletions chatterbot/corpus/corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os, json


class Corpus(object):

def __init__(self):
current_directory = os.path.dirname(__file__)
self.data_directory = os.path.join(current_directory, 'data')

def get_file_path(self, dotted_path):
"""
Reads a dotted file path and returns the file path.
"""
parts = dotted_path.split(".")
if parts[0] == 'chatterbot':
parts.pop(0)
parts[0] = self.data_directory

corpus_path = os.path.join(*parts)

if os.path.exists(corpus_path + ".json"):
corpus_path += ".json"

return corpus_path

def read_corpus(self, file_name):
"""
Read and return the data from a corpus json file.
"""
with open(file_name) as data_file:
data = json.load(data_file)
return data

def load_corpus(self, dotted_path):
"""
Return the data contained within a specified corpus.
"""

corpus_path = self.get_file_path(dotted_path)

corpora = []

if os.path.isdir(corpus_path):
for dirname, dirnames, filenames in os.walk(corpus_path):
for datafile in filenames:
if datafile.endswith(".json"):

corpus = self.read_corpus(
os.path.join(dirname, datafile)
)

for key in list(corpus.keys()):
corpora.append(corpus[key])
else:
corpus = self.read_corpus(corpus_path)

for key in list(corpus.keys()):
corpora.append(corpus[key])

return corpora

21 changes: 0 additions & 21 deletions chatterbot/corpus/english/__init__.py

This file was deleted.

28 changes: 0 additions & 28 deletions chatterbot/corpus/utils.py

This file was deleted.

5 changes: 3 additions & 2 deletions chatterbot/training.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from .corpus.utils import load_corpus
from .conversation import Statement
from .corpus import Corpus


class Trainer(object):

def __init__(self, chatbot, **kwargs):
self.chatbot = chatbot
self.corpus = Corpus()

def train_from_list(self, conversation):

Expand All @@ -30,7 +31,7 @@ def train_from_list(self, conversation):

def train_from_corpora(self, corpora):
for corpus in corpora:
corpus_data = load_corpus(corpus)
corpus_data = self.corpus.load_corpus(corpus)
for data in corpus_data:
for pair in data:
self.train_from_list(pair)
Expand Down
74 changes: 37 additions & 37 deletions tests/base_case.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,46 @@
from unittest import TestCase
from chatterbot import ChatBot
import os


class ChatBotTestCase(TestCase):
class UntrainedChatBotTestCase(TestCase):

def setUp(self):
self.test_data_directory = 'test_data'
self.test_database_name = self.random_string() + ".db"

if not os.path.exists(self.test_data_directory):
os.makedirs(self.test_data_directory)

database_path = self.test_data_directory + '/' + self.test_database_name

self.chatbot = ChatBot("Test Bot", database=database_path)

def random_string(self, start=0, end=9000):
"""
Generate a string based on a random number.
"""
from random import randint
return str(randint(start, end))

def remove_test_data(self):
import shutil

if os.path.exists(self.test_data_directory):
shutil.rmtree(self.test_data_directory)

def tearDown(self):
"""
Remove the test database.
"""
self.chatbot.storage.drop()
self.remove_test_data()


class ChatBotTestCase(UntrainedChatBotTestCase):

def setUp(self):
super(ChatBotTestCase, self).setUp()
"""
Set up a database for testing.
"""
Expand All @@ -27,43 +63,7 @@ def setUp(self):
"Blue."
]

self.chatbot = ChatBot("Test Bot", database="test-database.db")

self.chatbot.train(data1)
self.chatbot.train(data2)
self.chatbot.train(data3)

def tearDown(self):
"""
Remove the test database.
"""
self.chatbot.storage.drop()


class UntrainedChatBotTestCase(TestCase):
"""
This is a test case for use when the
chat bot should not start with any
prior training.
"""

def setUp(self):
"""
Set up a database for testing.
"""
test_db = self.random_string() + ".db"
self.chatbot = ChatBot("Test Bot", database=test_db)

def random_string(self, start=0, end=9000):
"""
Generate a string based on a random number.
"""
from random import randint
return str(randint(start, end))

def tearDown(self):
"""
Remove the test database.
"""
self.chatbot.storage.drop()

31 changes: 24 additions & 7 deletions tests/corpus_tests/test_corpus.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,40 @@
from unittest import TestCase
from chatterbot.corpus.utils import read_corpus, load_corpus
from chatterbot.corpus import Corpus
import os


class CorpusUtilsTestCase(TestCase):

def setUp(self):
self.corpus = Corpus()

def test_get_file_path(self):
"""
Test that a dotted path is properly converted to a file address.
"""
path = self.corpus.get_file_path("chatterbot.corpus.english")
self.assertIn(
os.path.join("chatterbot", "corpus", "data", "english"),
path
)

def test_read_corpus(self):
#data = read_corpus("chatterbot/corpus/english/greetings/conversations.json")
# TODO
pass
corpus_path = os.path.join(
self.corpus.data_directory,
"english", "conversations.json"
)
data = self.corpus.read_corpus(corpus_path)
self.assertIn("conversations", data)

def test_load_corpus(self):
corpus = load_corpus("chatterbot.corpus.english.greetings")
corpus = self.corpus.load_corpus("chatterbot.corpus.english.greetings")

self.assertEqual(len(corpus), 1)
self.assertIn(["Hi", "Hello"], corpus[0])

def test_load_corpus_general(self):
corpus = load_corpus("chatterbot.corpus.english")
corpus = self.corpus.load_corpus("chatterbot.corpus.english")

self.assertEqual(len(corpus), 2)
self.assertIn(["Hi", "Hello"], corpus[0])
self.assertIn(["Hi", "Hello"], corpus[1])