diff --git a/configs/text2text/babi.yaml b/configs/text2text/babi.yaml new file mode 100644 index 00000000..50a7eb46 --- /dev/null +++ b/configs/text2text/babi.yaml @@ -0,0 +1,59 @@ +training: + problem: + name: &name BABI + batch_size: &b 1 + data_type: train + embedding_type: glove.6B.100d + embedding_size: 50 + use_mask : false + joint_all: true + one_hot_embedding: true + tasks: [1, 2, 3] + ten_thousand_examples: true + truncation_length: 50 + directory : ./ + + + gradient_clipping: 20 + + seed_numpy: 847055145 + seed_torch: 697881609 + + optimizer: + name: RMSprop + lr: 0.0001 + + # settings parameters + terminal_conditions: + loss_stop: 1.0e-2 + epoch_limit: 100 + +validation: + problem: + name: *name + batch_size: *b + data_type: valid + embedding_type: glove.6B.100d + joint_all: true + one_hot_embedding: true + tasks: [1, 2, 3] + ten_thousand_examples: true + truncation_length : 50 + +testing: + problem: + name: *name + batch_size: *b + data_type: test + embedding_type: glove.6B.100d + joint_all: true + one_hot_embedding: true + tasks: [1, 2, 3] + ten_thousand_examples: true + truncation_length : 50 + +model: + name: LSTM + # Hidden state. + hidden_state_size: 256 + num_layers: 1 diff --git a/miprometheus/problems/question_context_to_class/__init__.py b/miprometheus/problems/question_context_to_class/__init__.py new file mode 100644 index 00000000..cf4893a7 --- /dev/null +++ b/miprometheus/problems/question_context_to_class/__init__.py @@ -0,0 +1,4 @@ +from .babiqa_dataset_single_question import BABI + + +__all__ = ['BABI'] diff --git a/miprometheus/problems/question_context_to_class/babiqa_dataset_single_question.py b/miprometheus/problems/question_context_to_class/babiqa_dataset_single_question.py new file mode 100644 index 00000000..d1bdc649 --- /dev/null +++ b/miprometheus/problems/question_context_to_class/babiqa_dataset_single_question.py @@ -0,0 +1,515 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright (C) IBM Corporation 2018 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, e:wqither express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""bAbiQA.py: contains code for loading the babi dataset (based on the parsing used in torchtext)""" +__author__= "Vincent Albouy, Ryan.L McAvoy" + +import torch +from miprometheus.utils.problems_utils.language import Language +import torch.utils.data +from tqdm import tqdm +import requests +import os +from miprometheus.utils.app_state import AppState +from miprometheus.problems.seq_to_seq.seq_to_seq_problem import SeqToSeqProblem +from miprometheus.problems.seq_to_seq.text2text.text_to_text_problem import TextToTextProblem + +from miprometheus.utils.loss.masked_cross_entropy_loss import MaskedCrossEntropyLoss + + + +class bAbIQASingleQuestion(TextToTextProblem): + """ + Problem Class for loading bAbi QA data set using Torchtext + + Inherits from SeqToSeqProblem + + """ + + def __init__(self, params): + """ + + Initializes BABI QA problem, calls base class initialization, sets properties using the provided parameters. + + :param params: Dictionary of parameters (read from configuration file). + + """ + super(bAbIQASingleQuestion).__init__() + + self.directory = '~/data/babi/' + + # boolean: is it training phase? + self.data_type = params['data_type'] + + self.use_batches = params['batch_size'] + + # task number to train on + self.tasks = params['tasks'] + + self.loss_function = MaskedCrossEntropyLoss() + + self.tenK = params['ten_thousand_examples'] + + self.one_hot_embedding = params['one_hot_embedding'] + + self.batch_size = params['batch_size'] + + self.embedding_type = params['embedding_type'] + + self.embedding_size = params['embedding_size'] + + self.init_token = '' + + self.pad_token = '' + + self.eos_token = '' + + self.use_mask = False + + self.urls = ['http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz'] + + self.name = 'BABIDataset' + + self.dirname = '' + + self.data = self.load_data( tasks=self.tasks, tenK=self.tenK, add_punctuation=True , data_type = self.data_type) + + #create an object language from Language class - This object will be used to create the words embeddings + self.language = Language('lang') + + self.default_values = {'input_item_size': self.embedding_size , 'output_item_size':self.embedding_size} + + self.data_definitions = {'sequences': {'size': [-1, -1, self.embedding_size], 'type': [torch.Tensor]}, + 'targets': {'size': [-1], 'type': [torch.Tensor]}, + 'current_questions': {'size': [-1, 1], 'type': [list, str]}, + 'masks': {'size': [-1], 'type': [torch.Tensor]}, + } + + #building the embeddings + if self.one_hot_embedding: + self.dictionaries, self.itos_dict = self.build_dictionaries_one_hot() + else: + self.dictionaries, self.itos_dict = self.build_dictionaries() + + + def __len__(self): + """Return the length of the questions set""" + return len(self.data) + + def __getitem__(self, idx): + """ + Getter method to access the dataset and return a sample. + :param idx: index of the sample to return. + :return: sample = {'sequence': story, 'targets': target, 'current_question': current_question, "mask": mask} + """ + #get current question with indices idx + current_question = self.data[idx] + + written_story, written_answers = current_question + + current_question = [" ".join(written_story), " ".join(written_answers)] + + story = self.embed_batch(written_story) + + answer = self.to_dictionary_indexes(self.dictionaries, written_answers) + + mask = torch.zeros((story.shape[0])).type(AppState().ByteTensor) + + target = torch.zeros((story.shape[0])).type(AppState().LongTensor) + + k = 0 + for i, word in enumerate(current_question[0].split(' ')): + if word == '_': + mask[i] = 1 + target[i] = answer[k] + k=k+1 + + #make a dictionnary with all the outputs + data_dict = self.create_data_dict() + data_dict['sequences'] = story + data_dict['targets'] = target + data_dict['current_question'] = current_question + data_dict['masks'] = mask + + #return the final DataDict + return data_dict + + def collate_babi(self, batch): + + """ + Collate method that create batch from samples. + :param batch. + :return: return {'sequence': sequence, 'targets': targets, 'current_question': current_question, "mask": mask} + """ + # get sizes + context_length = max(d["sequences"].shape[0] for d in batch) + answer_length = max(d["targets"].shape[0] for d in batch) + batch_size = len(batch) + word_size = batch[0]["sequences"].shape[-1] + + # create placeholders + sequences = torch.zeros((batch_size, context_length, word_size)).type(AppState().dtype) + + targets = torch.zeros((batch_size, answer_length)).type(AppState().LongTensor) + mask = torch.zeros((batch_size, answer_length)).type(AppState().ByteTensor) + + # padded data + current_question = [] + for i, d in enumerate(batch): + c_shape = d["sequences"].shape + a_shape = d["targets"].shape + sequences[i, :c_shape[0], :c_shape[1]] = d["sequences"] + targets[i, :a_shape[0]] = d["targets"] + mask[i, :a_shape[0]] = d["masks"] + current_question.append(d["current_question"]) + + # make a dictionnary with all the outputs + data_dict = self.create_data_dict() + data_dict['sequences'] = sequences + data_dict['targets'] = targets + data_dict['current_question'] = current_question + data_dict['masks'] = mask + + # return the fina DataDict + return data_dict + + def evaluate_loss(self, data_dict, logits): + """ Calculates accuracy equal to mean number of correct predictions in a given batch. + WARNING: Applies mask to both logits and targets! + + :param data_dict: DataDict({'sequences', 'sequences_length', 'targets', 'mask'}). + + :param logits: Predictions being output of the model. + + """ + # Check if mask should be is used - if so, use the correct loss + # function. + if self.use_mask: + loss = self.loss_function( + logits, data_dict['targets'], data_dict['masks']) + else: + pred = logits.transpose(1, 2) + + loss = self.loss_function(pred, data_dict['targets'], data_dict['masks']) + + return loss + + + def build_dictionaries_one_hot(self): + + """Creates the word embeddings for BABI QA with one hot vectors + + - 1. Collects all datasets word + - 2. Uses Language object to create the embeddings + + If it is the first time you run this code, it will take longer to load the embedding from torchtext + """ + #load data + data = self.load_data(tasks=self.tasks, tenK=self.tenK, add_punctuation=True, data_type='train', outmod="one_hot") + data = data + self.load_data(tasks=self.tasks, tenK=self.tenK, add_punctuation=True, data_type='valid', + outmod="one_hot") + data = data + self.load_data(tasks=self.tasks, tenK=self.tenK, add_punctuation=True, data_type='test', + outmod="one_hot") + + # make placeholders dictionnaries with special caracters + answ_to_ix = {".": 0, "?": 1, "_": 2} + itos_d = [".", "?", "_"] + + # display a progress bar while going through the data + self.fix_length = 0 + for q in tqdm(data): + story, answers = q + self.fix_length = max(self.fix_length, len(story)) + + # go through all the stories + for answer in story: + a = answer.lower() + if a not in answ_to_ix: + ix = len(answ_to_ix) + answ_to_ix[a] = ix + itos_d.append(a) + # print(a,ix) + + #go through all the answers + for answer in answers: + a = answer.lower() + if a not in answ_to_ix: + ix = len(answ_to_ix) + answ_to_ix[a] = ix + itos_d.append(a) + # print(a, ix) + + #return the corresponding dictionnaries + ret = (answ_to_ix) + return ret, itos_d + + def build_dictionaries(self): + + """Creates the word embeddings BABI QA + + - 1. Collects all datasets word + - 2. Uses Language object to create the embeddings + + If it is the first time you run this code, it will take longer to load the embedding from torchtext + """ + + print(' ---> Constructing the dictionaries with word embedding, may take some time ') + + # making an empty list of words meant to store all possible datasets words + text = [] + tasks = self.tasks + + data = self.load_data(tasks=tasks, tenK=self.tenK, add_punctuation=True, data_type='train', outmod="embedding") + data = data + self.load_data(tasks=tasks, tenK=self.tenK, add_punctuation=True, data_type='valid', + outmod="embedding") + data = data + self.load_data(tasks=tasks, tenK=self.tenK, add_punctuation=True, data_type='test', + outmod="embedding") + + # make placeholders dictionnaries with special caracters + answ_to_ix = {".": 0, "?": 1, "_": 2} + itos_d = [".", "?", "_"] + + # load all words from training data to a list named words_list[] + self.fix_length = 0 + for q in tqdm(data): + # display a progress bar + story, answers = q + self.fix_length = max(self.fix_length, len(story)) + for word in story: + text.extend([word.lower()]) + + # go through all the stories + for answer in story: + a = answer.lower() + if a not in answ_to_ix: + ix = len(answ_to_ix) + answ_to_ix[a] = ix + itos_d.append(a) + # print(a,ix) + + # go through all the answers + for answer in answers: + a = answer.lower() + if a not in answ_to_ix: + ix = len(answ_to_ix) + answ_to_ix[a] = ix + itos_d.append(a) + # print(a, ix) + + """ build embeddings from the chosen database / Example: glove.6B.100d """ + + self.language.build_pretrained_vocab(text, vectors=self.embedding_type, tokenize=self.tokenize) + + # return the corresponding dictionnaries + ret = (answ_to_ix) + return ret, itos_d + + + + def download_from_url(self, url, path): + + """Download file, with logic (from tensor2tensor) for Google Drive""" + + #get url and write file to path + if 'drive.google.com' not in url: + r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}) + with open(path, "wb") as file: + file.write(r.content) + return + print('downloading from Google Drive; may take a few minutes') + confirm_token = None + session = requests.Session() + response = session.get(url, stream=True) + for k, v in response.cookies.items(): + if k.startswith("download_warning"): + confirm_token = v + + if confirm_token: + url = url + "&confirm=" + confirm_token + response = session.get(url, stream=True) + + #open file + write chunks + chunk_size = 16 * 1024 + with open(path, "wb") as f: + for chunk in response.iter_content(chunk_size): + if chunk: + f.write(chunk) + + def load_data(self, path=None, root='data', tasks=[1], tenK=False, add_punctuation=True, data_type='train', + outmod=''): + + """loads all asked for tasks into a single file (combining multiple files) and then parses the combined file""" + + if tenK: + self.dirname = os.path.join('tasks_1-20_v1-2', 'en-valid-10k') + else: + self.dirname = os.path.join('tasks_1-20_v1-2', 'en-valid') + + if path is None: + path = self.download(root) + + file_data = os.path.join(path, 'collected_' + data_type + outmod + '.txt') + with open(file_data, 'w') as tf: + for task in tasks: + with open( + os.path.join(path, + 'qa' + str(task) + '_' + data_type + '.txt')) as f: + tf.write(f.read()) + return self.parse(file_data, add_punctuation) + + def download(self, root, check=None): + + """Download and unzip an online archive (.zip, .gz, or .tgz). + Arguments: + root (str): Folder to download data to. + check (str or None): Folder whose existence indicates + that the dataset has already been downloaded, or + None to check the existence of root/{cls.name}. + Returns: + str: Path to extracted dataset. + """ + #get path + path = os.path.join(root, self.name) + check = path if check is None else check + + #download data + if not os.path.isdir(check): + for url in self.urls: + if isinstance(url, tuple): + url, filename = url + else: + filename = os.path.basename(url) + zpath = os.path.join(path, filename) + if not os.path.isfile(zpath): + if not os.path.exists(os.path.dirname(zpath)): + os.makedirs(os.path.dirname(zpath)) + print('downloading {}'.format(filename)) + self.download_from_url(url, zpath) + zroot, ext = os.path.splitext(zpath) + _, ext_inner = os.path.splitext(zroot) + + #unzip the data + if ext == '.zip': + with zipfile.ZipFile(zpath, 'r') as zfile: + print('extracting') + zfile.extractall(path) + + # tarfile cannot handle bare .gz files + elif ext == '.tgz' or ext == '.gz' and ext_inner == '.tar': + with tarfile.open(zpath, 'r:gz') as tar: + dirs = [member for member in tar.getmembers()] + tar.extractall(path=path, members=dirs) + + #in case it is a gz file + elif ext == '.gz': + with gzip.open(zpath, 'rb') as gz: + with open(zroot, 'wb') as uncompressed: + shutil.copyfileobj(gz, uncompressed) + + #Return path to extracted dataset + return os.path.join(path, self.dirname) + + + def parse(self, file_data, add_punctuation): + + """This method is parsing the file + :param file_data : data file to be parsed + :param add_punctuation : boolean to decide wether we add punctuation + :return: data : Parsed data + + """ + #make empty lists + data, story, story2 = [],[],[] + i = 0 + + #open file + with open(file_data, 'r') as f: + for line in f: + # print(line) + tid, text = line.rstrip('\n').split(' ', 1) + if tid == '1': + story = [] + story2 = [] + answers = [] + # don't delete period + if text.endswith('.'): + for a in text[:-1].split(): + assert not isinstance(a, list) + story.append(a) + if add_punctuation: + story.append('.') + else: + # remove any leading or trailing whitespace after splitting + query, answer, supporting = (x.strip() for x in text.split('\t')) + + for a in query[:-1].split(' '): + story2.append(a) + if add_punctuation: + story2.append('?') + for a in answer.split(','): + answers.append(a) + story2.extend(['_']) + + story_f = list(story) + story_f.extend(story2) + if story_f: + data.append((story_f, answers)) + + #Set answers and story back to empty lists + answers = [] + story2 = [] + + return data + + + +if __name__ == "__main__": + + """Unitest that generates a batch and displays a sample """ + + babi_tasks = list(range(1, 21)) + + params = {'directory': '/', 'tasks': babi_tasks,'data_type': 'train', 'batch_size': 10,'embedding_type' :'glove.6B.100d', 'embedding_size' :38 , 'ten_thousand_examples': True, 'one_hot_embedding': True, 'truncation_length':50 } + + + + babi = bAbIQASingleQuestion(params) + sample=babi[12] + print(sample) + print('__getitem__ works.') + + + # wrap DataLoader on top of this Dataset subclass + from torch.utils.data.dataloader import DataLoader + + batch_size = 1 + dataloader = DataLoader(dataset=babi, collate_fn=babi.collate_babi, + batch_size=batch_size, shuffle=True, num_workers=0) + + # try to see if there is a speed up when generating batches w/ multiple workers + import time + s = time.time() + #for i, batch in enumerate(dataloader): + # print('Batch # {} - {}'.format(i, type(batch))) + # print('Number of workers: {}'.format(dataloader.num_workers)) + #print('time taken to exhaust the dataset for a batch size of {}: {}s'.format(batch_size, time.time() - s)) + + batch = next(iter(dataloader)) + print(batch) + + print('Unit test completed') + exit() diff --git a/miprometheus/problems/seq_to_seq/seq_to_seq_problem.py b/miprometheus/problems/seq_to_seq/seq_to_seq_problem.py index 953359fd..25c7d734 100644 --- a/miprometheus/problems/seq_to_seq/seq_to_seq_problem.py +++ b/miprometheus/problems/seq_to_seq/seq_to_seq_problem.py @@ -20,12 +20,14 @@ """ -__author__ = "Tomasz Kornuta & Vincent Marois" +__author__ = "Tomasz Kornuta & Vincent Marois & Vincent Albouy" from miprometheus.problems.problem import Problem import torch + + class SeqToSeqProblem(Problem): """ Class representing base class for all sequential problems. @@ -75,6 +77,9 @@ def evaluate_loss(self, data_dict, logits): return loss + + + if __name__ == '__main__': from miprometheus.utils.param_interface import ParamInterface diff --git a/miprometheus/problems/seq_to_seq/text2text/text_to_text_problem.py b/miprometheus/problems/seq_to_seq/text2text/text_to_text_problem.py index 555bd333..0b0542df 100644 --- a/miprometheus/problems/seq_to_seq/text2text/text_to_text_problem.py +++ b/miprometheus/problems/seq_to_seq/text2text/text_to_text_problem.py @@ -51,6 +51,7 @@ import torch import torch.nn as nn from miprometheus.problems.seq_to_seq.seq_to_seq_problem import SeqToSeqProblem +from miprometheus.utils.app_state import AppState # global tokens PAD_token = 0 @@ -326,6 +327,85 @@ def tensors_from_pairs(self, pairs, input_lang, output_lang): """ return [self.tensors_from_pair(pair, input_lang, output_lang) for pair in pairs] + def to_dictionary_indexes(self, dictionary, sentence): + """ + Outputs indexes of the dictionary corresponding to the words in the sequence. + Case insensitive. + """ + + idxs = torch.tensor([dictionary[w.lower()] for w in sentence]).type(AppState().LongTensor) + return idxs + + def indices_to_words(self, int_sentence): + + sentences = [] + for ind in int_sentence[0, :]: + sentences.append(self.itos_dict[ind]) + return sentences + + def embed_sentence_one_hot(self, sentence): + """ + Embed an entire sentence using a pretrained embedding + :param sentence: A string containing the words to embed + :returns: FloatTensor of embedded vectors [max_sentence_length, embedding size] + """ + size_hot = len(self.dictionaries) + outsentence = torch.zeros((len(sentence.split(" ")), size_hot)) + # for key, value in self.dictionaries.items(): + # print(key, value) + + # print(size_hot) + # embed a word at a time + for i, word in enumerate(sentence.split(" ")): + if not word.lower() == self.pad_token: + index = self.dictionaries[word.lower()] + # print(index, word) + outsentence[i, index] = 1 + # print(outsentence[i,:]) + + return outsentence + + # Change name to embed sentence + + def embed_batch(self, minibatch): + + ex = minibatch + sentence = " ".join(ex) + + if self.one_hot_embedding: + sent_embed = self.embed_sentence_one_hot(sentence) + else: + sent_embed = self.language.embed_sentence(sentence) + + return sent_embed + + def tokenize(self, sentence): + return sentence.split(' ') + + # list to string + + def detokenize_story(self, minibatch): + a = [] + for ex in minibatch: + b = [] + # print(ex) + for sentence in ex: + b.append(" ".join(sentence)) + a.append(b) + return a + + # string to list + + def tokenize_story(self, minibatch): + a = [] + for ex in minibatch: + b = [] + # print(ex) + for sentence in ex: + b.append(self.tokenize(sentence)) + a.append(b) + return a + class Lang(object): """ @@ -395,3 +475,5 @@ def add_word(self, word): else: # this word has been seen before, simply update its occurrence self.word2count[word] += 1 + +