generator.py

#!/usr/bin/env python
"""

Neural SPARQL Machines - Generator module

'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017
https://w3id.org/neural-sparql-machines/soru-marx-semantics2017.html
https://arxiv.org/abs/1708.07624

Version 0.1.0-akaha

"""
import argparse
import collections
import datetime
import json
import logging
import operator
import os
import random
import re
import sys
import traceback

from generator_utils import log_statistics, save_cache, query_dbpedia, strip_brackets, encode, read_template_file , do_replacements 


CELEBRITY_LIST = [
    'dbo:Royalty',
    '<http://dbpedia.org/class/yago/Wikicat21st-centuryActors>',
    '<http://dbpedia.org/class/yago/WikicatEnglishMusicians>',
    '<http://dbpedia.org/class/yago/Wikicat20th-centuryNovelists>',
    '<http://dbpedia.org/class/yago/Honoree110183757>'
    ]

SPECIAL_CLASSES = {
    'dbo:Person': [
        '<http://dbpedia.org/class/yago/Wikicat21st-centuryActors>',
        '<http://dbpedia.org/class/yago/WikicatEnglishMusicians>',
        '<http://dbpedia.org/class/yago/Wikicat20th-centuryNovelists>',
        '<http://dbpedia.org/class/yago/Honoree110183757>',
        'dbo:LacrossePlayer'
    ],
    'dbo:Athlete': ['dbo:LacrossePlayer'],
    'dbo:SportsTeam': ['dboBasketballTeam']
}
EXAMPLES_PER_TEMPLATE = 600

def extract_bindings( data, template ):
    matches = list()
    for match in data:
        matches.append(match)

    random.shuffle(matches)
    logging.debug('{} matches for {}'.format(len(matches), getattr(template, 'id')))

    if len(matches) == 0:
        return None

    if len(matches) <= EXAMPLES_PER_TEMPLATE:
        best_matches = matches
    else:
        #best_matches = sort_matches(matches, template)[0:EXAMPLES_PER_TEMPLATE]
        best_matches = sorted(matches, template)[0:EXAMPLES_PER_TEMPLATE]

    bindings = list()
    variables = getattr(template, 'variables')

    for match in best_matches:
        binding = {}
        for variable in variables:
            resource = match[variable]["value"]
            label = match["l" + variable]["value"]
            binding[variable] = {'uri': resource, 'label': label}
            used_resources.update([resource])
        bindings.append(binding)
        
    return bindings


def sort_matches( matches, template ):
    variables = getattr(template, 'variables')
    get_usages = lambda match : map(lambda variable : used_resources[match[variable]["value"]], variables)

    matches_with_usages = map(lambda match : {'usages': get_usages(match), 'match': match}, matches)
    sorted_matches_with_usages = sorted(matches_with_usages, key=prioritize_usage)
    sorted_matches = map(operator.itemgetter('match'), sorted_matches_with_usages)

    return sorted_matches

def prioritize_usage ( match ):
    usages = match['usages']
    if len(usages) == 1:
        return prioritize_single_match(usages[0])
    else:
        if len(usages) == 2:
            return prioritize_couple_match(usages)
        else:
            return prioritize_triple_match(usages)

def prioritize_single_match( usage ):
    highest_priority = 20 >= usage > 10
    second_highest_priority = 30 >= usage > 20
    third_highest_priority = 10 >= usage > 0
    fourth_highest_priority = 50 >= usage > 30
    fifth_highest_priority = usage == 0
    if highest_priority:
        return 0
    if second_highest_priority:
        return 1
    if third_highest_priority:
        return 2
    if fourth_highest_priority:
        return 3
    if fifth_highest_priority:
        return 4
    return usage


def prioritize_couple_match( usages ):
    between_zero_and_upper_limit = lambda value : 0 < value < 30
    usage, other_usage = usages
    highest_priority = all(map(between_zero_and_upper_limit, usages))
    second_highest_priority = any(map(between_zero_and_upper_limit, usages))
    third_highest_priority = usage == 0 and other_usage == 0

    if highest_priority:
        return 0
    if second_highest_priority:
        return 1
    if third_highest_priority:
        return 2
    return sum(usages)

def prioritize_triple_match( usages ):
    between_zero_and_upper_limit = lambda value : 0 < value < 30
    highest_priority = all(map(between_zero_and_upper_limit, usages))
    second_highest_priority = filter(between_zero_and_upper_limit, usages) >= 2
    third_highest_priority = any(map(between_zero_and_upper_limit, usages))

    if highest_priority:
        return 0
    if second_highest_priority:
        return 1
    if third_highest_priority:
        return 2

    return sum(usages)


def build_dataset_pair(binding, template, flag):
    print('\n build_dataset_pair(binding, template):')
    english = getattr(template, 'question')
    sparql = getattr(template, 'query')
    for variable in binding:
        uri = binding[variable]['uri']
        print(" uri = binding[variable]['uri']")
        print('\n building dataset, the URI: ', uri)
        label = do_replacements(uri)
        '''
        if flag: # this is changed to fit the transformer model
            label = binding[variable]['label']
        else:
            label = do_replacements(uri)  '''
        placeholder = '<{}>'.format(str.upper(variable))
        if placeholder in english and label is not None:
            print('\n building dataset, the label: ', label)
            english = english.replace(placeholder, label )
            '''
            if flag: # this is changed to fit the transformer model
                english = english.replace(placeholder, label )
            else:
                english = english.replace(placeholder, strip_brackets(label)) '''           
            print('\n building dataset, the strip_brackets(label): ', strip_brackets(label))
            print('\n building dataset, the placeholder: ', placeholder)
            print('\n building dataset, the english: ', english)
        if placeholder in sparql and uri is not None:
            sparql = sparql.replace(placeholder, uri)

    sparql = encode(sparql)
    dataset_pair = {'english': english, 'sparql': sparql}
    return dataset_pair


def generate_dataset(templates, output_dir, file_mode, flag):
    cache = dict()
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    it = 0
    with open(output_dir + '/data.en', file_mode) as english_questions, open(output_dir + '/data.sparql', file_mode) as sparql_queries:
        for template in templates:
            it = it + 1
            print( "for {}th template".format(it) )
            try:
                results = get_results_of_generator_query(cache, template)
                print(it, ' ---> ', results, '\n' )
                bindings = extract_bindings(results["results"]["bindings"], template)
                print( bindings)
                if bindings is None:
                    id_or_question = getattr(template, 'id') or getattr(template, 'question')
                    logging.debug("no data for {}".format(id_or_question))
                    not_instanced_templates.update([id_or_question])
                    continue

                for binding in bindings:
                    dataset_pair = build_dataset_pair(binding, template, flag )
                    print( "\n dataset_pair---> ", dataset_pair)
                    if (dataset_pair):
                        dataset_pair['english'] = " ".join(dataset_pair['english'].split())
                        english_questions.write("{}\n".format(dataset_pair['english']))
                        dataset_pair['sparql'] = re.sub("\s\s+" , " " , dataset_pair['sparql'])
                        a = re.search('(.*)brack_open',dataset_pair['sparql']);
                        b = re.search('brack_open(.*)brack_close',dataset_pair['sparql']);
                        c = re.search('brack_close(.*)',dataset_pair['sparql']);
                        # print a.group(1),b.group(1)
                        a = a.group(1)
                        b = b.group(1)
                        c = c.group(1)
                        b = b.replace(' attr_open ','(') 
                        b = b.replace(' attr_close',')')
                        dataset_pair['sparql'] = a + ' brack_open ' + b + ' brack_close ' + c
                        dataset_pair['sparql'] = " ".join(dataset_pair['sparql'].split())
                        # print 

                        # print "lol", dataset_pair['sparql']


                        sparql_queries.write("{}\n".format(dataset_pair['sparql']))
            except:
                exception = traceback.format_exc()
                logging.error('template {} caused exception {}'.format(getattr(template, 'id'), exception))
                logging.info('1. fix problem\n2. remove templates until the exception template in the template file\n3. restart with `--continue` parameter')
                raise Exception()


def get_results_of_generator_query( cache, template ):
    generator_query = getattr(template, 'generator_query')
    first_attempt = lambda template : prepare_generator_query(template)
    second_attempt = lambda template : prepare_generator_query(template, do_special_class_replacement=False)
    third_attempt = lambda template : prepare_generator_query(template, add_type_requirements=False)

    for attempt, prepare_query in enumerate([first_attempt, second_attempt, third_attempt], start=1):
        query = prepare_query(template)
        if query in cache:
            results = cache[query]
            break
        logging.debug('{}. attempt generator_query: {}'.format(attempt, query))
        results = query_dbpedia(query)
        sufficient_examples = len(results["results"]["bindings"]) >= EXAMPLES_PER_TEMPLATE/3
        if sufficient_examples:
            cache[query] = results
            break
    return results

LABEL_REPLACEMENT = "  (str(?lab{variable}) as ?l{variable}) where {{ ?{variable} rdfs:label ?lab{variable} . FILTER(lang(?lab{variable}) = 'en') . "
CLASS_REPLACEMENT = " where {{ ?{variable} a {ontology_class} . "
CLASSES_REPLACEMENT = " where {{ ?{variable} a ?t . VALUES (?t) {{ {classes} }} . "
SUBCLASS_REPLACEMENT = " where {{ ?{variable} rdfs:subClassOf {ontology_class} . "


def variable_is_subclass ( query, variable ):
    predicate_pattern = r'\s+?(rdf:type|a)\s+?\?' + variable
    predicate_match = re.search(predicate_pattern, query)
    return bool(predicate_match)


def add_requirement( query, where_replacement ):
    return query.replace(" where { ", where_replacement)


def prepare_generator_query( template, add_type_requirements=True, do_special_class_replacement=True):
    generator_query = getattr(template, 'generator_query')
    target_classes = getattr(template, 'target_classes')
    variables = getattr(template, 'variables')

    for i, variable in enumerate(variables):
        generator_query = add_requirement(generator_query, LABEL_REPLACEMENT.format(variable=variable))
        variable_has_a_type = len(target_classes) > i and target_classes[i]
        if variable_has_a_type and add_type_requirements:
            normalized_target_class = normalize(target_classes[i])
            if variable_is_subclass(generator_query, variable):
                generator_query = add_requirement(generator_query, SUBCLASS_REPLACEMENT.format(variable=variable, ontology_class=normalized_target_class))
            else:
                if normalized_target_class in SPECIAL_CLASSES and do_special_class_replacement:
                    classes = ' '.join(map(lambda c : '({})'.format(c), SPECIAL_CLASSES[normalized_target_class]))
                    generator_query = add_requirement(generator_query, CLASSES_REPLACEMENT.format(variable=variable,classes=classes))
                else:
                    ontology_class = normalized_target_class
                    generator_query = add_requirement(generator_query, CLASS_REPLACEMENT.format(variable=variable, ontology_class=ontology_class))
    return generator_query


def normalize (ontology_class):
    if str.startswith(ontology_class, 'http://dbpedia.org/ontology/'):
        return str.replace(ontology_class, 'http://dbpedia.org/ontology/', 'dbo:')
    if str.startswith(ontology_class, 'http'):
        return '<{}>'.format(ontology_class)
    return ontology_class


if __name__ == '__main__':
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--continue', dest='continue_generation', action='store_true', help='Continue after exception')
    parser.add_argument('--transformer', type=bool, default=False )
    requiredNamed = parser.add_argument_group('required named arguments')
    requiredNamed.add_argument('--templates', dest='templates', metavar='templateFile', help='templates', required=True)
    requiredNamed.add_argument('--output', dest='output', metavar='outputDirectory', help='dataset directory', required=True)
    args = parser.parse_args()
    flag = args.transformer

    template_file = args.templates
    output_dir = args.output
    use_resources_dump = args.continue_generation

   # print use_resources_dump => False

    time = datetime.datetime.today()
    logging.basicConfig(filename='{}/generator_{:%Y-%m-%d-%H-%M}.log'.format(output_dir, time), level=logging.DEBUG)
    resource_dump_file = output_dir + '/resource_dump.json'
    resource_dump_exists = os.path.exists(resource_dump_file)

    # print resource_dump_file, resource_dump_exists => data/place_v1/resource_dump.json False

    if (resource_dump_exists and not use_resources_dump):
        warning_message = 'Warning: The file {} exists which indicates an error. Remove file or continue generation after fixing with --continue'.format(
            resource_dump_file)
        #print warning_message
        sys.exit(1)

    # reload(sys) to set default encoding

    #reload(sys)
    #sys.setdefaultencoding("utf-8")
    import importlib,sys 
    importlib.reload(sys)

    not_instanced_templates = collections.Counter()
    used_resources = collections.Counter(json.loads(open(resource_dump_file).read())) if use_resources_dump else collections.Counter()
    file_mode = 'a' if use_resources_dump else 'w'
    templates = read_template_file(template_file)
    print(len(templates)) 
    try:
        generate_dataset(templates, output_dir, file_mode, flag )
        # print "lol"
    except:
        #print 'exception occured, look for error in log file'
        save_cache(resource_dump_file, used_resources)
    else:
        save_cache('{}/used_resources_{:%Y-%m-%d-%H-%M}.json'.format(output_dir, time), used_resources)
    finally:
        log_statistics(used_resources, SPECIAL_CLASSES, not_instanced_templates)