tests/test_word_generator.py

# -*- coding: utf-8 -*-
import sys
from os.path import dirname, abspath
sys.path.append(dirname(dirname(abspath(__file__))))
from nose.tools import raises
from torchmoji.word_generator import WordGenerator

IS_PYTHON2 = int(sys.version[0]) == 2

@raises(ValueError)
def test_only_unicode_accepted():
    """ Non-Unicode strings raise a ValueError.
        In Python 3 all string are Unicode
    """
    if not IS_PYTHON2:
        raise ValueError("You are using python 3 so this test should always pass")

    sentences = [
        u'Hello world',
        u'I am unicode',
        'I am not unicode',
        ]

    wg = WordGenerator(sentences)
    for w in wg:
        pass


def test_unicode_sentences_ignored_if_set():
    """ Strings with Unicode characters tokenize to empty array if they're not allowed.
    """
    sentence = [u'Dobrý den, jak se máš?']
    wg = WordGenerator(sentence, allow_unicode_text=False)
    assert wg.get_words(sentence[0]) == []


def test_check_ascii():
    """ check_ascii recognises ASCII words properly.
        In Python 3 all string are Unicode
    """
    if not IS_PYTHON2:
        return

    wg = WordGenerator([])
    assert wg.check_ascii('ASCII')
    assert not wg.check_ascii('ščřžýá')
    assert not wg.check_ascii('❤ ☀ ☆ ☂ ☻ ♞ ☯ ☭ ☢')


def test_convert_unicode_word():
    """ convert_unicode_word converts Unicode words correctly.
    """
    wg = WordGenerator([], allow_unicode_text=True)

    result = wg.convert_unicode_word(u'č')
    assert result == (True, u'\u010d'), '{}'.format(result)


def test_convert_unicode_word_ignores_if_set():
    """ convert_unicode_word ignores Unicode words if set.
    """
    wg = WordGenerator([], allow_unicode_text=False)

    result = wg.convert_unicode_word(u'č')
    assert result == (False, ''), '{}'.format(result)


def test_convert_unicode_chars():
    """ convert_unicode_word correctly converts accented characters.
    """
    wg = WordGenerator([], allow_unicode_text=True)
    result = wg.convert_unicode_word(u'ěščřžýáíé')
    assert result == (True, u'\u011b\u0161\u010d\u0159\u017e\xfd\xe1\xed\xe9'), '{}'.format(result)