Skip to content

Commit

Permalink
Add poetry and linters (#1)
Browse files Browse the repository at this point in the history
* Add poetry and linters

* Style

* Fix pip version

* Fix travis

* Add numerical range support

* Fix commas
  • Loading branch information
chomechome authored Nov 26, 2019
1 parent d6e5f0e commit c6fd216
Show file tree
Hide file tree
Showing 82 changed files with 423 additions and 600 deletions.
16 changes: 8 additions & 8 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,21 @@ python:
- 3.6

install:
- "pip install pipenv --upgrade"
- "pipenv install --dev"
- "pip install pip==18.1"
- "pip install poetry==0.12.10"
- "poetry install --no-interaction"

jobs:
include:
- stage: test
- stage: codestyle
python: 3.6
script:
- "pipenv run pytest -n 8 --boxed"
- stage: codestyle
- "make lint"
- stage: test
python: 3.6
script:
- "pipenv run flake8"
- "make test"
- stage: coverage
python: 3.6
script:
- "pipenv run pytest --cov=maru"
- "pipenv run codecov"
- "make coverage"
22 changes: 22 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
CODE = maru tests
PYTHON = poetry run

.PHONY: pretty lint test

pretty:
$(PYTHON) black --target-version py36 --skip-string-normalization $(CODE)
$(PYTHON) isort --apply --recursive $(CODE)
$(PYTHON) unify --in-place --recursive $(CODE)

lint:
$(PYTHON) black --target-version py36 --check --skip-string-normalization $(CODE)
$(PYTHON) flake8 --jobs 4 --statistics $(CODE)
$(PYTHON) pylint --jobs 4 --rcfile=setup.cfg $(CODE)
$(PYTHON) mypy $(CODE)

test:
$(PYTHON) pytest -n 8 --boxed tests

coverage:
$(PYTHON) pytest --cov=maru
$(PYTHON) codecov
19 changes: 0 additions & 19 deletions Pipfile

This file was deleted.

20 changes: 19 additions & 1 deletion maru/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
from maru.factory import get_analyzer # noqa: F401
from maru.factory import get_analyzer
from maru.grammeme import (
Animacy,
Aspect,
Case,
Degree,
Gender,
Mood,
Number,
NumericalForm,
PartOfSpeech,
Person,
Tense,
Variant,
VerbForm,
Voice,
)
from maru.morph import Morph
from maru.tag import Tag

__version__ = '0.1.1'
11 changes: 5 additions & 6 deletions maru/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Sequence, Iterable, Dict
from typing import Dict, Iterable, Sequence

from maru.grammeme import PartOfSpeech
from maru.lemmatizer import ILemmatizer
Expand All @@ -11,18 +11,17 @@


class Analyzer:
def __init__(self,
taggers: Sequence[ITagger],
lemmatizer: ILemmatizer,
):
def __init__(
self, taggers: Sequence[ITagger], lemmatizer: ILemmatizer,
):
self._taggers = taggers
self._lemmatizer = lemmatizer

def analyze(self, text: Text) -> Iterable[Morph]:
tags: Dict[Index, Tag] = {}

length = len(text)
indices = range(length)
indices: Sequence[int] = range(length)
for tagger in self._taggers:
tags.update(tagger.tag(text, indices))
indices = [index for index in indices if index not in tags]
Expand Down
6 changes: 3 additions & 3 deletions maru/factory/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .analyzer import get_analyzer # noqa: F401
from .lemmatizer import get_lemmatizer # noqa: F401
from .tagger import get_tagger # noqa: F401
from .analyzer import get_analyzer
from .lemmatizer import get_lemmatizer
from .tagger import get_tagger
7 changes: 3 additions & 4 deletions maru/factory/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@
from maru.tagger import NumericalTagger, PunctuationTagger


def get_analyzer(tagger: str = 'linear',
lemmatizer: str = 'pymorphy',
cache_size: int = 15000,
):
def get_analyzer(
tagger: str = 'linear', lemmatizer: str = 'pymorphy', cache_size: int = 15000,
):
return Analyzer(
taggers=[
PunctuationTagger(),
Expand Down
12 changes: 7 additions & 5 deletions maru/factory/lemmatizer.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from typing import Iterable

from maru.lemmatizer import (
ILemmatizer,
Cache,
DummyLemmatizer,
ILemmatizer,
PymorphyLemmatizer,
Cache,
)


def get_lemmatizer(name: str, cache_size: int) -> ILemmatizer:
lemmatizers = {
'dummy': lambda: DummyLemmatizer(),
'dummy': DummyLemmatizer,
'pymorphy': lambda: Cache(PymorphyLemmatizer(), size=cache_size),
}
if name not in lemmatizers:
Expand All @@ -21,5 +21,7 @@ def get_lemmatizer(name: str, cache_size: int) -> ILemmatizer:

class InvalidLemmatizerError(Exception):
def __init__(self, lemmatizer: str, expected: Iterable[str]):
super().__init__(f'Invalid lemmatizer {lemmatizer}. '
f'Valid lemmatizers are: {", ".join(expected)}.')
super().__init__(
f'Invalid lemmatizer {lemmatizer}. '
f'Valid lemmatizers are: {", ".join(expected)}.'
)
11 changes: 6 additions & 5 deletions maru/factory/tagger.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
from typing import Iterable

from maru.tagger import (
CRFTagger,
ITagger,
PymorphyTagger,
LinearTagger,
CRFTagger,
PymorphyTagger,
RNNTagger,
)


def get_tagger(name: str, cache_size: int) -> ITagger:
taggers = {
'pymorphy': lambda: PymorphyTagger(),
'pymorphy': PymorphyTagger,
'linear': lambda: LinearTagger(cache_size=cache_size),
'crf': lambda: CRFTagger(cache_size=cache_size),
'rnn': lambda: RNNTagger(cache_size=cache_size),
Expand All @@ -24,5 +24,6 @@ def get_tagger(name: str, cache_size: int) -> ITagger:

class InvalidTaggerError(Exception):
def __init__(self, tagger: str, expected: Iterable[str]):
super().__init__(f'Invalid tagger {tagger}. '
f'Valid taggers are: {", ".join(expected)}.')
super().__init__(
f'Invalid tagger {tagger}. ' f'Valid taggers are: {", ".join(expected)}.'
)
2 changes: 1 addition & 1 deletion maru/feature/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from . import extractor # noqa: F401
from . import extractor
14 changes: 7 additions & 7 deletions maru/feature/extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .abstract import IFeatureExtractor # noqa: F401
from .cache import Cache # noqa: F401
from .pymorphy import PymorphyExtractor # noqa: F401
from .pipeline import Pipeline # noqa: F401
from .stem import StemExtractor # noqa: F401
from .suffix import SuffixExtractor # noqa: F401
from .word import WordExtractor # noqa: F401
from .abstract import IFeatureExtractor
from .cache import Cache
from .pipeline import Pipeline
from .pymorphy import PymorphyExtractor
from .stem import StemExtractor
from .suffix import SuffixExtractor
from .word import WordExtractor
2 changes: 1 addition & 1 deletion maru/feature/extractor/abstract.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import abc

from maru.types import Word, FeatureVector
from maru.types import FeatureVector, Word


class IFeatureExtractor(metaclass=abc.ABCMeta):
Expand Down
5 changes: 3 additions & 2 deletions maru/feature/extractor/cache.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import lru
from typing import Tuple

import lru

from maru.feature.extractor.abstract import IFeatureExtractor
from maru.types import Word, FeatureVector
from maru.types import FeatureVector, Word


class Cache(IFeatureExtractor):
Expand Down
2 changes: 1 addition & 1 deletion maru/feature/extractor/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Sequence

from maru.feature.extractor.abstract import IFeatureExtractor
from maru.types import Word, FeatureVector
from maru.types import FeatureVector, Word


class Pipeline(IFeatureExtractor):
Expand Down
4 changes: 2 additions & 2 deletions maru/feature/extractor/pymorphy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from maru import pymorphy
from maru.feature.extractor.abstract import IFeatureExtractor
from maru.types import Word, FeatureVector
from maru.types import FeatureVector, Word


class PymorphyExtractor(IFeatureExtractor):
Expand All @@ -12,7 +12,7 @@ def __init__(self, hypotheses: int = 3):
def extract(self, word: Word) -> FeatureVector:
seen: Set[str] = set()

parses = pymorphy.analyze(word)[:self._hypotheses]
parses = pymorphy.analyze(word)[: self._hypotheses]
for index, parse in enumerate(parses):
new = [gram for gram in parse.tag.grammemes if gram not in seen]
if new:
Expand Down
2 changes: 1 addition & 1 deletion maru/feature/extractor/stem.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Sequence

from maru.feature.extractor import IFeatureExtractor
from maru.types import Word, FeatureVector
from maru.types import FeatureVector, Word


class StemExtractor(IFeatureExtractor):
Expand Down
2 changes: 1 addition & 1 deletion maru/feature/extractor/suffix.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Sequence

from maru.feature.extractor.abstract import IFeatureExtractor
from maru.types import Word, FeatureVector
from maru.types import FeatureVector, Word


class SuffixExtractor(IFeatureExtractor):
Expand Down
2 changes: 1 addition & 1 deletion maru/feature/extractor/word.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from maru.feature.extractor.abstract import IFeatureExtractor
from maru.types import Word, FeatureVector
from maru.types import FeatureVector, Word


class WordExtractor(IFeatureExtractor):
Expand Down
36 changes: 19 additions & 17 deletions maru/feature/vocabulary.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import collections
from typing import Dict, Iterable
from typing import Counter, Dict, Iterable, List

from maru.feature.window import FeatureWindow
from maru.types import Index, FeatureName, FeatureVector
from maru.types import FeatureName, FeatureVector, Index, Offset


class FeatureVocabulary(Dict[FeatureName, Index]):
@classmethod
def train(cls, features: Iterable[FeatureVector], min_count: int = 2):
counts = collections.Counter()
counts: Counter[str] = collections.Counter()
for vector in features:
counts.update(name for name, _ in vector)

Expand All @@ -18,27 +18,29 @@ def train(cls, features: Iterable[FeatureVector], min_count: int = 2):
return cls(vocabulary)


class PositionalFeatureVocabulary(Dict[Index, FeatureVocabulary]):
class PositionalFeatureVocabulary(Dict[Offset, FeatureVocabulary]):
@classmethod
def train(cls, windows: Iterable[FeatureWindow], min_count: int = 2):
vocabularies: Dict[Index, FeatureVocabulary] = {}

positions = collections.defaultdict(list)
features_by_offset: Dict[Offset, List[FeatureVector]] = collections.defaultdict(
list
)
for window in windows:
for position, features in window:
positions[position].append(features)
for offset, feature_vector in window:
features_by_offset[offset].append(feature_vector)

offset = 0
for position, features in positions.items():
vocabulary = FeatureVocabulary.train(features, min_count)
for feature in vocabulary.keys():
vocabulary[feature] += offset
vocabulary_by_offset: Dict[Offset, FeatureVocabulary] = {}
for offset, feature_matrix in features_by_offset.items():
feature_count = (
max(max(vocab.values()) for vocab in vocabulary_by_offset.values()) + 1
)

vocabularies[position] = vocabulary
vocabulary = FeatureVocabulary.train(feature_matrix, min_count)
for feature in vocabulary.keys():
vocabulary[feature] += feature_count

offset = max(vocabulary.values()) + 1
vocabulary_by_offset[offset] = vocabulary

return cls(vocabularies)
return cls(vocabulary_by_offset)

def get_feature_count(self):
max_index = max(max(vocab.values()) for vocab in self.values())
Expand Down
6 changes: 2 additions & 4 deletions maru/feature/window.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Iterable, Mapping

from maru.feature.extractor import IFeatureExtractor
from maru.types import Index, Indices, Offset, Text, FeatureWindow
from maru.types import FeatureWindow, Index, Indices, Offset, Text


class FeatureWindowGenerator:
Expand All @@ -16,8 +16,6 @@ def _get_window(self, text: Text, center: Index) -> FeatureWindow:
features = extractor.extract(text[index])
yield (offset, features)

def generate(self,
text: Text,
indices: Indices) -> Iterable[FeatureWindow]:
def generate(self, text: Text, indices: Indices) -> Iterable[FeatureWindow]:
for center in indices:
yield self._get_window(text, center)
Loading

0 comments on commit c6fd216

Please sign in to comment.