Add poetry and linters (#1)

* Add poetry and linters * Style * Fix pip version * Fix travis * Add numerical range support * Fix commas
chomechome · Nov 26, 2019 · c6fd216 · c6fd216
1 parent d6e5f0e
commit c6fd216
Show file tree

Hide file tree

Showing 82 changed files with 423 additions and 600 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,21 +4,21 @@ python:
   - 3.6
 
 install:
-  - "pip install pipenv --upgrade"
-  - "pipenv install --dev"
+  - "pip install pip==18.1"
+  - "pip install poetry==0.12.10"
+  - "poetry install --no-interaction"
 
 jobs:
   include:
-    - stage: test
+    - stage: codestyle
       python: 3.6
       script:
-        - "pipenv run pytest -n 8 --boxed"
-    - stage: codestyle
+        - "make lint"
+    - stage: test
       python: 3.6
       script:
-        - "pipenv run flake8"
+        - "make test"
     - stage: coverage
       python: 3.6
       script:
-        - "pipenv run pytest --cov=maru"
-        - "pipenv run codecov"
+        - "make coverage"
diff --git a/Makefile b/Makefile
@@ -0,0 +1,22 @@
+CODE = maru tests
+PYTHON = poetry run
+
+.PHONY: pretty lint test
+
+pretty:
+	$(PYTHON) black --target-version py36 --skip-string-normalization $(CODE)
+	$(PYTHON) isort --apply --recursive $(CODE)
+	$(PYTHON) unify --in-place --recursive $(CODE)
+
+lint:
+	$(PYTHON) black --target-version py36 --check --skip-string-normalization $(CODE)
+	$(PYTHON) flake8 --jobs 4 --statistics $(CODE)
+	$(PYTHON) pylint --jobs 4 --rcfile=setup.cfg $(CODE)
+	$(PYTHON) mypy $(CODE)
+
+test:
+	$(PYTHON) pytest -n 8 --boxed tests
+
+coverage:
+	$(PYTHON) pytest --cov=maru
+	$(PYTHON) codecov
diff --git a/Pipfile b/Pipfile
diff --git a/maru/__init__.py b/maru/__init__.py
@@ -1,3 +1,21 @@
-from maru.factory import get_analyzer  # noqa: F401
+from maru.factory import get_analyzer
+from maru.grammeme import (
+    Animacy,
+    Aspect,
+    Case,
+    Degree,
+    Gender,
+    Mood,
+    Number,
+    NumericalForm,
+    PartOfSpeech,
+    Person,
+    Tense,
+    Variant,
+    VerbForm,
+    Voice,
+)
+from maru.morph import Morph
+from maru.tag import Tag
 
 __version__ = '0.1.1'
diff --git a/maru/analyzer.py b/maru/analyzer.py
@@ -1,4 +1,4 @@
-from typing import Sequence, Iterable, Dict
+from typing import Dict, Iterable, Sequence
 
 from maru.grammeme import PartOfSpeech
 from maru.lemmatizer import ILemmatizer
@@ -11,18 +11,17 @@
 
 
 class Analyzer:
-    def __init__(self,
-                 taggers: Sequence[ITagger],
-                 lemmatizer: ILemmatizer,
-                 ):
+    def __init__(
+        self, taggers: Sequence[ITagger], lemmatizer: ILemmatizer,
+    ):
         self._taggers = taggers
         self._lemmatizer = lemmatizer
 
     def analyze(self, text: Text) -> Iterable[Morph]:
         tags: Dict[Index, Tag] = {}
 
         length = len(text)
-        indices = range(length)
+        indices: Sequence[int] = range(length)
         for tagger in self._taggers:
             tags.update(tagger.tag(text, indices))
             indices = [index for index in indices if index not in tags]

diff --git a/maru/factory/__init__.py b/maru/factory/__init__.py
@@ -1,3 +1,3 @@
-from .analyzer import get_analyzer  # noqa: F401
-from .lemmatizer import get_lemmatizer  # noqa: F401
-from .tagger import get_tagger  # noqa: F401
+from .analyzer import get_analyzer
+from .lemmatizer import get_lemmatizer
+from .tagger import get_tagger
diff --git a/maru/factory/analyzer.py b/maru/factory/analyzer.py
@@ -4,10 +4,9 @@
 from maru.tagger import NumericalTagger, PunctuationTagger
 
 
-def get_analyzer(tagger: str = 'linear',
-                 lemmatizer: str = 'pymorphy',
-                 cache_size: int = 15000,
-                 ):
+def get_analyzer(
+    tagger: str = 'linear', lemmatizer: str = 'pymorphy', cache_size: int = 15000,
+):
     return Analyzer(
         taggers=[
             PunctuationTagger(),

diff --git a/maru/factory/lemmatizer.py b/maru/factory/lemmatizer.py
@@ -1,16 +1,16 @@
 from typing import Iterable
 
 from maru.lemmatizer import (
-    ILemmatizer,
+    Cache,
     DummyLemmatizer,
+    ILemmatizer,
     PymorphyLemmatizer,
-    Cache,
 )
 
 
 def get_lemmatizer(name: str, cache_size: int) -> ILemmatizer:
     lemmatizers = {
-        'dummy': lambda: DummyLemmatizer(),
+        'dummy': DummyLemmatizer,
         'pymorphy': lambda: Cache(PymorphyLemmatizer(), size=cache_size),
     }
     if name not in lemmatizers:
@@ -21,5 +21,7 @@ def get_lemmatizer(name: str, cache_size: int) -> ILemmatizer:
 
 class InvalidLemmatizerError(Exception):
     def __init__(self, lemmatizer: str, expected: Iterable[str]):
-        super().__init__(f'Invalid lemmatizer {lemmatizer}. '
-                         f'Valid lemmatizers are: {", ".join(expected)}.')
+        super().__init__(
+            f'Invalid lemmatizer {lemmatizer}. '
+            f'Valid lemmatizers are: {", ".join(expected)}.'
+        )
diff --git a/maru/factory/tagger.py b/maru/factory/tagger.py
@@ -1,17 +1,17 @@
 from typing import Iterable
 
 from maru.tagger import (
+    CRFTagger,
     ITagger,
-    PymorphyTagger,
     LinearTagger,
-    CRFTagger,
+    PymorphyTagger,
     RNNTagger,
 )
 
 
 def get_tagger(name: str, cache_size: int) -> ITagger:
     taggers = {
-        'pymorphy': lambda: PymorphyTagger(),
+        'pymorphy': PymorphyTagger,
         'linear': lambda: LinearTagger(cache_size=cache_size),
         'crf': lambda: CRFTagger(cache_size=cache_size),
         'rnn': lambda: RNNTagger(cache_size=cache_size),
@@ -24,5 +24,6 @@ def get_tagger(name: str, cache_size: int) -> ITagger:
 
 class InvalidTaggerError(Exception):
     def __init__(self, tagger: str, expected: Iterable[str]):
-        super().__init__(f'Invalid tagger {tagger}. '
-                         f'Valid taggers are: {", ".join(expected)}.')
+        super().__init__(
+            f'Invalid tagger {tagger}. ' f'Valid taggers are: {", ".join(expected)}.'
+        )
diff --git a/maru/feature/__init__.py b/maru/feature/__init__.py
@@ -1 +1 @@
-from . import extractor  # noqa: F401
+from . import extractor
diff --git a/maru/feature/extractor/__init__.py b/maru/feature/extractor/__init__.py
@@ -1,7 +1,7 @@
-from .abstract import IFeatureExtractor  # noqa: F401
-from .cache import Cache  # noqa: F401
-from .pymorphy import PymorphyExtractor  # noqa: F401
-from .pipeline import Pipeline  # noqa: F401
-from .stem import StemExtractor  # noqa: F401
-from .suffix import SuffixExtractor  # noqa: F401
-from .word import WordExtractor  # noqa: F401
+from .abstract import IFeatureExtractor
+from .cache import Cache
+from .pipeline import Pipeline
+from .pymorphy import PymorphyExtractor
+from .stem import StemExtractor
+from .suffix import SuffixExtractor
+from .word import WordExtractor
diff --git a/maru/feature/extractor/abstract.py b/maru/feature/extractor/abstract.py
@@ -1,6 +1,6 @@
 import abc
 
-from maru.types import Word, FeatureVector
+from maru.types import FeatureVector, Word
 
 
 class IFeatureExtractor(metaclass=abc.ABCMeta):

diff --git a/maru/feature/extractor/cache.py b/maru/feature/extractor/cache.py
@@ -1,8 +1,9 @@
-import lru
 from typing import Tuple
 
+import lru
+
 from maru.feature.extractor.abstract import IFeatureExtractor
-from maru.types import Word, FeatureVector
+from maru.types import FeatureVector, Word
 
 
 class Cache(IFeatureExtractor):

diff --git a/maru/feature/extractor/pipeline.py b/maru/feature/extractor/pipeline.py
@@ -1,7 +1,7 @@
 from typing import Sequence
 
 from maru.feature.extractor.abstract import IFeatureExtractor
-from maru.types import Word, FeatureVector
+from maru.types import FeatureVector, Word
 
 
 class Pipeline(IFeatureExtractor):

diff --git a/maru/feature/extractor/pymorphy.py b/maru/feature/extractor/pymorphy.py
@@ -2,7 +2,7 @@
 
 from maru import pymorphy
 from maru.feature.extractor.abstract import IFeatureExtractor
-from maru.types import Word, FeatureVector
+from maru.types import FeatureVector, Word
 
 
 class PymorphyExtractor(IFeatureExtractor):
@@ -12,7 +12,7 @@ def __init__(self, hypotheses: int = 3):
     def extract(self, word: Word) -> FeatureVector:
         seen: Set[str] = set()
 
-        parses = pymorphy.analyze(word)[:self._hypotheses]
+        parses = pymorphy.analyze(word)[: self._hypotheses]
         for index, parse in enumerate(parses):
             new = [gram for gram in parse.tag.grammemes if gram not in seen]
             if new:

diff --git a/maru/feature/extractor/stem.py b/maru/feature/extractor/stem.py
@@ -1,7 +1,7 @@
 from typing import Sequence
 
 from maru.feature.extractor import IFeatureExtractor
-from maru.types import Word, FeatureVector
+from maru.types import FeatureVector, Word
 
 
 class StemExtractor(IFeatureExtractor):

diff --git a/maru/feature/extractor/suffix.py b/maru/feature/extractor/suffix.py
@@ -1,7 +1,7 @@
 from typing import Sequence
 
 from maru.feature.extractor.abstract import IFeatureExtractor
-from maru.types import Word, FeatureVector
+from maru.types import FeatureVector, Word
 
 
 class SuffixExtractor(IFeatureExtractor):

diff --git a/maru/feature/extractor/word.py b/maru/feature/extractor/word.py
@@ -1,5 +1,5 @@
 from maru.feature.extractor.abstract import IFeatureExtractor
-from maru.types import Word, FeatureVector
+from maru.types import FeatureVector, Word
 
 
 class WordExtractor(IFeatureExtractor):

diff --git a/maru/feature/vocabulary.py b/maru/feature/vocabulary.py
@@ -1,14 +1,14 @@
 import collections
-from typing import Dict, Iterable
+from typing import Counter, Dict, Iterable, List
 
 from maru.feature.window import FeatureWindow
-from maru.types import Index, FeatureName, FeatureVector
+from maru.types import FeatureName, FeatureVector, Index, Offset
 
 
 class FeatureVocabulary(Dict[FeatureName, Index]):
     @classmethod
     def train(cls, features: Iterable[FeatureVector], min_count: int = 2):
-        counts = collections.Counter()
+        counts: Counter[str] = collections.Counter()
         for vector in features:
             counts.update(name for name, _ in vector)
 
@@ -18,27 +18,29 @@ def train(cls, features: Iterable[FeatureVector], min_count: int = 2):
         return cls(vocabulary)
 
 
-class PositionalFeatureVocabulary(Dict[Index, FeatureVocabulary]):
+class PositionalFeatureVocabulary(Dict[Offset, FeatureVocabulary]):
     @classmethod
     def train(cls, windows: Iterable[FeatureWindow], min_count: int = 2):
-        vocabularies: Dict[Index, FeatureVocabulary] = {}
-
-        positions = collections.defaultdict(list)
+        features_by_offset: Dict[Offset, List[FeatureVector]] = collections.defaultdict(
+            list
+        )
         for window in windows:
-            for position, features in window:
-                positions[position].append(features)
+            for offset, feature_vector in window:
+                features_by_offset[offset].append(feature_vector)
 
-        offset = 0
-        for position, features in positions.items():
-            vocabulary = FeatureVocabulary.train(features, min_count)
-            for feature in vocabulary.keys():
-                vocabulary[feature] += offset
+        vocabulary_by_offset: Dict[Offset, FeatureVocabulary] = {}
+        for offset, feature_matrix in features_by_offset.items():
+            feature_count = (
+                max(max(vocab.values()) for vocab in vocabulary_by_offset.values()) + 1
+            )
 
-            vocabularies[position] = vocabulary
+            vocabulary = FeatureVocabulary.train(feature_matrix, min_count)
+            for feature in vocabulary.keys():
+                vocabulary[feature] += feature_count
 
-            offset = max(vocabulary.values()) + 1
+            vocabulary_by_offset[offset] = vocabulary
 
-        return cls(vocabularies)
+        return cls(vocabulary_by_offset)
 
     def get_feature_count(self):
         max_index = max(max(vocab.values()) for vocab in self.values())

diff --git a/maru/feature/window.py b/maru/feature/window.py
@@ -1,7 +1,7 @@
 from typing import Iterable, Mapping
 
 from maru.feature.extractor import IFeatureExtractor
-from maru.types import Index, Indices, Offset, Text, FeatureWindow
+from maru.types import FeatureWindow, Index, Indices, Offset, Text
 
 
 class FeatureWindowGenerator:
@@ -16,8 +16,6 @@ def _get_window(self, text: Text, center: Index) -> FeatureWindow:
                 features = extractor.extract(text[index])
                 yield (offset, features)
 
-    def generate(self,
-                 text: Text,
-                 indices: Indices) -> Iterable[FeatureWindow]:
+    def generate(self, text: Text, indices: Indices) -> Iterable[FeatureWindow]:
         for center in indices:
             yield self._get_window(text, center)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from . import extractor # noqa: F401
		from . import extractor