diff --git a/.appveyor.yml b/.appveyor.yml
index 250cd0e336c4..696aedccca11 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -28,7 +28,6 @@ install:
   - set CONDA_ENV="test-env"
   - ps: >-
       switch ($env:PYTHON_VERSION) {
-          "2.7" {$env:MINICONDA = "C:\Miniconda-x64"}
           "3.6" {$env:MINICONDA = "C:\Miniconda36-x64"}
           "3.7" {$env:MINICONDA = "C:\Miniconda37-x64"}
           default {$env:MINICONDA = "C:\Miniconda37-x64"}
diff --git a/.ci/test.sh b/.ci/test.sh
index 96f917775044..4cd181790ad7 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -100,15 +100,15 @@ if [[ $TASK == "sdist" ]]; then
     exit 0
 elif [[ $TASK == "bdist" ]]; then
     if [[ $OS_NAME == "macos" ]]; then
-        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --plat-name=macosx --universal || exit -1
-        mv dist/lightgbm-$LGB_VER-py2.py3-none-macosx.whl dist/lightgbm-$LGB_VER-py2.py3-none-macosx_10_13_x86_64.macosx_10_14_x86_64.macosx_10_15_x86_64.whl
+        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --plat-name=macosx --python-tag py3 || exit -1
+        mv dist/lightgbm-$LGB_VER-py3-none-macosx.whl dist/lightgbm-$LGB_VER-py3-none-macosx_10_13_x86_64.macosx_10_14_x86_64.macosx_10_15_x86_64.whl
         if [[ $AZURE == "true" ]]; then
-            cp dist/lightgbm-$LGB_VER-py2.py3-none-macosx*.whl $BUILD_ARTIFACTSTAGINGDIRECTORY
+            cp dist/lightgbm-$LGB_VER-py3-none-macosx*.whl $BUILD_ARTIFACTSTAGINGDIRECTORY
         fi
     else
-        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --plat-name=manylinux1_x86_64 --universal || exit -1
+        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --plat-name=manylinux1_x86_64 --python-tag py3 || exit -1
         if [[ $AZURE == "true" ]]; then
-            cp dist/lightgbm-$LGB_VER-py2.py3-none-manylinux1_x86_64.whl $BUILD_ARTIFACTSTAGINGDIRECTORY
+            cp dist/lightgbm-$LGB_VER-py3-none-manylinux1_x86_64.whl $BUILD_ARTIFACTSTAGINGDIRECTORY
         fi
     fi
     pip install --user $BUILD_DIRECTORY/python-package/dist/*.whl || exit -1
diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1
index 267c553bfb30..950b2463955d 100644
--- a/.ci/test_windows.ps1
+++ b/.ci/test_windows.ps1
@@ -49,7 +49,7 @@ elseif ($env:TASK -eq "sdist") {
 }
 elseif ($env:TASK -eq "bdist") {
   cd $env:BUILD_SOURCESDIRECTORY/python-package
-  python setup.py bdist_wheel --plat-name=win-amd64 --universal ; Check-Output $?
+  python setup.py bdist_wheel --plat-name=win-amd64 --python-tag py3 ; Check-Output $?
   cd dist; pip install @(Get-ChildItem *.whl) ; Check-Output $?
   cp @(Get-ChildItem *.whl) $env:BUILD_ARTIFACTSTAGINGDIRECTORY
 } elseif (($env:APPVEYOR -eq "true") -and ($env:TASK -eq "python")) {
diff --git a/.travis.yml b/.travis.yml
index 50d2eeabf6e9..930cc4d0f19e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,7 +17,7 @@ env:
     - PYTHON_VERSION=3.8
   matrix:
     - TASK=regular PYTHON_VERSION=3.6
-    - TASK=sdist PYTHON_VERSION=2.7
+    - TASK=sdist
     - TASK=bdist
     - TASK=if-else
     - TASK=lint
diff --git a/.vsts-ci.yml b/.vsts-ci.yml
index a5c0af13f522..9341134b6bcd 100644
--- a/.vsts-ci.yml
+++ b/.vsts-ci.yml
@@ -117,7 +117,6 @@ jobs:
         PYTHON_VERSION: 3.6
       sdist:
         TASK: sdist
-        PYTHON_VERSION: 2.7
       bdist:
         TASK: bdist
   steps:
diff --git a/docker/gpu/README.md b/docker/gpu/README.md
index 4baa6302bc93..13ff2bc686c9 100644
--- a/docker/gpu/README.md
+++ b/docker/gpu/README.md
@@ -13,13 +13,12 @@
 # Dockerfile for LightGBM GPU Version with Python
 
 `dockerfile.gpu` - A docker file with LightGBM utilizing nvidia-docker. The file is based on the `nvidia/cuda:8.0-cudnn5-devel` image.
-LightGBM can be utilized in GPU and CPU modes and via Python (2.7 & 3.6).
+LightGBM can be utilized in GPU and CPU modes and via Python.
 
 ## Contents
 
 - LightGBM (cpu + gpu)
-- Python 2.7 (conda) + scikit-learn, notebooks, pandas, matplotlib
-- Python 3.6 (conda) + scikit-learn, notebooks, pandas, matplotlib
+- Python 3.8 (conda) + scikit-learn, notebooks, pandas, matplotlib
 
 Running the container starts a Jupyter Notebook at `localhost:8888`.
 
diff --git a/docker/gpu/dockerfile.gpu b/docker/gpu/dockerfile.gpu
index 08c243a57bce..1b930b7b99c6 100644
--- a/docker/gpu/dockerfile.gpu
+++ b/docker/gpu/dockerfile.gpu
@@ -75,8 +75,7 @@ RUN echo "export PATH=$CONDA_DIR/bin:"'$PATH' > /etc/profile.d/conda.sh && \
     rm ~/miniconda.sh
 
 RUN conda config --set always_yes yes --set changeps1 no && \
-    conda create -y -q -n py2 python=2.7 mkl numpy scipy scikit-learn jupyter notebook ipython pandas matplotlib && \
-    conda create -y -q -n py3 python=3.6 mkl numpy scipy scikit-learn jupyter notebook ipython pandas matplotlib
+    conda create -y -q -n py3 python=3.8 mkl numpy scipy scikit-learn jupyter notebook ipython pandas matplotlib
 
 #################################################################################################################
 #           LightGBM
@@ -90,7 +89,6 @@ RUN cd /usr/local/src && mkdir lightgbm && cd lightgbm && \
 
 ENV PATH /usr/local/src/lightgbm/LightGBM:${PATH}
 
-RUN /bin/bash -c "source activate py2 && cd /usr/local/src/lightgbm/LightGBM/python-package && python setup.py install --precompile && source deactivate"
 RUN /bin/bash -c "source activate py3 && cd /usr/local/src/lightgbm/LightGBM/python-package && python setup.py install --precompile && source deactivate"
 
 #################################################################################################################
diff --git a/docs/conf.py b/docs/conf.py
index 103e3f51fa16..c0008be029b6 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -26,17 +26,13 @@
 from docutils.parsers.rst import Directive
 from sphinx.errors import VersionRequirementError
 from subprocess import PIPE, Popen
+from unittest.mock import Mock
 
 CURR_PATH = os.path.abspath(os.path.dirname(__file__))
 LIB_PATH = os.path.join(CURR_PATH, os.path.pardir, 'python-package')
 sys.path.insert(0, LIB_PATH)
 
 # -- mock out modules
-try:
-    from unittest.mock import Mock  # Python 3.x
-except ImportError:
-    from mock import Mock  # Python 2.x
-
 MOCK_MODULES = ['numpy', 'scipy', 'scipy.sparse',
                 'sklearn', 'matplotlib', 'pandas', 'graphviz']
 for mod_name in MOCK_MODULES:
@@ -208,9 +204,7 @@ def generate_doxygen_xml(app):
         "WARN_AS_ERROR=YES",
     ]
     doxygen_input = '\n'.join(doxygen_args)
-    is_py3 = sys.version[0] == "3"
-    if is_py3:
-        doxygen_input = bytes(doxygen_input, "utf-8")
+    doxygen_input = bytes(doxygen_input, "utf-8")
     if not os.path.exists(os.path.join(CURR_PATH, 'doxyoutput')):
         os.makedirs(os.path.join(CURR_PATH, 'doxyoutput'))
     try:
@@ -221,8 +215,7 @@ def generate_doxygen_xml(app):
         process = Popen(["doxygen", "-"],
                         stdin=PIPE, stdout=PIPE, stderr=PIPE)
         stdout, stderr = process.communicate(doxygen_input)
-        output = '\n'.join([i.decode('utf-8') if is_py3 else i
-                            for i in (stdout, stderr) if i is not None])
+        output = '\n'.join([i.decode('utf-8') for i in (stdout, stderr) if i is not None])
         if process.returncode != 0:
             raise RuntimeError(output)
         else:
diff --git a/docs/requirements_base.txt b/docs/requirements_base.txt
index 9c3dfc2a5b90..d9f0bfb8c916 100644
--- a/docs/requirements_base.txt
+++ b/docs/requirements_base.txt
@@ -1,3 +1,2 @@
 sphinx
 sphinx_rtd_theme >= 0.3
-mock; python_version < '3'
diff --git a/examples/python-guide/advanced_example.py b/examples/python-guide/advanced_example.py
index c38c6e469ff3..382497e6ff89 100644
--- a/examples/python-guide/advanced_example.py
+++ b/examples/python-guide/advanced_example.py
@@ -1,14 +1,11 @@
 # coding: utf-8
 import json
+import pickle
 import lightgbm as lgb
 import pandas as pd
 import numpy as np
 from sklearn.metrics import mean_squared_error
 
-try:
-    import cPickle as pickle
-except BaseException:
-    import pickle
 
 print('Loading data...')
 # load or create your dataset
diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py
index 786693f82412..f8d3bce3078f 100644
--- a/python-package/lightgbm/__init__.py
+++ b/python-package/lightgbm/__init__.py
@@ -3,16 +3,12 @@
 
 Contributors: https://github.com/microsoft/LightGBM/graphs/contributors.
 """
-from __future__ import absolute_import
-
 from .basic import Booster, Dataset
 from .callback import (early_stopping, print_evaluation, record_evaluation,
                        reset_parameter)
 from .engine import cv, train, CVBooster
 
 import os
-import sys
-import warnings
 
 try:
     from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker
@@ -36,8 +32,3 @@
            'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
            'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping',
            'plot_importance', 'plot_split_value_histogram', 'plot_metric', 'plot_tree', 'create_tree_digraph']
-
-# REMOVEME: remove warning after 3.1.0 version release
-if sys.version_info[0] == 2:
-    warnings.warn("LightGBM 3.1 version is the last version that supports Python 2.\n"
-                  "Next release will drop the support.", UserWarning)
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index 60f6b70f0881..7bea3a5fec44 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -1,9 +1,8 @@
 # coding: utf-8
 """Wrapper for C API of LightGBM."""
-from __future__ import absolute_import, print_function
-
 import copy
 import ctypes
+import json
 import os
 import warnings
 from tempfile import NamedTemporaryFile
@@ -12,18 +11,13 @@
 import numpy as np
 import scipy.sparse
 
-from .compat import (PANDAS_INSTALLED, DataFrame, Series, is_dtype_sparse,
-                     DataTable,
-                     decode_string, string_type,
-                     integer_types, numeric_types,
-                     json, json_default_with_numpy,
-                     range_, zip_)
+from .compat import PANDAS_INSTALLED, DataFrame, Series, is_dtype_sparse, DataTable
 from .libpath import find_lib_path
 
 
 def _log_callback(msg):
     """Redirect logs from native library into Python console."""
-    print("{0:s}".format(decode_string(msg)), end='')
+    print("{0:s}".format(msg.decode('utf-8')), end='')
 
 
 def _load_lib():
@@ -36,13 +30,16 @@ def _load_lib():
     callback = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
     lib.callback = callback(_log_callback)
     if lib.LGBM_RegisterLogCallback(lib.callback) != 0:
-        raise LightGBMError(decode_string(lib.LGBM_GetLastError()))
+        raise LightGBMError(lib.LGBM_GetLastError().decode('utf-8'))
     return lib
 
 
 _LIB = _load_lib()
 
 
+NUMERIC_TYPES = (int, float, bool)
+
+
 def _safe_call(ret):
     """Check the return value from C API call.
 
@@ -52,7 +49,7 @@ def _safe_call(ret):
         The return value from C API calls.
     """
     if ret != 0:
-        raise LightGBMError(decode_string(_LIB.LGBM_GetLastError()))
+        raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8'))
 
 
 def is_numeric(obj):
@@ -136,6 +133,16 @@ def c_array(ctype, values):
     return (ctype * len(values))(*values)
 
 
+def json_default_with_numpy(obj):
+    """Convert numpy classes to JSON serializable objects."""
+    if isinstance(obj, (np.integer, np.floating, np.bool_)):
+        return obj.item()
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    else:
+        return obj
+
+
 def param_dict_to_str(data):
     """Convert Python dictionary to string, which is passed to C API."""
     if data is None or not data:
@@ -149,7 +156,7 @@ def to_string(x):
                 else:
                     return str(x)
             pairs.append(str(key) + '=' + ','.join(map(to_string, val)))
-        elif isinstance(val, string_type) or isinstance(val, numeric_types) or is_numeric(val):
+        elif isinstance(val, (str, NUMERIC_TYPES)) or is_numeric(val):
             pairs.append(str(key) + '=' + str(val))
         elif val is not None:
             raise TypeError('Unknown type of parameter:%s, got:%s'
@@ -157,7 +164,7 @@ def to_string(x):
     return ' '.join(pairs)
 
 
-class _TempFile(object):
+class _TempFile:
     def __enter__(self):
         with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f:
             self.name = f.name
@@ -183,7 +190,14 @@ class LightGBMError(Exception):
     pass
 
 
-class _ConfigAliases(object):
+# DeprecationWarning is not shown by default, so let's create our own with higher level
+class LGBMDeprecationWarning(UserWarning):
+    """Custom deprecation warning."""
+
+    pass
+
+
+class _ConfigAliases:
     aliases = {"bin_construct_sample_cnt": {"bin_construct_sample_cnt",
                                             "subsample_for_bin"},
                "boosting": {"boosting",
@@ -375,7 +389,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
         else:
             if len(cat_cols) != len(pandas_categorical):
                 raise ValueError('train and valid dataset categorical_feature do not match.')
-            for col, category in zip_(cat_cols, pandas_categorical):
+            for col, category in zip(cat_cols, pandas_categorical):
                 if list(data[col].cat.categories) != list(category):
                     data[col] = data[col].cat.set_categories(category)
         if len(cat_cols):  # cat_cols is list
@@ -440,9 +454,9 @@ def _load_pandas_categorical(file_name=None, model_str=None):
                 if len(lines) >= 2:
                     break
                 offset *= 2
-        last_line = decode_string(lines[-1]).strip()
+        last_line = lines[-1].decode('utf-8').strip()
         if not last_line.startswith(pandas_key):
-            last_line = decode_string(lines[-2]).strip()
+            last_line = lines[-2].decode('utf-8').strip()
     elif model_str is not None:
         idx = model_str.rfind('\n', 0, offset)
         last_line = model_str[idx:].strip()
@@ -452,7 +466,7 @@ def _load_pandas_categorical(file_name=None, model_str=None):
         return None
 
 
-class _InnerPredictor(object):
+class _InnerPredictor:
     """_InnerPredictor of LightGBM.
 
     Not exposed to user.
@@ -563,7 +577,7 @@ def predict(self, data, start_iteration=0, num_iteration=-1,
             predict_type = C_API_PREDICT_CONTRIB
         int_data_has_header = 1 if data_has_header else 0
 
-        if isinstance(data, string_type):
+        if isinstance(data, str):
             with _TempFile() as f:
                 _safe_call(_LIB.LGBM_BoosterPredictForFile(
                     self.handle,
@@ -668,8 +682,8 @@ def inner_predict(mat, start_iteration, num_iteration, predict_type, preds=None)
             n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])]
             n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
             preds = np.zeros(sum(n_preds), dtype=np.float64)
-            for chunk, (start_idx_pred, end_idx_pred) in zip_(np.array_split(mat, sections),
-                                                              zip_(n_preds_sections, n_preds_sections[1:])):
+            for chunk, (start_idx_pred, end_idx_pred) in zip(np.array_split(mat, sections),
+                                                             zip(n_preds_sections, n_preds_sections[1:])):
                 # avoid memory consumption by arrays concatenation operations
                 inner_predict(chunk, start_iteration, num_iteration, predict_type, preds[start_idx_pred:end_idx_pred])
             return preds, nrow
@@ -807,8 +821,8 @@ def inner_predict_sparse(csr, start_iteration, num_iteration, predict_type):
             n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff(sections)]
             n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
             preds = np.zeros(sum(n_preds), dtype=np.float64)
-            for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip_(zip_(sections, sections[1:]),
-                                                                             zip_(n_preds_sections, n_preds_sections[1:])):
+            for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip(zip(sections, sections[1:]),
+                                                                            zip(n_preds_sections, n_preds_sections[1:])):
                 # avoid memory consumption by arrays concatenation operations
                 inner_predict(csr[start_idx:end_idx], start_iteration, num_iteration, predict_type, preds[start_idx_pred:end_idx_pred])
             return preds, nrow
@@ -906,7 +920,7 @@ def current_iteration(self):
         return out_cur_iter.value
 
 
-class Dataset(object):
+class Dataset:
     """Dataset in LightGBM."""
 
     def __init__(self, data, label=None, reference=None,
@@ -1018,7 +1032,7 @@ def _free_handle(self):
 
     def _set_init_score_by_predictor(self, predictor, data, used_indices=None):
         data_has_header = False
-        if isinstance(data, string_type):
+        if isinstance(data, str):
             # check data has header or not
             data_has_header = any(self.params.get(alias, False) for alias in _ConfigAliases.get("header"))
         num_data = self.num_data()
@@ -1029,18 +1043,18 @@ def _set_init_score_by_predictor(self, predictor, data, used_indices=None):
                                            is_reshape=False)
             if used_indices is not None:
                 assert not self.need_slice
-                if isinstance(data, string_type):
+                if isinstance(data, str):
                     sub_init_score = np.zeros(num_data * predictor.num_class, dtype=np.float32)
                     assert num_data == len(used_indices)
-                    for i in range_(len(used_indices)):
-                        for j in range_(predictor.num_class):
+                    for i in range(len(used_indices)):
+                        for j in range(predictor.num_class):
                             sub_init_score[i * predictor.num_class + j] = init_score[used_indices[i] * predictor.num_class + j]
                     init_score = sub_init_score
             if predictor.num_class > 1:
                 # need to regroup init_score
                 new_init_score = np.zeros(init_score.size, dtype=np.float32)
-                for i in range_(num_data):
-                    for j in range_(predictor.num_class):
+                for i in range(num_data):
+                    for j in range(predictor.num_class):
                         new_init_score[j * num_data + i] = init_score[i * predictor.num_class + j]
                 init_score = new_init_score
         elif self.init_score is not None:
@@ -1085,9 +1099,9 @@ def _lazy_init(self, data, label=None, reference=None,
             if feature_name is not None:
                 feature_dict = {name: i for i, name in enumerate(feature_name)}
             for name in categorical_feature:
-                if isinstance(name, string_type) and name in feature_dict:
+                if isinstance(name, str) and name in feature_dict:
                     categorical_indices.add(feature_dict[name])
-                elif isinstance(name, integer_types):
+                elif isinstance(name, int):
                     categorical_indices.add(name)
                 else:
                     raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature"
@@ -1108,7 +1122,7 @@ def _lazy_init(self, data, label=None, reference=None,
         elif reference is not None:
             raise TypeError('Reference dataset should be None or dataset instance')
         # start construct data
-        if isinstance(data, string_type):
+        if isinstance(data, str):
             self.handle = ctypes.c_void_p()
             _safe_call(_LIB.LGBM_DatasetCreateFromFile(
                 c_str(data),
@@ -1297,7 +1311,7 @@ def construct(self):
                     assert used_indices.flags.c_contiguous
                     if self.reference.group is not None:
                         group_info = np.array(self.reference.group).astype(np.int32, copy=False)
-                        _, self.group = np.unique(np.repeat(range_(len(group_info)), repeats=group_info)[self.used_indices],
+                        _, self.group = np.unique(np.repeat(range(len(group_info)), repeats=group_info)[self.used_indices],
                                                   return_counts=True)
                     self.handle = ctypes.c_void_p()
                     params_str = param_dict_to_str(self.params)
@@ -1433,7 +1447,7 @@ def update():
                     update()
                     self._free_handle()
                 else:
-                    raise LightGBMError(decode_string(_LIB.LGBM_GetLastError()))
+                    raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8'))
         return self
 
     def _reverse_update_params(self):
@@ -1727,7 +1741,7 @@ def get_feature_name(self):
         tmp_out_len = ctypes.c_int(0)
         reserved_string_buffer_size = 255
         required_string_buffer_size = ctypes.c_size_t(0)
-        string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for i in range_(num_feature)]
+        string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for i in range(num_feature)]
         ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))
         _safe_call(_LIB.LGBM_DatasetGetFeatureNames(
             self.handle,
@@ -1743,7 +1757,7 @@ def get_feature_name(self):
                 "Allocated feature name buffer size ({}) was inferior to the needed size ({})."
                 .format(reserved_string_buffer_size, required_string_buffer_size.value)
             )
-        return [string_buffers[i].value.decode('utf-8') for i in range_(num_feature)]
+        return [string_buffers[i].value.decode('utf-8') for i in range(num_feature)]
 
     def get_label(self):
         """Get the label of the Dataset.
@@ -1997,7 +2011,7 @@ def _dump_text(self, filename):
         return self
 
 
-class Booster(object):
+class Booster:
     """Booster in LightGBM."""
 
     def __init__(self, params=None, train_set=None, model_file=None, model_str=None, silent=False):
@@ -2037,7 +2051,7 @@ def __init__(self, params=None, train_set=None, model_file=None, model_str=None,
             for alias in _ConfigAliases.get("machines"):
                 if alias in params:
                     machines = params[alias]
-                    if isinstance(machines, string_type):
+                    if isinstance(machines, str):
                         num_machines = len(machines.split(','))
                     elif isinstance(machines, (list, set)):
                         num_machines = len(machines)
@@ -2458,7 +2472,7 @@ def update(self, train_set=None, fobj=None):
             _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
                 self.handle,
                 ctypes.byref(is_finished)))
-            self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
+            self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
             return is_finished.value == 1
         else:
             if not self.__set_objective_to_none:
@@ -2501,7 +2515,7 @@ def __boost(self, grad, hess):
             grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
             hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
             ctypes.byref(is_finished)))
-        self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
+        self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
         return is_finished.value == 1
 
     def rollback_one_iter(self):
@@ -2514,7 +2528,7 @@ def rollback_one_iter(self):
         """
         _safe_call(_LIB.LGBM_BoosterRollbackOneIter(
             self.handle))
-        self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
+        self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
         return self
 
     def current_iteration(self):
@@ -2627,7 +2641,7 @@ def eval(self, data, name, feval=None):
         if data is self.train_set:
             data_idx = 0
         else:
-            for i in range_(len(self.valid_sets)):
+            for i in range(len(self.valid_sets)):
                 if data is self.valid_sets[i]:
                     data_idx = i + 1
                     break
@@ -2700,7 +2714,7 @@ def eval_valid(self, feval=None):
         result : list
             List with evaluation results.
         """
-        return [item for i in range_(1, self.__num_dataset)
+        return [item for i in range(1, self.__num_dataset)
                 for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)]
 
     def save_model(self, filename, num_iteration=None, start_iteration=0, importance_type='split'):
@@ -3060,7 +3074,7 @@ def feature_name(self):
         tmp_out_len = ctypes.c_int(0)
         reserved_string_buffer_size = 255
         required_string_buffer_size = ctypes.c_size_t(0)
-        string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for i in range_(num_feature)]
+        string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for i in range(num_feature)]
         ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))
         _safe_call(_LIB.LGBM_BoosterGetFeatureNames(
             self.handle,
@@ -3076,7 +3090,7 @@ def feature_name(self):
                 "Allocated feature name buffer size ({}) was inferior to the needed size ({})."
                 .format(reserved_string_buffer_size, required_string_buffer_size.value)
             )
-        return [string_buffers[i].value.decode('utf-8') for i in range_(num_feature)]
+        return [string_buffers[i].value.decode('utf-8') for i in range(num_feature)]
 
     def feature_importance(self, importance_type='split', iteration=None):
         """Get feature importances.
@@ -3147,12 +3161,12 @@ def get_split_value_histogram(self, feature, bins=None, xgboost_style=False):
         def add(root):
             """Recursively add thresholds."""
             if 'split_index' in root:  # non-leaf
-                if feature_names is not None and isinstance(feature, string_type):
+                if feature_names is not None and isinstance(feature, str):
                     split_feature = feature_names[root['split_feature']]
                 else:
                     split_feature = root['split_feature']
                 if split_feature == feature:
-                    if isinstance(root['threshold'], string_type):
+                    if isinstance(root['threshold'], str):
                         raise LightGBMError('Cannot compute split value histogram for the categorical feature')
                     else:
                         values.append(root['threshold'])
@@ -3166,7 +3180,7 @@ def add(root):
         for tree_info in tree_infos:
             add(tree_info['tree_structure'])
 
-        if bins is None or isinstance(bins, integer_types) and xgboost_style:
+        if bins is None or isinstance(bins, int) and xgboost_style:
             n_unique = len(np.unique(values))
             bins = max(min(n_unique, bins) if bins is not None else n_unique, 1)
         hist, bin_edges = np.histogram(values, bins=bins)
@@ -3196,7 +3210,7 @@ def __inner_eval(self, data_name, data_idx, feval=None):
                 result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
             if tmp_out_len.value != self.__num_inner_eval:
                 raise ValueError("Wrong length of eval results")
-            for i in range_(self.__num_inner_eval):
+            for i in range(self.__num_inner_eval):
                 ret.append((data_name, self.__name_inner_eval[i],
                             result[i], self.__higher_better_inner_eval[i]))
         if callable(feval):
@@ -3258,7 +3272,7 @@ def __get_eval_info(self):
                 reserved_string_buffer_size = 255
                 required_string_buffer_size = ctypes.c_size_t(0)
                 string_buffers = [
-                    ctypes.create_string_buffer(reserved_string_buffer_size) for i in range_(self.__num_inner_eval)
+                    ctypes.create_string_buffer(reserved_string_buffer_size) for i in range(self.__num_inner_eval)
                 ]
                 ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
                 _safe_call(_LIB.LGBM_BoosterGetEvalNames(
@@ -3276,7 +3290,7 @@ def __get_eval_info(self):
                         .format(reserved_string_buffer_size, required_string_buffer_size.value)
                     )
                 self.__name_inner_eval = \
-                    [string_buffers[i].value.decode('utf-8') for i in range_(self.__num_inner_eval)]
+                    [string_buffers[i].value.decode('utf-8') for i in range(self.__num_inner_eval)]
                 self.__higher_better_inner_eval = \
                     [name.startswith(('auc', 'ndcg@', 'map@')) for name in self.__name_inner_eval]
 
@@ -3312,7 +3326,7 @@ def set_attr(self, **kwargs):
         """
         for key, value in kwargs.items():
             if value is not None:
-                if not isinstance(value, string_type):
+                if not isinstance(value, str):
                     raise ValueError("Only string values are accepted")
                 self.__attr[key] = value
             else:
diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py
index 5d12539177f8..9140127c846b 100644
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -1,13 +1,10 @@
 # coding: utf-8
 """Callbacks library."""
-from __future__ import absolute_import
-
 import collections
 import warnings
 from operator import gt, lt
 
 from .basic import _ConfigAliases
-from .compat import range_
 
 
 class EarlyStopException(Exception):
@@ -23,7 +20,7 @@ def __init__(self, best_iteration, best_score):
         best_score : float
             The score of the best iteration.
         """
-        super(EarlyStopException, self).__init__()
+        super().__init__()
         self.best_iteration = best_iteration
         self.best_score = best_score
 
@@ -219,7 +216,7 @@ def _callback(env):
             _init(env)
         if not enabled[0]:
             return
-        for i in range_(len(env.evaluation_result_list)):
+        for i in range(len(env.evaluation_result_list)):
             score = env.evaluation_result_list[i][2]
             if best_score_list[i] is None or cmp_op[i](score, best_score[i]):
                 best_score[i] = score
diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 5d951a56800a..fa12ae2c975a 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -1,62 +1,5 @@
 # coding: utf-8
 """Compatibility library."""
-from __future__ import absolute_import
-
-import inspect
-import sys
-
-import numpy as np
-
-is_py3 = (sys.version_info[0] == 3)
-
-"""Compatibility between Python2 and Python3"""
-if is_py3:
-    zip_ = zip
-    string_type = str
-    numeric_types = (int, float, bool)
-    integer_types = (int, )
-    range_ = range
-
-    def argc_(func):
-        """Count the number of arguments of a function."""
-        return len(inspect.signature(func).parameters)
-
-    def decode_string(bytestring):
-        """Decode C bytestring to ordinary string."""
-        return bytestring.decode('utf-8')
-else:
-    from itertools import izip as zip_
-    string_type = basestring
-    numeric_types = (int, long, float, bool)
-    integer_types = (int, long)
-    range_ = xrange
-
-    def argc_(func):
-        """Count the number of arguments of a function."""
-        return len(inspect.getargspec(func).args)
-
-    def decode_string(bytestring):
-        """Decode C bytestring to ordinary string."""
-        return bytestring
-
-"""json"""
-try:
-    import simplejson as json
-except (ImportError, SyntaxError):
-    # simplejson does not support Python 3.2, it throws a SyntaxError
-    # because of u'...' Unicode literals.
-    import json
-
-
-def json_default_with_numpy(obj):
-    """Convert numpy classes to JSON serializable objects."""
-    if isinstance(obj, (np.integer, np.floating, np.bool_)):
-        return obj.item()
-    elif isinstance(obj, np.ndarray):
-        return obj.tolist()
-    else:
-        return obj
-
 
 """pandas"""
 try:
@@ -66,12 +9,12 @@ def json_default_with_numpy(obj):
 except ImportError:
     PANDAS_INSTALLED = False
 
-    class Series(object):
+    class Series:
         """Dummy class for pandas.Series."""
 
         pass
 
-    class DataFrame(object):
+    class DataFrame:
         """Dummy class for pandas.DataFrame."""
 
         pass
@@ -103,7 +46,7 @@ class DataFrame(object):
 except ImportError:
     DATATABLE_INSTALLED = False
 
-    class DataTable(object):
+    class DataTable:
         """Dummy class for DataTable."""
 
         pass
@@ -162,10 +105,3 @@ def _check_sample_weight(sample_weight, X, dtype=None):
     _LGBMAssertAllFinite = None
     _LGBMCheckClassificationTargets = None
     _LGBMComputeSampleWeight = None
-
-
-# DeprecationWarning is not shown by default, so let's create our own with higher level
-class LGBMDeprecationWarning(UserWarning):
-    """Custom deprecation warning."""
-
-    pass
diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py
index 601b52cf1030..ee41a1b903b8 100644
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -1,7 +1,5 @@
 # coding: utf-8
 """Library with training routines of LightGBM."""
-from __future__ import absolute_import
-
 import collections
 import copy
 import warnings
@@ -11,8 +9,7 @@
 
 from . import callback
 from .basic import Booster, Dataset, LightGBMError, _ConfigAliases, _InnerPredictor
-from .compat import (SKLEARN_INSTALLED, _LGBMGroupKFold, _LGBMStratifiedKFold,
-                     string_type, integer_types, range_, zip_)
+from .compat import SKLEARN_INSTALLED, _LGBMGroupKFold, _LGBMStratifiedKFold
 
 
 def train(params, train_set, num_boost_round=100,
@@ -159,7 +156,7 @@ def train(params, train_set, num_boost_round=100,
 
     if num_boost_round <= 0:
         raise ValueError("num_boost_round should be greater than zero.")
-    if isinstance(init_model, string_type):
+    if isinstance(init_model, str):
         predictor = _InnerPredictor(model_file=init_model, pred_parameter=params)
     elif isinstance(init_model, Booster):
         predictor = init_model._to_predictor(dict(init_model.params, **params))
@@ -182,7 +179,7 @@ def train(params, train_set, num_boost_round=100,
     if valid_sets is not None:
         if isinstance(valid_sets, Dataset):
             valid_sets = [valid_sets]
-        if isinstance(valid_names, string_type):
+        if isinstance(valid_names, str):
             valid_names = [valid_names]
         for i, valid_data in enumerate(valid_sets):
             # reduce cost for prediction training data
@@ -209,7 +206,7 @@ def train(params, train_set, num_boost_round=100,
     # Most of legacy advanced options becomes callbacks
     if verbose_eval is True:
         callbacks.add(callback.print_evaluation())
-    elif isinstance(verbose_eval, integer_types):
+    elif isinstance(verbose_eval, int):
         callbacks.add(callback.print_evaluation(verbose_eval))
 
     if early_stopping_rounds is not None and early_stopping_rounds > 0:
@@ -231,7 +228,7 @@ def train(params, train_set, num_boost_round=100,
         booster = Booster(params=params, train_set=train_set)
         if is_valid_contain_train:
             booster.set_train_data_name(train_data_name)
-        for valid_set, name_valid_set in zip_(reduced_valid_sets, name_valid_sets):
+        for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
             booster.add_valid(valid_set, name_valid_set)
     finally:
         train_set._reverse_update_params()
@@ -240,7 +237,7 @@ def train(params, train_set, num_boost_round=100,
     booster.best_iteration = 0
 
     # start training
-    for i in range_(init_iteration, init_iteration + num_boost_round):
+    for i in range(init_iteration, init_iteration + num_boost_round):
         for cb in callbacks_before_iter:
             cb(callback.CallbackEnv(model=booster,
                                     params=params,
@@ -277,7 +274,7 @@ def train(params, train_set, num_boost_round=100,
     return booster
 
 
-class CVBooster(object):
+class CVBooster:
     """CVBooster in LightGBM.
 
     Auxiliary data structure to hold and redirect all boosters of ``cv`` function.
@@ -328,7 +325,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
             group_info = full_data.get_group()
             if group_info is not None:
                 group_info = np.array(group_info, dtype=np.int32, copy=False)
-                flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
+                flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
             else:
                 flatted_group = np.zeros(num_data, dtype=np.int32)
             folds = folds.split(X=np.zeros(num_data), y=full_data.get_label(), groups=flatted_group)
@@ -340,7 +337,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
                 raise LightGBMError('Scikit-learn is required for ranking cv.')
             # ranking task, split according to groups
             group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
-            flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
+            flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
             group_kfold = _LGBMGroupKFold(n_splits=nfold)
             folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group)
         elif stratified:
@@ -354,9 +351,9 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
             else:
                 randidx = np.arange(num_data)
             kstep = int(num_data / nfold)
-            test_id = [randidx[i: i + kstep] for i in range_(0, num_data, kstep)]
-            train_id = [np.concatenate([test_id[i] for i in range_(nfold) if k != i]) for k in range_(nfold)]
-            folds = zip_(train_id, test_id)
+            test_id = [randidx[i: i + kstep] for i in range(0, num_data, kstep)]
+            train_id = [np.concatenate([test_id[i] for i in range(nfold) if k != i]) for k in range(nfold)]
+            folds = zip(train_id, test_id)
 
     ret = CVBooster()
     for train_idx, test_idx in folds:
@@ -539,7 +536,7 @@ def cv(params, train_set, num_boost_round=100,
 
     if num_boost_round <= 0:
         raise ValueError("num_boost_round should be greater than zero.")
-    if isinstance(init_model, string_type):
+    if isinstance(init_model, str):
         predictor = _InnerPredictor(model_file=init_model, pred_parameter=params)
     elif isinstance(init_model, Booster):
         predictor = init_model._to_predictor(dict(init_model.params, **params))
@@ -573,7 +570,7 @@ def cv(params, train_set, num_boost_round=100,
         callbacks.add(callback.early_stopping(early_stopping_rounds, first_metric_only, verbose=False))
     if verbose_eval is True:
         callbacks.add(callback.print_evaluation(show_stdv=show_stdv))
-    elif isinstance(verbose_eval, integer_types):
+    elif isinstance(verbose_eval, int):
         callbacks.add(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
 
     callbacks_before_iter = {cb for cb in callbacks if getattr(cb, 'before_iteration', False)}
@@ -581,7 +578,7 @@ def cv(params, train_set, num_boost_round=100,
     callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order'))
     callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order'))
 
-    for i in range_(num_boost_round):
+    for i in range(num_boost_round):
         for cb in callbacks_before_iter:
             cb(callback.CallbackEnv(model=cvfolds,
                                     params=params,
diff --git a/python-package/lightgbm/plotting.py b/python-package/lightgbm/plotting.py
index ae57a449cb74..03dbc1e86818 100644
--- a/python-package/lightgbm/plotting.py
+++ b/python-package/lightgbm/plotting.py
@@ -1,7 +1,5 @@
 # coding: utf-8
 """Plotting library."""
-from __future__ import absolute_import, division
-
 import warnings
 from copy import deepcopy
 from io import BytesIO
@@ -9,8 +7,7 @@
 import numpy as np
 
 from .basic import Booster
-from .compat import (MATPLOTLIB_INSTALLED, GRAPHVIZ_INSTALLED,
-                     range_, zip_, string_type)
+from .compat import MATPLOTLIB_INSTALLED, GRAPHVIZ_INSTALLED
 from .sklearn import LGBMModel
 
 
@@ -22,7 +19,7 @@ def _check_not_tuple_of_2_elements(obj, obj_name='obj'):
 
 def _float2str(value, precision=None):
     return ("{0:.{1}f}".format(value, precision)
-            if precision is not None and not isinstance(value, string_type)
+            if precision is not None and not isinstance(value, str)
             else str(value))
 
 
@@ -97,12 +94,12 @@ def plot_importance(booster, ax=None, height=0.2,
     if not len(importance):
         raise ValueError("Booster's feature_importance is empty.")
 
-    tuples = sorted(zip_(feature_name, importance), key=lambda x: x[1])
+    tuples = sorted(zip(feature_name, importance), key=lambda x: x[1])
     if ignore_zero:
         tuples = [x for x in tuples if x[1] > 0]
     if max_num_features is not None and max_num_features > 0:
         tuples = tuples[-max_num_features:]
-    labels, values = zip_(*tuples)
+    labels, values = zip(*tuples)
 
     if ax is None:
         if figsize is not None:
@@ -112,7 +109,7 @@ def plot_importance(booster, ax=None, height=0.2,
     ylocs = np.arange(len(values))
     ax.barh(ylocs, values, align='center', height=height, **kwargs)
 
-    for x, y in zip_(values, ylocs):
+    for x, y in zip(values, ylocs):
         ax.text(x + 1, y,
                 _float2str(x, precision) if importance_type == 'gain' else x,
                 va='center')
@@ -238,7 +235,7 @@ def plot_split_value_histogram(booster, feature, bins=None, ax=None, width_coef=
 
     if title is not None:
         title = title.replace('@feature@', str(feature))
-        title = title.replace('@index/name@', ('name' if isinstance(feature, string_type) else 'index'))
+        title = title.replace('@index/name@', ('name' if isinstance(feature, str) else 'index'))
         ax.set_title(title)
     if xlabel is not None:
         ax.set_xlabel(xlabel)
@@ -337,7 +334,7 @@ def plot_metric(booster, metric=None, dataset_names=None,
             raise KeyError('No given metric in eval results.')
         results = metrics_for_one[metric]
     num_iteration, max_result, min_result = len(results), max(results), min(results)
-    x_ = range_(num_iteration)
+    x_ = range(num_iteration)
     ax.plot(x_, results, label=name)
 
     for name in dataset_names:
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index ed3ccea6c238..45c3d04e77f6 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -1,10 +1,10 @@
 # coding: utf-8
 """Scikit-learn wrapper interface for LightGBM."""
-from __future__ import absolute_import
-
 import copy
 import warnings
 
+from inspect import signature
+
 import numpy as np
 
 from .basic import Dataset, LightGBMError, _ConfigAliases
@@ -12,11 +12,11 @@
                      LGBMNotFittedError, _LGBMLabelEncoder, _LGBMModelBase,
                      _LGBMRegressorBase, _LGBMCheckXY, _LGBMCheckArray, _LGBMCheckSampleWeight,
                      _LGBMAssertAllFinite, _LGBMCheckClassificationTargets, _LGBMComputeSampleWeight,
-                     argc_, range_, zip_, string_type, DataFrame, DataTable)
+                     DataFrame, DataTable)
 from .engine import train
 
 
-class _ObjectiveFunctionWrapper(object):
+class _ObjectiveFunctionWrapper:
     """Proxy class for objective function."""
 
     def __init__(self, func):
@@ -69,7 +69,7 @@ def __call__(self, preds, dataset):
             The value of the second order derivative (Hessian) for each sample point.
         """
         labels = dataset.get_label()
-        argc = argc_(self.func)
+        argc = len(signature(self.func).parameters)
         if argc == 2:
             grad, hess = self.func(labels, preds)
         elif argc == 3:
@@ -88,15 +88,15 @@ def __call__(self, preds, dataset):
                 num_class = len(grad) // num_data
                 if num_class * num_data != len(grad):
                     raise ValueError("Length of grad and hess should equal to num_class * num_data")
-                for k in range_(num_class):
-                    for i in range_(num_data):
+                for k in range(num_class):
+                    for i in range(num_data):
                         idx = k * num_data + i
                         grad[idx] *= weight[i]
                         hess[idx] *= weight[i]
         return grad, hess
 
 
-class _EvalFunctionWrapper(object):
+class _EvalFunctionWrapper:
     """Proxy class for evaluation function."""
 
     def __init__(self, func):
@@ -158,7 +158,7 @@ def __call__(self, preds, dataset):
             Is eval result higher better, e.g. AUC is ``is_higher_better``.
         """
         labels = dataset.get_label()
-        argc = argc_(self.func)
+        argc = len(signature(self.func).parameters)
         if argc == 2:
             return self.func(labels, preds)
         elif argc == 3:
@@ -340,7 +340,7 @@ def get_params(self, deep=True):
         params : dict
             Parameter names mapped to their values.
         """
-        params = super(LGBMModel, self).get_params(deep=deep)
+        params = super().get_params(deep=deep)
         params.update(self._other_params)
         return params
 
@@ -518,10 +518,10 @@ def fit(self, X, y,
 
         # Separate built-in from callable evaluation metrics
         eval_metrics_callable = [_EvalFunctionWrapper(f) for f in eval_metric_list if callable(f)]
-        eval_metrics_builtin = [m for m in eval_metric_list if isinstance(m, string_type)]
+        eval_metrics_builtin = [m for m in eval_metric_list if isinstance(m, str)]
 
         # register default metric for consistency with callable eval_metric case
-        original_metric = self._objective if isinstance(self._objective, string_type) else None
+        original_metric = self._objective if isinstance(self._objective, str) else None
         if original_metric is None:
             # try to deduce from class instance
             if isinstance(self, LGBMRegressor):
@@ -537,7 +537,7 @@ def fit(self, X, y,
                 original_metric = params.pop(metric_alias)
 
         # concatenate metric from params (or default if not provided in params) and eval_metric
-        original_metric = [original_metric] if isinstance(original_metric, (string_type, type(None))) else original_metric
+        original_metric = [original_metric] if isinstance(original_metric, (str, type(None))) else original_metric
         params['metric'] = [e for e in eval_metrics_builtin if e not in original_metric] + original_metric
         params['metric'] = [metric for metric in params['metric'] if metric is not None]
 
@@ -767,16 +767,11 @@ def fit(self, X, y,
             verbose=True, feature_name='auto', categorical_feature='auto',
             callbacks=None, init_model=None):
         """Docstring is inherited from the LGBMModel."""
-        super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
-                                       init_score=init_score, eval_set=eval_set,
-                                       eval_names=eval_names,
-                                       eval_sample_weight=eval_sample_weight,
-                                       eval_init_score=eval_init_score,
-                                       eval_metric=eval_metric,
-                                       early_stopping_rounds=early_stopping_rounds,
-                                       verbose=verbose, feature_name=feature_name,
-                                       categorical_feature=categorical_feature,
-                                       callbacks=callbacks, init_model=init_model)
+        super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
+                    eval_set=eval_set, eval_names=eval_names, eval_sample_weight=eval_sample_weight,
+                    eval_init_score=eval_init_score, eval_metric=eval_metric,
+                    early_stopping_rounds=early_stopping_rounds, verbose=verbose, feature_name=feature_name,
+                    categorical_feature=categorical_feature, callbacks=callbacks, init_model=init_model)
         return self
 
     _base_doc = LGBMModel.fit.__doc__
@@ -803,7 +798,7 @@ def fit(self, X, y,
         _LGBMCheckClassificationTargets(y)
         self._le = _LGBMLabelEncoder().fit(y)
         _y = self._le.transform(y)
-        self._class_map = dict(zip_(self._le.classes_, self._le.transform(self._le.classes_)))
+        self._class_map = dict(zip(self._le.classes_, self._le.transform(self._le.classes_)))
         if isinstance(self.class_weight, dict):
             self._class_weight = {self._class_map[k]: v for k, v in self.class_weight.items()}
 
@@ -817,7 +812,7 @@ def fit(self, X, y,
                 self._objective = "multiclass"
 
         if not callable(eval_metric):
-            if isinstance(eval_metric, (string_type, type(None))):
+            if isinstance(eval_metric, (str, type(None))):
                 eval_metric = [eval_metric]
             if self._n_classes > 2:
                 for index, metric in enumerate(eval_metric):
@@ -844,17 +839,12 @@ def fit(self, X, y,
                 else:
                     valid_sets[i] = (valid_x, self._le.transform(valid_y))
 
-        super(LGBMClassifier, self).fit(X, _y, sample_weight=sample_weight,
-                                        init_score=init_score, eval_set=valid_sets,
-                                        eval_names=eval_names,
-                                        eval_sample_weight=eval_sample_weight,
-                                        eval_class_weight=eval_class_weight,
-                                        eval_init_score=eval_init_score,
-                                        eval_metric=eval_metric,
-                                        early_stopping_rounds=early_stopping_rounds,
-                                        verbose=verbose, feature_name=feature_name,
-                                        categorical_feature=categorical_feature,
-                                        callbacks=callbacks, init_model=init_model)
+        super().fit(X, _y, sample_weight=sample_weight, init_score=init_score, eval_set=valid_sets,
+                    eval_names=eval_names, eval_sample_weight=eval_sample_weight,
+                    eval_class_weight=eval_class_weight, eval_init_score=eval_init_score,
+                    eval_metric=eval_metric, early_stopping_rounds=early_stopping_rounds,
+                    verbose=verbose, feature_name=feature_name, categorical_feature=categorical_feature,
+                    callbacks=callbacks, init_model=init_model)
         return self
 
     _base_doc = LGBMModel.fit.__doc__
@@ -919,8 +909,7 @@ def predict_proba(self, X, raw_score=False, start_iteration=0, num_iteration=Non
         X_SHAP_values : array-like of shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects
             If ``pred_contrib=True``, the feature contributions for each sample.
         """
-        result = super(LGBMClassifier, self).predict(X, raw_score, start_iteration, num_iteration,
-                                                     pred_leaf, pred_contrib, **kwargs)
+        result = super().predict(X, raw_score, start_iteration, num_iteration, pred_leaf, pred_contrib, **kwargs)
         if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib):
             warnings.warn("Cannot compute class probabilities or labels "
                           "due to the usage of customized objective function.\n"
@@ -967,23 +956,18 @@ def fit(self, X, y,
             elif len(eval_group) != len(eval_set):
                 raise ValueError("Length of eval_group should be equal to eval_set")
             elif (isinstance(eval_group, dict)
-                  and any(i not in eval_group or eval_group[i] is None for i in range_(len(eval_group)))
+                  and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group)))
                   or isinstance(eval_group, list)
                   and any(group is None for group in eval_group)):
                 raise ValueError("Should set group for all eval datasets for ranking task; "
                                  "if you use dict, the index should start from 0")
 
         self._eval_at = eval_at
-        super(LGBMRanker, self).fit(X, y, sample_weight=sample_weight,
-                                    init_score=init_score, group=group,
-                                    eval_set=eval_set, eval_names=eval_names,
-                                    eval_sample_weight=eval_sample_weight,
-                                    eval_init_score=eval_init_score, eval_group=eval_group,
-                                    eval_metric=eval_metric,
-                                    early_stopping_rounds=early_stopping_rounds,
-                                    verbose=verbose, feature_name=feature_name,
-                                    categorical_feature=categorical_feature,
-                                    callbacks=callbacks, init_model=init_model)
+        super().fit(X, y, sample_weight=sample_weight, init_score=init_score, group=group,
+                    eval_set=eval_set, eval_names=eval_names, eval_sample_weight=eval_sample_weight,
+                    eval_init_score=eval_init_score, eval_group=eval_group, eval_metric=eval_metric,
+                    early_stopping_rounds=early_stopping_rounds, verbose=verbose, feature_name=feature_name,
+                    categorical_feature=categorical_feature, callbacks=callbacks, init_model=init_model)
         return self
 
     _base_doc = LGBMModel.fit.__doc__
diff --git a/python-package/setup.py b/python-package/setup.py
index bc05628fe3e5..140c679bddd3 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -1,8 +1,5 @@
 # coding: utf-8
 """Setup lightgbm package."""
-from __future__ import absolute_import
-
-import io
 import logging
 import os
 import struct
@@ -329,8 +326,8 @@ def run(self):
         copy_file(os.path.join(CURRENT_DIR, os.path.pardir, 'VERSION.txt'),
                   os.path.join(CURRENT_DIR, 'lightgbm', 'VERSION.txt'),
                   verbose=0)
-    version = io.open(os.path.join(CURRENT_DIR, 'lightgbm', 'VERSION.txt'), encoding='utf-8').read().strip()
-    readme = io.open(os.path.join(CURRENT_DIR, 'README.rst'), encoding='utf-8').read()
+    version = open(os.path.join(CURRENT_DIR, 'lightgbm', 'VERSION.txt'), encoding='utf-8').read().strip()
+    readme = open(os.path.join(CURRENT_DIR, 'README.rst'), encoding='utf-8').read()
 
     sys.path.insert(0, CURRENT_DIR)
 
@@ -368,8 +365,6 @@ def run(self):
                        'Operating System :: Microsoft :: Windows',
                        'Operating System :: POSIX',
                        'Operating System :: Unix',
-                       'Programming Language :: Python :: 2',
-                       'Programming Language :: Python :: 2.7',
                        'Programming Language :: Python :: 3',
                        'Programming Language :: Python :: 3.6',
                        'Programming Language :: Python :: 3.7',
diff --git a/tests/python_package_test/test_consistency.py b/tests/python_package_test/test_consistency.py
index 63a5834cf619..782dac4368e3 100644
--- a/tests/python_package_test/test_consistency.py
+++ b/tests/python_package_test/test_consistency.py
@@ -7,7 +7,7 @@
 from sklearn.datasets import load_svmlight_file
 
 
-class FileLoader(object):
+class FileLoader:
 
     def __init__(self, directory, prefix, config_file='train.conf'):
         directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), directory)
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index de8689fd3ea5..84f5a8cc1071 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -3,6 +3,7 @@
 import itertools
 import math
 import os
+import pickle
 import psutil
 import random
 import unittest
@@ -14,11 +15,6 @@
 from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error, roc_auc_score, average_precision_score
 from sklearn.model_selection import train_test_split, TimeSeriesSplit, GroupKFold
 
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
 from .utils import load_boston, load_breast_cancer, load_digits, load_iris
 
 
diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py
index f0b160d60dfb..758f34d6e76f 100644
--- a/tests/python_package_test/utils.py
+++ b/tests/python_package_test/utils.py
@@ -1,23 +1,7 @@
 # coding: utf-8
-import sklearn.datasets
+from functools import lru_cache
 
-try:
-    from functools import lru_cache
-except ImportError:
-    import warnings
-    warnings.warn("Could not import functools.lru_cache", RuntimeWarning)
-
-    def lru_cache(maxsize=None):
-        cache = {}
-
-        def _lru_wrapper(user_function):
-            def wrapper(*args, **kwargs):
-                arg_key = (args, tuple(kwargs.items()))
-                if arg_key not in cache:
-                    cache[arg_key] = user_function(*args, **kwargs)
-                return cache[arg_key]
-            return wrapper
-        return _lru_wrapper
+import sklearn.datasets
 
 
 @lru_cache(maxsize=None)