diff --git a/pytorch_lightning/trainer/evaluate_loop.py b/pytorch_lightning/trainer/evaluate_loop.py
index ddeed31dec735..824cd365125cb 100644
--- a/pytorch_lightning/trainer/evaluate_loop.py
+++ b/pytorch_lightning/trainer/evaluate_loop.py
@@ -141,9 +141,14 @@ def evaluation_epoch_end(self, num_dataloaders):
         eval_results = self.__run_eval_epoch_end(num_dataloaders, using_eval_result)
         return eval_results
 
-    def log_epoch_metrics(self, eval_results):
+    def log_epoch_metrics(self, eval_results, test_mode):
         using_eval_result = self.is_using_eval_results()
-        self.trainer.logger_connector.on_evaluation_epoch_end(eval_results, using_eval_result)
+        eval_loop_results = self.trainer.logger_connector.on_evaluation_epoch_end(
+            eval_results,
+            using_eval_result,
+            test_mode
+        )
+        return eval_loop_results
 
     def __run_eval_epoch_end(self, num_dataloaders, using_eval_result):
         model = self.trainer.get_model()
diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index caba7defe6602..c8b5ea3312b6f 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -124,33 +124,16 @@
 """
 
 from abc import ABC, abstractmethod
-from pprint import pprint
-from typing import Callable, List, Union
+from typing import Callable, List
 
 import torch
 from torch.utils.data import DataLoader
 
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.utilities import rank_zero_warn, flatten_dict, AMPType
-from pytorch_lightning.core.step_result import EvalResult, Result
+from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.trainer.evaluate_loop import EvaluationLoop
 from pytorch_lightning.trainer.logger_connector import LoggerConnector
 
-try:
-    import torch_xla.distributed.parallel_loader as xla_pl
-    import torch_xla.core.xla_model as xm
-except ImportError:
-    XLA_AVAILABLE = False
-else:
-    XLA_AVAILABLE = True
-
-try:
-    import horovod.torch as hvd
-except (ModuleNotFoundError, ImportError):
-    HOROVOD_AVAILABLE = False
-else:
-    HOROVOD_AVAILABLE = True
-
 
 class TrainerEvaluationLoopMixin(ABC):
 
@@ -265,15 +248,12 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None):
         eval_results = self.evaluation_loop.evaluation_epoch_end(num_dataloaders=len(dataloaders))
 
         # bookkeeping
-        self.evaluation_loop.log_epoch_metrics(eval_results)
+        eval_loop_results = self.evaluation_loop.log_epoch_metrics(eval_results, test_mode)
         self.evaluation_loop.predictions.to_disk()
 
         # hook
         self.evaluation_loop.on_evaluation_epoch_end()
 
-        # log the final eval loop metrics
-        eval_loop_results = self.__log_evaluation_epoch_metrics(eval_results, test_mode)
-
         # enable train mode again
         model.train()
         torch.set_grad_enabled(True)
@@ -282,51 +262,3 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None):
         self.evaluation_loop.on_evaluation_end()
 
         return eval_loop_results, eval_results
-
-    def __log_evaluation_epoch_metrics(self, eval_results, test_mode):
-        if self.running_sanity_check:
-            return
-
-        eval_loop_results = []
-        if eval_results is not None and len(eval_results) > 0:
-
-            # in eval, the user may return something at every validation step without final reduction
-            if not isinstance(eval_results, list):
-                eval_results = [eval_results]
-
-            for result_idx, result in enumerate(eval_results):
-                if isinstance(result, EvalResult):
-                    prog_bar_metrics = result.epoch_pbar_metrics
-                    log_metrics = result.epoch_log_metrics
-                    callback_metrics = result.callback_metrics
-
-                    # in testing we don't need the callback metrics
-                    if test_mode:
-                        callback_metrics = {}
-                else:
-                    _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.process_output(result)
-
-                # eval loop returns all metrics
-                dataloader_result_metrics = {**prog_bar_metrics, **log_metrics, **callback_metrics}
-
-                # add metrics to prog bar
-                self.logger_connector.add_progress_bar_metrics(prog_bar_metrics)
-
-                # log metrics
-                self.logger_connector.log_metrics(log_metrics, {})
-
-                # track metrics for callbacks
-                self.logger_connector.callback_metrics.update(callback_metrics)
-
-                if len(dataloader_result_metrics) > 0:
-                    eval_loop_results.append(dataloader_result_metrics)
-
-        # log results of test
-        if test_mode and self.is_global_zero and self.verbose_test:
-            print('-' * 80)
-            for result_idx, results in enumerate(eval_loop_results):
-                print(f'DATALOADER:{result_idx} TEST RESULTS')
-                pprint(results)
-                print('-' * 80)
-
-        return eval_loop_results
diff --git a/pytorch_lightning/trainer/logger_connector.py b/pytorch_lightning/trainer/logger_connector.py
index 3148f4ac59dad..883f344869d24 100644
--- a/pytorch_lightning/trainer/logger_connector.py
+++ b/pytorch_lightning/trainer/logger_connector.py
@@ -15,7 +15,8 @@
 from pytorch_lightning.core import memory
 from pytorch_lightning.utilities import flatten_dict
 from pytorch_lightning.utilities.model_utils import is_overridden
-from pytorch_lightning.core.step_result import Result
+from pytorch_lightning.core.step_result import EvalResult, Result
+from pprint import pprint
 
 
 class LoggerConnector:
@@ -73,7 +74,12 @@ def add_progress_bar_metrics(self, metrics):
 
         self.trainer.dev_debugger.track_pbar_metrics_history(metrics)
 
-    def on_evaluation_epoch_end(self, eval_results, using_eval_result):
+    def on_evaluation_epoch_end(self, eval_results, using_eval_result, test_mode):
+        # TODO: merge both functions?
+        self._log_on_evaluation_epoch_end_metrics(eval_results, using_eval_result)
+        return self.__log_evaluation_epoch_metrics_2(eval_results, test_mode)
+
+    def _log_on_evaluation_epoch_end_metrics(self, eval_results, using_eval_result):
         if using_eval_result:
             if isinstance(eval_results, list):
                 for eval_result in eval_results:
@@ -97,6 +103,54 @@ def on_evaluation_epoch_end(self, eval_results, using_eval_result):
                     flat = flatten_dict(eval_results)
                 self.trainer.logger_connector.callback_metrics.update(flat)
 
+    def __log_evaluation_epoch_metrics_2(self, eval_results, test_mode):
+        if self.trainer.running_sanity_check:
+            return
+
+        eval_loop_results = []
+        if eval_results is not None and len(eval_results) > 0:
+
+            # in eval, the user may return something at every validation step without final reduction
+            if not isinstance(eval_results, list):
+                eval_results = [eval_results]
+
+            for result_idx, result in enumerate(eval_results):
+                if isinstance(result, EvalResult):
+                    prog_bar_metrics = result.epoch_pbar_metrics
+                    log_metrics = result.epoch_log_metrics
+                    callback_metrics = result.callback_metrics
+
+                    # in testing we don't need the callback metrics
+                    if test_mode:
+                        callback_metrics = {}
+                else:
+                    _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.trainer.process_output(result)
+
+                # eval loop returns all metrics
+                dataloader_result_metrics = {**prog_bar_metrics, **log_metrics, **callback_metrics}
+
+                # add metrics to prog bar
+                self.trainer.logger_connector.add_progress_bar_metrics(prog_bar_metrics)
+
+                # log metrics
+                self.trainer.logger_connector.log_metrics(log_metrics, {})
+
+                # track metrics for callbacks
+                self.trainer.logger_connector.callback_metrics.update(callback_metrics)
+
+                if len(dataloader_result_metrics) > 0:
+                    eval_loop_results.append(dataloader_result_metrics)
+
+        # log results of test
+        if test_mode and self.trainer.is_global_zero and self.trainer.verbose_test:
+            print('-' * 80)
+            for result_idx, results in enumerate(eval_loop_results):
+                print(f'DATALOADER:{result_idx} TEST RESULTS')
+                pprint(results)
+                print('-' * 80)
+
+        return eval_loop_results
+
     def on_train_epoch_end(self, epoch_output, checkpoint_accumulator, early_stopping_accumulator, num_optimizers):
         self.log_train_epoch_end_metrics(epoch_output, checkpoint_accumulator,
                                          early_stopping_accumulator, num_optimizers)