diff --git a/CHANGELOG.md b/CHANGELOG.md
index c948e22e7b553..2e388c1c40762 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for `IterableDataset` in validation and testing ([#1104](https://github.com/PyTorchLightning/pytorch-lightning/pull/1104))
 - Added support for non-primitive types in `hparams` for `TensorboardLogger` ([#1130](https://github.com/PyTorchLightning/pytorch-lightning/pull/1130))
 - Added a check that stops the training when loss or weights contain `NaN` or `inf` values. ([#1097](https://github.com/PyTorchLightning/pytorch-lightning/pull/1097))
+- Updated references to self.forward() to instead use the `__call__` interface. ([#1211](https://github.com/PyTorchLightning/pytorch-lightning/pull/1211))
 
 ### Changed
 
diff --git a/README.md b/README.md
index 314ca70d632c7..51173a0d23d2c 100644
--- a/README.md
+++ b/README.md
@@ -200,7 +200,7 @@ def validation_step(self, batch, batch_idx):
     x, y = batch
     
     # or as basic as a CNN classification
-    out = self.forward(x)
+    out = self(x)
     loss = my_loss(out, y)
     return {'loss': loss} 
 ```
diff --git a/docs/source/child_modules.rst b/docs/source/child_modules.rst
index 6ea0c59951f9a..49fe6f463c373 100644
--- a/docs/source/child_modules.rst
+++ b/docs/source/child_modules.rst
@@ -24,7 +24,7 @@ that change in the `Autoencoder` model are the init, forward, training, validati
             x, _ = batch
 
             representation = self.encoder(x)
-            x_hat = self.forward(representation)
+            x_hat = self(representation)
 
             loss = MSE(x, x_hat)
             return loss
@@ -38,7 +38,7 @@ that change in the `Autoencoder` model are the init, forward, training, validati
         def _shared_eval(self, batch, batch_idx, prefix):
             x, y = batch
             representation = self.encoder(x)
-            x_hat = self.forward(representation)
+            x_hat = self(representation)
 
             loss = F.nll_loss(logits, y)
             return {f'{prefix}_loss': loss}
diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst
index c0453839519c5..6defcaa026f53 100644
--- a/docs/source/introduction_guide.rst
+++ b/docs/source/introduction_guide.rst
@@ -319,7 +319,7 @@ in the LightningModule
 
       def training_step(self, batch, batch_idx):
         x, y = batch
-        logits = self.forward(x)
+        logits = self(x)
         loss = F.nll_loss(logits, y)
         return {'loss': loss}
         # return loss (also works)
@@ -371,7 +371,7 @@ For clarity, we'll recall that the full LightningModule now looks like this.
 
       def training_step(self, batch, batch_idx):
         x, y = batch
-        logits = self.forward(x)
+        logits = self(x)
         loss = F.nll_loss(logits, y)
 
         # add logging
@@ -684,7 +684,7 @@ sample split in the `train_dataloader` method.
     class LitMNIST(pl.LightningModule):
       def validation_step(self, batch, batch_idx):
         x, y = batch
-        logits = self.forward(x)
+        logits = self(x)
         loss = F.nll_loss(logits, y)
         return {'val_loss': loss}
 
@@ -740,7 +740,7 @@ Just like the validation loop, we define exactly the same steps for testing:
     class LitMNIST(pl.LightningModule):
       def test_step(self, batch, batch_idx):
         x, y = batch
-        logits = self.forward(x)
+        logits = self(x)
         loss = F.nll_loss(logits, y)
         return {'val_loss': loss}
 
@@ -827,7 +827,7 @@ within it.
 
       def training_step(self, batch, batch_idx):
         x, y = batch
-        logits = self.forward(x)
+        logits = self(x)
         loss = F.nll_loss(logits, y)
         return loss
 
@@ -855,7 +855,7 @@ In this case, we've set this LightningModel to predict logits. But we could also
 
       def training_step(self, batch, batch_idx):
         x, y = batch
-        out, l1_feats, l2_feats, l3_feats = self.forward(x)
+        out, l1_feats, l2_feats, l3_feats = self(x)
         logits = torch.log_softmax(out, dim=1)
         ce_loss = F.nll_loss(logits, y)
         loss = perceptual_loss(l1_feats, l2_feats, l3_feats) + ce_loss
@@ -880,7 +880,7 @@ Or maybe we have a model that we use to do generation
       def training_step(self, batch, batch_idx):
         x, y = batch
         representation = self.encoder(x)
-        imgs = self.forward(representation)
+        imgs = self(representation)
 
         loss = perceptual_loss(imgs, x)
         return loss
diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index 0f51a654f0d68..6b8f15b736443 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -207,7 +207,7 @@ to illustrate why this is needed, let's look at dataparallel
 
     def training_step(self, batch, batch_idx):
         x, y = batch
-        y_hat = self.forward(batch)
+        y_hat = self(batch)
 
         # on dp or ddp2 if we did softmax now it would be wrong
         # because batch is actually a piece of the full batch
diff --git a/pl_examples/basic_examples/lightning_module_template.py b/pl_examples/basic_examples/lightning_module_template.py
index effd750de5fb9..d508c5f1e9399 100644
--- a/pl_examples/basic_examples/lightning_module_template.py
+++ b/pl_examples/basic_examples/lightning_module_template.py
@@ -106,7 +106,7 @@ def training_step(self, batch, batch_idx):
         x, y = batch
         x = x.view(x.size(0), -1)
 
-        y_hat = self.forward(x)
+        y_hat = self(x)
 
         # calculate loss
         loss_val = self.loss(y, y_hat)
@@ -133,7 +133,7 @@ def validation_step(self, batch, batch_idx):
         """
         x, y = batch
         x = x.view(x.size(0), -1)
-        y_hat = self.forward(x)
+        y_hat = self(x)
 
         loss_val = self.loss(y, y_hat)
 
diff --git a/pl_examples/domain_templates/gan.py b/pl_examples/domain_templates/gan.py
index 0d7f7834b6faf..5010026b28d17 100644
--- a/pl_examples/domain_templates/gan.py
+++ b/pl_examples/domain_templates/gan.py
@@ -105,7 +105,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                 z = z.cuda(imgs.device.index)
 
             # generate images
-            self.generated_imgs = self.forward(z)
+            self.generated_imgs = self(z)
 
             # log sampled images
             # sample_imgs = self.generated_imgs[:6]
@@ -179,7 +179,7 @@ def on_epoch_end(self):
             z = z.cuda(self.last_imgs.device.index)
 
         # log sampled images
-        sample_imgs = self.forward(z)
+        sample_imgs = self(z)
         grid = torchvision.utils.make_grid(sample_imgs)
         self.logger.experiment.add_image(f'generated_images', grid, self.current_epoch)
 
diff --git a/pl_examples/full_examples/imagenet/imagenet_example.py b/pl_examples/full_examples/imagenet/imagenet_example.py
index 646d092ddb54d..159ba16ae960f 100644
--- a/pl_examples/full_examples/imagenet/imagenet_example.py
+++ b/pl_examples/full_examples/imagenet/imagenet_example.py
@@ -42,7 +42,7 @@ def forward(self, x):
 
     def training_step(self, batch, batch_idx):
         images, target = batch
-        output = self.forward(images)
+        output = self(images)
         loss_val = F.cross_entropy(output, target)
         acc1, acc5 = self.__accuracy(output, target, topk=(1, 5))
 
@@ -65,7 +65,7 @@ def training_step(self, batch, batch_idx):
 
     def validation_step(self, batch, batch_idx):
         images, target = batch
-        output = self.forward(images)
+        output = self(images)
         loss_val = F.cross_entropy(output, target)
         acc1, acc5 = self.__accuracy(output, target, topk=(1, 5))
 
diff --git a/pl_examples/full_examples/semantic_segmentation/semseg.py b/pl_examples/full_examples/semantic_segmentation/semseg.py
index 8f25243cffb96..3b8c6dbfccc6d 100644
--- a/pl_examples/full_examples/semantic_segmentation/semseg.py
+++ b/pl_examples/full_examples/semantic_segmentation/semseg.py
@@ -143,7 +143,7 @@ def training_step(self, batch, batch_nb):
         img, mask = batch
         img = img.float()
         mask = mask.long()
-        out = self.forward(img)
+        out = self(img)
         loss_val = F.cross_entropy(out, mask, ignore_index=250)
         return {'loss': loss_val}
 
diff --git a/pytorch_lightning/core/__init__.py b/pytorch_lightning/core/__init__.py
index ff03a2d32ee61..231ef7d597264 100644
--- a/pytorch_lightning/core/__init__.py
+++ b/pytorch_lightning/core/__init__.py
@@ -82,7 +82,7 @@ def forward(self, x):
 
         def training_step(self, batch, batch_idx):
             x, y = batch
-            y_hat = self.forward(x)
+            y_hat = self(x)
             return {'loss': F.cross_entropy(y_hat, y)}
 
         def train_dataloader(self):
@@ -159,7 +159,7 @@ def configure_optimizers(self):
         class LitModel(pl.LightningModule):
             def validation_step(self, batch, batch_idx):
                 x, y = batch
-                y_hat = self.forward(x)
+                y_hat = self(x)
                 return {'val_loss': F.cross_entropy(y_hat, y)}
 
             def validation_epoch_end(self, outputs):
@@ -178,7 +178,7 @@ def val_dataloader(self):
         class LitModel(pl.LightningModule):
             def test_step(self, batch, batch_idx):
                 x, y = batch
-                y_hat = self.forward(x)
+                y_hat = self(x)
                 return {'test_loss': F.cross_entropy(y_hat, y)}
 
             def test_epoch_end(self, outputs):
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 2a67d32748c42..c7d4bf8df21d0 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -97,7 +97,7 @@ def forward(self, *args, **kwargs):
         Same as torch.nn.Module.forward(), however in Lightning you want this to define
         the  operations you want to use for prediction (ie: on a server or as a feature extractor).
 
-        Normally you'd call self.forward() from your training_step() method.
+        Normally you'd call self() from your training_step() method.
         This makes it easy to write a complex system for training with the outputs
         you'd want in a prediction setting.
 
@@ -117,7 +117,7 @@ def forward(self, x):
 
                 def training_step(self, batch, batch_idx):
                     x, y = batch
-                    feature_maps = self.forward(x)
+                    feature_maps = self(x)
                     logits = self.classifier(feature_maps)
 
                     # ...
@@ -171,7 +171,7 @@ def training_step(self, batch, batch_idx):
                     x, y, z = batch
 
                     # implement your own
-                    out = self.forward(x)
+                    out = self(x)
                     loss = self.loss(out, x)
 
                     logger_logs = {'training_loss': loss} # optional (MUST ALL BE TENSORS)
@@ -266,7 +266,7 @@ def training_step(self, batch, batch_idx):
                     # batch is 1/num_gpus big
                     x, y = batch
 
-                    out = self.forward(x)
+                    out = self(x)
                     loss = self.softmax(out)
                     loss = nce_loss(loss)
                     return {'loss': loss}
@@ -277,7 +277,7 @@ def training_step(self, batch, batch_idx):
                     # batch is 1/num_gpus big
                     x, y = batch
 
-                    out = self.forward(x)
+                    out = self(x)
                     return {'out': out}
 
                 def training_step_end(self, outputs):
@@ -342,7 +342,7 @@ def validation_step(self, batch, batch_idx):
                     x, y = batch
 
                     # implement your own
-                    out = self.forward(x)
+                    out = self(x)
                     loss = self.loss(out, y)
 
                     # log 6 example images
@@ -413,7 +413,7 @@ def validation_step(self, batch, batch_idx):
                     # batch is 1/num_gpus big
                     x, y = batch
 
-                    out = self.forward(x)
+                    out = self(x)
                     loss = self.softmax(out)
                     loss = nce_loss(loss)
                     return {'loss': loss}
@@ -424,7 +424,7 @@ def validation_step(self, batch, batch_idx):
                     # batch is 1/num_gpus big
                     x, y = batch
 
-                    out = self.forward(x)
+                    out = self(x)
                     return {'out': out}
 
                 def validation_epoch_end(self, outputs):
@@ -564,7 +564,7 @@ def test_step(self, batch, batch_idx):
                     x, y = batch
 
                     # implement your own
-                    out = self.forward(x)
+                    out = self(x)
                     loss = self.loss(out, y)
 
                     # log 6 example images
@@ -636,7 +636,7 @@ def test_step(self, batch, batch_idx):
                     # batch is 1/num_gpus big
                     x, y = batch
 
-                    out = self.forward(x)
+                    out = self(x)
                     loss = self.softmax(out)
                     loss = nce_loss(loss)
                     return {'loss': loss}
@@ -647,7 +647,7 @@ def test_step(self, batch, batch_idx):
                     # batch is 1/num_gpus big
                     x, y = batch
 
-                    out = self.forward(x)
+                    out = self(x)
                     return {'out': out}
 
                 def test_step_end(self, outputs):
diff --git a/tests/base/debug.py b/tests/base/debug.py
index 64f8067e274ed..a3ee833506dba 100644
--- a/tests/base/debug.py
+++ b/tests/base/debug.py
@@ -26,12 +26,12 @@ def my_loss(self, y_hat, y):
 
     def training_step(self, batch, batch_idx):
         x, y = batch
-        y_hat = self.forward(x)
+        y_hat = self(x)
         return {'training_loss': self.my_loss(y_hat, y)}
 
     def validation_step(self, batch, batch_idx):
         x, y = batch
-        y_hat = self.forward(x)
+        y_hat = self(x)
         return {'val_loss': self.my_loss(y_hat, y)}
 
     def validation_epoch_end(self, outputs):
diff --git a/tests/base/mixins.py b/tests/base/mixins.py
index 0be691726e209..1a05049f44f5f 100644
--- a/tests/base/mixins.py
+++ b/tests/base/mixins.py
@@ -21,7 +21,7 @@ def validation_step(self, batch, batch_idx, *args, **kwargs):
         """
         x, y = batch
         x = x.view(x.size(0), -1)
-        y_hat = self.forward(x)
+        y_hat = self(x)
 
         loss_val = self.loss(y, y_hat)
 
@@ -114,7 +114,7 @@ def validation_step(self, batch, batch_idx, dataloader_idx, **kwargs):
         """
         x, y = batch
         x = x.view(x.size(0), -1)
-        y_hat = self.forward(x)
+        y_hat = self(x)
 
         loss_val = self.loss(y, y_hat)
 
@@ -273,7 +273,7 @@ def test_step(self, batch, batch_idx, *args, **kwargs):
         """
         x, y = batch
         x = x.view(x.size(0), -1)
-        y_hat = self.forward(x)
+        y_hat = self(x)
 
         loss_test = self.loss(y, y_hat)
 
@@ -360,7 +360,7 @@ def test_step(self, batch, batch_idx, dataloader_idx, **kwargs):
         """
         x, y = batch
         x = x.view(x.size(0), -1)
-        y_hat = self.forward(x)
+        y_hat = self(x)
 
         loss_test = self.loss(y, y_hat)
 
@@ -413,7 +413,7 @@ def test_step(self, batch, batch_idx, *args, **kwargs):
         """
         x, y = batch
         x = x.view(x.size(0), -1)
-        y_hat = self.forward(x)
+        y_hat = self(x)
 
         loss_test = self.loss(y, y_hat)
 
@@ -460,7 +460,7 @@ def test_step(self, batch, batch_idx, dataloader_idx, **kwargs):
         """
         x, y = batch
         x = x.view(x.size(0), -1)
-        y_hat = self.forward(x)
+        y_hat = self(x)
 
         loss_test = self.loss(y, y_hat)
 
@@ -512,7 +512,7 @@ def validation_step(self, batch, batch_idx, *args, **kwargs):
         """
         x, y = batch
         x = x.view(x.size(0), -1)
-        y_hat = self.forward(x)
+        y_hat = self(x)
 
         loss_val = self.loss(y, y_hat)
 
@@ -558,7 +558,7 @@ def validation_step(self, batch, batch_idx, dataloader_idx, **kwargs):
         """
         x, y = batch
         x = x.view(x.size(0), -1)
-        y_hat = self.forward(x)
+        y_hat = self(x)
 
         loss_val = self.loss(y, y_hat)
 
diff --git a/tests/base/models.py b/tests/base/models.py
index 2b9fc27fb0715..e9605ca7645ab 100644
--- a/tests/base/models.py
+++ b/tests/base/models.py
@@ -54,7 +54,7 @@ def forward(self, x):
 
     def training_step(self, batch, batch_idx):
         x, y = batch
-        y_hat = self.forward(x)
+        y_hat = self(x)
         return {'loss': F.cross_entropy(y_hat, y)}
 
     def configure_optimizers(self):
@@ -140,7 +140,7 @@ def training_step(self, batch, batch_idx, optimizer_idx=None):
         x, y = batch
         x = x.view(x.size(0), -1)
 
-        y_hat = self.forward(x)
+        y_hat = self(x)
 
         # calculate loss
         loss_val = self.loss(y, y_hat)
diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py
index f5f5095d33a6d..75ce058a44b0c 100644
--- a/tests/models/test_cpu.py
+++ b/tests/models/test_cpu.py
@@ -295,7 +295,7 @@ def training_step(self, batch, batch_idx, hiddens):
             y_tensor = torch.tensor(y_list, dtype=x_tensor.dtype)
             assert y_tensor.shape[1] == truncated_bptt_steps, "tbptt split list failed"
 
-            pred = self.forward(x_tensor.view(batch_size, truncated_bptt_steps))
+            pred = self(x_tensor.view(batch_size, truncated_bptt_steps))
             loss_val = torch.nn.functional.mse_loss(
                 pred, y_tensor.view(batch_size, truncated_bptt_steps))
             return {