GerRudi · calbec · Oct 10, 2019 · Oct 10, 2019 · Apr 3, 2020 · Apr 3, 2020
diff --git a/README.md b/README.md
@@ -1,7 +1,13 @@
 # Python Flathunter-Helper
 
 ## Disclaimer
-This script crawls websites and looks for new offers. Any changes to the webpages can break this script immediately, use with caution. 
+This script crawls websites and looks for new flat offers. If there is a new offer, a Telegram Bot will notify you.
+
+Currently the script supports the following websites:
+- wg-gesucht.de
+- ebay-kleinanzeigen.de
+- immobilienscout24.de
+- immowelt.de
 
 
 ## Setup
@@ -15,7 +21,9 @@ git clone https://github.com/GerRudi/flathunter.git
 apt install python3-pip
 cp config.yaml.dist config.yaml
 nano config.yaml
--> Do your edits to config file
+```
+Now, do your edits to config file in nano editor
+```
 apt install python3-setuptools
 apt install python3-wheel
 
@@ -39,7 +47,7 @@ to Telegram User
 optional arguments:
   -h, --help            show this help message and exit
   --config CONFIG, -c CONFIG
-                        Config file to use. If not set, try to use
+                        Config file to use, usually 'config.yaml'. If not set, try to use
                         '~git-clone-dir/config.yaml'
 
 ```
@@ -73,5 +81,8 @@ Since this feature is not free, I "disabled" it. Read line 62 in hunter.py to re
 - [@tschuehly](https://github.com/tschuehly)
 - [@Cugu](https://github.com/Cugu)
 - [@GerRudi](https://github.com/GerRudi)
+- [@calbec](https://github.com/calbec)
 
 
+## License
+[![License: AGPL v3](https://img.shields.io/badge/License-AGPL%20v3-blue.svg)](https://www.gnu.org/licenses/agpl-3.0)
diff --git a/config.yaml.dist b/config.yaml.dist
@@ -9,7 +9,7 @@ loop:
 # List the URLs in the following format:
 # urls:
 # 	- https://www.immobilienscout24.de/Suche/...
-# 	- https://www.wg-gesucht.de/...
+# 	- https://www.wg-gesucht.de/...   # Use list search in wg-gesucht
 # 	- https://www.immowelt.de/...
 urls:
 
@@ -26,7 +26,7 @@ urls:
 # 
 # The example configuration below includes a place for
 # "John", located at the main train station of munich.
-# Two kinds of travel (bicycle and transit) are requested,
+# Three kinds of travel (bicycle, walking and transit) are requested,
 # each with a different label. Furthermore a place for
 # "Jane" is included, located at the given destination and
 # with the same kinds of travel.

diff --git a/flathunter.py b/flathunter.py
@@ -11,7 +11,7 @@
 from flathunter.idmaintainer import IdMaintainer
 from flathunter.hunter import Hunter
 from flathunter.crawl_ebaykleinanzeigen import CrawlEbayKleinanzeigen
-from flathunter.crawl_immowelt import CrawlImmowelt
+from flathunter.crawl_immowelt import CrawlImmoWelt
 
 __author__ = "Jan Harrie"
 __version__ = "1.0"
@@ -38,7 +38,7 @@
 
 
 def launch_flat_hunt(config):
-    searchers = [CrawlImmobilienscout(), CrawlWgGesucht(),CrawlEbayKleinanzeigen(),CrawlImmowelt()]
+    searchers = [CrawlImmobilienscout(), CrawlWgGesucht(),CrawlEbayKleinanzeigen(),CrawlImmoWelt()]
     id_watch = IdMaintainer('%s/processed_ids.db' % os.path.dirname(os.path.abspath(__file__)))
 
     hunter = Hunter()
@@ -64,7 +64,7 @@ def main():
     # load config
     config_handle = args.config
     __log__.info("Using config %s" % config_handle.name)
-    config = yaml.load(config_handle.read())
+    config = yaml.load(config_handle.read(), Loader=yaml.FullLoader)
 
     # check config
     if not config.get('telegram', dict()).get('bot_token'):

diff --git a/flathunter/crawl_ebaykleinanzeigen.py b/flathunter/crawl_ebaykleinanzeigen.py
@@ -23,7 +23,8 @@ def get_results(self, search_url):
         return entries
 
     def get_page(self, search_url):
-        resp = requests.get(search_url)  # TODO add page_no in url
+        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
+        resp = requests.get(search_url, headers=headers)  # TODO add page_no in url
         if resp.status_code != 200:
             self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
         return BeautifulSoup(resp.content, 'html.parser')

diff --git a/flathunter/crawl_immowelt.py b/flathunter/crawl_immowelt.py
@@ -1,10 +1,9 @@
-import logging
-import requests
-import re
+# coding= UTF-8
+import logging, requests, re
 from bs4 import BeautifulSoup
 
 
-class CrawlImmowelt:
+class CrawlImmoWelt:
     __log__ = logging.getLogger(__name__)
     URL_PATTERN = re.compile(r'https://www\.immowelt\.de')
 
@@ -30,77 +29,32 @@ def get_page(self, search_url):
 
     def extract_data(self, soup):
         entries = []
-        soup = soup.find(id="listItemWrapperFixed")
-        try:
-            title_elements = soup.find_all("h2")
-        except AttributeError:
-            return entries
-        expose_ids=soup.find_all("div", class_="listitem_wrap")
-
-
-        #soup.find_all(lambda e: e.has_attr('data-adid'))
-        #print(expose_ids)
-        for idx,title_el in enumerate(title_elements):
-
-            tags = expose_ids[idx].find_all(class_="hardfact")
-            url = "https://www.immowelt.de/" +expose_ids[idx].find("a").get("href")
-            address = expose_ids[idx].find(class_="listlocation")
-            address.find("span").extract()
-            address.find("strong").extract()
-            print(address.text.strip())
-            address = address.text.strip()
-
-            try:
-                print(tags[0].find("strong").text)
-                price = tags[0].find("strong").text.strip()
-            except IndexError:
-                print("Kein Preis angegeben")
-                price = "Auf Anfrage"
-
-            try:
-                tags[1].find("div").extract()
-                print(tags[1].text.strip())
-                size = tags[1].text.strip()
-            except IndexError:
-                size = "Nicht gegeben"
-                print("Quadratmeter nicht angegeben")				
-
-            try:
-                tags[2].find("div").extract()
-                print(tags[2].text.strip())
-                rooms = tags[2].text.strip()
-            except IndexError:
-                print("Keine Zimmeranzahl gegeben")
-                rooms = "Nicht gegeben"
-
+        soup = soup.find('div',class_ = "iw_list_content")
+        #print soup
+        results = soup.find_all(lambda e: e.has_attr('data-estateid') and not e.has_attr('data-action'))
+        #print results
+        for index,listing in enumerate(results):
+            price = listing.find('div',class_="hardfact price_rent").find("strong").text.strip()
+            id = listing.find('a').get('href').split('expose/',1)[1].split('?',1)[0].strip()
+            id = int(id,base=36)
+            url = "https://www.immowelt.de" + listing.find('a').get('href')
+            size = listing.find('div',class_="hardfact ").text
+            size = size.split('ca.)',1)[1].strip()
+            rooms = listing.find('div',class_="hardfact rooms").text
+            rooms = rooms.split('Zimmer',1)[1].strip()
+            address = listing.find('div',class_="listlocation ellipsis relative").text.strip()
+            title = listing.find('h2').text.strip()
             details = {
-                'id': int(expose_ids[idx].get("data-estateid")),
+                'id': id,
                 'url':  url ,
-                'title': title_el.text.strip(),
+                'title': title,
                 'price': price,
                 'size': size,
                 'rooms': rooms ,
                 'address': address
-
             }
             entries.append(details)
-        
+
         self.__log__.debug('extracted: ' + str(entries))
 
         return entries
-
-    def load_address(self, url):
-        # extract address from expose itself
-        exposeHTML = requests.get(url).content
-        exposeSoup = BeautifulSoup(exposeHTML, 'html.parser')
-        try:
-            street_raw = exposeSoup.find(id="street-address").text
-        except AttributeError:
-            street_raw=""
-        try:
-            address_raw = exposeSoup.find(id="viewad-locality").text
-        except AttributeError:
-            address_raw =""
-        address = address_raw.strip().replace("\n","") + " "+street_raw.strip()
-
-        return address
diff --git a/flathunter/crawl_wggesucht.py b/flathunter/crawl_wggesucht.py
@@ -43,7 +43,7 @@ def get_page(self, search_url, page_no):
     def extract_data(self, soup):
         entries = []
 
-        findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('ad--'))
+        findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('ad-'))
         existingFindings = list(
             filter(lambda e: e.has_attr('class') and not 'listenansicht-inactive' in e['class'], findings))