Added option to scrape X ammount of pages

augustobottelli · Apr 3, 2020 · dd96fa3 · dd96fa3 · aquinto92 · Sep 19, 2020
1 parent 89726b1
commit dd96fa3
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -22,6 +22,11 @@ $ pip3 install -r requirements.txt
 ```
 $ python3 restaurants_scraper.py --city "Buenos Aires"
 ```
+- If you wish to scrape just X pages instead of the whole catalog, you can include:
+```
+$ python3 restaurants_scraper.py --city "Buenos Aires" --max_pages X
+```
+
 It currently works for these cities:
 - Buenos Aires
 - Panama City
@@ -39,4 +44,4 @@ More cities can be added by including its city code and name from tripadvisor UR
 ## Disclaimer
 As mentioned before, the program is a web scraper and its correctness relies on Tripadvisor's HTML structure. If the page suffers changes, the program will break.
 
-As of today **2019/01/07 the program still works**
+As of today **2020/04/03 the program still works**
diff --git a/restaurants_scraper.py b/restaurants_scraper.py
@@ -108,7 +108,8 @@ def get_restaurant_info(restaurant_tag):
 
 def _set_cli():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--city", type=str)
+    parser.add_argument("--city", type=str, required=True, help="Need to specify a city")
+    parser.add_argument("--max_pages", type=int)
     args, unknown = parser.parse_known_args()
     if unknown:
         logging.warning(f"Unknown parameter {unknown}")
@@ -135,7 +136,10 @@ def _make_csv(restaurants_lists, city, date):
     page_offset = 0
     full_url = BASE_URL + f"/Restaurants-{city_code}-oa{page_offset}-{city_name}"
     first_page = get_html_and_parse(full_url)
-    last_page_offset = _get_last_page_offset(first_page)
+    if not args.max_pages:
+        last_page_offset = _get_last_page_offset(first_page)
+    else:
+        last_page_offset = (args.max_pages - 1) * PAGE_OFFSET_INTERVAL
     last_page = (last_page_offset / PAGE_OFFSET_INTERVAL) + 1
 
     logging.info(f"Scraping page 1 of {int(last_page)}")
@@ -149,5 +153,4 @@ def _make_csv(restaurants_lists, city, date):
         restaurants_information = get_restaurants_info(
             restaurants_data, page_html, thread_pool
         )
-
     _make_csv(restaurants_data, args.city, DATE)