Merge pull request #8 from algo7/fix-request-interception

Fix request interception
algo7 · Nov 23, 2022 · c49db4e · c49db4e
2 parents 8eea673 + 80e466b
commit c49db4e
Show file tree

Hide file tree

Showing 10 changed files with 140 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -17,15 +17,18 @@
 5. Edit the `SCRAPE_MODE` (RESTO for restaurants, HOTEL for hotel) variable in the `docker-compose-prod.yml` file to scrape either restaurant or hotel reviews.
 6. Edit the `CONCURRENCY` variable in the `docker-compose-prod.yml` file to set the number of concurrent requests.
    - A high concurrency number might cause the program to hang depending on the internet connection and the resource availability of your computer.
-7. Run `docker-compose -f docker-compose-prod.yml up` to start the container.
-8. Once the scraping process is finished, check the `reviews` folder for the results.
-9. Samples of the results are included in the `samples` folder.
-10. Please remember to empty the `reviews` folder before running the scraper again.
+7. Edit the `LANGUAGE` variable in the `docker-compose-prod.yml` file to the language of the reviews you want to scrape.
+   - This option is only supported RESTO mode.
+   - Available options are `fr` and `en` which will actaully scrape all the reviews.
+8. Run `docker-compose -f docker-compose-prod.yml up` to start the container.
+9. Once the scraping process is finished, check the `reviews` folder for the results.
+10. Samples of the results are included in the `samples` folder.
+11. Please remember to empty the `reviews` folder before running the scraper again.
 
 ## Docker CLI 
 1. Download the repository.
-2. Replace the `-e SCRAP_MODE` and `-e CONCURRENCY` with custom values.
-3. Run `docker run --mount type=bind,src="$(pwd)"/reviews,target=/puppeteer/reviews --mount type=bind,src="$(pwd)"/source,target=/puppeteer/source -e SCRAPE_MODE=HOTEL -e CONCURRENCY=5 ghcr.io/algo7/tripadvisor-review-scraper/scrap:latest` in the terminal at the root directory of the project.
+2. Replace the `-e SCRAP_MODE`, `-e CONCURRENCY`, `-e LANGUAGE` with custom values.
+3. Run `docker run --mount type=bind,src="$(pwd)"/reviews,target=/puppeteer/reviews --mount type=bind,src="$(pwd)"/source,target=/puppeteer/source -e SCRAPE_MODE=HOTEL -e CONCURRENCY=5 -e LANGUAGE=en ghcr.io/algo7/tripadvisor-review-scraper/scrap:latest` in the terminal at the root directory of the project.
 
 ## If you are lazy
 1. Download the repository.
@@ -55,4 +58,4 @@ latest: Pulling from algo7/tripadvisor-review-scraper/scrap
 
 ## Known Issues
 1. The hotel scraper works for English reviews only.
-2. [Unstable] The restaurant scraper will scrape all the reviews (you can't choose the language).
+2. The restaurant scraper can only scrap all the reviews together or the French reviews alone.
diff --git a/app.js b/app.js
@@ -21,13 +21,15 @@ const dataDir = join(__dirname, './reviews/');
 const sourceDir = join(__dirname, './source/');
 
 // Environment variables
-let { SCRAPE_MODE, CONCURRENCY } = process.env;
+let { SCRAPE_MODE, CONCURRENCY, LANGUAGE } = process.env;
 CONCURRENCY = parseInt(CONCURRENCY);
 if (!CONCURRENCY) CONCURRENCY = 2;
+if (!LANGUAGE || LANGUAGE !== 'fr') LANGUAGE = 'en';
 
 
 console.log(chalk.bold.blue(`The Scraper is Running in ${chalk.bold.magenta(SCRAPE_MODE)} Mode`));
 console.log(chalk.bold.blue(`Concurrency Setting ${chalk.bold.magenta(CONCURRENCY || 2)}`));
+console.log(chalk.bold.blue(`Review Language ${chalk.bold.magenta(LANGUAGE)}`));
 
 // Check if the required directories exist, otherwise create them
 if (!fileExists(dataDir)) mkdirSync(dataDir);
@@ -162,7 +164,7 @@ const restoScraperInit = async () => {
             const { webUrl: restoUrl, name: restoName, id: restoId, } = item;
 
             processQueue.push(restoScraper(restoUrl, restoName,
-                restoId, index, browserInstance))
+                restoId, index, LANGUAGE, browserInstance))
         }
 
         // Resolve processes the left over in the process queue

diff --git a/builds/setup-darwin-amd64.bin b/builds/setup-darwin-amd64.bin
diff --git a/builds/setup-linux-amd64.bin b/builds/setup-linux-amd64.bin
diff --git a/builds/setup-windows-amd64.exe b/builds/setup-windows-amd64.exe
diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml
@@ -4,14 +4,14 @@ services:
       # It's either RESTO or HOTEL
       SCRAPE_MODE: RESTO
       CONCURRENCY: 10
+      LANGUAGE: en
     # Dont touch anything below if you don't know what you are doing
     image: ghcr.io/algo7/tripadvisor-review-scraper/scrap:latest
     volumes:
-    - source: ./reviews
-      target: /puppeteer/reviews
-      type: bind
-    - source: ./source
-      target: /puppeteer/source
-      type: bind
+      - source: ./reviews
+        target: /puppeteer/reviews
+        type: bind
+      - source: ./source
+        target: /puppeteer/source
+        type: bind
 version: '3.9'
-
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,12 +1,13 @@
 version: '3.9'
 services:
   # Service name
-  scraper: 
+  scraper:
     # Build directory
     build: .
     environment:
       SCRAPE_MODE: RESTO
       CONCURRENCY: 4
+      LANGUAGE: en
     # Image name
     image: ghcr.io/algo7/tripadvisor-review-scraper/scrap:latest
     volumes:

diff --git a/libs/browser.js b/libs/browser.js
@@ -2,9 +2,7 @@
 import puppeteer from 'puppeteer-extra'
 import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker';
 import blockResourcesPlugin from 'puppeteer-extra-plugin-block-resources';
-puppeteer
-    .use(blockResourcesPlugin({ blockedTypes: new Set(['stylesheet', 'image', 'font', 'media', 'other']) }))
-    .use(AdblockerPlugin({ blockTrackers: true }))
+
 
 // Environments variables
 let { CONCURRENCY } = process.env;
@@ -86,6 +84,9 @@ class Browser {
      * @returns {Promise<puppeteer.Browser>}
      */
     async launch() {
+        puppeteer
+            .use(blockResourcesPlugin({ blockedTypes: new Set(['stylesheet', 'image', 'font', 'media', 'other']) }))
+            .use(AdblockerPlugin({ blockTrackers: true }))
         this.browser = await puppeteer.launch(this.config)
         return this.browser
     }

diff --git a/scrapers/resto.js b/scrapers/resto.js
@@ -6,12 +6,14 @@ import chalk from 'chalk';
  * Extract the review page urls, total review count, and total review page count
  * @param {String} restoUrl - The url of the restaurant page
  * @param {Number} position - The index of the restaurant page in the list
+ * @param {String} language - The language of the reviews that you wantto scrape
  * @param {Object} browser - A browser instance
  * @returns {Promise<Object | Error>} - The object containing the review count, page count, and the review page urls
  */
-const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
+const extractAllReviewPageUrls = async (restoUrl, position, language, browser) => {
     try {
 
+
         // Open a new page
         const page = await browser.getNewPage()
         await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3419.0 Safari/537.36');
@@ -22,34 +24,36 @@ const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
         // Wait for the content to load
         await page.waitForSelector('body');
 
-        const [reviewExpandable, reviewExists] = await Promise.all([
-            page.evaluate(() => {
-                if (document.querySelector('.taLnk.ulBlueLinks')) return true
+
+        // Check if the restaurant has reviews
+        let reviewExists = false;
+
+
+        if (language === 'fr') {
+            reviewExists = await page.evaluate(() => {
+                if (document.querySelector('[id=filters_detail_language_filterLang_fr]')) return true
                 return false
-            }),
-            page.evaluate(() => {
+            })
+        }
+
+        if (language === 'en') {
+            reviewExists = await page.evaluate(() => {
                 if (document.querySelector('[id=filters_detail_language_filterLang_ALL]')) return true
                 return false
             })
-        ])
-
-        if (!reviewExists) {
-            return browser.handBack(page);
         }
 
-        // Select all language
-        await page.click('[id=filters_detail_language_filterLang_ALL]');
 
-        await page.waitForTimeout(1000);
+        if (!reviewExists) return browser.handBack(page);
 
-        if (reviewExpandable) {
 
-            // Expand the reviews
-            await page.click('.taLnk.ulBlueLinks');
+        // Select specified language
+        let filterString = 'ALL'
+        if (language === 'fr') filterString = 'fr'
+        await page.click(`[id=filters_detail_language_filterLang_${filterString}]`);
 
-            // Wait for the reviews to load
-            await page.waitForFunction('document.querySelector("body").innerText.includes("Show less")');
-        }
+
+        await page.waitForTimeout(1000);
 
         // Determin current URL
         const currentURL = page.url();
@@ -60,7 +64,9 @@ const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
          * In browser code:
          * Extract the review page url
          */
-        const getReviewPageUrls = await page.evaluate(() => {
+        let getReviewPageUrls = null
+
+        if (language === 'en') getReviewPageUrls = await page.evaluate(() => {
 
             // Get the total review count
             const totalReviewCount = parseInt(document
@@ -69,6 +75,46 @@ const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
                 .split(')')[0]
                 .replace(',', ''));
 
+
+            // Default review page count
+            let noReviewPages = totalReviewCount / 15;
+
+            // Calculate the last review page
+            if (totalReviewCount % 15 !== 0) {
+                noReviewPages = ((totalReviewCount - totalReviewCount % 15) / 15) + 1;
+            }
+
+            // Get the url of the 2nd page of review. The 1st page is the input link
+            let url = false;
+
+            // If there is more than 1 review page
+            if (document.getElementsByClassName('pageNum').length > 0) {
+                url = document.getElementsByClassName('pageNum')[1].href;
+            }
+
+            return {
+                noReviewPages,
+                url,
+                totalReviewCount,
+            };
+        });
+
+        if (language === 'fr') getReviewPageUrls = await page.evaluate(() => {
+
+            // Get the total review count
+            // const totalReviewCount = parseInt(document
+            //     .getElementsByClassName('reviews_header_count')[0]
+            //     .innerText.split('(')[1]
+            //     .split(')')[0]
+            //     .replace(',', ''));
+            const reviewEelement = document.getElementsByClassName('count')
+            let totalReviewCount = 0
+            for (let index = 0; index < reviewEelement.length; index++) {
+                if (reviewEelement[index].parentElement.innerText.split('(')[0].split(' ')[0] === 'French') {
+                    totalReviewCount = parseInt(document.getElementsByClassName('count')[index].innerText.split('(')[1].split(')')[0])
+                }
+            }
+
             // Default review page count
             let noReviewPages = totalReviewCount / 15;
 
@@ -92,6 +138,8 @@ const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
             };
         });
 
+
+
         // Destructure function outputs
         let { noReviewPages, url, totalReviewCount, } = getReviewPageUrls;
 
@@ -119,7 +167,7 @@ const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
             pageCount: reviewPageUrls.length,
             urls: reviewPageUrls,
         };
-
+        console.log(data)
         // Hand back the page so it's available again
         browser.handBack(page);
 
@@ -137,10 +185,11 @@ const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
  * @param {Number} position - The index of the restaurant page in the list
  * @param {String} restoName - The name of the restaurant
  * @param {String} restoId - The id of the restaurant
+ * @param {String} language - The language of the reviews that you wantto scrape
  * @param {Object} browser - A browser instance
  * @returns {Promise<Object | Error>} - The final data
  */
-const scrape = async (totalReviewCount, reviewPageUrls, position, restoName, restoId, browser) => {
+const scrape = async (totalReviewCount, reviewPageUrls, position, restoName, restoId, language, browser) => {
     try {
 
         // Open a new page
@@ -159,11 +208,30 @@ const scrape = async (totalReviewCount, reviewPageUrls, position, restoName, res
             // Wait for the content to load
             await page.waitForSelector('body');
 
-            // Select all language
-            await page.click('[id=filters_detail_language_filterLang_ALL]');
+
+            // Select specified language
+            let filterString = 'ALL'
+            if (language === 'fr') filterString = 'fr'
+            await page.click(`[id=filters_detail_language_filterLang_${filterString}]`);
 
             await page.waitForTimeout(1000);
 
+            const reviewExpandable = await page.evaluate(() => {
+                if (document.querySelector('.taLnk.ulBlueLinks')) return true
+                return false
+            })
+
+            if (reviewExpandable) {
+
+                // Expand the reviews
+                await page.click('.taLnk.ulBlueLinks');
+
+                // Wait for the reviews to load
+                await page.waitForFunction('document.querySelector("body").innerText.includes("Show less")');
+            }
+
+
+
             // Determine current URL
             const currentURL = page.url();
 
@@ -236,13 +304,14 @@ const scrape = async (totalReviewCount, reviewPageUrls, position, restoName, res
  * @param {String} restoName - The name of the restaurant
  * @param {String} restoId - The id of the restaurant
  * @param {Number} position - The index of the restaurant page in the list
+ * @param {String} language - The language of the reviews to scrape
  * @param {Object} browser - A browser instance
  * @returns {Promise<Object | Error>} - The final data
  */
-const start = async (restoUrl, restoName, restoId, position, browser) => {
+const start = async (restoUrl, restoName, restoId, position, language, browser) => {
     try {
 
-        const extracted = await extractAllReviewPageUrls(restoUrl, position, browser);
+        const extracted = await extractAllReviewPageUrls(restoUrl, position, language, browser);
 
         // If the resto has no reviews
         if (!extracted) return {
@@ -263,7 +332,7 @@ const start = async (restoUrl, restoName, restoId, position, browser) => {
 
         const { urls, count, } = extracted
 
-        const results = await scrape(count, urls, position, restoName, restoId, browser);
+        const results = await scrape(count, urls, position, restoName, restoId, language, browser);
 
         return results;
 

diff --git a/setup/main.go b/setup/main.go
@@ -29,6 +29,7 @@ var (
 	errMissingSourceFiles       = errors.New("MISSING SOURCE FILES")
 	errInputScrapMode           = errors.New("INVALID SCRAP MODE")
 	errInputConcurrency         = errors.New("INVALID CONCURRENCY VALUE")
+	errInputLanguage            = errors.New("INVALID CONCURRENCY VALUE")
 	errDockerComposeYmlNotFound = errors.New("DOCKER-COMPSE-PROD.YML NOT FOUND")
 	errValueReplace             = errors.New("FAILED TO REPLACE VALUE")
 )
@@ -156,6 +157,16 @@ func userInputs(path string) error {
 		return errInputScrapMode
 	}
 
+	// Get review language
+	fmt.Println("Enter the language of the reviews (en or fr):")
+	var lang string
+	_, err = fmt.Scanf("%s\n", &lang)
+
+	// Input validation
+	if err != nil || (lang != "en" && lang != "fr") {
+		return errInputLanguage
+	}
+
 	// Get concurrency value
 	fmt.Println("Enter the concurrency value (ex: 10):")
 	var i int
@@ -169,6 +180,7 @@ func userInputs(path string) error {
 	// Print the user output
 	fmt.Println("Scrap mode:", mode)
 	fmt.Println("Concurrency value:", i)
+	fmt.Println("Review language:", lang)
 
 	// Read the docker-compose-prod.yml file
 	dockerComposeFilePath := filepath.Join(path, "Project_Files/docker-compose-prod.yml")
@@ -182,11 +194,15 @@ func userInputs(path string) error {
 	scrapModeRegex := regexp.MustCompile("SCRAPE_MODE:(.*)")
 	// Regex to match the concurrency value
 	concurrencyRegex := regexp.MustCompile("CONCURRENCY:(.*)")
+	// Regex to match the review language option
+	reviewLaguageRegex := regexp.MustCompile("LANGUAGE:(.*)")
 
 	// Replace the scrap mode with the input
 	scrapModeChanged := scrapModeRegex.ReplaceAllString(string(content), "SCRAPE_MODE: "+mode)
 	// Replace the concurrency value with the input
 	concurrencyChanged := concurrencyRegex.ReplaceAllString(scrapModeChanged, "CONCURRENCY: "+strconv.Itoa(i))
+	// Replace the review language with the input
+	reviewLanguageChanged := reviewLaguageRegex.ReplaceAllString(concurrencyChanged, "LANGUAGE: "+lang)
 
 	f, err := os.OpenFile(dockerComposeFilePath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, os.ModePerm)
 
@@ -198,7 +214,7 @@ func userInputs(path string) error {
 	defer f.Close()
 
 	// Write the new content to the file
-	_, err = f.WriteString(concurrencyChanged)
+	_, err = f.WriteString(reviewLanguageChanged)
 
 	if err != nil {
 		return errValueReplace