Skip to content

Commit

Permalink
Merge pull request #8 from algo7/fix-request-interception
Browse files Browse the repository at this point in the history
Fix request interception
  • Loading branch information
algo7 authored Nov 23, 2022
2 parents 8eea673 + 80e466b commit c49db4e
Show file tree
Hide file tree
Showing 10 changed files with 140 additions and 48 deletions.
17 changes: 10 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,18 @@
5. Edit the `SCRAPE_MODE` (RESTO for restaurants, HOTEL for hotel) variable in the `docker-compose-prod.yml` file to scrape either restaurant or hotel reviews.
6. Edit the `CONCURRENCY` variable in the `docker-compose-prod.yml` file to set the number of concurrent requests.
- A high concurrency number might cause the program to hang depending on the internet connection and the resource availability of your computer.
7. Run `docker-compose -f docker-compose-prod.yml up` to start the container.
8. Once the scraping process is finished, check the `reviews` folder for the results.
9. Samples of the results are included in the `samples` folder.
10. Please remember to empty the `reviews` folder before running the scraper again.
7. Edit the `LANGUAGE` variable in the `docker-compose-prod.yml` file to the language of the reviews you want to scrape.
- This option is only supported RESTO mode.
- Available options are `fr` and `en` which will actaully scrape all the reviews.
8. Run `docker-compose -f docker-compose-prod.yml up` to start the container.
9. Once the scraping process is finished, check the `reviews` folder for the results.
10. Samples of the results are included in the `samples` folder.
11. Please remember to empty the `reviews` folder before running the scraper again.

## Docker CLI
1. Download the repository.
2. Replace the `-e SCRAP_MODE` and `-e CONCURRENCY` with custom values.
3. Run `docker run --mount type=bind,src="$(pwd)"/reviews,target=/puppeteer/reviews --mount type=bind,src="$(pwd)"/source,target=/puppeteer/source -e SCRAPE_MODE=HOTEL -e CONCURRENCY=5 ghcr.io/algo7/tripadvisor-review-scraper/scrap:latest` in the terminal at the root directory of the project.
2. Replace the `-e SCRAP_MODE`, `-e CONCURRENCY`, `-e LANGUAGE` with custom values.
3. Run `docker run --mount type=bind,src="$(pwd)"/reviews,target=/puppeteer/reviews --mount type=bind,src="$(pwd)"/source,target=/puppeteer/source -e SCRAPE_MODE=HOTEL -e CONCURRENCY=5 -e LANGUAGE=en ghcr.io/algo7/tripadvisor-review-scraper/scrap:latest` in the terminal at the root directory of the project.

## If you are lazy
1. Download the repository.
Expand Down Expand Up @@ -55,4 +58,4 @@ latest: Pulling from algo7/tripadvisor-review-scraper/scrap

## Known Issues
1. The hotel scraper works for English reviews only.
2. [Unstable] The restaurant scraper will scrape all the reviews (you can't choose the language).
2. The restaurant scraper can only scrap all the reviews together or the French reviews alone.
6 changes: 4 additions & 2 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@ const dataDir = join(__dirname, './reviews/');
const sourceDir = join(__dirname, './source/');

// Environment variables
let { SCRAPE_MODE, CONCURRENCY } = process.env;
let { SCRAPE_MODE, CONCURRENCY, LANGUAGE } = process.env;
CONCURRENCY = parseInt(CONCURRENCY);
if (!CONCURRENCY) CONCURRENCY = 2;
if (!LANGUAGE || LANGUAGE !== 'fr') LANGUAGE = 'en';


console.log(chalk.bold.blue(`The Scraper is Running in ${chalk.bold.magenta(SCRAPE_MODE)} Mode`));
console.log(chalk.bold.blue(`Concurrency Setting ${chalk.bold.magenta(CONCURRENCY || 2)}`));
console.log(chalk.bold.blue(`Review Language ${chalk.bold.magenta(LANGUAGE)}`));

// Check if the required directories exist, otherwise create them
if (!fileExists(dataDir)) mkdirSync(dataDir);
Expand Down Expand Up @@ -162,7 +164,7 @@ const restoScraperInit = async () => {
const { webUrl: restoUrl, name: restoName, id: restoId, } = item;

processQueue.push(restoScraper(restoUrl, restoName,
restoId, index, browserInstance))
restoId, index, LANGUAGE, browserInstance))
}

// Resolve processes the left over in the process queue
Expand Down
Binary file modified builds/setup-darwin-amd64.bin
Binary file not shown.
Binary file modified builds/setup-linux-amd64.bin
Binary file not shown.
Binary file modified builds/setup-windows-amd64.exe
Binary file not shown.
14 changes: 7 additions & 7 deletions docker-compose-prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ services:
# It's either RESTO or HOTEL
SCRAPE_MODE: RESTO
CONCURRENCY: 10
LANGUAGE: en
# Dont touch anything below if you don't know what you are doing
image: ghcr.io/algo7/tripadvisor-review-scraper/scrap:latest
volumes:
- source: ./reviews
target: /puppeteer/reviews
type: bind
- source: ./source
target: /puppeteer/source
type: bind
- source: ./reviews
target: /puppeteer/reviews
type: bind
- source: ./source
target: /puppeteer/source
type: bind
version: '3.9'

3 changes: 2 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
version: '3.9'
services:
# Service name
scraper:
scraper:
# Build directory
build: .
environment:
SCRAPE_MODE: RESTO
CONCURRENCY: 4
LANGUAGE: en
# Image name
image: ghcr.io/algo7/tripadvisor-review-scraper/scrap:latest
volumes:
Expand Down
7 changes: 4 additions & 3 deletions libs/browser.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
import puppeteer from 'puppeteer-extra'
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker';
import blockResourcesPlugin from 'puppeteer-extra-plugin-block-resources';
puppeteer
.use(blockResourcesPlugin({ blockedTypes: new Set(['stylesheet', 'image', 'font', 'media', 'other']) }))
.use(AdblockerPlugin({ blockTrackers: true }))


// Environments variables
let { CONCURRENCY } = process.env;
Expand Down Expand Up @@ -86,6 +84,9 @@ class Browser {
* @returns {Promise<puppeteer.Browser>}
*/
async launch() {
puppeteer
.use(blockResourcesPlugin({ blockedTypes: new Set(['stylesheet', 'image', 'font', 'media', 'other']) }))
.use(AdblockerPlugin({ blockTrackers: true }))
this.browser = await puppeteer.launch(this.config)
return this.browser
}
Expand Down
123 changes: 96 additions & 27 deletions scrapers/resto.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ import chalk from 'chalk';
* Extract the review page urls, total review count, and total review page count
* @param {String} restoUrl - The url of the restaurant page
* @param {Number} position - The index of the restaurant page in the list
* @param {String} language - The language of the reviews that you wantto scrape
* @param {Object} browser - A browser instance
* @returns {Promise<Object | Error>} - The object containing the review count, page count, and the review page urls
*/
const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
const extractAllReviewPageUrls = async (restoUrl, position, language, browser) => {
try {


// Open a new page
const page = await browser.getNewPage()
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3419.0 Safari/537.36');
Expand All @@ -22,34 +24,36 @@ const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
// Wait for the content to load
await page.waitForSelector('body');

const [reviewExpandable, reviewExists] = await Promise.all([
page.evaluate(() => {
if (document.querySelector('.taLnk.ulBlueLinks')) return true

// Check if the restaurant has reviews
let reviewExists = false;


if (language === 'fr') {
reviewExists = await page.evaluate(() => {
if (document.querySelector('[id=filters_detail_language_filterLang_fr]')) return true
return false
}),
page.evaluate(() => {
})
}

if (language === 'en') {
reviewExists = await page.evaluate(() => {
if (document.querySelector('[id=filters_detail_language_filterLang_ALL]')) return true
return false
})
])

if (!reviewExists) {
return browser.handBack(page);
}

// Select all language
await page.click('[id=filters_detail_language_filterLang_ALL]');

await page.waitForTimeout(1000);
if (!reviewExists) return browser.handBack(page);

if (reviewExpandable) {

// Expand the reviews
await page.click('.taLnk.ulBlueLinks');
// Select specified language
let filterString = 'ALL'
if (language === 'fr') filterString = 'fr'
await page.click(`[id=filters_detail_language_filterLang_${filterString}]`);

// Wait for the reviews to load
await page.waitForFunction('document.querySelector("body").innerText.includes("Show less")');
}

await page.waitForTimeout(1000);

// Determin current URL
const currentURL = page.url();
Expand All @@ -60,7 +64,9 @@ const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
* In browser code:
* Extract the review page url
*/
const getReviewPageUrls = await page.evaluate(() => {
let getReviewPageUrls = null

if (language === 'en') getReviewPageUrls = await page.evaluate(() => {

// Get the total review count
const totalReviewCount = parseInt(document
Expand All @@ -69,6 +75,46 @@ const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
.split(')')[0]
.replace(',', ''));


// Default review page count
let noReviewPages = totalReviewCount / 15;

// Calculate the last review page
if (totalReviewCount % 15 !== 0) {
noReviewPages = ((totalReviewCount - totalReviewCount % 15) / 15) + 1;
}

// Get the url of the 2nd page of review. The 1st page is the input link
let url = false;

// If there is more than 1 review page
if (document.getElementsByClassName('pageNum').length > 0) {
url = document.getElementsByClassName('pageNum')[1].href;
}

return {
noReviewPages,
url,
totalReviewCount,
};
});

if (language === 'fr') getReviewPageUrls = await page.evaluate(() => {

// Get the total review count
// const totalReviewCount = parseInt(document
// .getElementsByClassName('reviews_header_count')[0]
// .innerText.split('(')[1]
// .split(')')[0]
// .replace(',', ''));
const reviewEelement = document.getElementsByClassName('count')
let totalReviewCount = 0
for (let index = 0; index < reviewEelement.length; index++) {
if (reviewEelement[index].parentElement.innerText.split('(')[0].split(' ')[0] === 'French') {
totalReviewCount = parseInt(document.getElementsByClassName('count')[index].innerText.split('(')[1].split(')')[0])
}
}

// Default review page count
let noReviewPages = totalReviewCount / 15;

Expand All @@ -92,6 +138,8 @@ const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
};
});



// Destructure function outputs
let { noReviewPages, url, totalReviewCount, } = getReviewPageUrls;

Expand Down Expand Up @@ -119,7 +167,7 @@ const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
pageCount: reviewPageUrls.length,
urls: reviewPageUrls,
};

console.log(data)
// Hand back the page so it's available again
browser.handBack(page);

Expand All @@ -137,10 +185,11 @@ const extractAllReviewPageUrls = async (restoUrl, position, browser) => {
* @param {Number} position - The index of the restaurant page in the list
* @param {String} restoName - The name of the restaurant
* @param {String} restoId - The id of the restaurant
* @param {String} language - The language of the reviews that you wantto scrape
* @param {Object} browser - A browser instance
* @returns {Promise<Object | Error>} - The final data
*/
const scrape = async (totalReviewCount, reviewPageUrls, position, restoName, restoId, browser) => {
const scrape = async (totalReviewCount, reviewPageUrls, position, restoName, restoId, language, browser) => {
try {

// Open a new page
Expand All @@ -159,11 +208,30 @@ const scrape = async (totalReviewCount, reviewPageUrls, position, restoName, res
// Wait for the content to load
await page.waitForSelector('body');

// Select all language
await page.click('[id=filters_detail_language_filterLang_ALL]');

// Select specified language
let filterString = 'ALL'
if (language === 'fr') filterString = 'fr'
await page.click(`[id=filters_detail_language_filterLang_${filterString}]`);

await page.waitForTimeout(1000);

const reviewExpandable = await page.evaluate(() => {
if (document.querySelector('.taLnk.ulBlueLinks')) return true
return false
})

if (reviewExpandable) {

// Expand the reviews
await page.click('.taLnk.ulBlueLinks');

// Wait for the reviews to load
await page.waitForFunction('document.querySelector("body").innerText.includes("Show less")');
}



// Determine current URL
const currentURL = page.url();

Expand Down Expand Up @@ -236,13 +304,14 @@ const scrape = async (totalReviewCount, reviewPageUrls, position, restoName, res
* @param {String} restoName - The name of the restaurant
* @param {String} restoId - The id of the restaurant
* @param {Number} position - The index of the restaurant page in the list
* @param {String} language - The language of the reviews to scrape
* @param {Object} browser - A browser instance
* @returns {Promise<Object | Error>} - The final data
*/
const start = async (restoUrl, restoName, restoId, position, browser) => {
const start = async (restoUrl, restoName, restoId, position, language, browser) => {
try {

const extracted = await extractAllReviewPageUrls(restoUrl, position, browser);
const extracted = await extractAllReviewPageUrls(restoUrl, position, language, browser);

// If the resto has no reviews
if (!extracted) return {
Expand All @@ -263,7 +332,7 @@ const start = async (restoUrl, restoName, restoId, position, browser) => {

const { urls, count, } = extracted

const results = await scrape(count, urls, position, restoName, restoId, browser);
const results = await scrape(count, urls, position, restoName, restoId, language, browser);

return results;

Expand Down
18 changes: 17 additions & 1 deletion setup/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ var (
errMissingSourceFiles = errors.New("MISSING SOURCE FILES")
errInputScrapMode = errors.New("INVALID SCRAP MODE")
errInputConcurrency = errors.New("INVALID CONCURRENCY VALUE")
errInputLanguage = errors.New("INVALID CONCURRENCY VALUE")
errDockerComposeYmlNotFound = errors.New("DOCKER-COMPSE-PROD.YML NOT FOUND")
errValueReplace = errors.New("FAILED TO REPLACE VALUE")
)
Expand Down Expand Up @@ -156,6 +157,16 @@ func userInputs(path string) error {
return errInputScrapMode
}

// Get review language
fmt.Println("Enter the language of the reviews (en or fr):")
var lang string
_, err = fmt.Scanf("%s\n", &lang)

// Input validation
if err != nil || (lang != "en" && lang != "fr") {
return errInputLanguage
}

// Get concurrency value
fmt.Println("Enter the concurrency value (ex: 10):")
var i int
Expand All @@ -169,6 +180,7 @@ func userInputs(path string) error {
// Print the user output
fmt.Println("Scrap mode:", mode)
fmt.Println("Concurrency value:", i)
fmt.Println("Review language:", lang)

// Read the docker-compose-prod.yml file
dockerComposeFilePath := filepath.Join(path, "Project_Files/docker-compose-prod.yml")
Expand All @@ -182,11 +194,15 @@ func userInputs(path string) error {
scrapModeRegex := regexp.MustCompile("SCRAPE_MODE:(.*)")
// Regex to match the concurrency value
concurrencyRegex := regexp.MustCompile("CONCURRENCY:(.*)")
// Regex to match the review language option
reviewLaguageRegex := regexp.MustCompile("LANGUAGE:(.*)")

// Replace the scrap mode with the input
scrapModeChanged := scrapModeRegex.ReplaceAllString(string(content), "SCRAPE_MODE: "+mode)
// Replace the concurrency value with the input
concurrencyChanged := concurrencyRegex.ReplaceAllString(scrapModeChanged, "CONCURRENCY: "+strconv.Itoa(i))
// Replace the review language with the input
reviewLanguageChanged := reviewLaguageRegex.ReplaceAllString(concurrencyChanged, "LANGUAGE: "+lang)

f, err := os.OpenFile(dockerComposeFilePath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, os.ModePerm)

Expand All @@ -198,7 +214,7 @@ func userInputs(path string) error {
defer f.Close()

// Write the new content to the file
_, err = f.WriteString(concurrencyChanged)
_, err = f.WriteString(reviewLanguageChanged)

if err != nil {
return errValueReplace
Expand Down

0 comments on commit c49db4e

Please sign in to comment.