From 6290c1e805a2a551cc2f21904966a9612a2e35c4 Mon Sep 17 00:00:00 2001 From: algo7 <11154774+algo7@users.noreply.github.com> Date: Thu, 4 May 2023 22:34:38 +0200 Subject: [PATCH 1/2] use data of review instead of date of stay as it's not always available. modify month string to num func --- libs/utils.js | 20 ++++++++++++++++++++ scrapers/hotel.js | 39 ++++++++++++++++++++++++++------------- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/libs/utils.js b/libs/utils.js index 955900d..8b0f4a1 100644 --- a/libs/utils.js +++ b/libs/utils.js @@ -46,6 +46,26 @@ const monthStringToNumber = (monthString) => { return 10; case 'November': return 11; + case 'Jan': + return 1; + case 'Feb': + return 2; + case 'Mar': + return 3; + case 'Apr': + return 4; + case 'Jun': + return 6; + case 'Jul': + return 7; + case 'Aug': + return 8; + case 'Sep': + return 9; + case 'Oct': + return 10; + case 'Nov': + return 11; default: return 12; } diff --git a/scrapers/hotel.js b/scrapers/hotel.js index 3363bc9..bfec63d 100644 --- a/scrapers/hotel.js +++ b/scrapers/hotel.js @@ -193,24 +193,38 @@ const scrape = async (totalReviewCount, reviewPageUrls, position, hotelName, hot }); // Extract date of stay - const commentDateOfStay = await page.evaluate(async () => { + const commentDateOfReview = await page.evaluate(async () => { - const commentDateOfStayBlocks = document.getElementsByClassName('teHYY') + // const commentDateOfStayBlocks = document.getElementsByClassName('teHYY') + const commentDateBlocks = document.getElementsByClassName("cRVSd") - const dates = []; + // const datesOfStay = []; + const datesOfReview = []; - for (let index = 0; index < commentDateOfStayBlocks.length; index++) { - // Split the date of stay text block into an array - const splitted = commentDateOfStayBlocks[index].innerText.split(' ') + // for (let index = 0; index < commentDateOfStayBlocks.length; index++) { - dates.push({ - month: splitted[3], - year: splitted[4], + // // Split the date of stay text block into an array + // const splitted = commentDateOfStayBlocks[index].innerText.split(' ') + + // datesOfStay.push({ + // month: splitted[3], + // year: splitted[4], + // }); + // } + + for (let index = 0; index < commentDateBlocks.length; index++) { + + // Split the date of comment text block into an array + const splitted = commentDateBlocks[index].children[0].innerText.split('review').pop().split(' ') + + datesOfReview.push({ + month: splitted[1], + year: splitted[2], }); } - return dates; + return datesOfReview; }); // Extract comments text @@ -230,12 +244,11 @@ const scrape = async (totalReviewCount, reviewPageUrls, position, hotelName, hot // Format (for CSV processing) the reviews so each review of each page is in an object const formatted = commentContent.map((comment, index) => { - return { title: commentTitle[index], content: comment, - month: monthStringToNumber(commentDateOfStay[index].month), - year: commentDateOfStay[index].year, + month: monthStringToNumber(commentDateOfReview[index].month), + year: commentDateOfReview[index].year, rating: commentRatingStringToNumber(commentRating[index]), }; }); From 47bf8fac44191e892fd6849d8f59c335d7318336 Mon Sep 17 00:00:00 2001 From: algo7 <11154774+algo7@users.noreply.github.com> Date: Thu, 4 May 2023 22:35:48 +0200 Subject: [PATCH 2/2] update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 30bf855..bd90d1b 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,7 @@ latest: Pulling from algo7/tripadvisor-review-scraper/scrap ## Known Issues 1. The hotel scraper works for English reviews only. 2. The restaurant scraper can only scrap english reivews or french reviews. +3. The hotel scraper uses date of review instead of date of stay as the date because the date of stay is not always available. # Container Provisioner Container Provisioner is a tool written in [Go](https://go.dev/) that provides a UI for the users to interact with the scraper. It uses [Docker API](https://docs.docker.com/engine/api/) to provision the containers and run the scraper. The UI is written in raw HTML and JavaScript while the backend web framwork is [Fiber](https://docs.gofiber.io/).