Skip to content

Commit

Permalink
Merge pull request #25 from algo7/feature/scrape_comment_date_and_rating
Browse files Browse the repository at this point in the history
Feature/scrape comment date and rating
  • Loading branch information
algo7 committed May 4, 2023
2 parents 206617a + 2d133b5 commit 7d3278f
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 6 deletions.
77 changes: 71 additions & 6 deletions libs/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,71 @@ const fileExists = (filePath) => {
}
};

/**
* Convert month string to number
* @param {String} monthString - The month string
* @returns {Number} - The month number
*/
const monthStringToNumber = (monthString) => {
switch (monthString) {
case 'January':
return 1;
case 'February':
return 2;
case 'March':
return 3;
case 'April':
return 4;
case 'May':
return 5;
case 'June':
return 6;
case 'July':
return 7;
case 'August':
return 8;
case 'September':
return 9;
case 'October':
return 10;
case 'November':
return 11;
default:
return 12;
}
};

/**
* Convert comment rating string (the class name of the rating element) to number (the actual rating)
* @param {String} ratingString - The class name of the rating element
* @returns {Number} - The actual rating
*/
const commentRatingStringToNumber = (ratingString) => {

switch (ratingString) {
case "bubble_10":
return 1;
case "bubble_15":
return 1.5;
case "bubble_20":
return 2;
case "bubble_25":
return 2.5;
case "bubble_30":
return 3;
case "bubble_35":
return 3.5;
case "bubble_40":
return 4;
case "bubble_45":
return 4.5;
case "bubble_50":
return 5;
default:
return 5;
}
};

/**
* Combine all JSON files in the data directory into a JSON array of object
* @param {String} scrapeMode - Resturant or hotel
Expand All @@ -30,7 +95,7 @@ const combine = (scrapeMode, dataDir) => {

const extracted = allFiles
// Filter out JSON files
.filter(fileName => fileName.includes('.json'))
.filter(fileName => fileName.includes('.json') && fileName !== 'All.json')
// Load each file and extract the information
.map(fileName => {
const fileContent = JSON.parse(readFileSync(`${dataDir}${fileName}`));
Expand All @@ -42,7 +107,7 @@ const combine = (scrapeMode, dataDir) => {
const { hotelName, hotelId, position, allReviews, } = fileContent;
return { hotelName, hotelId, position, allReviews, };
})
// Sort the extracted data by the index
// Sort the extracted data by the index so all reviews of the same hotel are together
.sort((a, b) => a.position - b.position)
// Append the name, id, and index to each review
.map(item => {
Expand Down Expand Up @@ -75,11 +140,11 @@ const combine = (scrapeMode, dataDir) => {
rating, dateOfVist, ratingDate,
};
}
const { hotelName, hotelId, title, content, } = review;
const { hotelName, hotelId, title, content, month, year, rating } = review;

// Check if the hotel ID is supplied
if (!hotelId) return { hotelName, title, content, };
return { hotelName, hotelId, title, content, };
if (!hotelId) return { hotelName, title, content, month, year, rating };
return { hotelName, hotelId, title, content, month, year, rating };

});

Expand Down Expand Up @@ -181,7 +246,7 @@ const noBs = async (page) => {
}
};

export { fileExists, combine, reviewJSONToCsv, csvToJSON, dataProcessor, noBs };
export { fileExists, combine, reviewJSONToCsv, csvToJSON, dataProcessor, noBs, monthStringToNumber, commentRatingStringToNumber };


// const cookiesAvailable = await fileExists('./data/cookies.json');
Expand Down
39 changes: 39 additions & 0 deletions scrapers/hotel.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// Dependencies
import { Chalk } from 'chalk';

import { monthStringToNumber, commentRatingStringToNumber } from '../libs/utils.js';

// Environment variables
let { IS_PROVISIONER } = process.env;

Expand Down Expand Up @@ -178,6 +180,39 @@ const scrape = async (totalReviewCount, reviewPageUrls, position, hotelName, hot
return titles;
});

// Extract comment rating
const commentRating = await page.evaluate(async () => {
const commentRatingBlocks = document.getElementsByClassName("Hlmiy")
const ratings = [];

for (let index = 0; index < commentRatingBlocks.length; index++) {
ratings.push(commentRatingBlocks[index].children[0].classList[1]);
}

return ratings;
});

// Extract date of stay
const commentDateOfStay = await page.evaluate(async () => {

const commentDateOfStayBlocks = document.getElementsByClassName('teHYY')

const dates = [];

for (let index = 0; index < commentDateOfStayBlocks.length; index++) {

// Split the date of stay text block into an array
const splitted = commentDateOfStayBlocks[index].innerText.split(' ')

dates.push({
month: splitted[3],
year: splitted[4],
});
}

return dates;
});

// Extract comments text
const commentContent = await page.evaluate(async () => {

Expand All @@ -195,9 +230,13 @@ const scrape = async (totalReviewCount, reviewPageUrls, position, hotelName, hot

// Format (for CSV processing) the reviews so each review of each page is in an object
const formatted = commentContent.map((comment, index) => {

return {
title: commentTitle[index],
content: comment,
month: monthStringToNumber(commentDateOfStay[index].month),
year: commentDateOfStay[index].year,
rating: commentRatingStringToNumber(commentRating[index]),
};
});

Expand Down

0 comments on commit 7d3278f

Please sign in to comment.