Skip to content

Commit

Permalink
Refactoring/main (#224)
Browse files Browse the repository at this point in the history
* add config package

* Refactor scraper main.go file

* Remove unused init function

* fix error message typo

* update comments

* Refactor file type handling in config and main.go

* move ip check function to utils package

* Remove unused variables and comments

* Refactor JSON file writing and review sorting

* comments

* comments

* formatting
  • Loading branch information
algo7 committed Mar 15, 2024
1 parent 0820aba commit deaf9cb
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 105 deletions.
4 changes: 3 additions & 1 deletion scraper/go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
module github.com/algo7/TripAdvisor-Review-Scraper/scraper

go 1.21
go 1.21.1

toolchain go1.21.4
Empty file added scraper/go.sum
Empty file.
54 changes: 54 additions & 0 deletions scraper/internal/config/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package config

import (
"fmt"
"os"
"strings"
)

// Config is a struct that represents the configuration for the scraper
type Config struct {
LocationURL string
Languages []string
FileType string
ProxyHost string
}

// NewConfig is a function that returns a new Config struct
// Returns an error if the LOCATION_URL is not set
func NewConfig() (*Config, error) {
// Default languages
defaultLanguages := []string{"en"}

// Get location URL
locationURL := os.Getenv("LOCATION_URL")
if locationURL == "" {
return nil, fmt.Errorf("LOCATION_URL not set")
}

// Get languages
languages := defaultLanguages
if envLang := os.Getenv("LANGUAGES"); envLang != "" {
languages = strings.Split(envLang, "|")
}

// Get file type
fileType := strings.ToLower(os.Getenv("FILETYPE"))
if fileType == "" {
fileType = "csv"
}

if fileType != "csv" && fileType != "json" {
return nil, fmt.Errorf("invalid file type. Use csv or json")
}

// Get proxy host
proxyHost := os.Getenv("PROXY_HOST")

return &Config{
LocationURL: locationURL,
Languages: languages,
FileType: fileType,
ProxyHost: proxyHost,
}, nil
}
107 changes: 27 additions & 80 deletions scraper/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,63 +2,38 @@ package main

import (
"encoding/csv"
"encoding/json"
"fmt"
"log"
"math/rand"
"net/http"
"os"
"sort"
"strconv"
"strings"
"time"

"github.com/algo7/TripAdvisor-Review-Scraper/scraper/internal/config"
"github.com/algo7/TripAdvisor-Review-Scraper/scraper/pkg/tripadvisor"
)

var (
// LANGUAGES is a slice of languages to be used for scraping, default is English
// var LANGUAGES = []string{"en", "fr", "pt", "es", "de", "it", "ru", "ja", "zh", "ko", "nl", "sv", "da", "fi", "no", "pl", "hu", "cs", "el", "tr", "th", "ar", "he", "id", "ms", "vi", "tl", "uk", "ro", "bg", "hr", "sr", "sk", "sl", "et", "lv", "lt", "sq", "mk", "hi", "bn", "pa", "gu", "ta", "te", "kn", "ml", "mr", "ur", "fa", "ne", "si", "my", "km", "lo", "am", "ka", "hy", "az", "uz", "tk", "ky", "tg", "mn", "bo", "sd", "ps", "ku", "gl", "eu", "ca", "is", "af", "xh", "zu", "ny", "st", "tn", "sn", "sw", "rw", "so", "mg", "eo", "cy", "gd", "gv", "ga", "mi", "sm", "to", "haw", "id", "jw"}
LANGUAGES = []string{"en"}

// FILETYPE is the type of file to be saved, default is csv
FILETYPE = "csv"
"github.com/algo7/TripAdvisor-Review-Scraper/scraper/pkg/utils"
)

func main() {
// Scraper variables
var allReviews []tripadvisor.Review
var location tripadvisor.Location

// Get the location URL from the environment variable
locationURL := os.Getenv("LOCATION_URL")
log.Printf("Location URL: %s", locationURL)

// Get the languages from the environment variable of use "en" as default
languages := LANGUAGES
if os.Getenv("LANGUAGES") != "" {
languages = strings.Split(os.Getenv("LANGUAGES"), "|")
}
log.Printf("Languages: %v", languages)

// Get the file type from the environment variable or use "csv" as default
fileType := FILETYPE
if os.Getenv("FILETYPE") != "" {
fileType = os.Getenv("FILETYPE")
}
if fileType != "csv" && fileType != "json" {
log.Fatal("Invalid file type. Use csv or json")
config, err := config.NewConfig()
if err != nil {
log.Fatalf("Error creating scrape config: %v", err)
}
log.Printf("File Type: %s", fileType)

// Get the query type from the URL
queryType := tripadvisor.GetURLType(locationURL)
queryType := tripadvisor.GetURLType(config.LocationURL)
if queryType == "" {
log.Fatal("Invalid URL")
}
log.Printf("Location Type: %s", queryType)

// Parse the location ID and location name from the URL
locationID, locationName, err := tripadvisor.ParseURL(locationURL, queryType)
locationID, locationName, err := tripadvisor.ParseURL(config.LocationURL, queryType)
if err != nil {
log.Fatalf("Error parsing URL: %v", err)
}
Expand All @@ -68,34 +43,31 @@ func main() {
// Get the query ID for the given query type.
queryID := tripadvisor.GetQueryID(queryType)
if err != nil {
log.Fatal("The location ID must be an positive integer")
log.Fatal("The location ID must be a positive integer")
}

// Get the proxy host if set
proxyHost := os.Getenv("PROXY_HOST")

// The default HTTP client
client := &http.Client{}

// If the proxy host is set, use the proxy client
if proxyHost != "" {
if config.ProxyHost != "" {

// Get the HTTP client with the proxy
client, err = tripadvisor.GetHTTPClientWithProxy(proxyHost)
client, err = tripadvisor.GetHTTPClientWithProxy(config.ProxyHost)
if err != nil {
log.Fatalf("Error creating HTTP client with the give proxy %s: %v", proxyHost, err)
log.Fatalf("Error creating HTTP client with the give proxy %s: %v", config.ProxyHost, err)
}

// Check IP
ip, err := tripadvisor.CheckIP(client)
ip, err := utils.CheckIP(client)
if err != nil {
log.Fatalf("Error checking IP: %v", err)
}
log.Printf("Proxy IP: %s", ip)
}

// Fetch the review count for the given location ID
reviewCount, err := tripadvisor.FetchReviewCount(client, locationID, queryType, languages)
reviewCount, err := tripadvisor.FetchReviewCount(client, locationID, queryType, config.Languages)
if err != nil {
log.Fatalf("Error fetching review count: %v", err)
}
Expand All @@ -105,7 +77,7 @@ func main() {
log.Printf("Review count: %d", reviewCount)

// Create a file to save the reviews data
fileName := "reviews." + fileType
fileName := fmt.Sprintf("reviews.%s", config.FileType)
fileHandle, err := os.Create(fileName)
if err != nil {
log.Fatalf("Error creating file %s: %v", fileName, err)
Expand All @@ -131,7 +103,7 @@ func main() {
offset := tripadvisor.CalculateOffset(i)

// Make the request to the TripAdvisor GraphQL endpoint
resp, err := tripadvisor.MakeRequest(client, queryID, languages, locationID, offset, 20)
resp, err := tripadvisor.MakeRequest(client, queryID, config.Languages, locationID, offset, 20)
if err != nil {
log.Fatalf("Error making request at iteration %d: %v", i, err)
}
Expand All @@ -156,7 +128,7 @@ func main() {
// Store the location data
location = response[0].Data.Locations[0].Location

if fileType == "csv" {
if config.FileType == "csv" {
// Iterating over the reviews
for _, row := range reviews {
row := []string{
Expand All @@ -177,7 +149,8 @@ func main() {
}

}
if fileType == "csv" {

if config.FileType == "csv" {
// Create a new csv writer. We are using writeAll so defer writer.Flush() is not required
writer := csv.NewWriter(fileHandle)

Expand All @@ -192,45 +165,19 @@ func main() {
if err != nil {
log.Fatalf("Error writing data to csv: %v", err)
}
} else {
// Write the data to the JSON file
const layout = "2006-01-02"

sort.Slice(allReviews, func(i, j int) bool {
iTime, err := time.Parse(layout, allReviews[i].CreatedDate)
if err != nil {
log.Fatalf("Error parsing time: %v", err)
}

jTime, err := time.Parse(layout, allReviews[j].CreatedDate)
if err != nil {
log.Fatalf("Error parsing time: %v", err)
}
}

return jTime.After(iTime)
})
// If the file type is JSON, write the data to the file
if config.FileType == "json" {
// Sort the reviews by date
tripadvisor.SortReviewsByDate(allReviews)

feedback := tripadvisor.Feedback{
Location: location,
Reviews: allReviews,
}
data, err := json.Marshal(feedback)
if err != nil {
log.Fatalf("Could not marshal data: %v", err)
}
_, err = fileHandle.Write(data)
// Write the data to the JSON file
err := tripadvisor.WriteReviewsToJSONFile(allReviews, location, fileHandle)
if err != nil {
log.Fatalf("Could not write data: %v", err)
log.Fatalf("Error writing data to JSON file: %v", err)
}
}

log.Printf("Data written to %s", fileName)
log.Println("Scrapping completed")
}

func init() {
// Check if the environment variables are set
if os.Getenv("LOCATION_URL") == "" {
log.Fatal("LOCATION_URL not set")
}
}
52 changes: 28 additions & 24 deletions scraper/pkg/tripadvisor/tripadvisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ import (
"log"
"net/http"
"os"
"sort"
"strconv"
"strings"
"time"
)

// MakeRequest is a function that sends a POST request to the TripAdvisor GraphQL endpoint
Expand Down Expand Up @@ -143,30 +145,6 @@ func FetchReviewCount(client *http.Client, locationID uint32, queryType string,
return 0, fmt.Errorf("no reviews found for location ID %d", locationID)
}

// CheckIP takes in a http client and calls ipinfo.io/ip to check the current IP address
func CheckIP(client *http.Client) (ip string, err error) {

// Make the request to ipinfo.io/ip
resp, err := client.Get("https://ipinfo.io/ip")
if err != nil {
return "", fmt.Errorf("error getting IP address: %w", err)
}
defer resp.Body.Close()

// Check the response status code
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("error response status code: %d", resp.StatusCode)
}

// Read the response body
responseBody, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("error reading response body: %w", err)
}

return string(responseBody), nil
}

// CalculateIterations is a function that calculates the number of iterations required to fetch all reviews
func CalculateIterations(reviewCount uint32) (iterations uint32) {

Expand Down Expand Up @@ -249,3 +227,29 @@ func ParseURL(url string, locationType string) (locationID uint32, locationName
return 0, "", fmt.Errorf("invalid location type: %s", locationType)
}
}

func WriteReviewsToJSONFile(reviews []Review, location Location, fileHandle *os.File) error {
feedback := Feedback{
Location: location,
Reviews: reviews,
}
data, err := json.Marshal(feedback)
if err != nil {
return fmt.Errorf("could not marshal data: %w", err)
}
if _, err := fileHandle.Write(data); err != nil {
return fmt.Errorf("could not write data to file: %w", err)
}
return nil
}

// SortReviewsByDate is a function that sorts the reviews by date
// This function modifies the original slice
func SortReviewsByDate(reviews []Review) {
const layout = "2006-01-02" // Move the layout constant here to keep it scoped to the sorting logic
sort.Slice(reviews, func(i, j int) bool {
iTime, _ := time.Parse(layout, reviews[i].CreatedDate) // Assume error handling is done elsewhere or errors are unlikely
jTime, _ := time.Parse(layout, reviews[j].CreatedDate)
return iTime.After(jTime)
})
}
31 changes: 31 additions & 0 deletions scraper/pkg/utils/utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package utils

import (
"fmt"
"io"
"net/http"
)

// CheckIP takes in a http client and calls ipinfo.io/ip to check the current IP address
func CheckIP(client *http.Client) (ip string, err error) {

// Make the request to ipinfo.io/ip
resp, err := client.Get("https://ipinfo.io/ip")
if err != nil {
return "", fmt.Errorf("error getting IP address: %w", err)
}
defer resp.Body.Close()

// Check the response status code
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("error response status code: %d", resp.StatusCode)
}

// Read the response body
responseBody, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("error reading response body: %w", err)
}

return string(responseBody), nil
}

0 comments on commit deaf9cb

Please sign in to comment.