-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
jalarcon
committed
Jul 12, 2022
1 parent
a6e99c3
commit edbc73e
Showing
14 changed files
with
4,699 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,3 +13,6 @@ | |
|
||
# Dependency directories (remove the comment below to include it) | ||
# vendor/ | ||
|
||
# Visual Studio Code | ||
.vscode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
package cmd | ||
|
||
import ( | ||
"encoding/json" | ||
"flag" | ||
"fmt" | ||
"log" | ||
"os" | ||
"strconv" | ||
"text/template" | ||
|
||
"github.com/PuerkitoBio/goquery" | ||
"github.com/midir99/rastreadora/mpp" | ||
"github.com/midir99/rastreadora/ws" | ||
) | ||
|
||
type Scraper string | ||
|
||
const ( | ||
ScraperGroAlba = "gro-alba" | ||
ScraperGroAmber = "gro-amber" | ||
ScraperMorAmber = "mor-amber" | ||
ScraperMorCustom = "mor-custom" | ||
) | ||
|
||
func ScrapersAvailable() []Scraper { | ||
return []Scraper{ | ||
ScraperGroAlba, | ||
ScraperGroAmber, | ||
ScraperMorAmber, | ||
ScraperMorCustom, | ||
} | ||
} | ||
|
||
var usageTemplate = `rastreadora is a tool for scraping missing person posters data. | ||
Usage: | ||
rastreadora [-o output] <scraper> <from> [until] | ||
Arguments: | ||
scraper (string): the scraper that will be used to extract data, available values:{{range .Scrapers}} | ||
- {{.}}{{end}} | ||
from (number): the page number to start scraping missing person posters data. | ||
until (number): the page number to stop scraping missing person posters data, if omitted | ||
the program will only scrap data from the page number specified by the | ||
<from> argument. | ||
Flags: | ||
-o (string): the filename where the data will be stored, if omitted the data will be | ||
dumped in STDOUT. | ||
-V (bool): print the version of the program. | ||
-h (bool): print this usage message. | ||
` | ||
|
||
func Usage() { | ||
templateData := struct { | ||
Scrapers []Scraper | ||
}{Scrapers: ScrapersAvailable()} | ||
tmpl := template.Must(template.New("usage").Parse(usageTemplate)) | ||
err := tmpl.Execute(flag.CommandLine.Output(), templateData) | ||
if err != nil { | ||
fmt.Fprint(flag.CommandLine.Output(), "unable to print help") | ||
} | ||
} | ||
|
||
type Args struct { | ||
Scraper Scraper | ||
PageFrom uint64 | ||
PageUntil uint64 | ||
Out string | ||
PrintVersion bool | ||
} | ||
|
||
func ParseArgs() (*Args, error) { | ||
args := Args{} | ||
flag.StringVar(&args.Out, "o", "", "the filename to dump the missing-person posters data, if not present data is dumped into stdout") | ||
flag.BoolVar(&args.PrintVersion, "V", false, "print the version of the program") | ||
flag.Usage = Usage | ||
flag.Parse() | ||
if args.PrintVersion { | ||
return &args, nil | ||
} | ||
// Validate the "scraper" argument | ||
args.Scraper = Scraper(flag.Arg(0)) | ||
if args.Scraper == "" { | ||
return nil, fmt.Errorf("<scraper> argument cannot be empty") | ||
} | ||
scraperIsValid := false | ||
for _, s := range ScrapersAvailable() { | ||
if args.Scraper == s { | ||
scraperIsValid = true | ||
break | ||
} | ||
} | ||
if !scraperIsValid { | ||
return nil, fmt.Errorf("\"%s\" is not a valid choice for <scraper>", args.Scraper) | ||
} | ||
// Validate the "from" argument | ||
if flag.Arg(1) == "" { | ||
return nil, fmt.Errorf("<from> argument cannot be empty") | ||
} | ||
pF, err := strconv.ParseUint(flag.Arg(1), 10, 0) | ||
if err != nil { | ||
return nil, fmt.Errorf("\"%s\" is not a valid number for <from>", flag.Arg(1)) | ||
} | ||
args.PageFrom = pF | ||
// Validate the "until" argument | ||
if flag.Arg(2) == "" { | ||
args.PageUntil = args.PageFrom | ||
} else { | ||
pU, err := strconv.ParseUint(flag.Arg(2), 10, 0) | ||
if err != nil { | ||
return nil, fmt.Errorf("\"%s\" is not a valid number for [until]", flag.Arg(2)) | ||
} | ||
args.PageUntil = pU | ||
} | ||
// Validate "from" value is lower or equal to "until" value | ||
if args.PageFrom > args.PageUntil { | ||
return nil, fmt.Errorf("<from> value must be lower or equal to [until] value") | ||
} | ||
return &args, nil | ||
} | ||
|
||
func PrintVersion() { | ||
fmt.Println("rastreadora v0.0.1") | ||
} | ||
|
||
func SelectScraperFuncs(scraper Scraper) (func(*goquery.Document) []mpp.MissingPersonPoster, func(uint64) string, error) { | ||
switch scraper { | ||
case ScraperGroAlba: | ||
return ws.ScrapeGroAlbaAlerts, ws.MakeGroAlbaUrl, nil | ||
case ScraperGroAmber: | ||
return ws.ScrapeGroAmberAlerts, ws.MakeGroAmberUrl, nil | ||
case ScraperMorAmber: | ||
return ws.ScrapeMorAmberAlerts, ws.MakeMorAmberUrl, nil | ||
case ScraperMorCustom: | ||
return ws.ScrapeMorCustomAlerts, ws.MakeMorCustomUrl, nil | ||
default: | ||
return nil, nil, fmt.Errorf("invalid scraper %v", scraper) | ||
} | ||
} | ||
|
||
func Execute(args *Args) { | ||
if args.PrintVersion { | ||
PrintVersion() | ||
os.Exit(0) | ||
} | ||
scraper, makeUrl, err := SelectScraperFuncs(args.Scraper) | ||
if err != nil { | ||
log.Fatalf("Error: %s", err) | ||
} | ||
ch := make(chan []mpp.MissingPersonPoster) | ||
for pageNum := args.PageFrom; pageNum <= args.PageUntil; pageNum++ { | ||
pageUrl := makeUrl(pageNum) | ||
log.Printf("Processing %s ...\n", pageUrl) | ||
go ws.Scrape(pageUrl, scraper, ch) | ||
} | ||
mpps := []mpp.MissingPersonPoster{} | ||
pagesCount := args.PageUntil - args.PageFrom + 1 | ||
for curPage := uint64(1); curPage <= pagesCount; curPage++ { | ||
mpps = append(mpps, <-ch...) | ||
log.Printf("%d out of %d page(s) have been scraped\n", curPage, pagesCount) | ||
} | ||
output, err := json.Marshal(mpps) | ||
if err != nil { | ||
log.Fatal("Error: ", err) | ||
} | ||
if args.Out != "" { | ||
if os.WriteFile(args.Out, output, 0664) != nil { | ||
log.Fatalf("Error: %s", err) | ||
} | ||
} else { | ||
_, err := os.Stdout.Write(output) | ||
if err != nil { | ||
log.Fatalf("Error: %s", err) | ||
} | ||
} | ||
log.Printf("%d missing person poster(s) were processed\n", len(mpps)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
module github.com/midir99/rastreadora | ||
|
||
go 1.18 | ||
|
||
require ( | ||
github.com/PuerkitoBio/goquery v1.8.0 | ||
github.com/andybalholm/cascadia v1.3.1 // indirect | ||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 // indirect | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= | ||
github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= | ||
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= | ||
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= | ||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 h1:/6y1LfuqNuQdHAm0jjtPtgRcxIxjVZgm5OTu8/QhZvk= | ||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= | ||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= | ||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= | ||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= | ||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= | ||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package main | ||
|
||
import ( | ||
"flag" | ||
"fmt" | ||
"os" | ||
|
||
"github.com/midir99/rastreadora/cmd" | ||
) | ||
|
||
func main() { | ||
args, err := cmd.ParseArgs() | ||
if err != nil { | ||
fmt.Fprint( | ||
flag.CommandLine.Output(), | ||
"Error: ", | ||
err, | ||
"\nTry using the -h flag to get some help.\n", | ||
) | ||
os.Exit(1) | ||
} | ||
cmd.Execute(args) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
package mpp | ||
|
||
type State string | ||
|
||
const ( | ||
StateCiudadDeMexico State = "MX-CMX" | ||
StateAguascalientes State = "MX-AGU" | ||
StateBajaCalifornia State = "MX-BCN" | ||
StateBajaCaliforniaSur State = "MX-BCS" | ||
StateCampeche State = "MX-CAM" | ||
StateCoahuilaDeZaragoza State = "MX-COA" | ||
StateColima State = "MX-COL" | ||
StateChiapas State = "MX-CHP" | ||
StateChihuahua State = "MX-CHH" | ||
StateDurango State = "MX-DUR" | ||
StateGuanajuato State = "MX-GUA" | ||
StateGuerrero State = "MX-GRO" | ||
StateHidalgo State = "MX-HID" | ||
StateJalisco State = "MX-JAL" | ||
StateMexico State = "MX-MEX" | ||
StateMichoacanDeOcampo State = "MX-MIC" | ||
StateMorelos State = "MX-MOR" | ||
StateNayarit State = "MX-NAY" | ||
StateNuevoLeon State = "MX-NLE" | ||
StateOaxaca State = "MX-OAX" | ||
StatePuebla State = "MX-PUE" | ||
StateQueretaro State = "MX-QUE" | ||
StateQuintanaRoo State = "MX-ROO" | ||
StateSanLuisPotosi State = "MX-SLP" | ||
StateSinaloa State = "MX-SIN" | ||
StateSonora State = "MX-SON" | ||
StateTabasco State = "MX-TAB" | ||
StateTamaulipas State = "MX-TAM" | ||
StateTlaxcala State = "MX-TLA" | ||
StateVeracruzDeIgnacioDeLaLlave State = "MX-VER" | ||
StateYucatan State = "MX-YUC" | ||
StateZacatecas State = "MX-ZAC" | ||
) | ||
|
||
type PhysicalBuild string | ||
|
||
const ( | ||
PhysicalBuildSlim PhysicalBuild = "S" | ||
PhysicalBuildRegular PhysicalBuild = "R" | ||
PhysicalBuildHeavy PhysicalBuild = "H" | ||
) | ||
|
||
type Complexion string | ||
|
||
const ( | ||
ComplexionVeryLight Complexion = "VL" | ||
ComplexionLight Complexion = "L" | ||
ComplexionLightIntermediate Complexion = "LI" | ||
ComplexionDarkIntermediate Complexion = "DI" | ||
ComplexionDark Complexion = "D" | ||
ComplexionVeryDark Complexion = "VD" | ||
) | ||
|
||
type Sex string | ||
|
||
const ( | ||
SexFemale Sex = "F" | ||
SexMale Sex = "M" | ||
) | ||
|
||
type AlertType string | ||
|
||
const ( | ||
AlertTypeAlba AlertType = "AL" | ||
AlertTypeAmber AlertType = "AM" | ||
AlertTypeHasVistoA AlertType = "HV" | ||
AlertTypeOdisea AlertType = "OD" | ||
) | ||
|
||
type MissingPersonPoster struct { | ||
MpName string `json:"mp_name"` | ||
MpHeight uint `json:"mp_height,omitempty"` | ||
MpWeight uint `json:"mp_weight,omitempty"` | ||
MpPhysicalBuild PhysicalBuild `json:"mp_physical_build,omitempty"` | ||
MpComplexion Complexion `json:"mp_complexion,omitempty"` | ||
MpSex Sex `json:"mp_sex,omitempty"` | ||
MpDob string `json:"mp_dob,omitempty"` | ||
MpAgeWhenDisappeared uint `json:"mp_age_when_disappeared,omitempty"` | ||
MpEyesDescription string `json:"mp_eyes_description,omitempty"` | ||
MpHairDescription string `json:"mp_hair_description,omitempty"` | ||
MpOutfitDescription string `json:"mp_outfit_description,omitempty"` | ||
MpIdentifyingCharacteristics string `json:"mp_identifying_characteristics,omitempty"` | ||
CircumstancesBehindDissapearance string `json:"circumstances_behind_dissapearance,omitempty"` | ||
MissingFrom string `json:"missing_from,omitempty"` | ||
MissingDate string `json:"missing_date,omitempty"` | ||
Found bool `json:"found,omitempty"` | ||
AlertType AlertType `json:"alert_type,omitempty"` | ||
PoState State `json:"po_state"` | ||
PoPostUrl string `json:"po_post_url,omitempty"` | ||
PoPostPublicationDate string `json:"po_post_publication_date,omitempty"` | ||
PoPosterUrl string `json:"po_poster_url,omitempty"` | ||
IsMultiple bool `json:"is_multiple,omitempty"` | ||
} |
Oops, something went wrong.