Skip to content

Commit

Permalink
Add all the functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
jalarcon committed Jul 12, 2022
1 parent a6e99c3 commit edbc73e
Show file tree
Hide file tree
Showing 14 changed files with 4,699 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@

# Dependency directories (remove the comment below to include it)
# vendor/

# Visual Studio Code
.vscode
182 changes: 182 additions & 0 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
package cmd

import (
"encoding/json"
"flag"
"fmt"
"log"
"os"
"strconv"
"text/template"

"github.com/PuerkitoBio/goquery"
"github.com/midir99/rastreadora/mpp"
"github.com/midir99/rastreadora/ws"
)

type Scraper string

const (
ScraperGroAlba = "gro-alba"
ScraperGroAmber = "gro-amber"
ScraperMorAmber = "mor-amber"
ScraperMorCustom = "mor-custom"
)

func ScrapersAvailable() []Scraper {
return []Scraper{
ScraperGroAlba,
ScraperGroAmber,
ScraperMorAmber,
ScraperMorCustom,
}
}

var usageTemplate = `rastreadora is a tool for scraping missing person posters data.
Usage:
rastreadora [-o output] <scraper> <from> [until]
Arguments:
scraper (string): the scraper that will be used to extract data, available values:{{range .Scrapers}}
- {{.}}{{end}}
from (number): the page number to start scraping missing person posters data.
until (number): the page number to stop scraping missing person posters data, if omitted
the program will only scrap data from the page number specified by the
<from> argument.
Flags:
-o (string): the filename where the data will be stored, if omitted the data will be
dumped in STDOUT.
-V (bool): print the version of the program.
-h (bool): print this usage message.
`

func Usage() {
templateData := struct {
Scrapers []Scraper
}{Scrapers: ScrapersAvailable()}
tmpl := template.Must(template.New("usage").Parse(usageTemplate))
err := tmpl.Execute(flag.CommandLine.Output(), templateData)
if err != nil {
fmt.Fprint(flag.CommandLine.Output(), "unable to print help")
}
}

type Args struct {
Scraper Scraper
PageFrom uint64
PageUntil uint64
Out string
PrintVersion bool
}

func ParseArgs() (*Args, error) {
args := Args{}
flag.StringVar(&args.Out, "o", "", "the filename to dump the missing-person posters data, if not present data is dumped into stdout")
flag.BoolVar(&args.PrintVersion, "V", false, "print the version of the program")
flag.Usage = Usage
flag.Parse()
if args.PrintVersion {
return &args, nil
}
// Validate the "scraper" argument
args.Scraper = Scraper(flag.Arg(0))
if args.Scraper == "" {
return nil, fmt.Errorf("<scraper> argument cannot be empty")
}
scraperIsValid := false
for _, s := range ScrapersAvailable() {
if args.Scraper == s {
scraperIsValid = true
break
}
}
if !scraperIsValid {
return nil, fmt.Errorf("\"%s\" is not a valid choice for <scraper>", args.Scraper)
}
// Validate the "from" argument
if flag.Arg(1) == "" {
return nil, fmt.Errorf("<from> argument cannot be empty")
}
pF, err := strconv.ParseUint(flag.Arg(1), 10, 0)
if err != nil {
return nil, fmt.Errorf("\"%s\" is not a valid number for <from>", flag.Arg(1))
}
args.PageFrom = pF
// Validate the "until" argument
if flag.Arg(2) == "" {
args.PageUntil = args.PageFrom
} else {
pU, err := strconv.ParseUint(flag.Arg(2), 10, 0)
if err != nil {
return nil, fmt.Errorf("\"%s\" is not a valid number for [until]", flag.Arg(2))
}
args.PageUntil = pU
}
// Validate "from" value is lower or equal to "until" value
if args.PageFrom > args.PageUntil {
return nil, fmt.Errorf("<from> value must be lower or equal to [until] value")
}
return &args, nil
}

func PrintVersion() {
fmt.Println("rastreadora v0.0.1")
}

func SelectScraperFuncs(scraper Scraper) (func(*goquery.Document) []mpp.MissingPersonPoster, func(uint64) string, error) {
switch scraper {
case ScraperGroAlba:
return ws.ScrapeGroAlbaAlerts, ws.MakeGroAlbaUrl, nil
case ScraperGroAmber:
return ws.ScrapeGroAmberAlerts, ws.MakeGroAmberUrl, nil
case ScraperMorAmber:
return ws.ScrapeMorAmberAlerts, ws.MakeMorAmberUrl, nil
case ScraperMorCustom:
return ws.ScrapeMorCustomAlerts, ws.MakeMorCustomUrl, nil
default:
return nil, nil, fmt.Errorf("invalid scraper %v", scraper)
}
}

func Execute(args *Args) {
if args.PrintVersion {
PrintVersion()
os.Exit(0)
}
scraper, makeUrl, err := SelectScraperFuncs(args.Scraper)
if err != nil {
log.Fatalf("Error: %s", err)
}
ch := make(chan []mpp.MissingPersonPoster)
for pageNum := args.PageFrom; pageNum <= args.PageUntil; pageNum++ {
pageUrl := makeUrl(pageNum)
log.Printf("Processing %s ...\n", pageUrl)
go ws.Scrape(pageUrl, scraper, ch)
}
mpps := []mpp.MissingPersonPoster{}
pagesCount := args.PageUntil - args.PageFrom + 1
for curPage := uint64(1); curPage <= pagesCount; curPage++ {
mpps = append(mpps, <-ch...)
log.Printf("%d out of %d page(s) have been scraped\n", curPage, pagesCount)
}
output, err := json.Marshal(mpps)
if err != nil {
log.Fatal("Error: ", err)
}
if args.Out != "" {
if os.WriteFile(args.Out, output, 0664) != nil {
log.Fatalf("Error: %s", err)
}
} else {
_, err := os.Stdout.Write(output)
if err != nil {
log.Fatalf("Error: %s", err)
}
}
log.Printf("%d missing person poster(s) were processed\n", len(mpps))
}
9 changes: 9 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
module github.com/midir99/rastreadora

go 1.18

require (
github.com/PuerkitoBio/goquery v1.8.0
github.com/andybalholm/cascadia v1.3.1 // indirect
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 // indirect
)
11 changes: 11 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 h1:/6y1LfuqNuQdHAm0jjtPtgRcxIxjVZgm5OTu8/QhZvk=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
23 changes: 23 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package main

import (
"flag"
"fmt"
"os"

"github.com/midir99/rastreadora/cmd"
)

func main() {
args, err := cmd.ParseArgs()
if err != nil {
fmt.Fprint(
flag.CommandLine.Output(),
"Error: ",
err,
"\nTry using the -h flag to get some help.\n",
)
os.Exit(1)
}
cmd.Execute(args)
}
98 changes: 98 additions & 0 deletions mpp/mpp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package mpp

type State string

const (
StateCiudadDeMexico State = "MX-CMX"
StateAguascalientes State = "MX-AGU"
StateBajaCalifornia State = "MX-BCN"
StateBajaCaliforniaSur State = "MX-BCS"
StateCampeche State = "MX-CAM"
StateCoahuilaDeZaragoza State = "MX-COA"
StateColima State = "MX-COL"
StateChiapas State = "MX-CHP"
StateChihuahua State = "MX-CHH"
StateDurango State = "MX-DUR"
StateGuanajuato State = "MX-GUA"
StateGuerrero State = "MX-GRO"
StateHidalgo State = "MX-HID"
StateJalisco State = "MX-JAL"
StateMexico State = "MX-MEX"
StateMichoacanDeOcampo State = "MX-MIC"
StateMorelos State = "MX-MOR"
StateNayarit State = "MX-NAY"
StateNuevoLeon State = "MX-NLE"
StateOaxaca State = "MX-OAX"
StatePuebla State = "MX-PUE"
StateQueretaro State = "MX-QUE"
StateQuintanaRoo State = "MX-ROO"
StateSanLuisPotosi State = "MX-SLP"
StateSinaloa State = "MX-SIN"
StateSonora State = "MX-SON"
StateTabasco State = "MX-TAB"
StateTamaulipas State = "MX-TAM"
StateTlaxcala State = "MX-TLA"
StateVeracruzDeIgnacioDeLaLlave State = "MX-VER"
StateYucatan State = "MX-YUC"
StateZacatecas State = "MX-ZAC"
)

type PhysicalBuild string

const (
PhysicalBuildSlim PhysicalBuild = "S"
PhysicalBuildRegular PhysicalBuild = "R"
PhysicalBuildHeavy PhysicalBuild = "H"
)

type Complexion string

const (
ComplexionVeryLight Complexion = "VL"
ComplexionLight Complexion = "L"
ComplexionLightIntermediate Complexion = "LI"
ComplexionDarkIntermediate Complexion = "DI"
ComplexionDark Complexion = "D"
ComplexionVeryDark Complexion = "VD"
)

type Sex string

const (
SexFemale Sex = "F"
SexMale Sex = "M"
)

type AlertType string

const (
AlertTypeAlba AlertType = "AL"
AlertTypeAmber AlertType = "AM"
AlertTypeHasVistoA AlertType = "HV"
AlertTypeOdisea AlertType = "OD"
)

type MissingPersonPoster struct {
MpName string `json:"mp_name"`
MpHeight uint `json:"mp_height,omitempty"`
MpWeight uint `json:"mp_weight,omitempty"`
MpPhysicalBuild PhysicalBuild `json:"mp_physical_build,omitempty"`
MpComplexion Complexion `json:"mp_complexion,omitempty"`
MpSex Sex `json:"mp_sex,omitempty"`
MpDob string `json:"mp_dob,omitempty"`
MpAgeWhenDisappeared uint `json:"mp_age_when_disappeared,omitempty"`
MpEyesDescription string `json:"mp_eyes_description,omitempty"`
MpHairDescription string `json:"mp_hair_description,omitempty"`
MpOutfitDescription string `json:"mp_outfit_description,omitempty"`
MpIdentifyingCharacteristics string `json:"mp_identifying_characteristics,omitempty"`
CircumstancesBehindDissapearance string `json:"circumstances_behind_dissapearance,omitempty"`
MissingFrom string `json:"missing_from,omitempty"`
MissingDate string `json:"missing_date,omitempty"`
Found bool `json:"found,omitempty"`
AlertType AlertType `json:"alert_type,omitempty"`
PoState State `json:"po_state"`
PoPostUrl string `json:"po_post_url,omitempty"`
PoPostPublicationDate string `json:"po_post_publication_date,omitempty"`
PoPosterUrl string `json:"po_poster_url,omitempty"`
IsMultiple bool `json:"is_multiple,omitempty"`
}
Loading

0 comments on commit edbc73e

Please sign in to comment.