Skip to content

Commit

Permalink
updated ingest script with better error checking of inputs; added new…
Browse files Browse the repository at this point in the history
… data fields from scoutbook; added YPT date checking option
  • Loading branch information
jbrown123 committed Feb 26, 2021
1 parent 7245af4 commit c9df2b3
Showing 1 changed file with 156 additions and 33 deletions.
189 changes: 156 additions & 33 deletions ingest.pl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# script to merge ScoutNET and ScoutBook data and generate JSON data.
use Getopt::Std;

getopts('e:D'); # warn about excessive MB, Debug output
getopts('e:y:D'); # warn about excessive MB, want about YPT expiry; Debug output

Usage() unless ($#ARGV == 1);

Expand All @@ -12,45 +12,106 @@

warn "Loading badges from $badges\n\n";

# these fields are required
@badgesRequired = (
'BSAMemberID',
'First Name',
'Last Name',
'YPTExpiryDate',
'Units',
'Districts',
'ListingPreference',
'Availability',
'Merit Badges'
);

open(BADGES, '<', $badges) or die $!;
# BSA Member ID First Name Last Name Merit Badges
# 0 1 2 3
# UserID BSAMemberID First Name Last Name Email YPTExpiryDate Units Districts ListingPreference Availability Merit Badges
# 0 1 2 3 4 5 6 7 8 9 10
while (<BADGES>)
{
chomp;
next if (/^\s*$/); # skip blank lines

s/"//g; # remove redundant quotes from all fields

@f = split(/\t/);
if ($. == 1)
{
die "Improper badges file format $_\n\n" if ($#f != 3 || $f[0] !~ /BSA Member ID/i || $f[3] !~ /Merit Badges/i);
# map field names to offsets
$i = 0;
for $f (@f)
{
$badgeFields{$f} = $i++;
}

die "Improper badges file format $_\nRequired: " . join(', ', @badgesRequired) . "\n" unless (Contains(\@f, \@badgesRequired));
next;
}

if ($f[0] !~ /\d+/)
if ($f[$badgeFields{'BSAMemberID'}] !~ /\d+/)
{
warn "Invalid badges entry $_\n\n";
$errors{"Invalid badges entry"}++;
next;
}

if (exists $badges{$f[0]})
if (exists $badges{$f[$badgeFields{'BSAMemberID'}]})
{
warn "Duplicate badges entry for $_\n\tprev: $badges{$f[0]}\n\n";
warn "Duplicate badges entry for $_\n\tprev: $badges{$f[$badgeFields{'BSAMemberID'}]}\n\n";
$errors{"Duplicate badges entry"}++;
}

$badges{$f[0]} = $_;
if ($f[$badgeFields{'Merit Badges'}] =~ /^\s*$/)
{
warn "Empty badges list for $_\n\tprev: $badges{$f[$badgeFields{'BSAMemberID'}]}\n\n";
$errors{"Empty badges list"}++;
}

if ($opt_e && scalar(split(/\t/, $f[3])) > $opt_e)
$badges{$f[$badgeFields{'BSAMemberID'}]} = $_;

if ($opt_e && scalar(split(/\s*,\s*/, $f[$badgeFields{'Merit Badges'}])) > $opt_e)
{
warn "Excessive badges for $_\n\n";
$errors{"Excessive badges"}++;
}


if ($opt_y && $f[$badgeFields['YPTExpiryDate']])
{
my $expiry = DateToDays($f[$badgeFields{'YPTExpiryDate'}]);

my $now = DateToDays();

if ($expiry < $now)
{
warn "YPT expired $_\n\n";
$errors{"YPT expired"}++;
}
elsif ($expiry - $now <= $opt_y)
{
warn "YPT expiring $_\n\n";
$errors{"YPT expiring"}++;
}
}
}
close(BADGES);


@peopleRequired = (
'Person ID',
'First Name',
'Middle Name',
'Last Name',
'Address 1',
'City',
'State',
'Zip Code',
'Other Reg District Name',
'Phone Type',
'Phone No',
'Registrant Home E-Mail'
);

warn '=' x 20 . "\nLoading people from $people\n";
open(PEOPLE, '<', $people) or die $!;
Expand All @@ -64,67 +125,86 @@
@f = split(/\t/);
if ($. == 1)
{
die "Improper people file format $_\n\n" if ($#f != 14 || $f[0] !~ /Person ID/i);
# map field names to offsets
$i = 0;
for $f (@f)
{
$f =~ s/\s*$//;
$peopleFields{$f} = $i++;
}

die "Improper people file format $_\nRequired: " . join(', ', @peopleRequired) . "\n" unless (Contains(\@f, \@peopleRequired));
next;
}

if ($f[0] !~ /\d+/)
if ($f[$peopleFields{'Person ID'}] !~ /\d+/)
{
warn "Invalid person entry $_\n\n";
$errors{"Invalid person entry"}++;
next;
}

if (exists($people{$f[0]}))
if (exists($people{$f[$peopleFields{'Person ID'}]}))
{
warn "Duplicate person entry for $_\n\tprev: $people{$f[0]}\n\n";
warn "Duplicate person entry for $_\n\tprev: $people{$f[$peopleFields{'Person ID'}]}\n\n";
$errors{"Duplicate person entry"}++;
next;
}

$people{$f[0]} = $_;
$people{$f[$peopleFields{'Person ID'}]} = $_;
}
close(PEOPLE);

warn '=' x 20 . "\nMerging data\n";

for (values %people)
{
@f = split(/\t/);
@p = split(/\t/);

if (!exists $badges{$f[0]})
if (!exists $badges{$p[$peopleFields{'Person ID'}]})
{
warn "Missing badges entry for $_\n\n";
$errors{"Missing badges entry"}++;
next;
}

@b = split(/\t/, $badges{$f[0]});
@b = split(/\t/, $badges{$p[$peopleFields{'Person ID'}]});

$b[3] =~ s/"//g; # remove surrounding quotes
$b[$badgeFields{'Merit Badges'}] =~ s/"//g; # remove surrounding quotes

# deal with the ScoutBook braintrust using commas to separate data with embedded commas
$b[3] =~ s/Signs, Signals, and Codes/Signs Signals and Codes/ig; # remove bogus commas
$b[3] =~ s/\s*,\s*/\t/g; # replace comma with tab (as it should have been anyway)
$b[3] =~ s/Signs Signals and Codes/Signs, Signals, and Codes/ig; # restore commas
$b[$badgeFields{'Merit Badges'}] =~ s/Signs, Signals, and Codes/Signs Signals and Codes/ig; # remove bogus commas
$b[$badgeFields{'Merit Badges'}] =~ s/\s*,\s*/\t/g; # replace comma with tab (as it should have been anyway)
$b[$badgeFields{'Merit Badges'}] =~ s/Signs Signals and Codes/Signs, Signals, and Codes/ig; # restore commas

$mbs = '"' . join('","', split(/\t/, $b[3])) . '"';
$mbs = '"' . join('","', split(/\t/, $b[$badgeFields{'Merit Badges'}])) . '"';

$name = ($p[$peopleFields{'Middle Name'}] =~ /^\s*$/) ?
"$p[$peopleFields{'First Name'}] $p[$peopleFields{'Last Name'}]" :
"$p[$peopleFields{'First Name'}] $p[$peopleFields{'Middle Name'}] $p[$peopleFields{'Last Name'}]";

$workwith = $b[$badgeFields{'ListingPreference'}] .
(($b[$badgeFields{'ListingPreference'}] eq 'Unit') ? " $b[$badgeFields{'Units'}]" :
($b[$badgeFields{'ListingPreference'}] eq 'District') ? " $b[$badgeFields{'Districts'}]" : '');

print <<EOS;
{
name: "${\( ($f[2] =~ /^\s*$/) ? "$f[1] $f[3]" : "$f[1] $f[2] $f[3]" )}",
address: "$f[4]",
city: "$f[5]",
state: "$f[6]",
zip: "$f[7]",
phone: [{type: "$f[12]", number: "$f[13]"}],
email: "$f[14]",
district: "$f[10]",
bsaid: "$f[0]",
name: "$b[$badgeFields{'First Name'}] $b[$badgeFields{'Last Name'}]",
address: "$p[$peopleFields{'Address 1'}]",
city: "$p[$peopleFields{'City'}]",
state: "$p[$peopleFields{'State'}]",
zip: "$p[$peopleFields{'Zip Code'}]",
phone: [{type: "$p[$peopleFields{'Phone Type'}]", number: "$p[$peopleFields{'Phone No'}]"}],
email: "$p[$peopleFields{'Registrant Home E-Mail'}]",
district: "$p[$peopleFields{'Other Reg District Name'}]",
bsaid: "$p[$peopleFields{'Person ID'}]",
meritbadges: [$mbs],
workwith: "unknown"
workwith: "$workwith",
availability: "$b[$badgeFields{'Availability'}]",
yptexpiry: "$b[$badgeFields{'YPTExpiryDate'}]"
},
EOS
$outputCount++;
}

# check for badges entry without a corresponding people entry
Expand All @@ -137,12 +217,54 @@
}
}

warn '=' x 20 . "\nError counts\n";
warn '=' x 20 . "\nTotal records: $outputCount\nError counts\n";
for (sort keys %errors)
{
warn "\t$_: $errors{$_}\n";
}



# usage Contains (\@haystack, \@needle)
# think of it as @haystack contains @needle
sub Contains
{
my ($haystack, $needle) = @_;

my %haystackHash = map {$_ => 1} @{$haystack};

for (@{$needle})
{
return 0 if (!exists($haystackHash{$_}));
}
return 1;
}

# a highly simplistic date parser
# expects m/d/yyyy format
# returns (approx) days since 1/1/1970
sub DateToDays
{
my ($date) = @_;

# no params = now / today
if ($date == undef)
{
my ($sec,$min,$hour,$mday,$mon,$year) = localtime;
$date = ($mon+1) . '/' . $mday . '/' . ($year+1900);
}
# 0 J F M A M J J A S O N D
my @daysByMonth = qw/0 0 31 59 90 120 151 181 212 243 273 304 334 /;

my @f = split(/[\/\-\\]/, $date);

die "Can't understand date $date as @f" if ($#f < 2 || $f[2] < 100 || $f[0] > 12 || $f[1] > 31);

my ($m, $d, $y) = @f;

return (($y-1970) * 365 + $daysByMonth[$m] + $d-1);
}

sub Usage
{
die <<EOS;
Expand All @@ -152,6 +274,7 @@ sub Usage
options:
-e <n> : warn about excessive merit badges, more than <n>
-y <d> : warn about YPT expiring in 'd' days (default: 45)
-D : turn on debug messages
EOS
}

0 comments on commit c9df2b3

Please sign in to comment.