forked from hashicorp/terraform-website
-
Notifications
You must be signed in to change notification settings - Fork 0
/
check-incoming-links.sh
executable file
·42 lines (37 loc) · 1.53 KB
/
check-incoming-links.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env bash
# Check the most popular inbound links from the web at large, and make sure they
# either work as expected or got redirected to somewhere else. Takes the site
# URL (i.e. https://www.terraform.io) as an argument, so that it can also be run
# against a local server... but we don't generally do that, since this is only
# really useful when used against the production site.
set -e
URL=$1
redirects_file="$(mktemp)"
# This line parses redirects.txt to get a plain list of paths that got
# redirected, which we later use to filter the incoming links list.
# the awk: Only grab the first field (the "old" path).
# the grep: Throw out any comments or empty lines.
# By the way, all this work to ignore redirected links isn't really necessary,
# because wget should treat redirected URLs as successes. But it does maybe save
# us a few HTTP requests.
cat content/redirects.txt \
| awk -F' ' '{print $1}' \
| grep -v -E -e '^#' -e '^\s*$' \
| sort | uniq \
> "$redirects_file"
echo "Checking known incoming links..."
# the grep: print all the lines in incoming-links.txt that DON'T match ANY of
# the lines in $redirects_file.
# the awk: prepend the site URL to the paths.
grep -v -x -f "$redirects_file" content/scripts/testdata/incoming-links.txt \
| awk "{print \"${URL}\" \$0}" \
| wget \
--tries=120 \
--delete-after \
--no-directories \
--no-host-directories \
--no-verbose \
--spider \
--waitretry=120 \
--input-file -
rm "$redirects_file"