Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add solr restarter service to autoheal solr #5989

Merged
merged 4 commits into from
Jan 10, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions docker-compose.production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,25 @@ services:
max-size: "512m"
max-file: "4"

solr_restarter:
profiles: ["ol-solr0"]
build: scripts/solr_restarter
restart: unless-stopped
environment:
- TEST_URL=http://openlibrary.org/search.json?q=hello&mode=everything&limit=0
- CONTAINER_NAMES=openlibrary_solr_1 openlibrary_solr_haproxy_1
- SEND_SLACK_MESSAGE=true
mekarpeles marked this conversation as resolved.
Show resolved Hide resolved
env_file:
- ../olsystem/etc/solr_restarter.env
volumes:
# Forward the docker socket, since this needs to be able to
# run docker commands
- "/var/run/docker.sock:/var/run/docker.sock"
logging:
options:
max-size: "512m"
max-file: "4"

covers:
profiles: ["ol-covers0"]
image: "${OLIMAGE:-openlibrary/olbase:latest}"
Expand Down
2 changes: 1 addition & 1 deletion scripts/deployment/are_servers_in_sync.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

SERVERS="ol-home0 ol-covers0 ol-web1 ol-web2 ol-www0"
SERVERS="ol-home0 ol-covers0 ol-web1 ol-web2 ol-www0 ol-solr0"
REPO_DIRS="/opt/olsystem /opt/openlibrary /opt/openlibrary/vendor/infogami /opt/booklending_utils"

for REPO_DIR in $REPO_DIRS; do
Expand Down
2 changes: 1 addition & 1 deletion scripts/deployment/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -o xtrace

# See https://github.com/internetarchive/openlibrary/wiki/Deployment-Scratchpad
SERVERS="ol-home0 ol-covers0 ol-web1 ol-web2 ol-www0"
SERVERS="ol-home0 ol-covers0 ol-web1 ol-web2 ol-www0 ol-solr0"
COMPOSE_FILE="docker-compose.yml:docker-compose.production.yml"

# This script must be run on ol-home0 to start a new deployment.
Expand Down
2 changes: 1 addition & 1 deletion scripts/deployment/restart_servers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ fi

for SERVER in $SERVERS; do
HOSTNAME=$(host $SERVER | cut -d " " -f 1)
ssh $SERVER "cd /opt/openlibrary; COMPOSE_FILE=$PRODUCTION HOSTNAME=$HOSTNAME OLIMAGE=$OLIMAGE docker-compose --profile $(echo $SERVER | cut -f1 -d '.') up --no-deps -d"
ssh $SERVER "cd /opt/openlibrary; COMPOSE_FILE=$PRODUCTION HOSTNAME=$HOSTNAME OLIMAGE=$OLIMAGE docker-compose --profile $(echo $SERVER | cut -f1 -d '.') up --build --no-deps -d"
done
16 changes: 16 additions & 0 deletions scripts/solr_restarter/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM docker:latest

# Install Node
RUN apk add --update nodejs npm

# Install deps globally for this tiny image; don't create a node_modules folder
RUN npm install -g node-fetch@2
ENV NODE_PATH="/usr/local/lib/node_modules:$NODE_PATH"

COPY . /app

# Override default entrypoint, since docker block "sh" as the default entrypoint,
# so we wouldn't be able to run node as the cmd otherwise.
ENTRYPOINT []

CMD ["node", "/app/index.js"]
137 changes: 137 additions & 0 deletions scripts/solr_restarter/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
// @ts-check
/**
* Util script to restart solr. This file should probably be using something built
* into docker-compose, but that looks like it's not quite available
* for docker-compose? It might be docker swarm only? It's unclear.
*
* This script is necessary to prevent solr from going occasionally going down
* in-explicably. Well, it doesn't prevent it, but it force it to restart after ~3min
* of "unhealthy".
*/
const { execSync } = require('child_process');
const fetch = require('node-fetch');


/**
* @param {number} ms
*/
async function sleep(ms) {
return new Promise(res => setTimeout(() => res(), ms));
}

class SolrRestarter {
/** Don't restart twice in 10 minutes */
MAX_RESTART_WIN = 10*60*1000;
/** Must be unhealthy for this many minutes to trigger a refresh */
UNHEALTHY_DURATION = 2*60*1000;
/** Check every minute */
CHECK_FREQ = 60*1000;
/** How many times we're aloud to try restarting without going healthy before giving up */
MAX_RESTARTS = 3;
/** Number of restarts we've done without transitioning to healthy */
restartsRun = 0;
/** timestamp in ms */
lastRestart = 0;
/** @type {'healthy' | 'unhealthy'} */
state = 'healthy';
/** timestamp in ms */
lastStateChange = 0;
/** Number of consecutive health checks that haven't failed or succeeded, but errored. */
healthCheckErrorRun = 0;

/** The URL to fetch in our healthcheck */
TEST_URL = process.env.TEST_URL ?? 'http://openlibrary.org/search.json?q=hello&mode=everything&limit=0';

/** Whether we should send slack messages, or just console.log */
SEND_SLACK_MESSAGE = process.env.SEND_SLACK_MESSAGE == 'true';

/** The containers to restart */
CONTAINER_NAMES = process.env.CONTAINER_NAMES;

async checkHealth() {
console.log(this.TEST_URL);
const resp = await Promise.race([fetch(this.TEST_URL), sleep(3000).then(() => 'timeout')]);

if (resp == 'timeout') return false;

try {
const json = await resp.json();
return !json.error && json.numFound;
} catch (err) {
throw `Invalid response: ${await resp.text()}`;
}
}

/**
* @param {string} text
*/
async sendSlackMessage(text) {
if (this.SEND_SLACK_MESSAGE) {
await fetch('https://slack.com/api/chat.postMessage', {
method: 'POST',
headers: {
Authorization: `Bearer ${process.env.SLACK_TOKEN}`,
"Content-Type": "application/json; charset=utf-8",
},
body: JSON.stringify({
text,
channel: process.env.SLACK_CHANNEL_ID,
})
}).then(r => r.text());
} else {
console.log(text);
}
}

async loop() {
while (true) {
let isHealthy = true;
try {
isHealthy = await this.checkHealth();
} catch (err) {
this.healthCheckErrorRun++;
if (this.healthCheckErrorRun > 3) {
// This is an unexpected error; likely means OL is down for other reasons.
await this.sendSlackMessage(`Health check errored 3+ times with ${err}; skipping?`);
}
await sleep(this.CHECK_FREQ);
continue;
}
this.healthCheckErrorRun = 0;
const newState = isHealthy ? 'healthy' : 'unhealthy';
if (this.state != newState) {
this.lastStateChange = Date.now();
}
this.state = newState;
console.log(`State: ${this.state}`);

if (!isHealthy) {
if (Date.now() - this.lastStateChange > this.UNHEALTHY_DURATION) {
const canRestart = Date.now() - this.lastRestart > this.MAX_RESTART_WIN;
if (canRestart) {
if (this.restartsRun >= this.MAX_RESTARTS) {
await this.sendSlackMessage("Hit max restarts. we're clearly not helping. Exiting.");
throw new Error("MAX_RESTARTS exceeded");
}
await this.sendSlackMessage(`solr-restarter: Unhealthy for a few minutes; Restarting solr`);
execSync(`docker restart ${this.CONTAINER_NAMES}`, { stdio: "inherit" });
this.restartsRun++;
this.lastRestart = Date.now();
} else {
console.log('Cannot restart; too soon since last restart');
}
}
} else {
// Send a message if we recently tried to restart
if (this.restartsRun) {
await this.sendSlackMessage(`solr-restarter: solr state now ${this.state} :success-kid:`);
}
this.restartsRun = 0;
}
await sleep(this.CHECK_FREQ);
}
}
}

process.on('unhandledRejection', err => { throw err });
new SolrRestarter().loop();