Skip to content

Commit

Permalink
worker: add a job to check for potential typosquatting
Browse files Browse the repository at this point in the history
This only fires when new crates are published: updates to existing
crates will not cause this job to run.

On a technical level, the major impact here is that the background
worker will keep an in memory cache of the top 3000 crates and their
owners. I don't expect the impact of this to be significant in practice.

As this is an experiment at present, configuration is hardcoded into the
new worker job module. If this becomes a longer term thing, this would
be split out into our normal configuration system for easier management.
  • Loading branch information
LawnGnome committed Oct 13, 2023
1 parent 9c5bf7e commit 3f1d514
Show file tree
Hide file tree
Showing 11 changed files with 689 additions and 0 deletions.
19 changes: 19 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ tower = "=0.4.13"
tower-http = { version = "=0.4.4", features = ["fs", "catch-panic"] }
tracing = "=0.1.37"
tracing-subscriber = { version = "=0.3.17", features = ["env-filter"] }
typomania = { version = "=0.1.0", default-features = false }
url = "=2.4.1"

[dev-dependencies]
Expand Down
13 changes: 13 additions & 0 deletions src/background_jobs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ macro_rules! job_variant_from_value {

jobs! {
pub enum Job {
CheckTyposquat(CheckTyposquatJob),
DailyDbMaintenance,
DumpDb(DumpDbJob),
NormalizeIndex(NormalizeIndexJob),
Expand Down Expand Up @@ -166,6 +167,12 @@ impl Job {
Ok(())
}

pub fn check_typosquat<T: ToString>(krate: T) -> Self {
Self::CheckTyposquat(CheckTyposquatJob {
krate: krate.to_string(),
})
}

pub fn daily_db_maintenance() -> Self {
Self::DailyDbMaintenance
}
Expand Down Expand Up @@ -250,6 +257,7 @@ impl Job {
.as_ref()
.expect("Application should configure a background runner environment");
match self {
Job::CheckTyposquat(args) => worker::check_typosquat(env, conn, &args.krate),
Job::DailyDbMaintenance => {
worker::perform_daily_db_maintenance(&mut *fresh_connection(pool)?)
}
Expand Down Expand Up @@ -298,6 +306,11 @@ pub struct AddCrateJob {
pub(super) krate: crates_io_index::Crate,
}

#[derive(Serialize, Deserialize)]
pub struct CheckTyposquatJob {
pub(super) krate: String,
}

#[derive(Serialize, Deserialize)]
pub struct UpdateCrateIndexJob {
pub(super) crate_name: String,
Expand Down
5 changes: 5 additions & 0 deletions src/controllers/krate/publish.rs
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,11 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult<Json<GoodCra

Job::enqueue_sync_to_index(&krate.name, conn)?;

// Experiment: check new crates for potential typosquatting.
if existing_crate.is_none() {
Job::check_typosquat(&krate.name).enqueue(conn)?;
}

// The `other` field on `PublishWarnings` was introduced to handle a temporary warning
// that is no longer needed. As such, crates.io currently does not return any `other`
// warnings at this time, but if we need to, the field is available.
Expand Down
26 changes: 26 additions & 0 deletions src/email.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,32 @@ or go to https://{domain}/me/pending-invites to manage all of your crate ownersh
self.send(email, subject, &body)
}

/// Attempts to send a notification that a new crate may be typosquatting another crate.
pub fn send_possible_typosquat_notification(
&self,
email: &str,
crate_name: &str,
squats: &[typomania::checks::Squat],
) -> AppResult<()> {
let subject = "Possible typosquatting in new crate";
let body = format!(
"New crate {crate_name} may be typosquatting one or more other crates.\n
Visit https://{domain}/crates/{crate_name} to see the offending crate.\n
\n
Specific squat checks that triggered:\n
\n
- {squats}\n",
domain = crate::config::domain_name(),
squats = squats
.iter()
.map(|squat| format!("{squat}"))
.collect::<Vec<_>>()
.join("\n- "),
);

self.send(email, subject, &body)
}

/// Attempts to send an API token exposure notification email
pub fn send_token_exposed_notification(
&self,
Expand Down
2 changes: 2 additions & 0 deletions src/worker/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pub mod dump_db;
pub mod fastly;
mod git;
mod readmes;
mod typosquat;
mod update_downloads;

pub(crate) use daily_db_maintenance::perform_daily_db_maintenance;
Expand All @@ -17,4 +18,5 @@ pub(crate) use git::{
perform_index_squash, perform_normalize_index, sync_to_git_index, sync_to_sparse_index,
};
pub(crate) use readmes::perform_render_and_upload_readme;
pub(crate) use typosquat::check_typosquat;
pub(crate) use update_downloads::perform_update_downloads;
99 changes: 99 additions & 0 deletions src/worker/typosquat.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
use diesel::PgConnection;
use typomania::Package;

use crate::{background_jobs::Environment, swirl::PerformError, Emails};

use self::types::OwnedCrate;

mod cache;
mod config;
mod types;

#[cfg(test)]
mod test_util;

#[instrument(skip_all, fields(krate.name = ?name))]
pub fn check_typosquat(
env: &Environment,
conn: &mut PgConnection,
name: &str,
) -> Result<(), PerformError> {
check_typosquat_inner(env.emails(), conn, name)
}

fn check_typosquat_inner(
emails: &Emails,
conn: &mut PgConnection,
name: &str,
) -> Result<(), PerformError> {
info!("Checking new crate for potential typosquatting");

let krate: Box<dyn Package> = Box::new(OwnedCrate::from_name(conn, name)?);
let squats = cache::get_harness(conn)?.check_package(name, krate)?;
if !squats.is_empty() {
// Well, well, well. For now, the only action we'll take is to e-mail people who hopefully
// care to check into things more closely.
info!(?squats, "Found potential typosquatting");

for email in config::NOTIFY_EMAILS.iter() {
if let Err(e) = emails.send_possible_typosquat_notification(email, name, &squats) {
error!(?e, ?email, "sending possible typosquat notification");
}
}
}

Ok(())
}

#[cfg(test)]
mod tests {
use crate::test_util::pg_connection;

use super::test_util::Faker;
use super::*;

#[test]
fn integration() -> Result<(), PerformError> {
let emails = Emails::new_in_memory();
let mut faker = Faker::new(pg_connection());

// Set up a user and a crate to match against.
let user = faker.user("a")?;
faker.crate_and_version("my-crate", "It's awesome", &user, 100)?;

// Prime the cache so it only includes the crate we just created.
//
// Note that there's theoretical flakiness here if the test takes longer to run than the
// cache TTL. Of course, since the cache TTL is currently set to 12 hours, that would
// probably indicate bigger problems.
let _harness = super::cache::get_harness(faker.borrow_conn())?;

// Now we'll create new crates: one problematic, one not so.
let other_user = faker.user("b")?;
let (angel, _version) = faker.crate_and_version(
"innocent-crate",
"I'm just a simple, innocent crate",
&other_user,
0,
)?;
let (demon, _version) = faker.crate_and_version(
"mycrate",
"I'm even more innocent, obviously",
&other_user,
0,
)?;

// OK, we're done faking stuff.
let mut conn = faker.into_conn();

// Run the check with a crate that shouldn't cause problems.
check_typosquat_inner(&emails, &mut conn, &angel.name)?;
assert!(emails.mails_in_memory().unwrap().is_empty());

// Now run the check with a less innocent crate.
check_typosquat_inner(&emails, &mut conn, &demon.name)?;
assert!(!emails.mails_in_memory().unwrap().is_empty());

Ok(())
}
}
100 changes: 100 additions & 0 deletions src/worker/typosquat/cache.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
use std::{
sync::{Arc, Mutex},
time::Instant,
};

use diesel::PgConnection;
use typomania::{
checks::{Bitflips, Omitted, SwappedWords, Typos},
Harness,
};

use crate::swirl::PerformError;

use super::{config, types::TopCrates};

/// Gets the typomania harness for the cached top crates, regenerating it if it is out of date.
pub(super) fn get_harness(
conn: &mut PgConnection,
) -> Result<Arc<Harness<TopCrates>>, PerformError> {
HARNESS_CACHE.get(conn)
}

static HARNESS_CACHE: Cache = Cache::new();

struct Cache(Mutex<Inner>);

impl Cache {
const fn new() -> Self {
Self(Mutex::new(Inner::new()))
}

fn get(&self, conn: &mut PgConnection) -> Result<Arc<Harness<TopCrates>>, PerformError> {
let mut inner = self.0.lock().unwrap();

Ok(if let Some(harness) = inner.get() {
harness
} else {
let harness = Arc::new(new_harness(conn)?);

inner.update(harness.clone());
harness
})
}
}

#[instrument(skip_all)]
fn new_harness(conn: &mut PgConnection) -> Result<Harness<TopCrates>, PerformError> {
debug!("Rebuilding top crate cache");
let start = Instant::now();

let top_crates = TopCrates::new(conn, config::TOP_CRATES)?;

// This is essentially the standard set of checks that was implemented by typogard-crates.
let harness = Harness::builder()
.with_check(Bitflips::new(
config::CRATE_NAME_ALPHABET,
top_crates.iter_names(),
))
.with_check(Omitted::new(config::CRATE_NAME_ALPHABET))
.with_check(SwappedWords::new("-_"))
.with_check(Typos::new(config::TYPOS.iter().map(|(c, typos)| {
(*c, typos.iter().map(|ss| ss.to_string()).collect())
})))
.build(top_crates);

let elapsed = Instant::now() - start;
debug!(?elapsed, "Top crate cache rebuilt");

Ok(harness)
}

struct Inner {
harness: Option<Arc<Harness<TopCrates>>>,
last_update: Option<Instant>,
}

impl Inner {
const fn new() -> Self {
Self {
harness: None,
last_update: None,
}
}

fn get(&self) -> Option<Arc<Harness<TopCrates>>> {
if let Some(harness) = &self.harness {
if let Some(when) = self.last_update {
if when >= Instant::now() - config::CACHE_TTL {
return Some(harness.clone());
}
}
}
None
}

fn update(&mut self, harness: Arc<Harness<TopCrates>>) {
self.harness = Some(harness);
self.last_update = Some(Instant::now());
}
}
Loading

0 comments on commit 3f1d514

Please sign in to comment.