Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support to pg_bulkload for hight speed data insertion and automatic scan on insert #922

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion .env → .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ POSTGRES_HOST=db
# To make the best use of CONCURRENCY, follow the guide https://rengine.wiki
#
MIN_CONCURRENCY=5
MAX_CONCURRENCY=30
MAX_CONCURRENCY=30
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ local_settings.py
db.sqlite3
db.sqlite3-journal
media
.env

### Django.Python Stack ###
# Byte-compiled / optimized / DLL files
Expand Down Expand Up @@ -55,3 +56,5 @@ secret
/secrets

get-docker.sh

filter_dns_leak.sh
22 changes: 22 additions & 0 deletions db/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
FROM postgres:latest


RUN apt-get update && \
apt-get install -y git postgresql-server-dev-15 libpam0g-dev libreadline-dev build-essential libselinux1-dev libzstd-dev liblz4-dev zlib1g-dev libkrb5-dev && \
git clone https://github.com/ossc-db/pg_bulkload && \
cd pg_bulkload && \
make USE_PGXS=1 && \
make USE_PGXS=1 install

# USER postgres
# WORKDIR "/tmp"
# RUN export PATH="/usr/lib/postgresql/15/bin/:$PATH" && \
# git clone https://github.com/ossc-db/pg_bulkload && \
# cd pg_bulkload && \
# make installcheck

ENTRYPOINT ["docker-entrypoint.sh"]

EXPOSE 5432
CMD ["postgres"]

Empty file added db/imports/.placeholder
Empty file.
226 changes: 226 additions & 0 deletions db/scripts/bulk_insert_domains.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
#!/bin/bash

sudo -v &>/dev/null

domain_file=$1
organization=$2
web_server=https://localhost
engine_type=1

tput setaf 2;
echo "Bulk Domains Insertion"
nb_domain_to_insert=$(wc -l $domain_file | awk '{print $1}')

tput setaf 6; echo "Found $nb_domain_to_insert domain(s) in $domain_file !"

echo " "
tput setaf 4; echo "Quering last domain ID inserted ..."

last_domain_id=$(sudo docker-compose exec db psql -t -U rengine -d rengine -c 'select max(id) from public.targetapp_domain;' | awk 'NF==1 {print $1}')
if [ -z "$last_domain_id" ]
then
last_domain_id=0
fi
tmp_domain_id=$last_domain_id

tput setaf 2; echo "Last domain ID inserted = $last_domain_id"

timestamp=$(date +%s)
data_fname=/imports/domain_insertion_$timestamp.csv
ldata_fname=./db$data_fname
log_fname=/imports/domain_insertion_$timestamp.log
bad_fname=/imports/domain_insertion_$timestamp.bad
dup_fname=/imports/domain_insertion_$timestamp.dup

echo " "
tput setaf 4; echo "Generating domain data file at '$ldata_fname'..."

insert_date=$(date)
touch $ldata_fname
for domain in $(cat $domain_file)
do
((last_domain_id=last_domain_id+1))
ldomain="${domain,,}"
echo "$last_domain_id,$ldomain,,,,$insert_date,$insert_date," | tee -a $ldata_fname >/dev/null
done

echo " "
tput setaf 4; echo "Creating pg_bulkload log files ..."
touch ./db$log_fname && chmod o+w ./db$log_fname
touch ./db$bad_fname && chmod o+w ./db$bad_fname
touch ./db$dup_fname && chmod o+w ./db$dup_fname

echo " "
tput setaf 4; echo "Creating pg_bulkload extension ..."
sudo docker-compose exec db psql -U rengine -d rengine -c "CREATE EXTENSION pg_bulkload" 2>/dev/null

echo " "
tput setaf 4; echo "Start domain instertion using pg_bulkload ..."; tput setaf 6;
sudo docker-compose exec db pg_bulkload \
--infile=$data_fname \
--output=public.targetapp_domain \
--option="WRITER=PARALLEL" \
--option="TYPE=CSV" \
--option="DELIMITER=," \
--option="DUPLICATE_ERRORS=-1" \
--option="PARSE_ERRORS=-1" \
--option="ON_DUPLICATE_KEEP=NEW" \
--option="CHECK_CONSTRAINTS=YES" \
-U rengine \
-d rengine \
--logfile=$log_fname \
--parse-badfile=$bad_fname \
--duplicate-badfile=$dup_fname

echo " "
tput setaf 5; echo "Result log file available at './db$log_fname'"
tput setaf 5; echo "Bad records that cannot be parsed correctly available at './db$bad_fname'"
tput setaf 5; echo "Bad records that conflict with unique constraints available at './db$dup_fname'"

echo " "
tput setaf 4; echo "Creating organization '$organization'..."
organization_id=$(sudo docker-compose exec db psql -t -U rengine -d rengine -c "insert into public.targetapp_organization(name, insert_date) values('$organization', now()) on conflict (name) do update set id=public.targetapp_organization.id, description=excluded.description returning id;" | awk 'NF==1 {print $1}')

tput setaf 6; echo "$organization created with ID = $organization_id !"


end_domain_id=$(sudo docker-compose exec db psql -t -U rengine -d rengine -c 'select max(id) from public.targetapp_domain;' | awk 'NF==1 {print $1}')
if [ $end_domain_id -eq $tmp_domain_id ]
then
tput setaf 1; echo "No new domain imported, exiting ..."
exit
fi

echo " "
tput setaf 4; echo "Quering last Organization <-> Domain relation id inserted ..."

last_relation_id=$(sudo docker-compose exec db psql -t -U rengine -d rengine -c "select max(id) from public.targetapp_organization_domains;" | awk 'NF==1 {print $1}')
if [ -z "$last_relation_id" ]
then
last_relation_id=0
fi

tput setaf 2; echo "Last Organization <-> Domain relation id inserted = $last_relation_id"

timestamp=$(date +%s)
data_fname=/imports/relation_insertion_$timestamp.csv
ldata_fname=./db$data_fname
log_fname=/imports/relation_insertion_$timestamp.log
bad_fname=/imports/relation_insertion_$timestamp.bad
dup_fname=/imports/relation_insertion_$timestamp.dup

echo " "
tput setaf 4; echo "Generating relation data file at '$ldata_fname'..."

touch $ldata_fname
last_domain_id=$(($tmp_domain_id+1))
for domain_id in $(seq $last_domain_id $end_domain_id)
do
((last_relation_id=last_relation_id+1))
echo "$last_relation_id,$organization_id,$domain_id" | tee -a $ldata_fname >/dev/null
done

echo " "
tput setaf 4; echo "Creating pg_bulkload log files ..."
touch ./db$log_fname && chmod o+w ./db$log_fname
touch ./db$bad_fname && chmod o+w ./db$bad_fname
touch ./db$dup_fname && chmod o+w ./db$dup_fname

echo " "
tput setaf 4; echo "Start relation insertion using pg_bulkload ..."; tput setaf 6;
sudo docker-compose exec db pg_bulkload \
--infile=$data_fname \
--output=public.targetapp_organization_domains \
--option="WRITER=PARALLEL" \
--option="TYPE=CSV" \
--option="DELIMITER=," \
--option="DUPLICATE_ERRORS=-1" \
--option="PARSE_ERRORS=-1" \
--option="ON_DUPLICATE_KEEP=NEW" \
--option="CHECK_CONSTRAINTS=YES" \
-U rengine \
-d rengine \
--logfile=$log_fname \
--parse-badfile=$bad_fname \
--duplicate-badfile=$dup_fname

echo " "
tput setaf 5; echo "Result log file available at './db$log_fname'"
tput setaf 5; echo "Bad records that cannot be parsed correctly available at './db$bad_fname'"
tput setaf 5; echo "Bad records that conflict with unique constraints available at './db$dup_fname'"


echo " "
tput setaf 4; echo "Quering last scan history id inserted ..."

last_scanhistory_id=$(sudo docker-compose exec db psql -t -U rengine -d rengine -c "select max(id) from public.startscan_scanhistory;" | awk 'NF==1 {print $1}')
if [ -z "$last_scanhistory_id" ]
then
last_scanhistory_id=0
fi
tmp_scanhistory_id=$last_scanhistory_id

tput setaf 2; echo "Last scan history id inserted = $last_scanhistory_id"

timestamp=$(date +%s)
data_fname=/imports/scanhistory_insertion_$timestamp.csv
ldata_fname=./db$data_fname
log_fname=/imports/scanhistory_insertion_$timestamp.log
bad_fname=/imports/scanhistory_insertion_$timestamp.bad
dup_fname=/imports/scanhistory_insertion_$timestamp.dup

echo " "
tput setaf 4; echo "Generating scan history data file at '$ldata_fname'..."

touch $ldata_fname
last_domain_id=$(($tmp_domain_id+1))
for domain_id in $(seq $last_domain_id $end_domain_id)
do
((last_scanhistory_id=last_scanhistory_id+1))
echo "$last_scanhistory_id,$insert_date,-1,'','',False,False,False,False,False,False,True,,True,$domain_id,$engine_type,," | tee -a $ldata_fname >/dev/null
done

echo " "
tput setaf 4; echo "Creating pg_bulkload log files ..."
touch ./db$log_fname && chmod o+w ./db$log_fname
touch ./db$bad_fname && chmod o+w ./db$bad_fname
touch ./db$dup_fname && chmod o+w ./db$dup_fname

echo " "
tput setaf 4; echo "Start scan history insertion using pg_bulkload ..."; tput setaf 6;
sudo docker-compose exec db pg_bulkload \
--infile=$data_fname \
--output=public.startscan_scanhistory \
--option="WRITER=PARALLEL" \
--option="TYPE=CSV" \
--option="DELIMITER=," \
--option="DUPLICATE_ERRORS=-1" \
--option="PARSE_ERRORS=-1" \
--option="ON_DUPLICATE_KEEP=NEW" \
--option="CHECK_CONSTRAINTS=YES" \
-U rengine \
-d rengine \
--logfile=$log_fname \
--parse-badfile=$bad_fname \
--duplicate-badfile=$dup_fname

echo " "
tput setaf 5; echo "Result log file available at './db$log_fname'"
tput setaf 5; echo "Bad records that cannot be parsed correctly available at './db$bad_fname'"
tput setaf 5; echo "Bad records that conflict with unique constraints available at './db$dup_fname'"

echo " "
tput setaf 4; echo "Start scaning tasks ..."

touch $ldata_fname
last_domain_id=$(($tmp_domain_id+1))
last_scanhistory_id=$tmp_scanhistory_id
for domain_id in $(seq $last_domain_id $end_domain_id)
do
((last_scanhistory_id=last_scanhistory_id+1))
tput setaf 4; echo "Starting scan on domain id = $domain_id ..."
celery_id=$(sudo docker-compose exec celery celery -A reNgine -b redis://redis:6379/0 --result-backend redis://redis:6379/0 call reNgine.tasks.initiate_scan -a ["$domain_id","$last_scanhistory_id",0,"$engine_type"])
tput setaf 4; echo "Update scan history with celery task id ('$celery_id')"
sudo docker-compose exec db psql -t -U rengine -d rengine -c "update public.startscan_scanhistory set celery_id ='$celery_id' where id = $last_scanhistory_id;" &>/dev/null
done

12 changes: 7 additions & 5 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@ version: '3.8'
services:
db:
restart: always
image: "postgres:12.3-alpine"
build:
context: ./db
environment:
- POSTGRES_DB=${POSTGRES_DB}
- POSTGRES_USER=${POSTGRES_USER}
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
- POSTGRES_PORT=${POSTGRES_PORT}
ports:
- "5432:5432"
# ports:
# - "5432:5432"
volumes:
- postgres_data:/var/lib/postgresql/data/
- ./db/imports:/imports/
networks:
- rengine_network

Expand Down Expand Up @@ -105,8 +107,8 @@ services:
- nuclei_templates:/root/nuclei-templates
- tool_config:/root/.config
- static_volume:/usr/src/app/staticfiles/
ports:
- "8000:8000"
# ports:
# - "8000:8000"
depends_on:
- db
- celery
Expand Down
28 changes: 28 additions & 0 deletions web/scanEngine/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ class EngineType(models.Model):
yaml_configuration = models.TextField()
default_engine = models.BooleanField(null=True, default=False)

class Meta:
db_table = "scanengine_enginetype"

def __str__(self):
return self.engine_name

Expand All @@ -37,6 +40,9 @@ class Wordlist(models.Model):
short_name = models.CharField(max_length=50, unique=True)
count = models.IntegerField(default=0)

class Meta:
db_table = "scanengine_wordlist"

def __str__(self):
return self.name

Expand All @@ -47,6 +53,9 @@ class Configuration(models.Model):
short_name = models.CharField(max_length=50, unique=True)
content = models.TextField()

class Meta:
db_table = "scanengine_configuration"

def __str__(self):
return self.name

Expand All @@ -59,6 +68,9 @@ class InterestingLookupModel(models.Model):
url_lookup = models.BooleanField(default=True)
condition_200_http_lookup = models.BooleanField(default=False)

class Meta:
db_table = "scanengine_interestinglookupmodel"


class Notification(models.Model):
id = models.AutoField(primary_key=True)
Expand All @@ -78,12 +90,19 @@ class Notification(models.Model):

send_scan_output_file = models.BooleanField(default=True)

class Meta:
db_table = "scanengine_notification"



class Proxy(models.Model):
id = models.AutoField(primary_key=True)
use_proxy = models.BooleanField(default=False)
proxies = models.TextField(blank=True, null=True)

class Meta:
db_table = "scanengine_proxy"


class Hackerone(models.Model):
id = models.AutoField(primary_key=True)
Expand All @@ -94,6 +113,9 @@ class Hackerone(models.Model):
send_medium = models.BooleanField(default=False)
report_template = models.TextField(blank=True, null=True)

class Meta:
db_table = "scanengine_hackerone"


class VulnerabilityReportSetting(models.Model):
id = models.AutoField(primary_key=True)
Expand All @@ -109,6 +131,9 @@ class VulnerabilityReportSetting(models.Model):
show_footer = models.BooleanField(default=False)
footer_text = models.CharField(max_length=200, null=True, blank=True)

class Meta:
db_table = "scanengine_vulnerabilityreportsetting"


class InstalledExternalTool(models.Model):
id = models.AutoField(primary_key=True)
Expand All @@ -127,5 +152,8 @@ class InstalledExternalTool(models.Model):
github_clone_path = models.CharField(max_length=1500, null=True, blank=True)
subdomain_gathering_command = models.CharField(max_length=300, null=True, blank=True)

class Meta:
db_table = "scanengine_installedexternaltool"

def __str__(self):
return self.name
Loading