Skip to content

Commit

Permalink
init: Use VACUUM for large, modified SQLite files only (OSGeo#2377)
Browse files Browse the repository at this point in the history
For SQLite database, VACUUM is used when the session is ending. There is a cost in running the task even when the database is small and no vacuuming is needed. This change adds a check for the size of the database and ignores databases under MEMORYMB value or 300MB.

Use MEMORYMB as info for what small file means. If user does not have a problem keeping 300MB or more in memory, there is likely enough space on the disk for many files of that size. If database cleanup is needed, user can always run VACUUM manually or configure SQLite to do automatic vacuums.

Additionally, vacuum runs only if the database was modified since the start of the session because there is no need to vacuum when it is only read or unused. Any edits, including additions, still trigger the vacuum.

Unlike the original implementation, the vacuum task is now synchronous so that there is no running GRASS process after the session has ended. The original code runs relatively quickly, because it just starts the vacuum subprocess which then runs on its own. However, this dangling subprocess may cause unexpected behavior (locked SQLite, process termination on shared hardware). The new code waits for the process to finish, but vacuums only for large, modified database files.

The message is now only verbose and printed in parallel while the task is running.

The original implementation comes from Trac ticket 3697 (https://trac.osgeo.org/grass/ticket/3697) and is best visible in a bulk backport commit f790d68 (https://trac.osgeo.org/grass/changeset/73736).
  • Loading branch information
wenzeslaus authored and ninsbl committed Feb 17, 2023
1 parent f3bc923 commit a74525d
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 29 deletions.
13 changes: 8 additions & 5 deletions lib/init/grass.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import os
import errno
import atexit
import datetime
import gettext
import shutil
import signal
Expand Down Expand Up @@ -2119,11 +2120,11 @@ def clean_temp():
gsetup.clean_temp()


def clean_all():
def clean_all(*, start_time):
from grass.script import setup as gsetup

# clean default sqlite db
gsetup.clean_default_db()
gsetup.clean_default_db(modified_after=start_time)
# remove leftover temp files
clean_temp()
# save 'last used' GISRC after removing variables which shouldn't
Expand Down Expand Up @@ -2658,6 +2659,8 @@ def main():
fatal(e.args[0])
sys.exit(_("Exiting..."))

start_time = datetime.datetime.now(datetime.timezone.utc)

# unlock the mapset which is current at the time of turning off
# in case mapset was changed
atexit.register(lambda: unlock_gisrc_mapset(gisrc, gisrcrc))
Expand All @@ -2677,12 +2680,12 @@ def main():
# only non-error, interactive version continues from here
if params.batch_job:
returncode = run_batch_job(params.batch_job)
clean_all()
clean_all(start_time=start_time)
sys.exit(returncode)
elif params.exit_grass:
# clean always at exit, cleans whatever is current mapset based on
# the GISRC env variable
clean_all()
clean_all(start_time=start_time)
sys.exit(0)
else:
if use_shell:
Expand Down Expand Up @@ -2737,7 +2740,7 @@ def main():
close_gui()

# here we are at the end of grass session
clean_all()
clean_all(start_time=start_time)
mapset_settings = load_gisrc(gisrc, gisrcrc=gisrcrc)
if not params.tmp_location or (
params.tmp_location and mapset_settings.gisdbase != os.environ["TMPDIR"]
Expand Down
74 changes: 50 additions & 24 deletions python/grass/script/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
# then this could even do locking

from pathlib import Path
import datetime
import os
import shutil
import subprocess
Expand Down Expand Up @@ -376,6 +377,7 @@ class SessionHandle:

def __init__(self, active=True):
self._active = active
self._start_time = datetime.datetime.now(datetime.timezone.utc)

@property
def active(self):
Expand Down Expand Up @@ -411,31 +413,52 @@ def finish(self):
if not self.active:
raise ValueError("Attempt to finish an already finished session")
self._active = False
finish()
finish(start_time=self._start_time)


# clean-up functions when terminating a GRASS session
# these fns can only be called within a valid GRASS session
def clean_default_db():
# clean the default db if it is sqlite
from grass.script import core as gcore
from grass.script import db as gdb

conn = gdb.db_connection()
if conn and conn["driver"] == "sqlite":
# check if db exists
gisenv = gcore.gisenv()
database = conn["database"]
database = database.replace("$GISDBASE", gisenv["GISDBASE"])
database = database.replace("$LOCATION_NAME", gisenv["LOCATION_NAME"])
database = database.replace("$MAPSET", gisenv["MAPSET"])
if os.path.exists(database):
gcore.message(_("Cleaning up default sqlite database ..."))
gcore.start_command("db.execute", sql="VACUUM")
# give it some time to start
import time

time.sleep(0.1)


def clean_default_db(*, modified_after=None):
"""Clean (vacuum) the default db if it is SQLite
When *modified_after* is set, database is cleaned only when it was modified
since the *modified_after* time.
"""
# Limiting usage of in other function by lazy-imports.
# pylint: disable=import-outside-toplevel
import grass.script as gs

conn = gs.db_connection()
if not conn or conn["driver"] != "sqlite":
return
# check if db exists
gis_env = gs.gisenv()
database = conn["database"]
database = database.replace("$GISDBASE", gis_env["GISDBASE"])
database = database.replace("$LOCATION_NAME", gis_env["LOCATION_NAME"])
database = database.replace("$MAPSET", gis_env["MAPSET"])
database = Path(database)
if not database.is_file():
return
file_stat = database.stat()
# Small size based on MEMORYMB (MiB) or its default.
small_db_size = int(gis_env.get("MEMORYMB", 300)) * (1 << 20)
if file_stat.st_size <= small_db_size:
return
if modified_after:
modified_time = datetime.datetime.fromtimestamp(
file_stat.st_mtime, tz=datetime.timezone.utc
)
if modified_after >= modified_time:
return
# Start the vacuum process, then show the message in parallel while
# the vacuum is running. Finally, wait for the vacuum process to finish.
# Error handling is the same as errors="ignore".
process = gs.start_command("db.execute", sql="VACUUM")
gs.verbose(_("Cleaning up default SQLite database..."))
process.wait()


def call(cmd, **kwargs):
Expand All @@ -456,7 +479,7 @@ def clean_temp():
call([os.path.join(gisbase, "etc", "clean_temp")], stdout=subprocess.DEVNULL)


def finish():
def finish(*, start_time=None):
"""Terminate the GRASS session and clean up
GRASS commands can no longer be used after this function has been
Expand All @@ -469,9 +492,12 @@ def finish():
The function is not completely symmetrical with :func:`init` because it only
closes the mapset, but doesn't undo the runtime environment setup.
"""
clean_default_db()
When *start_time* is set, it might be used to determine cleaning procedures.
Currently, it is used to do SQLite database vacuum only when database was modified
since the session started.
"""
clean_default_db(modified_after=start_time)
clean_temp()
# TODO: unlock the mapset?
# unset the GISRC and delete the file
Expand Down

0 comments on commit a74525d

Please sign in to comment.