Skip to content

Commit

Permalink
Merge pull request #1312 from dandi/asset-paths-design
Browse files Browse the repository at this point in the history
  • Loading branch information
jjnesbitt authored Oct 19, 2022
2 parents cb5dc7b + 4353b03 commit 1ab45a0
Show file tree
Hide file tree
Showing 16 changed files with 881 additions and 366 deletions.
156 changes: 156 additions & 0 deletions dandiapi/api/asset_paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
from __future__ import annotations

import os

from django.db import transaction
from django.db.models import Count, F, QuerySet

from dandiapi.api.models import Asset, AssetPath, AssetPathRelation, Version

####################################################################
# Dandiset and version deletion will cascade to asset path deletion.
# Thus, no explicit action is needed for these.
####################################################################


def extract_paths(path: str) -> list[str]:
nodepaths: list[str] = path.split('/')
for i in range(len(nodepaths))[1:]:
nodepaths[i] = os.path.join(nodepaths[i - 1], nodepaths[i])

return nodepaths


def get_root_paths(version: Version) -> QuerySet[AssetPath]:
"""Return all root paths for a version."""
# Use prefetch_related here instead of select_related,
# as otherwise the resulting join is very large
qs = AssetPath.objects.prefetch_related(
'asset',
'asset__blob',
'asset__embargoed_blob',
'asset__zarr',
)
return (
qs.filter(version=version)
.alias(num_parents=Count('parent_links'))
.filter(num_parents=1)
.order_by('path')
)


def get_path_children(path: AssetPath) -> QuerySet[AssetPath]:
"""Get all direct children from an existing path."""
qs = AssetPath.objects.select_related(
'asset',
'asset__blob',
'asset__embargoed_blob',
'asset__zarr',
)
path_ids = (
AssetPathRelation.objects.filter(parent=path, depth=1)
.values_list('child', flat=True)
.distinct()
)
return qs.filter(id__in=path_ids).order_by('path')


def search_asset_paths(query: str, version: Version) -> QuerySet[AssetPath] | None:
"""Return all direct children of this path, if there are any."""
if not query:
return get_root_paths(version)

# Ensure no trailing slash
fixed_query = query.rstrip('/')

# Retrieve path
path = AssetPath.objects.filter(version=version, path=fixed_query).first()
if path is None:
return None

return get_path_children(path)


# TODO: Make idempotent
@transaction.atomic()
def add_asset_paths(asset: Asset, version: Version):
# Get or create leaf path
leaf, created = AssetPath.objects.get_or_create(path=asset.path, asset=asset, version=version)
if not created:
return

# Create absolute paths (exclude leaf node)
nodepaths = extract_paths(asset.path)[:-1]

# Create nodes
AssetPath.objects.bulk_create(
[AssetPath(path=path, version=version, asset=None) for path in nodepaths],
ignore_conflicts=True,
)

# Retrieve all paths
paths = [*AssetPath.objects.filter(version=version, path__in=nodepaths).order_by('path'), leaf]

# Create relations between paths
links = []
for i in range(len(paths)):
links.extend(
[
AssetPathRelation(parent=paths[i], child=paths[j], depth=j - i)
for j in range(len(paths))[i:]
]
)

# Create objects
AssetPathRelation.objects.bulk_create(links, ignore_conflicts=True)

# Get all relations (including leaf node)
parent_ids = (
AssetPathRelation.objects.filter(child=leaf)
.distinct('parent')
.values_list('parent', flat=True)
)

# Update size + file count
AssetPath.objects.filter(id__in=parent_ids).update(
aggregate_size=F('aggregate_size') + asset.size, aggregate_files=F('aggregate_files') + 1
)


@transaction.atomic()
def delete_asset_paths(asset: Asset, version: Version):
leaf: AssetPath = AssetPath.objects.get(asset=asset, version=version)

# Fetch parents
parent_ids = (
AssetPathRelation.objects.filter(child=leaf)
.distinct('parent')
.values_list('parent', flat=True)
)
parent_paths = AssetPath.objects.filter(id__in=parent_ids)

# Update parents
parent_paths.update(
aggregate_size=F('aggregate_size') - asset.size, aggregate_files=F('aggregate_files') - 1
)

# Ensure integrity
leaf.refresh_from_db()
assert leaf.aggregate_size == 0
assert leaf.aggregate_files == 0

# Delete leaf node and any other paths with no contained files
AssetPath.objects.filter(aggregate_files=0).delete()


@transaction.atomic()
def update_asset_paths(old_asset: Asset, new_asset: Asset, version: Version):
delete_asset_paths(old_asset, version)
add_asset_paths(new_asset, version)


@transaction.atomic()
def add_version_asset_paths(version: Version):
"""Add every asset from a version."""
for asset in version.assets.iterator():
add_asset_paths(asset, version)
12 changes: 12 additions & 0 deletions dandiapi/api/management/commands/ingest_asset_paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import djclick as click

from dandiapi.api.asset_paths import add_version_asset_paths
from dandiapi.api.models import Version


@click.command()
def ingest_asset_paths():
for version in Version.objects.iterator():
print(f'Version: {version}')
print(f'\t {version.assets.count()} assets')
add_version_asset_paths(version)
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# Generated by Django 4.1.1 on 2022-10-19 16:07

from django.db import migrations, models
import django.db.models.deletion

import dandiapi.api.models.asset


class Migration(migrations.Migration):

dependencies = [
('api', '0037_alter_version_status'),
]

operations = [
migrations.CreateModel(
name='AssetPath',
fields=[
(
'id',
models.BigAutoField(
auto_created=True, primary_key=True, serialize=False, verbose_name='ID'
),
),
('path', models.CharField(max_length=512)),
('aggregate_files', models.PositiveBigIntegerField(default=0)),
('aggregate_size', models.PositiveBigIntegerField(default=0)),
],
),
migrations.CreateModel(
name='AssetPathRelation',
fields=[
(
'id',
models.BigAutoField(
auto_created=True, primary_key=True, serialize=False, verbose_name='ID'
),
),
('depth', models.PositiveIntegerField()),
],
),
migrations.AlterField(
model_name='asset',
name='path',
field=models.CharField(
max_length=512, validators=[dandiapi.api.models.asset.validate_asset_path]
),
),
migrations.AddConstraint(
model_name='asset',
constraint=models.CheckConstraint(
check=models.Q(
('path__regex', '^([A-z0-9(),&\\s#+~_=-]?\\/?\\.?[A-z0-9(),&\\s#+~_=-])+$')
),
name='asset_path_regex',
),
),
migrations.AddConstraint(
model_name='asset',
constraint=models.CheckConstraint(
check=models.Q(('path__startswith', '/'), _negated=True),
name='asset_path_no_leading_slash',
),
),
migrations.AddField(
model_name='assetpathrelation',
name='child',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='parent_links',
to='api.assetpath',
),
),
migrations.AddField(
model_name='assetpathrelation',
name='parent',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='child_links',
to='api.assetpath',
),
),
migrations.AddField(
model_name='assetpath',
name='asset',
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.PROTECT,
related_name='leaf_paths',
to='api.asset',
),
),
migrations.AddField(
model_name='assetpath',
name='version',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='asset_paths',
to='api.version',
),
),
migrations.AddConstraint(
model_name='assetpathrelation',
constraint=models.UniqueConstraint(
fields=('parent', 'child'), name='unique-relationship'
),
),
migrations.AddConstraint(
model_name='assetpath',
constraint=models.CheckConstraint(
check=models.Q(('path__endswith', '/'), _negated=True), name='consistent-slash'
),
),
migrations.AddConstraint(
model_name='assetpath',
constraint=models.UniqueConstraint(
fields=('asset', 'version'), name='unique-asset-version'
),
),
migrations.AddConstraint(
model_name='assetpath',
constraint=models.UniqueConstraint(
fields=('version', 'path'), name='unique-version-path'
),
),
]
3 changes: 3 additions & 0 deletions dandiapi/api/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .asset import Asset, AssetBlob, EmbargoedAssetBlob
from .asset_paths import AssetPath, AssetPathRelation
from .dandiset import Dandiset
from .oauth import StagingApplication
from .upload import EmbargoedUpload, Upload
Expand All @@ -8,6 +9,8 @@
__all__ = [
'Asset',
'AssetBlob',
'AssetPath',
'AssetPathRelation',
'Dandiset',
'EmbargoedAssetBlob',
'EmbargoedUpload',
Expand Down
21 changes: 19 additions & 2 deletions dandiapi/api/models/asset.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from __future__ import annotations

import datetime
import re
from urllib.parse import urlparse, urlunparse
import uuid

from django.conf import settings
from django.contrib.postgres.indexes import HashIndex
from django.core.exceptions import FieldError
from django.core.exceptions import FieldError, ValidationError
from django.core.validators import RegexValidator
from django.db import models
from django.db.models import Q
Expand All @@ -25,6 +26,18 @@
from .dandiset import Dandiset
from .version import Version

ASSET_CHARS_REGEX = r'[A-z0-9(),&\s#+~_=-]'
ASSET_PATH_REGEX = fr'^({ASSET_CHARS_REGEX}?\/?\.?{ASSET_CHARS_REGEX})+$'


def validate_asset_path(path: str):
if path.startswith('/'):
raise ValidationError('Path must not begin with /')
if not re.match(ASSET_PATH_REGEX, path):
raise ValidationError('Path improperly formatted')

return path


class BaseAssetBlob(TimeStampedModel):
SHA256_REGEX = r'[0-9a-f]{64}'
Expand Down Expand Up @@ -106,7 +119,7 @@ class Status(models.TextChoices):
INVALID = 'Invalid'

asset_id = models.UUIDField(unique=True, default=uuid.uuid4)
path = models.CharField(max_length=512)
path = models.CharField(max_length=512, validators=[validate_asset_path])
blob = models.ForeignKey(
AssetBlob, related_name='assets', on_delete=models.CASCADE, null=True, blank=True
)
Expand Down Expand Up @@ -145,6 +158,10 @@ class Meta:
name='asset_metadata_has_schema_version',
check=Q(metadata__schemaVersion__isnull=False),
),
models.CheckConstraint(name='asset_path_regex', check=Q(path__regex=ASSET_PATH_REGEX)),
models.CheckConstraint(
name='asset_path_no_leading_slash', check=~Q(path__startswith='/')
),
]

@property
Expand Down
Loading

0 comments on commit 1ab45a0

Please sign in to comment.