Skip to content

Commit

Permalink
Merge pull request #328 from HebaruSan/fix/bucket-names
Browse files Browse the repository at this point in the history
Fully sanitize archive.org bucket names
  • Loading branch information
HebaruSan authored Feb 28, 2024
2 parents d96e3b9 + 3328a5e commit 524a588
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 21 deletions.
5 changes: 2 additions & 3 deletions netkan/netkan/download_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
import logging
import re
from pathlib import Path
from importlib.resources import read_text
from string import Template
import urllib.parse
from typing import Dict, Tuple, Any, Optional

import requests

from .utils import repo_file_add_or_changed
from .utils import repo_file_add_or_changed, legacy_read_text
from .repos import CkanMetaRepo
from .metadata import Ckan

Expand All @@ -23,7 +22,7 @@ class GraphQLQuery:
MODULES_PER_GRAPHQL = 20

# The request we send to GitHub, with a parameter for the module specific section
GRAPHQL_TEMPLATE = Template(read_text('netkan', 'downloads_query.graphql'))
GRAPHQL_TEMPLATE = Template(legacy_read_text('netkan', 'downloads_query.graphql'))

# The request string per module, depends on getDownloads fragment existing in
# the main template
Expand Down
44 changes: 31 additions & 13 deletions netkan/netkan/mirrorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import logging
import shutil
from pathlib import Path
from importlib.resources import read_text
from typing import Optional, List, Union, Iterable, BinaryIO, Dict, Any, TYPE_CHECKING
import boto3
import github
Expand All @@ -16,6 +15,7 @@
from .metadata import Ckan
from .repos import CkanMetaRepo
from .common import deletion_msg, download_stream_to_file, USER_AGENT
from .utils import legacy_read_text

if TYPE_CHECKING:
from mypy_boto3_sqs.type_defs import DeleteMessageBatchRequestEntryTypeDef
Expand All @@ -26,15 +26,17 @@
class CkanMirror(Ckan):

DESCRIPTION_TEMPLATE = Template(
read_text('netkan', 'mirror_description_template.jinja2'))
legacy_read_text('netkan', 'mirror_description_template.jinja2'))

BUCKET_EXCLUDE_PATTERN = re.compile(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9._-]')

LICENSE_URLS = {
"Apache" : 'http://www.apache.org/licenses/LICENSE-1.0',
"Apache-1.0" : 'http://www.apache.org/licenses/LICENSE-1.0',
"Apache-2.0" : 'http://www.apache.org/licenses/LICENSE-2.0',
"Artistic" : 'http://www.gnu.org/licenses/license-list.en.html#ArtisticLicense',
"Artistic-1.0" : 'http://www.gnu.org/licenses/license-list.en.html#ArtisticLicense',
"Artistic-2.0" : 'http://www.perlfoundation.org/artistic_license_2_0',
"Artistic" : 'https://directory.fsf.org/wiki/License:Artistic-1.0',
"Artistic-1.0" : 'https://directory.fsf.org/wiki/License:Artistic-1.0',
"Artistic-2.0" : 'https://directory.fsf.org/wiki/License:Artistic-2.0',
"BSD-2-clause" : 'https://opensource.org/licenses/BSD-2-Clause',
"BSD-3-clause" : 'https://opensource.org/licenses/BSD-3-Clause',
"ISC" : 'https://opensource.org/licenses/ISC',
Expand Down Expand Up @@ -99,20 +101,23 @@ class CkanMirror(Ckan):
"Perl" : 'http://dev.perl.org/licenses/',
"Python-2.0" : 'https://www.python.org/download/releases/2.0/license/',
"QPL-1.0" : 'https://opensource.org/licenses/QPL-1.0',
"W3C" : 'https://www.w3.org/Consortium/Legal/2015/copyright-software-and-document',
"W3C" : 'https://www.w3.org/copyright/software-license-2023/',
"Zlib" : 'http://www.zlib.net/zlib_license.html',
"Zope" : 'http://old.zope.org/Resources/License.1',
"Unlicense" : 'https://unlicense.org/UNLICENSE',
}

def __init__(self, collection: str, filename: Optional[Union[str, Path]] = None, contents: Optional[str] = None) -> None:
def __init__(self, collection: str, filename: Optional[Union[str, Path]] = None,
contents: Optional[str] = None) -> None:
Ckan.__init__(self, filename, contents)
self.collection = collection

@property
def can_mirror(self) -> bool:
return (
self.kind == 'package' and getattr(self, 'download_content_type', '') in Ckan.MIME_TO_EXTENSION and self.redistributable
self.kind == 'package'
and getattr(self, 'download_content_type', '') in Ckan.MIME_TO_EXTENSION
and self.redistributable
)

def mirrored(self, iarchive: internetarchive.session.ArchiveSession) -> bool:
Expand All @@ -129,14 +134,25 @@ def license_urls(self) -> List[str]:
for lic in self.licenses() if lic in self.LICENSE_URLS]

def mirror_item(self, with_epoch: bool = True) -> str:
return f'{self.identifier}-{self._format_version(with_epoch)}'
return self._ia_bucket_sanitize(
f'{self.identifier}-{self._format_version(with_epoch)}')

def mirror_source_filename(self, with_epoch: bool = True) -> str:
return f'{self.identifier}-{self._format_version(with_epoch)}.source.zip'
return self._ia_bucket_sanitize(
f'{self.identifier}-{self._format_version(with_epoch)}.source.zip')

def mirror_title(self, with_epoch: bool = True) -> str:
return f'{self.name} - {self._format_version(with_epoch)}'

# InternetArchive says:
# Bucket names should be valid archive identifiers;
# try someting matching this regular expression:
# ^[a-zA-Z0-9][a-zA-Z0-9_.-]{4,100}$
# (We enforce everything except the minimum of 4 characters)
@classmethod
def _ia_bucket_sanitize(cls, s: str) -> str:
return cls.BUCKET_EXCLUDE_PATTERN.sub('', s)[:100]

@property
def item_metadata(self) -> Dict[str, Any]:
lic_urls = self.license_urls()
Expand Down Expand Up @@ -229,10 +245,12 @@ def mirror_description(self) -> str:

class Mirrorer:

EPOCH_ID_REGEXP = re.compile('-[0-9]+-')
EPOCH_TITLE_REGEXP = re.compile(' - [0-9]+:')
EPOCH_ID_REGEXP = re.compile(r'-[0-9]+-')
EPOCH_TITLE_REGEXP = re.compile(r' - [0-9]+:')

def __init__(self, ckm_repo: CkanMetaRepo, ia_access: str, ia_secret: str, ia_collection: str, token: Optional[str] = None) -> None:
def __init__(self, ckm_repo: CkanMetaRepo,
ia_access: str, ia_secret: str, ia_collection: str,
token: Optional[str] = None) -> None:
self.ckm_repo = ckm_repo
self.ia_collection = ia_collection
self.ia_access = ia_access
Expand Down
6 changes: 3 additions & 3 deletions netkan/netkan/spacedock_adder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import re
import io
from importlib.resources import read_text
from string import Template
from collections import defaultdict, deque
import logging
Expand All @@ -17,6 +16,7 @@
from .mod_analyzer import ModAnalyzer
from .queue_handler import BaseMessageHandler, QueueHandler
from .repos import NetkanRepo
from .utils import legacy_read_text

if TYPE_CHECKING:
from mypy_boto3_sqs.service_resource import Message
Expand All @@ -28,8 +28,8 @@

# https://github.com/KSP-SpaceDock/SpaceDock/blob/master/KerbalStuff/ckan.py
class SpaceDockAdder:
COMMIT_TEMPLATE = Template(read_text('netkan', 'sd_adder_commit_template.md'))
PR_BODY_TEMPLATE = Template(read_text('netkan', 'sd_adder_pr_body_template.md'))
COMMIT_TEMPLATE = Template(legacy_read_text('netkan', 'sd_adder_commit_template.md'))
PR_BODY_TEMPLATE = Template(legacy_read_text('netkan', 'sd_adder_pr_body_template.md'))
USER_TEMPLATE = Template('[$username]($user_url)')
TITLE_TEMPLATE = Template('Add $name from $site_name')
GITHUB_PATH_PATTERN = re.compile(r'^/([^/]+)/([^/]+)')
Expand Down
4 changes: 2 additions & 2 deletions netkan/netkan/ticket_closer.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import logging
from datetime import datetime, timedelta, timezone
from importlib.resources import read_text
from collections import defaultdict
from string import Template
import github

from .common import USER_AGENT
from .utils import legacy_read_text


class TicketCloser:

REPO_NAMES = ['CKAN', 'NetKAN']
BODY_TEMPLATE = Template(read_text('netkan', 'ticket_close_template.md'))
BODY_TEMPLATE = Template(legacy_read_text('netkan', 'ticket_close_template.md'))

def __init__(self, token: str, user_name: str) -> None:
self._gh = github.Github(token, user_agent=USER_AGENT)
Expand Down
6 changes: 6 additions & 0 deletions netkan/netkan/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import subprocess
from pathlib import Path
from typing import Union
from importlib.resources import files

from git import Repo


Expand Down Expand Up @@ -42,3 +44,7 @@ def repo_file_add_or_changed(repo: Repo, filename: Union[str, Path]) -> bool:
x.a_path for x in repo.index.diff(None)]:
return True
return False


def legacy_read_text(pkg: str, resource: str) -> str:
return files(pkg).joinpath(resource).read_text()

0 comments on commit 524a588

Please sign in to comment.