From 4d8d01cd472e11bea1795a46a82b700c3d7f50a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Laurent=20Ren=C3=A9=20de=20Cotret?= Date: Sun, 19 Jun 2022 11:25:30 -0400 Subject: [PATCH] Use HTTPS to download protein data bank files --- CHANGELOG.rst | 1 + crystals/parsers.py | 29 ++++++++--------------------- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c349aa1..851172b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,7 @@ Release 1.5.0 * Added some typing information. * Added the :attr:`Supercell.scaled_lattice_vectors` property and associated documentation (#11). +* Protein Data Bank downloads are now done through HTTPS rather than FTP, which is recommended by the RCSB data bank. * Fixed some documentation formatting. Release 1.4.1 diff --git a/crystals/parsers.py b/crystals/parsers.py index a306641..4bed287 100644 --- a/crystals/parsers.py +++ b/crystals/parsers.py @@ -2,7 +2,6 @@ """ Atomic structure parsers. """ -import gzip import re from typing import Any, Iterable, Optional, Tuple, Union import warnings @@ -10,7 +9,7 @@ from contextlib import AbstractContextManager, suppress from functools import lru_cache from itertools import repeat -from os import PathLike, environ, remove +from os import PathLike, environ from pathlib import Path from platform import system from string import digits, punctuation @@ -147,7 +146,7 @@ def __exit__(self, *args, **kwargs): def download_pdb_file( pdb_code: str, download_dir: Optional[PathLike] = None, - server: str = "ftp://ftp.wwpdb.org", + server: str = "https://files.rcsb.org", overwrite: bool = False, ) -> Path: """ @@ -161,7 +160,7 @@ def download_pdb_file( download_dir : path-like object Directory where to save the PDB file. Default is a local folder in the current directory server : str, optional - Address of the FTP server from which to download the PDB file. Default is the main server. + Root address of the server from which to download the PDB file. Default is the main server. overwrite : bool, optional If True, existing PDB file with the same structure will be overwritten. Default is False. @@ -170,14 +169,6 @@ def download_pdb_file( file : pathlib.Path Pointer to the downloaded file """ - # Get the compressed PDB structure - code = pdb_code.lower() - archive_fn = Path(f"pdb{code}.ent.gz") - pdb_dir = "divided" - url = ( - server + f"/pub/pdb/data/structures/{pdb_dir}/pdb/{code[1:3]}/{archive_fn}" - ) - # Where does the final PDB file get saved? if download_dir is None: path = STRUCTURE_CACHE else: @@ -185,21 +176,17 @@ def download_pdb_file( path.mkdir(exist_ok=True) - filename = path / archive_fn - final_file = path / f"pdb{code}.ent" # (decompressed) + final_file = path / f"pdb{pdb_code.lower()}.ent" # (decompressed) # Skip download if the file already exists if (not overwrite) and (final_file.exists()): return final_file - urlretrieve(url, filename) + resp = requests.get(server + f"/download/{pdb_code.upper()}.pdb") + resp.raise_for_status() - # Uncompress the archive, delete when done - # Can't use context manager with gzip.open until Python 2.7 - with gzip.open(filename, "rb") as gz: - with open(final_file, "wb") as out: - out.writelines(gz) - remove(filename) + with open(final_file, "wb") as out: + out.write(resp.content) return Path(final_file)