Use HTTPS to download protein data bank files

LaurentRDC · Jun 19, 2022 · 4d8d01c · 4d8d01c
1 parent 2d3a570
commit 4d8d01c
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 21 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -9,6 +9,7 @@ Release 1.5.0
 
 * Added some typing information.
 * Added the :attr:`Supercell.scaled_lattice_vectors` property and associated documentation (#11).
+* Protein Data Bank downloads are now done through HTTPS rather than FTP, which is recommended by the RCSB data bank.
 * Fixed some documentation formatting.
 
 Release 1.4.1

diff --git a/crystals/parsers.py b/crystals/parsers.py
@@ -2,15 +2,14 @@
 """
 Atomic structure parsers.
 """
-import gzip
 import re
 from typing import Any, Iterable, Optional, Tuple, Union
 import warnings
 from abc import abstractmethod
 from contextlib import AbstractContextManager, suppress
 from functools import lru_cache
 from itertools import repeat
-from os import PathLike, environ, remove
+from os import PathLike, environ
 from pathlib import Path
 from platform import system
 from string import digits, punctuation
@@ -147,7 +146,7 @@ def __exit__(self, *args, **kwargs):
     def download_pdb_file(
         pdb_code: str,
         download_dir: Optional[PathLike] = None,
-        server: str = "ftp://ftp.wwpdb.org",
+        server: str = "https://files.rcsb.org",
         overwrite: bool = False,
     ) -> Path:
         """
@@ -161,7 +160,7 @@ def download_pdb_file(
         download_dir : path-like object
             Directory where to save the PDB file. Default is a local folder in the current directory
         server : str, optional
-            Address of the FTP server from which to download the PDB file. Default is the main server.
+            Root address of the server from which to download the PDB file. Default is the main server.
         overwrite : bool, optional
             If True, existing PDB file with the same structure will be overwritten. Default is False.
 
@@ -170,36 +169,24 @@ def download_pdb_file(
         file : pathlib.Path
             Pointer to the downloaded file
         """
-        # Get the compressed PDB structure
-        code = pdb_code.lower()
-        archive_fn = Path(f"pdb{code}.ent.gz")
-        pdb_dir = "divided"
-        url = (
-            server + f"/pub/pdb/data/structures/{pdb_dir}/pdb/{code[1:3]}/{archive_fn}"
-        )
-        # Where does the final PDB file get saved?
         if download_dir is None:
             path = STRUCTURE_CACHE
         else:
             path = Path(download_dir)
 
         path.mkdir(exist_ok=True)
 
-        filename = path / archive_fn
-        final_file = path / f"pdb{code}.ent"  # (decompressed)
+        final_file = path / f"pdb{pdb_code.lower()}.ent"  # (decompressed)
 
         # Skip download if the file already exists
         if (not overwrite) and (final_file.exists()):
             return final_file
 
-        urlretrieve(url, filename)
+        resp = requests.get(server + f"/download/{pdb_code.upper()}.pdb")
+        resp.raise_for_status()
 
-        # Uncompress the archive, delete when done
-        # Can't use context manager with gzip.open until Python 2.7
-        with gzip.open(filename, "rb") as gz:
-            with open(final_file, "wb") as out:
-                out.writelines(gz)
-        remove(filename)
+        with open(final_file, "wb") as out:
+            out.write(resp.content)
 
         return Path(final_file)