diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 33ba92171..1cfd857f7 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -964,25 +964,38 @@ def _set_intra_residue_bonds(array, atom_site): aromatic_flag[i] = aromatic any_mask = bond_array[:, 2] == BondType.ANY - chem_comp_bond = Category() + # Remove already existing residue and atom name combinations + # These appear when the structure contains a residue multiple times + atom_id_1 = array.atom_name[bond_array[:, 0]] + atom_id_2 = array.atom_name[bond_array[:, 1]] # Take the residue name from the first atom index, as the residue # name is the same for both atoms, since we have only intra bonds - chem_comp_bond["comp_id"] = array.res_name[bond_array[:, 0]] - chem_comp_bond["atom_id_1"] = array.atom_name[bond_array[:, 0]] - chem_comp_bond["atom_id_2"] = array.atom_name[bond_array[:, 1]] + comp_id = array.res_name[bond_array[:, 0]] + _, unique_indices = np.unique( + np.stack([comp_id, atom_id_1, atom_id_2], axis=-1), axis=0, return_index=True + ) + unique_indices.sort() + + chem_comp_bond = Category() + n_bonds = len(unique_indices) + chem_comp_bond["pdbx_ordinal"] = np.arange(1, n_bonds + 1, dtype=np.int32) + chem_comp_bond["comp_id"] = comp_id[unique_indices] + chem_comp_bond["atom_id_1"] = atom_id_1[unique_indices] + chem_comp_bond["atom_id_2"] = atom_id_2[unique_indices] chem_comp_bond["value_order"] = Column( - value_order, np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT) + value_order[unique_indices], + np.where(any_mask[unique_indices], MaskValue.MISSING, MaskValue.PRESENT), ) chem_comp_bond["pdbx_aromatic_flag"] = Column( - aromatic_flag, np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT) + aromatic_flag[unique_indices], + np.where(any_mask[unique_indices], MaskValue.MISSING, MaskValue.PRESENT), ) # BondList does not contain stereo information # -> all values are missing chem_comp_bond["pdbx_stereo_config"] = Column( - np.zeros(len(bond_array), dtype="U1"), - np.full(len(bond_array), MaskValue.MISSING), + np.zeros(n_bonds, dtype="U1"), + np.full(n_bonds, MaskValue.MISSING), ) - chem_comp_bond["pdbx_ordinal"] = np.arange(1, len(bond_array) + 1, dtype=np.int32) return chem_comp_bond @@ -1007,6 +1020,7 @@ def _set_inter_residue_bonds(array, atom_site): bond_array = _filter_bonds(array, "inter") if len(bond_array) == 0: return None + struct_conn = Category() struct_conn["id"] = np.arange(1, len(bond_array) + 1) struct_conn["conn_type_id"] = np.full(len(bond_array), "covale") diff --git a/tests/structure/test_pdbx.py b/tests/structure/test_pdbx.py index 8d5da7862..a3a699398 100644 --- a/tests/structure/test_pdbx.py +++ b/tests/structure/test_pdbx.py @@ -167,7 +167,7 @@ def test_bond_conversion(tmpdir, format, path): ref_bonds = atoms.bonds pdbx_file = File() - # The import difference to `test_conversion()` is `include_bonds` + # The important difference to `test_conversion()` is `include_bonds` pdbx.set_structure(pdbx_file, atoms, include_bonds=True) file_path = join(tmpdir, f"test.{format}") pdbx_file.write(file_path) @@ -182,6 +182,30 @@ def test_bond_conversion(tmpdir, format, path): assert test_bonds == ref_bonds +def test_bond_sparsity(): + """ + Ensure that only as much intra-residue bonds are written as necessary, + i.e. the created ``chem_comp_bond`` category has at maximum category many rows as + the reference NextGen CIF file. + + Less bonds are allowed, as not all atoms that a residue has in the CCD are actually + present in the structure. + + This tests a previous bug, where duplicate intra-residue bonds were written + (https://github.com/biotite-dev/biotite/issues/652). + """ + path = join(data_dir("structure"), "nextgen", "pdb_00001l2y_xyz-enrich.cif") + ref_pdbx_file = pdbx.CIFFile.read(path) + ref_bond_number = ref_pdbx_file.block["chem_comp_bond"].row_count + + atoms = pdbx.get_structure(ref_pdbx_file, model=1, include_bonds=True) + test_pdbx_file = pdbx.CIFFile() + pdbx.set_structure(test_pdbx_file, atoms, include_bonds=True) + test_bond_number = test_pdbx_file.block["chem_comp_bond"].row_count + + assert test_bond_number <= ref_bond_number + + @pytest.mark.parametrize("format", ["cif", "bcif"]) def test_extra_fields(tmpdir, format): path = join(data_dir("structure"), f"1l2y.{format}")