From 5d76a56e007038f520a66d821361f85a3cf60ab2 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 12 Aug 2024 14:06:28 +0200 Subject: [PATCH 01/83] [nomenclature._sort_all_consensus_labels] API CHANGE return also sorted indices --- mdciao/nomenclature/nomenclature.py | 16 ++++++++++++---- tests/test_nomenclature.py | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index 63c9bd10..8315528f 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -1549,7 +1549,7 @@ def __init__(self, maps, tops=None): self._residxs["consensus"] = self._residxs.index.values self._residxs=self._residxs[["consensus"]+[key for key in self._residxs.keys() if key !="consensus"]] - sorted_keys = _sort_all_consensus_labels(self._residxs["consensus"], append_diffset=False) + sorted_keys = _sort_all_consensus_labels(self._residxs["consensus"], append_diffset=False)[0] assert len(sorted_keys)==len(self._residxs["consensus"]), (len(sorted_keys), len(self._residxs["consensus"])) self._residxs = self._residxs.sort_values("consensus", key=lambda col: col.map(lambda x: sorted_keys.index(x))) self._residxs.index = _np.arange(len(self._residxs)) @@ -2428,7 +2428,7 @@ def _conslabel2fraglabel(labelres, defrag="@", prefix_GPCR=True): label = _GPCR_num2lett[label] return label -def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN","KLIFS"], ): +def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN","KLIFS"], return_argsort=False): r""" Sort a mix of consensus labels GPCR, CGN, KLIFS @@ -2442,7 +2442,7 @@ def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN", end of `sorted_labels` unless explicitly deactivated with `append_diffset`. - append_diffset : bool, default is True + append_diffset : bool, default is True Append the non-consensus labels at the end of `sorted_labels` order : list @@ -2458,6 +2458,11 @@ def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN", ------- sorted_labels : list Sorted consensus labels + sorted_indices : 1D _np.ndarray + The indices of `labels` that return + the sorted `soted_labels`. Depending + on `append_diffset` it will contain + (or not) all indices of `labels` """ lambdas = {"GPCR": lambda labels: _sort_GPCR_consensus_labels(labels, append_diffset=False), @@ -2470,7 +2475,10 @@ def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN", if append_diffset: sorted_labels += [lab for lab in labels if lab not in sorted_labels] - return sorted_labels + sorted_indices = [_np.flatnonzero(lab==_np.array(labels)) for lab in sorted_labels] + sorted_indices = _np.hstack([si for si in sorted_indices if len(si)>0]).squeeze() + + return sorted_labels, sorted_indices _GPCR_num2lett = { "1": "TM1 ", diff --git a/tests/test_nomenclature.py b/tests/test_nomenclature.py index abe7eff2..548f2945 100644 --- a/tests/test_nomenclature.py +++ b/tests/test_nomenclature.py @@ -857,7 +857,7 @@ def test_KLIFS_dont_append(self): sorted) def test_sort_all_consensus_labels(self): - sorted = nomenclature._sort_all_consensus_labels(self.tosort, append_diffset=False, order = ["CGN","KLIFS", "GPCR"]) + sorted, sorted_indices = nomenclature._sort_all_consensus_labels(self.tosort, append_diffset=False, order = ["CGN","KLIFS", "GPCR"]) _np.testing.assert_array_equal( [ "G.H1.1", "G.H1.10", "H.HA.10", "H.HA.20", @@ -865,8 +865,14 @@ def test_sort_all_consensus_labels(self): "2.50", "3.50", "H8.1", "H8.10"], sorted) + _np.testing.assert_array_equal( + [9, 0, 10, 1, + 4, 11, 2, + 8, 7, 6, 3], + sorted_indices) + def test_sort_all_consensus_labels_append(self): - sorted = nomenclature._sort_all_consensus_labels(self.tosort, append_diffset=True, + sorted, sorted_indices = nomenclature._sort_all_consensus_labels(self.tosort, append_diffset=True, order=["CGN", "KLIFS"]) _np.testing.assert_array_equal( [ @@ -876,6 +882,12 @@ def test_sort_all_consensus_labels_append(self): ], sorted) + _np.testing.assert_array_equal( + [9, 0, 10, 1, + 4, 11, 2, + 3, 5, 6, 7, 8], + sorted_indices) + class Test_compatible_consensus_fragments(TestClassSetUpTearDown_CGN_local): def setUp(self): From d70a5eddef1a3b22eaf0c16c90d4afdfaa13b90a Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 12 Aug 2024 14:06:28 +0200 Subject: [PATCH 02/83] [nomenclature._sort_all_consensus_labels] API CHANGE return also sorted indices --- mdciao/nomenclature/nomenclature.py | 16 ++++++++++++---- tests/test_nomenclature.py | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index 63c9bd10..dacee17f 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -1549,7 +1549,7 @@ def __init__(self, maps, tops=None): self._residxs["consensus"] = self._residxs.index.values self._residxs=self._residxs[["consensus"]+[key for key in self._residxs.keys() if key !="consensus"]] - sorted_keys = _sort_all_consensus_labels(self._residxs["consensus"], append_diffset=False) + sorted_keys = _sort_all_consensus_labels(self._residxs["consensus"], append_diffset=False)[0] assert len(sorted_keys)==len(self._residxs["consensus"]), (len(sorted_keys), len(self._residxs["consensus"])) self._residxs = self._residxs.sort_values("consensus", key=lambda col: col.map(lambda x: sorted_keys.index(x))) self._residxs.index = _np.arange(len(self._residxs)) @@ -2428,7 +2428,7 @@ def _conslabel2fraglabel(labelres, defrag="@", prefix_GPCR=True): label = _GPCR_num2lett[label] return label -def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN","KLIFS"], ): +def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN","KLIFS"]): r""" Sort a mix of consensus labels GPCR, CGN, KLIFS @@ -2442,7 +2442,7 @@ def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN", end of `sorted_labels` unless explicitly deactivated with `append_diffset`. - append_diffset : bool, default is True + append_diffset : bool, default is True Append the non-consensus labels at the end of `sorted_labels` order : list @@ -2458,6 +2458,11 @@ def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN", ------- sorted_labels : list Sorted consensus labels + sorted_indices : 1D _np.ndarray + The indices of `labels` that return + the sorted `soted_labels`. Depending + on `append_diffset` it will contain + (or not) all indices of `labels` """ lambdas = {"GPCR": lambda labels: _sort_GPCR_consensus_labels(labels, append_diffset=False), @@ -2470,7 +2475,10 @@ def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN", if append_diffset: sorted_labels += [lab for lab in labels if lab not in sorted_labels] - return sorted_labels + sorted_indices = [_np.flatnonzero(lab==_np.array(labels)) for lab in sorted_labels] + sorted_indices = _np.hstack([si for si in sorted_indices if len(si)>0]).squeeze() + + return sorted_labels, sorted_indices _GPCR_num2lett = { "1": "TM1 ", diff --git a/tests/test_nomenclature.py b/tests/test_nomenclature.py index abe7eff2..548f2945 100644 --- a/tests/test_nomenclature.py +++ b/tests/test_nomenclature.py @@ -857,7 +857,7 @@ def test_KLIFS_dont_append(self): sorted) def test_sort_all_consensus_labels(self): - sorted = nomenclature._sort_all_consensus_labels(self.tosort, append_diffset=False, order = ["CGN","KLIFS", "GPCR"]) + sorted, sorted_indices = nomenclature._sort_all_consensus_labels(self.tosort, append_diffset=False, order = ["CGN","KLIFS", "GPCR"]) _np.testing.assert_array_equal( [ "G.H1.1", "G.H1.10", "H.HA.10", "H.HA.20", @@ -865,8 +865,14 @@ def test_sort_all_consensus_labels(self): "2.50", "3.50", "H8.1", "H8.10"], sorted) + _np.testing.assert_array_equal( + [9, 0, 10, 1, + 4, 11, 2, + 8, 7, 6, 3], + sorted_indices) + def test_sort_all_consensus_labels_append(self): - sorted = nomenclature._sort_all_consensus_labels(self.tosort, append_diffset=True, + sorted, sorted_indices = nomenclature._sort_all_consensus_labels(self.tosort, append_diffset=True, order=["CGN", "KLIFS"]) _np.testing.assert_array_equal( [ @@ -876,6 +882,12 @@ def test_sort_all_consensus_labels_append(self): ], sorted) + _np.testing.assert_array_equal( + [9, 0, 10, 1, + 4, 11, 2, + 3, 5, 6, 7, 8], + sorted_indices) + class Test_compatible_consensus_fragments(TestClassSetUpTearDown_CGN_local): def setUp(self): From 26fbd8eb73087d14027f4b28954867fe512d7eac Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 12 Aug 2024 14:56:37 +0200 Subject: [PATCH 03/83] [nomeclature._sort_consensus_labels] API CHANGE don't de-duplicate, i.e. return 'sorted' duplicates --- mdciao/nomenclature/nomenclature.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index dacee17f..431983aa 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -2310,7 +2310,9 @@ def _sort_consensus_labels(subset, sorted_superset, Parameters ---------- subset : iterable - list with the names (type str) to be ordered + list with the names (type str) to be ordered. + If duplicates are present, they will also appear + as duplicates in `fragnames_out` sorted_superset : iterable list with names in the desired order. Is a superset of :obj:`subset` @@ -2320,7 +2322,11 @@ def _sort_consensus_labels(subset, sorted_superset, Returns ------- - fragnames_out + fragnames_out: list + List with the labels of `subset` sorted according + to some `sorted_superset` and potentially other + labels not contained in the `sorted_superset` appended + at the end. """ by_frags = _defdict(dict) @@ -2345,6 +2351,11 @@ def _sort_consensus_labels(subset, sorted_superset, if append_diffset: labs_out += [item for item in subset if item not in labs_out] + # Recover the duplicates + order = [] + for lab in labs_out: + order.extend(_np.flatnonzero(_np.array(subset)==lab)) + labs_out = [subset[oo] for oo in order] return labs_out From 6318535e1a8dd99341839d4656740cdd56eadb5c Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 12 Aug 2024 15:08:01 +0200 Subject: [PATCH 04/83] [nomeclature._sort_consensus_labels] return right indices for duplicates --- mdciao/nomenclature/nomenclature.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index 431983aa..696e2de8 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -2486,8 +2486,14 @@ def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN", if append_diffset: sorted_labels += [lab for lab in labels if lab not in sorted_labels] - sorted_indices = [_np.flatnonzero(lab==_np.array(labels)) for lab in sorted_labels] - sorted_indices = _np.hstack([si for si in sorted_indices if len(si)>0]).squeeze() + # Handle duplicates by including their indices only once (else _np.flatnonzero returns all indices all the time) + sorted_indices = [] + for lab in sorted_labels: + for ii in _np.flatnonzero(lab == _np.array(labels)): + if ii not in sorted_indices: + sorted_indices.append(ii) + sorted_indices = _np.array(sorted_indices, ndmin=1) + assert sorted_indices.ndim==1 return sorted_labels, sorted_indices From 3a6e18aa226fe8cd2e8628569d84e46d89d0680b Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 12 Aug 2024 16:27:17 +0200 Subject: [PATCH 05/83] [nomeclature._lexsort_consensus_ctc_labels] new method for lexsorting consensus labels, tests --- mdciao/nomenclature/nomenclature.py | 79 ++++++++++++++++++++++++++++- tests/test_nomenclature.py | 70 ++++++++++++++++++++++++- 2 files changed, 147 insertions(+), 2 deletions(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index 696e2de8..a70079d6 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -2471,7 +2471,7 @@ def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN", Sorted consensus labels sorted_indices : 1D _np.ndarray The indices of `labels` that return - the sorted `soted_labels`. Depending + the sorted `sorted_labels`. Depending on `append_diffset` it will contain (or not) all indices of `labels` """ @@ -3807,3 +3807,80 @@ def fragments_as_idxs(self): """ return self._fragments_as_idxs + +def _lexsort_consensus_ctc_labels(labels, reverse = False, columns = [0, 1], sep = "-") -> tuple: + r""" + Sort contact-labels in ascending order of resSeq using both columns + + Wraps around :obj:`_sort_all_consensus_labels` with some string handling. + + It will also work with contact-labels consisting of only one residue, + e.g. in the cases where the "anchor" has been deleted or the frequencies + have been aggregated to per-residue frequencies + + >>> labels = ['3.50-G.H5.23', + >>> '3.50-7.53', + >>> '3.50-2.39', + >>> '4.50-6.60', + >>> '3.50-5.58'] + >>> sorted_labels, order = _lexsort_consensus_ctc_labels(labels) + >>> sorted_labels + >>> labels = ['3.50-2.39', + >>> '3.50-5.58', + >>> '3.50-7.53', + >>> '3.50-G.H5.23', + >>> '4.50-6.60'] + + + Parameters + ---------- + labels : list or np.ndarray + Strings describing the contact + residues using consensus labels only. + Labels can be just one residue "3.50" or + both "3.50-2.50", but not 'mixed', as in + >>> labels = ["3.50", "3.50-2.50"] + Full labels, e.g. "GLU30@3.50", or non-consensus + labels, e.g. "frag1", will be sorted last. + reverse : bool, default is False + If True, sort in descending + order, instead of ascending + columns : list + The order of the columns, + e.g. [0,1] means sort first + by first column (idx 0), + then by second column (idx 1). + sep : char, default is "-" + The character to use + when separating the + contact label into both residues + + Returns + ------- + sorted_labels : list + The sorted contact labels + order : 1D np.ndarray + The indices of `ctc_labels` that + sort it into `sorted_labels` + """ + split_labels = [_mdcu.str_and_dict.splitlabel(lab, sep=sep) for lab in labels] + + if not any([all([len(lab) == 1 for lab in split_labels]), + all([len(lab) == 2 for lab in split_labels])]): + raise ValueError(f"Labels have to be all single ('3.50') or double ('3.50-2.50'), but not mixed {labels}") + split_labels = _np.vstack(split_labels) + if split_labels.ndim == 1: + order = _sort_all_consensus_labels(split_labels())[1] + elif split_labels.ndim ==2: + #There's other ways but this was easy to set up + lexsort = {key : val[columns[1]].to_dict() for key, val in _DataFrame(split_labels).groupby(by=columns[0])} + order = [] + for key1 in _sort_all_consensus_labels(list(lexsort.keys()))[0]: + for key2 in _sort_all_consensus_labels(list(lexsort[key1].values()))[0]: + order.extend(_np.flatnonzero(_np.array(labels) == sep.join(_np.array([key1, key2])[columns]))) + else: + raise ValueError + + if reverse: + order = order[::-1] + return [labels[ii] for ii in order], order diff --git a/tests/test_nomenclature.py b/tests/test_nomenclature.py index 548f2945..f77c7a11 100644 --- a/tests/test_nomenclature.py +++ b/tests/test_nomenclature.py @@ -861,7 +861,7 @@ def test_sort_all_consensus_labels(self): _np.testing.assert_array_equal( [ "G.H1.1", "G.H1.10", "H.HA.10", "H.HA.20", - 'αC.25', 'αD.55', 'a.l.85', + 'αC.25', 'αD.55', 'a.l.85', "2.50", "3.50", "H8.1", "H8.10"], sorted) @@ -888,6 +888,74 @@ def test_sort_all_consensus_labels_append(self): 3, 5, 6, 7, 8], sorted_indices) +class Test_lexsort_consensus_ctc_labels(unittest.TestCase): + + """ + (, + [2, 4, 1, 0, 3]) + + """ + + def test_lexsort_works(self): + labels = ['3.50-G.H5.23', + '3.50-7.53', + "3.50-frag1", + '3.50-2.39', + '4.50-6.60', + '3.50-5.58'] + sorted_labels, order = nomenclature._lexsort_consensus_ctc_labels(labels) + self.assertListEqual(sorted_labels, + ['3.50-2.39', + '3.50-5.58', + '3.50-7.53', + '3.50-G.H5.23', + "3.50-frag1", + '4.50-6.60']) + self.assertListEqual(order, + [3, 5, 1, 0, 2, 4]) + + def test_lexsort_works_reverse(self): + labels = ['3.50-G.H5.23', + '3.50-7.53', + "3.50-frag1", + '3.50-2.39', + '4.50-6.60', + '3.50-5.58'] + sorted_labels, order = nomenclature._lexsort_consensus_ctc_labels(labels, reverse=True) + self.assertListEqual(sorted_labels, + ['4.50-6.60', + '3.50-frag1', + '3.50-G.H5.23', + '3.50-7.53', + '3.50-5.58', + '3.50-2.39']) + self.assertListEqual(order, + [4, 2, 0, 1, 5, 3]) + + def test_lexsort_works_raises(self): + with self.assertRaises(ValueError): + nomenclature._lexsort_consensus_ctc_labels(['3.50-G.H5.23', '3.50']) + + def test_lexsort_works_second_column(self): + labels = ['3.50-G.H5.23', + '3.50-7.53', + "3.50-frag1", + '3.50-2.39', + '4.50-6.60', + '3.50-5.58'] + sorted_labels, order = nomenclature._lexsort_consensus_ctc_labels(labels, columns=[1,0]) + self.assertListEqual(sorted_labels, + ['3.50-2.39', + '3.50-5.58', + '4.50-6.60', + '3.50-7.53', + '3.50-G.H5.23', + "3.50-frag1", + ]) + self.assertListEqual(order, + [3, 5, 4, 1, 0, 2]) + + class Test_compatible_consensus_fragments(TestClassSetUpTearDown_CGN_local): def setUp(self): From f6fd8965b4e67b6da7f6af0c1ea3f2d717669a1d Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 10:27:16 +0200 Subject: [PATCH 06/83] [ContactPair.label_flex] refactor split_label -> pad_label --- mdciao/contacts/contacts.py | 16 ++++++++-------- mdciao/plots/plots.py | 4 ++-- tests/test_contacts.py | 26 +++++++++++++------------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 42c80f71..2a6f86df 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -1946,7 +1946,7 @@ def frequency_overall_trajs(self, ctc_cutoff_Ang,switch_off_Ang=None): """ return _np.mean(_np.hstack(self.binarize_trajs(ctc_cutoff_Ang, switch_off_Ang=switch_off_Ang))) - def label_flex(self, AA_format="short", split_label=True, defrag=None, fmt1="%-15s", fmt2="%-15s"): + def label_flex(self, AA_format="short", pad_label=True, defrag=None, fmt1="%-15s", fmt2="%-15s"): r""" A more flexible method to produce the label of this :obj:`ContactPair` @@ -1960,8 +1960,8 @@ def label_flex(self, AA_format="short", split_label=True, defrag=None, fmt1="%-1 * "try_consensus": 4.50 if consensus labeling is present, else default to "short" - split_label : bool, default is True - Split the labels so that stacked contact labels + pad_label : bool, default is True + Pad the labels with whitespace so that stacked contact labels become easier-to-read in plain ascii formats - "E25@3.50____- A35@4.50" - "A30@longfrag- A35@4.50 @@ -1971,10 +1971,10 @@ def label_flex(self, AA_format="short", split_label=True, defrag=None, fmt1="%-1 them as is, e.g. would be "@" fmt1 : str, default is "%-15s" Specify how the labels of res1 should be formatted. - Only has effect if `split_label` is True + Only has effect if `pad_label` is True fmt2 : str, default is "%-15s" Specify how the labels of res2 should be formatted. - Only has effect if `split_label` is True + Only has effect if `pad_label` is True Returns ------- label : str @@ -2007,7 +2007,7 @@ def label_flex(self, AA_format="short", split_label=True, defrag=None, fmt1="%-1 raise ValueError(AA_format) if defrag is not None: label = _mdcu.str_and_dict.defrag_key(label,defrag=defrag, sep="-") - if split_label: + if pad_label: fmt = f"{fmt1} - {fmt2}" label = fmt % tuple(_mdcu.str_and_dict.splitlabel(label, '-')) @@ -3945,7 +3945,7 @@ def frequency_table(self, ctc_cutoff_Ang, """ if _path.splitext(str(fname))[1] in [".xlsx"]: - freq_dataframe_kwargs["split_label"] = False + freq_dataframe_kwargs["pad_label"] = False main_DF = self.frequency_dataframe(ctc_cutoff_Ang, switch_off_Ang=switch_off_Ang, **freq_dataframe_kwargs) @@ -3956,7 +3956,7 @@ def frequency_table(self, ctc_cutoff_Ang, return_as_dataframe=True) self.frequency_spreadsheet(main_DF,idfs,ctc_cutoff_Ang,fname) else: - freq_dataframe_kwargs["split_label"] = True + freq_dataframe_kwargs["pad_label"] = True main_DF = self.frequency_dataframe(ctc_cutoff_Ang, switch_off_Ang=switch_off_Ang, **freq_dataframe_kwargs) diff --git a/mdciao/plots/plots.py b/mdciao/plots/plots.py index 99427548..3436c23c 100644 --- a/mdciao/plots/plots.py +++ b/mdciao/plots/plots.py @@ -901,14 +901,14 @@ def compare_groups_of_contacts(groups, elif all([istr in str(type(ifile)) for istr in ["mdciao", "contacts", "ContactGroup"]]): if distro: idict = ifile.distribution_dicts(AA_format=AA_format, - split_label=False, + pad_label=False, bins="auto") else: assert ctc_cutoff_Ang is not None, "Cannot provide a ContatGroup object without a `ctc_cutoff_Ang` parameter" if not interface: idict = ifile.frequency_dicts(ctc_cutoff_Ang=ctc_cutoff_Ang, AA_format=AA_format, - split_label=False) + pad_label=False) else: idict = ifile.frequency_sum_per_residue_names(ctc_cutoff_Ang=ctc_cutoff_Ang, shorten_AAs=[True if AA_format=="short" else False][0], diff --git a/tests/test_contacts.py b/tests/test_contacts.py index 3d563259..9e3e2325 100644 --- a/tests/test_contacts.py +++ b/tests/test_contacts.py @@ -859,7 +859,7 @@ def test_frequency_dict_no_labels(self): idict = cpt.frequency_dict(21, AA_format="long") assert idict["label"] == ('%-15s - %-15s' % (0, 1)), idict["label"] - idict = cpt.frequency_dict(21, split_label=False) + idict = cpt.frequency_dict(21, pad_label=False) assert idict["label"] == "0-1" def test_frequency_dict_w_labels(self): @@ -874,7 +874,7 @@ def test_frequency_dict_w_labels(self): idict = cpt.frequency_dict(21, AA_format="long") assert idict["label"] == ('%-15s - %-15s' % ("0@fragA", "1@fragB")) - idict = cpt.frequency_dict(21, split_label=False) + idict = cpt.frequency_dict(21, pad_label=False) assert idict["label"] == '0@fragA-1@fragB' def test_frequency_dict_w_labels_just_consensus(self): @@ -884,7 +884,7 @@ def test_frequency_dict_w_labels_just_consensus(self): consensus_labels=["3.50","4.50"] ) idict = cpt.frequency_dict(21, - split_label=False, + pad_label=False, AA_format="just_consensus") self.assertEqual(idict["label"], "3.50-4.50") @@ -896,7 +896,7 @@ def test_frequency_dict_w_labels_just_consensus_raises(self): ) with self.assertRaises(ValueError): idict = cpt.frequency_dict(21, - split_label=False, + pad_label=False, AA_format="just_consensus") @@ -1727,7 +1727,7 @@ def test_distance_distributions(self): def test_distirbution_dicts(self): CG = self.CG_cp1_cp2 - dicts = CG.distribution_dicts(bins=10,split_label=False) + dicts = CG.distribution_dicts(bins=10,pad_label=False) _np.testing.assert_array_equal(list(dicts.keys()), ["0-1", "0-2"]) for a, b in zip(dicts.values(), CG._distributions_of_distances(bins=10)): _np.testing.assert_array_equal(a[0],b[0]) @@ -2219,7 +2219,7 @@ def setUpClass(cls): assert cls.total_intf_freq_at_3 > 0 def test_frequency_dicts(self): CG = self.CG - freqdcit = CG.frequency_dicts(2, split_label=False) + freqdcit = CG.frequency_dicts(2, pad_label=False) self.assertDictEqual(freqdcit, {"E30@fragA-V31@fragB" : 2 / 5, "E30@fragA-W32@fragC" : 1 / 5}) @@ -2229,7 +2229,7 @@ def test_frequency_dicts_sort(self): self.cp1_w_anchor_and_frags_and_top], neighbors_excluded=0 ) - self.assertDictEqual(CG.frequency_dicts(2, split_label=False, sort_by_freq=True), + self.assertDictEqual(CG.frequency_dicts(2, pad_label=False, sort_by_freq=True), {"E30@fragA-W32@fragC": 1 / 5, "E30@fragA-V31@fragB": 2 / 5}) @@ -2445,7 +2445,7 @@ def test_frequency_table(self): CG = contacts.ContactGroup([contacts.ContactPair([0, 1], [[.4, .3, .25]], [[0, 1, 2]]), contacts.ContactPair([0, 2], [[.1, .2, .3]], [[0, 1, 2]])]) - table = CG.frequency_dataframe(2.5, split_label=False) + table = CG.frequency_dataframe(2.5, pad_label=False) _np.testing.assert_array_equal(table["freq"].array, [1 / 3, 2 / 3]) _np.testing.assert_array_equal(table["label"].array, ["0-1", "0-2"]) _np.testing.assert_array_equal(table["sum"].array, [1 / 3, 1 / 3 + 2 / 3]) @@ -2453,7 +2453,7 @@ def test_frequency_table(self): def test_frequency_table_w_atom_types_and_names(self): CG = contacts.ContactGroup([self.cp1_w_atom_types, self.cp2_w_atom_types]) - table = CG.frequency_dataframe(3.5, split_label=False, atom_types=True) + table = CG.frequency_dataframe(3.5, pad_label=False, atom_types=True) _np.testing.assert_array_equal(table["freq"].array, [3 / 4, 3 / 4]) _np.testing.assert_array_equal(table["label"].array, ["E30-V31", "E30-W32"]) _np.testing.assert_array_equal(table["sum"].array, [3 / 4, 3 / 4 + 3 / 4]) @@ -2510,7 +2510,7 @@ def test_cutoff(self): def test_frequency_dicts(self): with self.assertRaises(ValueError) as cm: - self.CG.frequency_dicts(6, split_label=False) + self.CG.frequency_dicts(6, pad_label=False) def test_frequency_per_contact(self): with self.assertRaises(ValueError) as cm: @@ -2573,7 +2573,7 @@ def test_frequency_table(self): contacts.ContactPair([0, 2], [[.1, .2, .3]], [[0, 1, 2]])], max_cutoff_Ang=3) with self.assertRaises(ValueError): - CG.frequency_dataframe(6, split_label=False) + CG.frequency_dataframe(6, pad_label=False) class TestContactGroupPlots(TestBaseClassContactGroup): @@ -2851,7 +2851,7 @@ def test_plot_timedep_ctcs_matrix(self): fig, plotted_freqs, plotted_trajs = r.plot_timedep_ctcs_matrix(3, dt=1e-3, t_unit="ns", ) - freqs_by_freq = r.frequency_dicts(3.0, sort_by_freq=True, split_label=False) + freqs_by_freq = r.frequency_dicts(3.0, sort_by_freq=True, pad_label=False) self.assertDictEqual(plotted_freqs, freqs_by_freq) self.assertEqual(len(plotted_trajs),r.n_trajs) for ii, (traj, iax) in enumerate(zip(plotted_trajs, fig.axes)): @@ -2876,7 +2876,7 @@ def test_plot_timedep_ctcs_matrix_anchor_1_traj_ctc_control_2(self): defrag=None, ctc_control=2, ) - freqs_by_freq = r.frequency_dicts(3.0, sort_by_freq=True, split_label=False, defrag=None, AA_format="long") + freqs_by_freq = r.frequency_dicts(3.0, sort_by_freq=True, pad_label=False, defrag=None, AA_format="long") freqs_by_freq = _mdcu.str_and_dict.delete_exp_in_keys(freqs_by_freq, "LEU394")[0] freqs_by_freq = {key : val for ii, (key,val) in enumerate(freqs_by_freq.items()) if ii<2} self.assertDictEqual(plotted_freqs, freqs_by_freq) From 45d4f0ece8fcdb98f12ca08b343225053842f95c Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 10:37:21 +0200 Subject: [PATCH 07/83] [ContactPair.label_flex] better docs, better ValueError message --- mdciao/contacts/contacts.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 2a6f86df..e8a97e45 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -1948,17 +1948,17 @@ def frequency_overall_trajs(self, ctc_cutoff_Ang,switch_off_Ang=None): def label_flex(self, AA_format="short", pad_label=True, defrag=None, fmt1="%-15s", fmt2="%-15s"): r""" - A more flexible method to produce the label of this :obj:`ContactPair` + A more flexible method to produce the label of this `ContactPair` Parameters ---------- AA_format : str, default is "short" Amino-acid format for the label, can be - * short: A35@4.55 + * "short": A35@4.55 * "long": ALA35@4.50 - * "just_consensus": 4.50 - * "try_consensus": 4.50 if consensus labeling is present, - else default to "short" + * "just_consensus": 4.50 if consensus labels are present, else fail + * "try_consensus": 4.50 if consensus labels are present, else + fallback to "short" pad_label : bool, default is True Pad the labels with whitespace so that stacked contact labels @@ -1979,7 +1979,7 @@ def label_flex(self, AA_format="short", pad_label=True, defrag=None, fmt1="%-15s ------- label : str """ - + _allowed_AAformats = ["short", "long", "try_consensus", "just_consensus"] if AA_format== 'short': label = self.labels.w_fragments_short_AA elif AA_format== 'long': @@ -1988,7 +1988,8 @@ def label_flex(self, AA_format="short", pad_label=True, defrag=None, fmt1="%-15s #TODO where do we put this assertion? if None in self._attribute_residues.consensus_labels: if AA_format.startswith("just_"): - raise ValueError("Residues %s don't have both consensus labels:%s. \n Try setting `AA_format='try_consensus'`" % ( + raise ValueError("Residues %s don't have both consensus labels:%s. " + "\n Try setting `AA_format='try_consensus'`" % ( self._attribute_residues.names_short, self._attribute_residues.consensus_labels)) elif AA_format.startswith("try_"): @@ -2002,9 +2003,9 @@ def label_flex(self, AA_format="short", pad_label=True, defrag=None, fmt1="%-15s label="-".join(label) else: label = self.labels.just_consensus - else: - raise ValueError(AA_format) + raise ValueError(f"The method got AA_format='{AA_format}', " + f"but the only allowed values for 'AA_format' are {_allowed_AAformats}.") if defrag is not None: label = _mdcu.str_and_dict.defrag_key(label,defrag=defrag, sep="-") if pad_label: From 2d46d71c8716d7ac01debaa701a40243fa041d7a Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 10:58:01 +0200 Subject: [PATCH 08/83] [ContactPair.gen_label] AA_short can handle consensus options, tests --- mdciao/contacts/contacts.py | 26 ++++++++++++++++++++++---- tests/test_contacts.py | 3 +++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index e8a97e45..3903d01d 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -2025,18 +2025,34 @@ def gen_label(self, Parameters ---------- AA_format : str, default is "short" - Alternative is "long" ("E30" vs "GLU30") + Options are: + * "short": "E30@3.50" + * "long": GLU30@3.50 + * "just_consensus": 3.50, fail if none is found + * "try_consensus": 3.50, fallback to "short" if none is found fragments : bool, default is False Include fragment information Will get the "best" information available, ie consensus>fragname>fragindex + When trying to get consensus labels, + this option is ignored, s.t. the full + "E30@3.50" is returned regardless. delete_anchor : bool, default is False Delete the anchor from the label Returns ------- + label : str + The contact label, containing + both or only one residue, depending on + the value of `delete_anchor`. """ + _allowed_AAformats = ["short", "long", "try_consensus", "just_consensus"] + if AA_format not in _allowed_AAformats: + raise ValueError(f"The method got AA_format='{AA_format}', " + f"but the only allowed values for 'AA_format' are {_allowed_AAformats}.") + if self.neighborhood is None and delete_anchor: delete_anchor = False print("ContactPair.gen_label() can't use `delete_anchor=True`, this is not a neighborhood.\n" @@ -2053,8 +2069,8 @@ def gen_label(self, label = self.labels.w_fragments else: label = self.labels.no_fragments - else: - raise ValueError(AA_format) + elif AA_format in ["try_consensus", "just_consensus"]: + label = self.label_flex(AA_format=AA_format, pad_label=False) else: if AA_format == "short": if fragments: @@ -2066,7 +2082,9 @@ def gen_label(self, label = self.neighborhood.partner_res_and_fragment_str else: label = self.neighborhood.partner_residue_name - + elif AA_format in ["try_consensus", "just_consensus"]: + label = self.label_flex(AA_format=AA_format, pad_label=False) + label = _mdcu.str_and_dict.splitlabel(label)[self.residues.anchor_index] return label @_kwargs_subs(label_flex) diff --git a/tests/test_contacts.py b/tests/test_contacts.py index 9e3e2325..6f32d8c8 100644 --- a/tests/test_contacts.py +++ b/tests/test_contacts.py @@ -1205,11 +1205,14 @@ def test_gen_labels(self): self.assertEqual(CP.gen_label("long") ,"LEU394-LEU388") self.assertEqual(CP.gen_label("short",fragments=True), "L394@G.H5.26-L388@G.H5.20") self.assertEqual(CP.gen_label("long",fragments=True) ,"LEU394@G.H5.26-LEU388@G.H5.20") + self.assertEqual(CP.gen_label("just_consensus") ,"G.H5.26-G.H5.20") self.assertEqual(CP.gen_label("short", delete_anchor=True), "L388") self.assertEqual(CP.gen_label("long", delete_anchor=True), "LEU388") self.assertEqual(CP.gen_label("short", fragments=True, delete_anchor=True), "L388@G.H5.20") self.assertEqual(CP.gen_label("long", fragments=True, delete_anchor=True), "LEU388@G.H5.20") + self.assertEqual(CP.gen_label("just_consensus", delete_anchor=True) ,"G.H5.26") + with self.assertRaises(ValueError): CP.gen_label("wrong") From d9e9cb079632f22ebe169a8c9550e54d03e42dff Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 11:32:15 +0200 Subject: [PATCH 09/83] [contacts.ContactGroup.frequency_sum_per_residue_names] API BREAK refactor shorten_AAs -> AA_format to allow consensus labels, tests --- mdciao/contacts/contacts.py | 19 +++++++++++++++---- tests/test_contacts.py | 26 ++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 3903d01d..3e36e401 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -3710,7 +3710,7 @@ def frequency_sum_per_residue_idx_dict(self, ctc_cutoff_Ang, def frequency_sum_per_residue_names(self, ctc_cutoff_Ang, switch_off_Ang=None, sort_by_freq=True, - shorten_AAs=True, + AA_format="short", list_by_interface=False, return_as_dataframe=False, ): @@ -3733,8 +3733,12 @@ def frequency_sum_per_residue_names(self, ctc_cutoff_Ang, :obj:`self.interface_residxs` for more info. If False, residues are in ascending order of residue indices - shorten_AAs : bool, default is True - Use E30 instead of GLU30 + AA_format : str, default is 'short' + Use E30@3.50 instead of GLU30@3.50. + Alternatives are: + * "long": GLU30@3.50 + * "just_consensus": 3.50, fail if none is found + * "try_consensus": 3.50, fallback to "short" if none is found list_by_interface : bool, default is False group the freq_dict by interface residues. Only has an effect if self.is_interface @@ -3759,7 +3763,14 @@ def frequency_sum_per_residue_names(self, ctc_cutoff_Ang, # Use the residue@frag representation but avoid empty fragments list_out = [] - residx2resnamefragnamebest = self.residx2resnamefragnamebest(shorten_AAs=shorten_AAs) + if "consensus" not in AA_format: + residx2resnamefragnamebest = self.residx2resnamefragnamebest(shorten_AAs=[True if AA_format=="short" else False][0]) + else: + residx2resnamefragnamebest = {} + for lab, pair in zip(self.gen_ctc_labels(AA_format=AA_format), + self.res_idxs_pairs): + residx2resnamefragnamebest.update({key : val for key, val in zip(pair, + _mdcu.str_and_dict.splitlabel(lab))}) for ifreq in freqs: idict = {} for idx, val in ifreq.items(): diff --git a/tests/test_contacts.py b/tests/test_contacts.py index 6f32d8c8..a46d9bbf 100644 --- a/tests/test_contacts.py +++ b/tests/test_contacts.py @@ -2219,6 +2219,7 @@ def setUpClass(cls): cls.intf = examples.examples.Interface_B2AR_Gas(GPCR_UniProt = cls.GPCR, CGN_UniProt = cls.CGN) cls.total_intf_freq_at_3 = cls.intf.frequency_per_contact(3.0).sum() + cls.L394 = examples.ContactGroupL394(GPCR_UniProt=None) assert cls.total_intf_freq_at_3 > 0 def test_frequency_dicts(self): CG = self.CG @@ -2278,6 +2279,26 @@ def test_frequency_per_residue_name(self): _np.testing.assert_equal(freq_dict["V31@fragB"], 2 / 5) _np.testing.assert_equal(freq_dict["W32@fragC"], 1 / 5) + def test_frequency_per_residue_name_consensus(self): + CG = self.L394 + """ + ['L394@G.H5.26, + 'L388@G.H5.20', + 'R389@G.H5.21', + 'L230@frag3', + 'R385@G.H5.17', + 'K270@frag3'] + """ + freq_dict = CG.frequency_sum_per_residue_names(4, AA_format="try_consensus")[0] + assert len(freq_dict) == 6 + _np.testing.assert_equal(freq_dict["G.H5.26"], CG.select_by_residues("L394").frequency_per_contact(4).sum()) + _np.testing.assert_equal(freq_dict["G.H5.20"], CG.select_by_residues("L388").frequency_per_contact(4).sum()) + _np.testing.assert_equal(freq_dict["L230@frag3"], CG.select_by_residues("L230").frequency_per_contact(4).sum()) + _np.testing.assert_equal(freq_dict["G.H5.17"], CG.select_by_residues("R385").frequency_per_contact(4).sum()) + _np.testing.assert_equal(freq_dict["K270@frag3"], CG.select_by_residues("K270").frequency_per_contact(4).sum()) + + + def test_frequency_per_residue_name_no_sort(self): CG = self.CG freq_dict = CG.frequency_sum_per_residue_names(2, sort_by_freq=False)[0] @@ -2297,6 +2318,11 @@ def test_frequency_per_residue_name_dataframe(self): 2 / 5, 1 / 5]) + def test_frequency_per_residue_name_consensus(self): + CG = self.CG + freq_dict = CG.frequency_sum_per_residue_names(2, + return_as_dataframe=True)[0] + assert len(freq_dict) == 3 def test_frequency_dict_by_consensus_labels_fails(self): CG = self.CG From 00ebbd15f27990f2e5c7e84a4dac33e3b10176f7 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 11:39:18 +0200 Subject: [PATCH 10/83] [contacts.ContactGroup.select_by_residues] Allow selection via consensus labels, tests --- mdciao/contacts/contacts.py | 21 +++++++++++---------- tests/test_contacts.py | 9 ++++++++- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 3e36e401..60c259c7 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -6530,8 +6530,8 @@ def select_by_residues(self, keep_interface=True, n_residues=1): r""" - Return a copy this :obj:`ContactGroup`, but with a sub-selection of :obj:`ContactGroup.contact_pairs` based on residues. - The returned :obj:`ContactGroup` has the same trajectories and frames as the original. + Return a copy this `ContactGroup`, but with a sub-selection of `ContactGroup.contact_pairs` based on residues. + The returned `ContactGroup` has the same trajectories and frames as the original. The filtering of ContactPairs is done using `CSVexpression`, `residue_indices`, or `residue_pairs` so that: @@ -6545,9 +6545,9 @@ def select_by_residues(self, Parameters ---------- CSVexpression : str or None, default is None - CSV expression like "GLU30,K*" to select + CSV expression like "GLU30,K*,3.50" to select the residue-pairs of :obj:`self` for the - new :obj:`ContactGroup`. See + new `ContactGroup`. See :obj:`mdciao.utils.residue_and_atom.find_AA` for the syntax of the expression. residue_indices : list, default is None, @@ -6564,9 +6564,9 @@ def select_by_residues(self, used, since `residue_indices` matches are unique merge : bool, default is True Merge the selected residue-pairs into - one single :obj:`ContactGroup`. If False + one single `ContactGroup`. If False every sub-string of :obj:`CSVexpression` - returns its own :obj:`ContactGroup` + returns its own `ContactGroup` keep_interface : bool, default is True If self.is_interface and `merge` are both True, then returned ContactGroup @@ -6584,10 +6584,10 @@ def select_by_residues(self, Returns ------- - newCG : :obj:`ContactGroup` or dict + newCG : `ContactGroup` or dict If dict, it's keyed with substrings of - :obj:`CSVexpression` and valued with - :obj:`ContactGroups` + `CSVexpression` and valued with + `ContactGroups` """ assert n_residues in [1,2] if CSVexpression is not None: @@ -6595,7 +6595,8 @@ def select_by_residues(self, keys = [exp.strip(" ") for exp in CSVexpression.split(",")] matches = [] for exp in keys: - matches.append(_mdcu.residue_and_atom.find_AA(exp.strip(" "), self.top)) + matches.append(_mdcu.residue_and_atom.find_AA(exp.strip(" "), self.top, + extra_columns={"consensus" : self.residx2consensuslabel})) if not allow_multiple_matches and len(matches[-1])>1: print("The expression '%s' finds multiple matches, but only one is allowed" % exp) _mdcu.residue_and_atom.parse_and_list_AAs_input(exp, self.top) diff --git a/tests/test_contacts.py b/tests/test_contacts.py index a46d9bbf..1df07846 100644 --- a/tests/test_contacts.py +++ b/tests/test_contacts.py @@ -1966,10 +1966,17 @@ def test_to_select_by_residues_residue_pairs(self): new_CG : contacts.ContactGroup = CG.select_by_residues(residue_pairs=residue_pairs) assert isinstance(new_CG, contacts.ContactGroup) assert new_CG.n_ctcs == 2 - print(new_CG.res_idxs_pairs,"AAAA") assert new_CG._contacts[0] is CG._contacts[2] assert new_CG._contacts[1] is CG._contacts[0] + def test_to_select_by_residues_consensus(self): + CG = examples.ContactGroupL394() + + new_CG : contacts.ContactGroup = CG.select_by_residues("G.H.21") + assert isinstance(new_CG, contacts.ContactGroup) + assert new_CG.n_ctcs == 1 + assert new_CG._contacts[0] is CG._contacts[1] + def test_to_ContactGroups_per_traj(self): traj = md.load(test_filenames.traj_xtc_stride_20, top=test_filenames.top_pdb) From d4cc4477c1dbcafebd05e5a29900982daff9ff0b Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 11:40:09 +0200 Subject: [PATCH 11/83] [test_contacts] refactor ._contacts -> contact_pairs (don't test using private attributes) --- tests/test_contacts.py | 74 +++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/tests/test_contacts.py b/tests/test_contacts.py index 1df07846..c72f179f 100644 --- a/tests/test_contacts.py +++ b/tests/test_contacts.py @@ -1110,7 +1110,7 @@ def test_plot_distance_distribution_smoothing(self): def test_retop(self): CG = examples.ContactGroupL394() - CP : contacts.ContactPair = CG._contacts[0] + CP : contacts.ContactPair = CG.contact_pairs[0] top = md.load(test_filenames.pdb_3SN6).top #print(CP.top, CP.residues.idxs_pair) @@ -1153,7 +1153,7 @@ def test_retop(self): def test_retop_deepcopy(self): CG = examples.ContactGroupL394() - CP: contacts.ContactPair = CG._contacts[0] + CP: contacts.ContactPair = CG.contact_pairs[0] top = md.load(test_filenames.pdb_3SN6).top imap = {347: 342, @@ -1182,7 +1182,7 @@ def test_retop_deepcopy(self): def test_serialize_as_dict(self): CG = examples.ContactGroupL394() - CP: contacts.ContactPair = CG._contacts[0] + CP: contacts.ContactPair = CG.contact_pairs[0] sCP = CP._serialized_as_dict() assert sCP["residues.idxs_pair"] is CP.residues.idxs_pair @@ -1199,7 +1199,7 @@ def test_serialize_as_dict(self): def test_gen_labels(self): CG = examples.ContactGroupL394() - CP: contacts.ContactPair = CG._contacts[0] + CP: contacts.ContactPair = CG.contact_pairs[0] self.assertEqual(CP.gen_label("short"),"L394-L388") self.assertEqual(CP.gen_label("long") ,"LEU394-LEU388") @@ -1850,10 +1850,10 @@ def test_select_by_residues_CSV(self): new_CG : contacts.ContactGroup = CG.select_by_residues(CSVexpression=CSV) assert new_CG.n_ctcs == 4 - assert new_CG._contacts[0] is CG._contacts[0] - assert new_CG._contacts[1] is CG._contacts[2] - assert new_CG._contacts[2] is CG._contacts[3] - assert new_CG._contacts[3] is CG._contacts[5] + assert new_CG.contact_pairs[0] is CG.contact_pairs[0] + assert new_CG.contact_pairs[1] is CG.contact_pairs[2] + assert new_CG.contact_pairs[2] is CG.contact_pairs[3] + assert new_CG.contact_pairs[3] is CG.contact_pairs[5] assert isinstance(new_CG,contacts.ContactGroup) new_CG_dict = CG.select_by_residues(CSVexpression=CSV, merge=False) @@ -1861,12 +1861,12 @@ def test_select_by_residues_CSV(self): self.assertSequenceEqual(list(new_CG_dict.keys()),CSV.split(",")) assert new_CG_dict[keys[0]].n_ctcs == 2 - assert new_CG_dict[keys[0]]._contacts[0] is CG._contacts[0] - assert new_CG_dict[keys[0]]._contacts[1] is CG._contacts[3] + assert new_CG_dict[keys[0]].contact_pairs[0] is CG.contact_pairs[0] + assert new_CG_dict[keys[0]].contact_pairs[1] is CG.contact_pairs[3] assert new_CG_dict[keys[1]].n_ctcs == 2 - assert new_CG_dict[keys[1]]._contacts[0] is CG._contacts[2] - assert new_CG_dict[keys[1]]._contacts[1] is CG._contacts[5] + assert new_CG_dict[keys[1]].contact_pairs[0] is CG.contact_pairs[2] + assert new_CG_dict[keys[1]].contact_pairs[1] is CG.contact_pairs[5] assert new_CG_dict[keys[2]] is None @@ -1888,20 +1888,20 @@ def test_select_by_residues_residue_indices(self): residue_indices = [10, 40, 1] new_CG : contacts.ContactGroup = CG.select_by_residues(residue_indices=residue_indices) assert new_CG.n_ctcs == 3 - assert new_CG._contacts[0] is CG._contacts[2] - assert new_CG._contacts[1] is CG._contacts[4] - assert new_CG._contacts[2] is CG._contacts[5] + assert new_CG.contact_pairs[0] is CG.contact_pairs[2] + assert new_CG.contact_pairs[1] is CG.contact_pairs[4] + assert new_CG.contact_pairs[2] is CG.contact_pairs[5] assert isinstance(new_CG,contacts.ContactGroup) new_CG_dict = CG.select_by_residues(residue_indices=residue_indices, merge=False) self.assertSequenceEqual(list(new_CG_dict.keys()),residue_indices) assert new_CG_dict[residue_indices[0]].n_ctcs == 2 - assert new_CG_dict[residue_indices[0]]._contacts[0] is CG._contacts[2] - assert new_CG_dict[residue_indices[0]]._contacts[1] is CG._contacts[5] + assert new_CG_dict[residue_indices[0]].contact_pairs[0] is CG.contact_pairs[2] + assert new_CG_dict[residue_indices[0]].contact_pairs[1] is CG.contact_pairs[5] assert new_CG_dict[residue_indices[1]].n_ctcs == 2 - assert new_CG_dict[residue_indices[1]]._contacts[0] is CG._contacts[2] - assert new_CG_dict[residue_indices[1]]._contacts[1] is CG._contacts[4] + assert new_CG_dict[residue_indices[1]].contact_pairs[0] is CG.contact_pairs[2] + assert new_CG_dict[residue_indices[1]].contact_pairs[1] is CG.contact_pairs[4] assert new_CG_dict[residue_indices[2]] is None @@ -1925,26 +1925,26 @@ def test_to_select_by_residues_residue_indices_n_residues_is_2(self): assert isinstance(new_CG, contacts.ContactGroup) assert new_CG.n_ctcs == 3 - assert new_CG._contacts[0] is CG._contacts[2] - assert new_CG._contacts[1] is CG._contacts[4] - assert new_CG._contacts[2] is CG._contacts[5] + assert new_CG.contact_pairs[0] is CG.contact_pairs[2] + assert new_CG.contact_pairs[1] is CG.contact_pairs[4] + assert new_CG.contact_pairs[2] is CG.contact_pairs[5] new_CG_dict = CG.select_by_residues(residue_indices=residue_indices, merge=False, n_residues=2) self.assertSequenceEqual(list(new_CG_dict.keys()),residue_indices) assert new_CG_dict[residue_indices[0]].n_ctcs == 1 - assert new_CG_dict[residue_indices[0]]._contacts[0] is CG._contacts[2] + assert new_CG_dict[residue_indices[0]].contact_pairs[0] is CG.contact_pairs[2] assert new_CG_dict[residue_indices[1]].n_ctcs == 2 - assert new_CG_dict[residue_indices[1]]._contacts[0] is CG._contacts[2] - assert new_CG_dict[residue_indices[1]]._contacts[1] is CG._contacts[4] + assert new_CG_dict[residue_indices[1]].contact_pairs[0] is CG.contact_pairs[2] + assert new_CG_dict[residue_indices[1]].contact_pairs[1] is CG.contact_pairs[4] assert new_CG_dict[residue_indices[2]].n_ctcs == 2 - assert new_CG_dict[residue_indices[2]]._contacts[0] is CG._contacts[4] - assert new_CG_dict[residue_indices[2]]._contacts[1] is CG._contacts[5] + assert new_CG_dict[residue_indices[2]].contact_pairs[0] is CG.contact_pairs[4] + assert new_CG_dict[residue_indices[2]].contact_pairs[1] is CG.contact_pairs[5] assert new_CG_dict[residue_indices[3]].n_ctcs == 1 - assert new_CG_dict[residue_indices[3]]._contacts[0] is CG._contacts[5] + assert new_CG_dict[residue_indices[3]].contact_pairs[0] is CG.contact_pairs[5] def test_to_select_by_residues_residue_pairs(self): CG = _mdcsites([{"name": "test_random", @@ -1966,8 +1966,8 @@ def test_to_select_by_residues_residue_pairs(self): new_CG : contacts.ContactGroup = CG.select_by_residues(residue_pairs=residue_pairs) assert isinstance(new_CG, contacts.ContactGroup) assert new_CG.n_ctcs == 2 - assert new_CG._contacts[0] is CG._contacts[2] - assert new_CG._contacts[1] is CG._contacts[0] + assert new_CG.contact_pairs[0] is CG.contact_pairs[2] + assert new_CG.contact_pairs[1] is CG.contact_pairs[0] def test_to_select_by_residues_consensus(self): CG = examples.ContactGroupL394() @@ -1975,7 +1975,7 @@ def test_to_select_by_residues_consensus(self): new_CG : contacts.ContactGroup = CG.select_by_residues("G.H.21") assert isinstance(new_CG, contacts.ContactGroup) assert new_CG.n_ctcs == 1 - assert new_CG._contacts[0] is CG._contacts[1] + assert new_CG.contact_pairs[0] is CG.contact_pairs[1] def test_to_ContactGroups_per_traj(self): @@ -2005,11 +2005,11 @@ def test_to_ContactGroups_per_traj(self): _np.testing.assert_array_equal(iCG.trajlabels[0], 'mdtraj.00') for jj in range(CG.n_ctcs): - _np.testing.assert_array_equal( CG._contacts[jj].time_traces.ctc_trajs[ii], - iCG._contacts[jj].time_traces.ctc_trajs[0]) - _np.testing.assert_array_equal( CG._contacts[jj].time_traces.atom_pair_trajs[ii], - iCG._contacts[jj].time_traces.atom_pair_trajs[0]) - assert CG._contacts[jj].time_traces.trajs[ii] is iCG._contacts[jj].time_traces.trajs[0] + _np.testing.assert_array_equal( CG.contact_pairs[jj].time_traces.ctc_trajs[ii], + iCG.contact_pairs[jj].time_traces.ctc_trajs[0]) + _np.testing.assert_array_equal( CG.contact_pairs[jj].time_traces.atom_pair_trajs[ii], + iCG.contact_pairs[jj].time_traces.atom_pair_trajs[0]) + assert CG.contact_pairs[jj].time_traces.trajs[ii] is iCG.contact_pairs[jj].time_traces.trajs[0] class TestContactGroup_select_by_frames(TestBaseClassContacts): @@ -2816,7 +2816,7 @@ def test_plot_timedep_ctcs_sort_by_freq(self): # are uniquelly associated with the ContactPair # reverse the ContactPairs s.t. the frequences are in ascending (unusual) order - CG = contacts.ContactGroup(self.CG_cp1_cp2_both_w_anchor_and_frags._contacts[::-1], + CG = contacts.ContactGroup(self.CG_cp1_cp2_both_w_anchor_and_frags.contact_pairs[::-1], neighbors_excluded=0) figs = CG.plot_timedep_ctcs(ctc_cutoff_Ang=2) From aa1e9698286b4ced78b84c94690383cf612b188e Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 11:45:03 +0200 Subject: [PATCH 12/83] [nomeclature._lexsort_consensus_ctc_labels] fix --- mdciao/nomenclature/nomenclature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index a70079d6..7bffb2bc 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -3868,9 +3868,9 @@ def _lexsort_consensus_ctc_labels(labels, reverse = False, columns = [0, 1], sep if not any([all([len(lab) == 1 for lab in split_labels]), all([len(lab) == 2 for lab in split_labels])]): raise ValueError(f"Labels have to be all single ('3.50') or double ('3.50-2.50'), but not mixed {labels}") - split_labels = _np.vstack(split_labels) + split_labels = _np.vstack(split_labels).squeeze() if split_labels.ndim == 1: - order = _sort_all_consensus_labels(split_labels())[1] + order = _sort_all_consensus_labels(split_labels)[1] elif split_labels.ndim ==2: #There's other ways but this was easy to set up lexsort = {key : val[columns[1]].to_dict() for key, val in _DataFrame(split_labels).groupby(by=columns[0])} From c3c451bd4a3660b82c6a8b5e466b3e90a654695c Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 12:08:50 +0200 Subject: [PATCH 13/83] [siteIO.sites_to_res_pairs] don't fail if conlab is missing (return none), assert consensus_maps are len=top.n_residues, kwargs documented, tests --- mdciao/sites/siteIO.py | 28 +++++++++++++++++++++------- tests/test_siteIO.py | 18 ++++++++++-------- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/mdciao/sites/siteIO.py b/mdciao/sites/siteIO.py index 1d875029..e41d0aea 100644 --- a/mdciao/sites/siteIO.py +++ b/mdciao/sites/siteIO.py @@ -27,6 +27,8 @@ import mdciao.fragments as _mdcfrg import mdciao.utils as _mdcu +from mdciao.utils.str_and_dict import _kwargs_subs + _allowed_site_schemes = ("AAresSeq","residx", "consensus") def x2site(site, fmt="AAresSeq"): """ @@ -103,6 +105,7 @@ def x2site(site, fmt="AAresSeq"): return idict +@_kwargs_subs(_mdcfrg.get_fragments) def sites_to_res_pairs(site_dicts, top, fragments=None, default_fragment_index=None, @@ -116,8 +119,8 @@ def sites_to_res_pairs(site_dicts, top, Note ---- - Any residue not found in :obj:`top` is assigned - a 'None' in the returned :obj:`res_idx_pairs`. + Any residue not found in `top` is assigned + a 'None' in the returned `res_idx_pairs`. Parameters ---------- @@ -135,8 +138,17 @@ def sites_to_res_pairs(site_dicts, top, a dimer, pass which fragment/monomer should be chosen by default. The default behaviour (None) will prompt the user when necessary - get_fragments_kwargs : - see :obj:`fragments.get_fragments` + consensus_maps : dict, default is None + Dictionary of consensus maps, i.e. + keyed with nomenclature type (GPCR,CGN,KLIFS) + and valued with lists of len top.n_residues + get_fragments_kwargs : dict + Optional arguments for :obj:`~mdciao.fragments.get_fragments`. + The optional parameters of are: + + Other Parameters + ---------------- + %(substitute_kwargs)s Returns ------- @@ -167,6 +179,7 @@ def sites_to_res_pairs(site_dicts, top, raise ValueError("Can't use consensus labels in the site definitions if no consensus maps are passed.\n" "Please provide GPCR, CGN, or KLIFS consensus labeling to use " "the 'consensus' way of defining a site.") + assert all([len(val)==top.n_residues for val in consensus_maps.values()]) key2res = {key: {label: ii for ii, label in enumerate(val) if str(label).lower()!="none"} for key, val in consensus_maps.items()} def get_pair_lambda(bond): res_out = [] @@ -174,10 +187,11 @@ def get_pair_lambda(bond): res = [cm.get(desc,None) for cm in key2res.values()] res = [rr for rr in res if rr is not None] if len(res)==0: - raise ValueError(f"The consensus descriptor {desc} didn't " - f"yield any matches on {list(consensus_maps.keys())} labels.") - else: + res_out.append(None) + elif len(res)==1: res_out.append(res[0]) + else: + raise ValueError(res) return res_out for bond in bonds: diff --git a/tests/test_siteIO.py b/tests/test_siteIO.py index 1ad2c763..9c2b21d9 100644 --- a/tests/test_siteIO.py +++ b/tests/test_siteIO.py @@ -208,7 +208,8 @@ def setUp(self): self.GDP_json = test_filenames.GDP_json self.geom = _md.load(test_filenames.actor_pdb) self.fragments = mdciao.fragments.get_fragments(self.geom.top) - + self.GPCR_map = mdciao.examples.GPCRLabeler_ardb2_human().top2labels(self.geom.top) + self.CGN_map = mdciao.examples.CGNLabeler_gnas2_human().top2labels(self.geom.top) def test_the_idxs_work_no_frags(self): site = mdciao.sites.x2site(self.GDP_json) ctc_idxs, __ = mdciao.sites.sites_to_res_pairs([site], self.geom.top) @@ -240,9 +241,9 @@ def test_consensus(self): ctc_idxs, site_maps = mdciao.sites.sites_to_res_pairs([{"name": "interesting contacts", "pairs": {"consensus": [ "G.H5.26-6.32x32"]}}], self.geom.top, - consensus_maps={"GPCR": [None, "6.32x32"], - "CGN": ["G.H5.26"]}) - _np.testing.assert_array_equal(ctc_idxs, [[0,1]]) + consensus_maps={"GPCR": self.GPCR_map, + "CGN": self.CGN_map}) + _np.testing.assert_array_equal(ctc_idxs, [[659, 209]]) _np.testing.assert_array_equal(site_maps, [[0]]) def test_consensus_fails_no_consensus_passed(self): with self.assertRaises(ValueError): @@ -250,12 +251,13 @@ def test_consensus_fails_no_consensus_passed(self): "pairs": {"consensus": [ "G.H5.26-6.32x32"]}}], self.geom.top) - def test_consensus_fails_no_label_present(self): - with self.assertRaises(ValueError): - mdciao.sites.sites_to_res_pairs([{"name":"interesting contacts", + def test_consensus_no_label_present(self): + ctc_idxs, site_maps = mdciao.sites.sites_to_res_pairs([{"name":"interesting contacts", "pairs": {"consensus": [ "G.H5.26-6.32x32"]}}], self.geom.top, - consensus_maps={"GPCR":[None, "6.32x32"]}) + consensus_maps={"GPCR":self.GPCR_map}) + _np.testing.assert_array_equal(ctc_idxs, [[None, 209]]) + _np.testing.assert_array_equal(site_maps, [[0]]) class Test_discard_empty_sites(unittest.TestCase): From 400f58a49390e7d65a07585b01dd004e2fe0295e Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 12:09:13 +0200 Subject: [PATCH 14/83] [residue_and_atom] typos --- mdciao/utils/residue_and_atom.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mdciao/utils/residue_and_atom.py b/mdciao/utils/residue_and_atom.py index 4982e15a..96c62af5 100644 --- a/mdciao/utils/residue_and_atom.py +++ b/mdciao/utils/residue_and_atom.py @@ -627,15 +627,22 @@ def top2lsd(top, substitute_fail="X", ---------- top : :obj:`~mdtraj.Topology` substitute_fail : str, None, int, default is "X" - If there is no .code attribute, different options are there + If there is no .code attribute, there are different options depending on the value of this parameter * None : throw an exception when no short code is found (default) * 'long' : keep the residue's long name, i.e. do nothing * 'c': any alphabetic character, as long as it is of len=1 * 0 : the first alphabetic character in the residue's name extra_columns : dictionary of indexables - Any other column you want to - include in the :obj:`~pandas.DataFrame` + Any other columns you want to + include in the :obj:`~pandas.DataFrame`, e.g. + {"GPCR" : [None, None,...,3.50, 3.51...], + "CGN" : [G.H5.25, None, None, ...]} + If the values are lists, they sould be + len=top.n_residues, if dicts, the dicts + don't need to cover all residues of `top`, e.g. + {"GPCR" : {200 : "3.50", 201 : "3.51"}, + "CGN" : {0 : "G.H5.25"}} Returns ------- From 5d6fb6b0af816fbecdb19be57ffce6b38c670ae4 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 12:14:16 +0200 Subject: [PATCH 15/83] [plots._sorter_by_key_or_val] new option 'consensus' for 'sort_by' argument, using _lexsort_consensus_ctc_labels --- mdciao/plots/plots.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mdciao/plots/plots.py b/mdciao/plots/plots.py index 3436c23c..7b125fb0 100644 --- a/mdciao/plots/plots.py +++ b/mdciao/plots/plots.py @@ -38,9 +38,9 @@ import mdciao.utils as _mdcu -from mdciao.nomenclature.nomenclature import _sort_all_consensus_labels +from mdciao.nomenclature.nomenclature import _lexsort_consensus_ctc_labels # The above line introduces a dependency of 'plots' on 'nomenclature', which were -# uncoupled so far. The alternative would be to put '_sort_all_consensus_labels' +# uncoupled so far. The alternative would be to put '_lexsort_consensus_ctc_labels' # into 'utils.str_and_dict' (since it's essentially string operations). # However, as plotting methods become increasing nomenclature-aware, such a # plots -> nomenclature dependency will likely come in the future @@ -1710,7 +1710,7 @@ def _sorter_by_key_or_val(sort_by, indict): The rules might use just the keys or just the values of the dict - The indict is left unaltered. + The `indict` is left unaltered. Parameters ---------- @@ -1727,6 +1727,8 @@ def _sorter_by_key_or_val(sort_by, indict): by the values of the `indict` * "keep" keep the order of the keys + * "consensus" + Sort following consensus nomenclature * a list of contact labels. The returned `ordered_keys` will be the intersection of `sort_by` and `indict.keys()`, @@ -1766,7 +1768,7 @@ def _sorter_by_key_or_val(sort_by, indict): # Out[5]: ['0-10', '0-20', 'ALA', 'ALA30-GLU40', 'ALA30-GLU50', 'GLU5-ALA20'] # -> we would want ['0-10', '0-20', 'GLU5-ALA20', 'ALA30-GLU40', 'ALA30-GLU50', 'ALA'] elif sort_by == "consensus": - ordered_keys = _sort_all_consensus_labels(all_ctc_keys) + ordered_keys = _lexsort_consensus_ctc_labels(all_ctc_keys)[0] elif sort_by in ["mean", "std"]: ordered_keys = list(_mdcu.str_and_dict.sort_dict_by_asc_values(indict).keys()) elif sort_by == "keep": From 00113c73f07dd2bb45330345d22bb7d6785cdab5 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 12:17:11 +0200 Subject: [PATCH 16/83] [plots.compare_groups_of_contacts] adapt to refactor shorten_AAs -> AA_format in frequency_sum_per_residue_names --- mdciao/plots/plots.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdciao/plots/plots.py b/mdciao/plots/plots.py index 7b125fb0..2b85e19c 100644 --- a/mdciao/plots/plots.py +++ b/mdciao/plots/plots.py @@ -911,7 +911,7 @@ def compare_groups_of_contacts(groups, pad_label=False) else: idict = ifile.frequency_sum_per_residue_names(ctc_cutoff_Ang=ctc_cutoff_Ang, - shorten_AAs=[True if AA_format=="short" else False][0], + AA_format=AA_format, list_by_interface=True) else: From da1c6a31023ee882f10580c90961b7427f9ec8ab Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 12:20:46 +0200 Subject: [PATCH 17/83] [nomenclature] make tuples instead of lists for the consensus fragments sets --- mdciao/nomenclature/nomenclature.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index 7bffb2bc..83ad4618 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -2514,7 +2514,7 @@ def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN", "8": "H8", } -_GPCR_fragments = ["NT", +_GPCR_fragments = ("NT", "1", "TM1 ", "12", "ICL1", "2", "TM2", @@ -2530,9 +2530,9 @@ def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN", "7", "TM7", "78", "8", "H8", - "CT"] + "CT") -_CGN_fragments = ['G.HN', +_CGN_fragments = ('G.HN', 'G.hns1', 'G.S1', 'G.s1h1', @@ -2568,9 +2568,9 @@ def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN", 'G.h4s6', 'G.S6', 'G.s6h5', - 'G.H5'] + 'G.H5') -_KLIFS_fragments = ['I', +_KLIFS_fragments = ('I', 'g.l', 'II', 'III', @@ -2588,7 +2588,7 @@ def _sort_all_consensus_labels(labels, append_diffset=True, order=["GPCR","CGN", 'VII', 'VIII', 'xDFG', - 'a.l'] + 'a.l') _GPCR_mandatory_fields = ["protein_segment", From a7f1caede74f5e48ad6ed3ae0d6f5b505883e149 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 12:21:44 +0200 Subject: [PATCH 18/83] [fragments.get_fragments] minor docs --- mdciao/fragments/fragments.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mdciao/fragments/fragments.py b/mdciao/fragments/fragments.py index 0aa3b50f..08732ed9 100644 --- a/mdciao/fragments/fragments.py +++ b/mdciao/fragments/fragments.py @@ -174,7 +174,6 @@ def get_fragments(top, ---------- top : :obj:`~mdtraj.Topology` or str When str, path to filename - method : str, default is 'lig_resSeq+' The method passed will be the basis for creating fragments. Check the following options with the example sequence @@ -244,7 +243,7 @@ def get_fragments(top, additional arguments, see :obj:`~mdciao.residue_and_atom.residues_from_descriptors` Other Parameters - --------------- + ---------------- %(substitute_kwargs)s Returns From ff27991f18a5cd31b8716a2c34081cc8aff455e4 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 12:22:41 +0200 Subject: [PATCH 19/83] [fragments.check_if_fragment_clashes] correct typo in input prompt --- mdciao/fragments/fragments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdciao/fragments/fragments.py b/mdciao/fragments/fragments.py index 08732ed9..4f3229ed 100644 --- a/mdciao/fragments/fragments.py +++ b/mdciao/fragments/fragments.py @@ -837,7 +837,7 @@ def check_if_fragment_clashes(sub_frag, fragname, fragments, top, if n_in_fragment < len(fragments[jj]): istr += "%u residues outside %s" % (len(fragments[jj]) - n_in_fragment, fragname) print(istr) - answr = input("Input the idxs of the fragments where ECL2 can be found %s (fmt = 1 or 1-4, or 1,3):" % fragname) + answr = input(f"Input the idxs of the fragments where {fragname} can be found %s (fmt = 1 or 1-4, or 1,3):") answr = _mdcu.lists.rangeexpand(answr) if not all([idx in frag_cands for idx in answr]): raise ValueError(f"Cannot keep fragment {set(answr).difference(frag_cands)}, " From ffd31a3216b84f369a882d4ed0ae58534ceacc37 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 12:23:42 +0200 Subject: [PATCH 20/83] [fragments._fragments_strings_to_fragments] pass "method" directly to get_fragments --- mdciao/fragments/fragments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdciao/fragments/fragments.py b/mdciao/fragments/fragments.py index 4f3229ed..42e2f08a 100644 --- a/mdciao/fragments/fragments.py +++ b/mdciao/fragments/fragments.py @@ -918,7 +918,7 @@ def _fragments_strings_to_fragments(fragment_input, top, verbose=False): if fragment_input.lower()=="consensus": user_wants_consensus = True method = 'resSeq+' - fragments_as_residue_idxs = get_fragments(top, method='resSeq+', + fragments_as_residue_idxs = get_fragments(top, method=method, verbose=False) elif fragment_input in _allowed_fragment_methods: method = fragment_input From 8a27a7f8bbfc23ed6c5ef6c90430949f44c12c85 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 12:45:47 +0200 Subject: [PATCH 21/83] [fragments.frag_list_2_frag_groups] can expand ranges, prompting expressions are now more informative, tests --- mdciao/fragments/fragments.py | 61 ++++++++++++++++++++++++----------- tests/test_fragments.py | 41 +++++++++++++++++++++-- 2 files changed, 81 insertions(+), 21 deletions(-) diff --git a/mdciao/fragments/fragments.py b/mdciao/fragments/fragments.py index 42e2f08a..615aaa93 100644 --- a/mdciao/fragments/fragments.py +++ b/mdciao/fragments/fragments.py @@ -1019,12 +1019,12 @@ def frag_dict_2_frag_groups(frag_defs_dict, ng=2, verbose=False, answers=None): r""" - Input a dictionary of fragment definitions, keyed by - whatever and valued with residue idxs and prompt - the user how to re-group them + Re-group fragment definitions into new fragment groups. + By default it prompts the user unless the `answers` are provided directly. - It wraps around :obj:`_match_dict_by_patterns` - under the hood + It can expand numerical ranges, e.g. "1-3" to [1,2,3] and + use string pattern matching as done by + :obj:`~mdciao.str_and_dict.match_dict_by_patterns` TODO: refactor into str_and_dict_utils TODO: It will be mostly used with fragments so it's better here for the API? IDK @@ -1032,25 +1032,30 @@ def frag_dict_2_frag_groups(frag_defs_dict, ng=2, Parameters ---------- frag_defs_dict : dict - Fragment definitions in residue idxs + Fragment definitions as iterable + of residue idxs. Fragments can overlap, + e.g. one can have fragments corresponding + to chains in the pdb-file mixed + with consensus fragments (TM1, TM2, etc). ng : int, default is 2 - wanted number of groups + wanted number of groups of fragments answers : list, default is None - List of strings. If provided, - the items of this list will - be passed as answers to the prompt - asking for fragment choice. None and + List of len `ng`. Each item + can contain either string + expressions like "TM*,-H8", + or "1-6" or an explicit list + of integers. These items + will be passed as answers to the prompt + asking for `ng` fragment choices. Returns ------- - groups_as_residue_idxs, groups_as_keys - groups_as_residue_idxs : list of len ng Contains ng arrays with the concatenated and sorted residues in each group groups_as_keys : list of len ng Contains ng lists with the keys - of :obj:`frag_defs_dict` in each of groups + of `frag_defs_dict` in each of groups """ groups_as_keys = [] @@ -1058,22 +1063,40 @@ def frag_dict_2_frag_groups(frag_defs_dict, ng=2, _answers = [None]*ng if answers is not None: for ii, ians in enumerate(answers): - if isinstance(ians,str): - _answers[ii]=ians + if ians is not None: + if not isinstance(ians,str): + _answers[ii]=",".join([str(ii) for ii in ians]) + else: + _answers[ii]=ians answers = _answers if verbose: for key, val in frag_defs_dict.items(): print("%s: %u-%u"%(key, val[0],val[-1])) for ii in range(1, ng + 1): - print("group %u: " % ii, end='') + print("Select group %u: " % ii, end='') if answers[ii-1] is None: + print("\nInput a list of comma-separated expressions. Please note:") answer = input( - "Input a list of comma-separated posix-expressions.\n" - "Prepend with '-' to exclude, e.g. 'TM*,-TM2,H8' to grab all TMs and H8, but exclude TM2)\n").replace( + " - to select topology fragments, use integer indices, e.g. '0,1,4'. " + "Ranges are also allowed, e.g. '0-3' instead of '0,1,2,3'.\n" + " - to select consensus fragments, use string descriptors like 'TM1'. " + "Posix expressions like '*' or '?' are allowed, e.g. 'TM*' for all TMs.\n" + " - to exclude some fragments, prepend with '-', e.g. 'TM*,-TM2,H8' to select all TMs and H8, excluding TM2.\n").replace( " ", "").strip("'").strip('"') + print("group %u: " % ii, end='') else: answer = answers[ii-1] + + # Expand ranges if necessary + expanded_answer = [] + for ii in answer.split(","): + if "-" in ii and ii[0]!="-" and ii.replace("-","").isnumeric(): + expanded_answer.extend([str(ii) for ii in _mdcu.lists.rangeexpand(ii)]) + else: + expanded_answer.append(str(ii)) + answer = ",".join(expanded_answer) + igroup, res_idxs_in_group = _mdcu.str_and_dict.match_dict_by_patterns(answer, frag_defs_dict) groups_as_keys.append([ilab for ilab in frag_defs_dict.keys() if ilab in igroup]) groups_as_residue_idxs.append(sorted(res_idxs_in_group)) diff --git a/tests/test_fragments.py b/tests/test_fragments.py index dae8e190..6785fb36 100644 --- a/tests/test_fragments.py +++ b/tests/test_fragments.py @@ -352,7 +352,7 @@ def test_works(self): self.assertSequenceEqual(groups_as_residue_idxs[1],[6,7]) def test_works_with_answers(self): - input_vaules = ["TM*,-TM2", "H8"] + input_values = ["TM*,-TM2", "H8"] groups_as_residue_idxs, \ groups_as_keys, \ = mdcfragments.frag_dict_2_frag_groups({"TM1": [0, 1], @@ -360,7 +360,7 @@ def test_works_with_answers(self): "TM3": [4, 5], "H8": [6, 7]}, verbose=True, - answers=input_vaules + answers=input_values ) self.assertSequenceEqual(groups_as_keys[0],["TM1","TM3"]) self.assertSequenceEqual(groups_as_keys[1],["H8"]) @@ -368,6 +368,43 @@ def test_works_with_answers(self): self.assertSequenceEqual(groups_as_residue_idxs[0],[0,1,4,5]) self.assertSequenceEqual(groups_as_residue_idxs[1],[6,7]) + def test_works_with_non_string_answers(self): + input_values = ["TM*,-TM2", [2]] + groups_as_residue_idxs, \ + groups_as_keys, \ + = mdcfragments.frag_dict_2_frag_groups({"TM1": [0, 1], + "TM2": [2, 3], + "TM3": [4, 5], + "H8": [6, 7], + "2":[8, 9]}, + verbose=True, + answers=input_values + ) + self.assertSequenceEqual(groups_as_keys[0], ["TM1", "TM3"]) + self.assertSequenceEqual(groups_as_keys[1], ["2"]) + + self.assertSequenceEqual(groups_as_residue_idxs[0], [0, 1, 4, 5]) + self.assertSequenceEqual(groups_as_residue_idxs[1], [8,9]) + def test_works_with_string_ranges(self): + input_values = ["TM*,-TM2", "2-3"] + groups_as_residue_idxs, \ + groups_as_keys, \ + = mdcfragments.frag_dict_2_frag_groups({"TM1": [0, 1], + "TM2": [2, 3], + "TM3": [4, 5], + "H8": [6, 7], + "2":[8, 9], + "3": [10, 11], + "4": [12,13]}, + verbose=True, + answers=input_values + ) + self.assertSequenceEqual(groups_as_keys[0], ["TM1", "TM3"]) + self.assertSequenceEqual(groups_as_keys[1], ["2","3"]) + + self.assertSequenceEqual(groups_as_residue_idxs[0], [0, 1, 4, 5]) + self.assertSequenceEqual(groups_as_residue_idxs[1], [8,9, 10, 11]) + class Test_frag_list_2_frag_groups(unittest.TestCase): From c9d107f47639af345d9b9bd9d0ab3a9989767ece Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 12:46:35 +0200 Subject: [PATCH 22/83] [cli._parse_consensus_option] minor --- mdciao/cli/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index b55b29d3..53104d0f 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -155,7 +155,8 @@ def _parse_consensus_option(option, consensus_type, LC_out = option if isinstance(LC_out, _mdcnomenc.LabelerConsensus): - answer = _mdcnomenc.guess_by_nomenclature(LC_out, top, fragments, consensus_type, + answer = _mdcnomenc.guess_by_nomenclature(LC_out, top, + fragments=fragments, nomenclature_name=consensus_type, accept_guess=accept_guess, # verbose=True ) From 1a5bf25fb28866c3d272d59887837ad4becac227 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 12:50:31 +0200 Subject: [PATCH 23/83] [cli.residue_neighborhoods] don't store `user_wants_consensus` --- mdciao/cli/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 53104d0f..41f64507 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -874,7 +874,7 @@ def residue_neighborhoods(residues, print("Will compute contact frequencies for (%u items):\n%s" "\n with a stride of %u frames" % (len(xtcs),_mdcu.str_and_dict.inform_about_trajectories(xtcs, only_show_first_and_last=15), stride)) - fragments_as_residue_idxs, user_wants_consensus = _mdcfrg.fragments._fragments_strings_to_fragments(fragments, refgeom.top, verbose=True) + fragments_as_residue_idxs = _mdcfrg.fragments._fragments_strings_to_fragments(fragments, refgeom.top, verbose=True)[0] fragment_names = _parse_fragment_naming_options(fragment_names, fragments_as_residue_idxs) fragment_colors = _parse_coloring_options(fragment_colors,len(fragment_names)) From 092a5bfc6dae0b2e744bcef21fdbbffec71813d2 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 13:55:26 +0200 Subject: [PATCH 24/83] [cli.residue_selection] minor --- mdciao/cli/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 41f64507..80480b14 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -2340,8 +2340,8 @@ def residue_selection(expression, if fragments is None: fragments = [_signature(_mdcfrg.get_fragments).parameters["method"].default] - _frags, __ = _mdcfrg.fragments._fragments_strings_to_fragments(_mdcu.lists.force_iterable(fragments), - _top, verbose=True) + _frags = _mdcfrg.fragments._fragments_strings_to_fragments(_mdcu.lists.force_iterable(fragments), + _top, verbose=True)[0] res_idxs_list, consensus_maps, __ = _res_resolver(expression, _top, _frags, midstring="Your selection '%s' yields:" % expression, GPCR_UniProt=GPCR_UniProt, CGN_UniProt=CGN_UniProt, From 63abc34ffc86fe3436012828a53cdfcbf59d4896 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 14:12:48 +0200 Subject: [PATCH 25/83] [fragments.frag_dict_2_frag_groups] new optarg fail_on_empty, tests --- mdciao/fragments/fragments.py | 9 ++++++++- tests/test_fragments.py | 13 ++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/mdciao/fragments/fragments.py b/mdciao/fragments/fragments.py index 615aaa93..9c320c5d 100644 --- a/mdciao/fragments/fragments.py +++ b/mdciao/fragments/fragments.py @@ -1017,7 +1017,8 @@ def frag_list_2_frag_groups(frag_list, def frag_dict_2_frag_groups(frag_defs_dict, ng=2, verbose=False, - answers=None): + answers=None, + fail_on_empty=True): r""" Re-group fragment definitions into new fragment groups. By default it prompts the user unless the `answers` are provided directly. @@ -1026,6 +1027,9 @@ def frag_dict_2_frag_groups(frag_defs_dict, ng=2, use string pattern matching as done by :obj:`~mdciao.str_and_dict.match_dict_by_patterns` + Unless specifically turned off, the method will fail + if any one of the `answers` yields no fragment groups. + TODO: refactor into str_and_dict_utils TODO: It will be mostly used with fragments so it's better here for the API? IDK @@ -1098,6 +1102,9 @@ def frag_dict_2_frag_groups(frag_defs_dict, ng=2, answer = ",".join(expanded_answer) igroup, res_idxs_in_group = _mdcu.str_and_dict.match_dict_by_patterns(answer, frag_defs_dict) + if len(igroup)==0 and fail_on_empty: + raise ValueError(f"The expression '{answer}' doesn't yield any fragments. " + "Set 'fail_on_empty=False' if you know what you're doing.") groups_as_keys.append([ilab for ilab in frag_defs_dict.keys() if ilab in igroup]) groups_as_residue_idxs.append(sorted(res_idxs_in_group)) print(', '.join(groups_as_keys[-1])) diff --git a/tests/test_fragments.py b/tests/test_fragments.py index 6785fb36..ca38fd9c 100644 --- a/tests/test_fragments.py +++ b/tests/test_fragments.py @@ -405,7 +405,18 @@ def test_works_with_string_ranges(self): self.assertSequenceEqual(groups_as_residue_idxs[0], [0, 1, 4, 5]) self.assertSequenceEqual(groups_as_residue_idxs[1], [8,9, 10, 11]) - + def test_fails_on_empty(self): + with self.assertRaises(ValueError): + mdcfragments.frag_dict_2_frag_groups({"TM1": [0, 1], + "TM2": [2, 3], + "TM3": [4, 5], + "H8": [6, 7], + "2": [8, 9], + "3": [10, 11], + "4": [12, 13]}, + verbose=True, + answers=["TM10",[4]] + ) class Test_frag_list_2_frag_groups(unittest.TestCase): def test_works_automatically(self): From 07a668c8bf0f9614efd1b114dd5603c837ca1b77 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 13 Aug 2024 14:15:23 +0200 Subject: [PATCH 26/83] [ContactGroup.plot_frequency_sums_as_bars] use AA_format when calling frequency_sum_per_residue_names --- mdciao/contacts/contacts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 60c259c7..5ce4647b 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -5706,7 +5706,7 @@ def plot_frequency_sums_as_bars(self, frq_dict_list = self.frequency_sum_per_residue_names(ctc_cutoff_Ang, switch_off_Ang=switch_off_Ang, sort_by_freq=sort_by_freq, - shorten_AAs=shorten_AAs, + AA_format=["short" if shorten_AAs else "long"][0], list_by_interface=list_by_interface) # TODO the method plot_freqs_as_bars is very similar but From 40e9c6a62f6942b4f55e4d57b95413d187c89cbc Mon Sep 17 00:00:00 2001 From: gph82 Date: Wed, 14 Aug 2024 11:22:52 +0200 Subject: [PATCH 27/83] [nomenclature.conlabs2confrags] refactor from _map2defs, simplify using DataFrame.groubpy, expose in the AP, expose in the APII --- mdciao/nomenclature/__init__.py | 1 + mdciao/nomenclature/nomenclature.py | 41 ++++++++++++----------------- tests/test_nomenclature.py | 10 +++---- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/mdciao/nomenclature/__init__.py b/mdciao/nomenclature/__init__.py index 2b26a6fe..7fb381d7 100644 --- a/mdciao/nomenclature/__init__.py +++ b/mdciao/nomenclature/__init__.py @@ -38,6 +38,7 @@ guess_by_nomenclature matching_fragments references + conlabs2confrags References ========== diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index 83ad4618..b19fabad 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -1961,7 +1961,7 @@ def _fill_consensus_gaps(consensus_list, top, verbose=False): The same as the input :obj:`consensus_list` with guessed missing entries """ - defs = _map2defs(consensus_list) + defs = conlabs2confrags(consensus_list) # todo decrease verbosity # Iterate over fragments for frag_key, conlabs in defs.items(): @@ -2259,47 +2259,40 @@ def guess_by_nomenclature(CLin, top, fragments=None, nomenclature_name=None, return answer -def _map2defs(cons_list, splitchar="."): +def conlabs2confrags(conlabs, splitchar="."): r""" Subdomain definitions form a list of consensus labels. The indices of the list are interpreted as residue indices - in the topology used to generate :obj:`cons_list` - in the first place, e.g. by using :obj:`nomenclature_utils._top2consensus_map` + in the topology used to generate `cons_list` + in the first place, e.g. by using :obj:`mdciao.nomenclature.LabelerConsensus.top2labels` - Note: - ----- - The method will guess automagically whether this is a CGN or GPCR label by - checking the type of the first character (numeric is GPCR, 3.50, alpha is CGN, G.H5.1) Parameters ---------- - cons_list: list - Contains consensus labels for a given topology, s.t. indices of + conlabs: list + Consensus labels for a given topology, s.t. indices of the list map to residue indices of a given topology, s.t. cons_list[10] has the consensus label of top.residue(10) splitchar : str, default is "." The character to use to get the subdomain labels from the consensus labels, e.g. "3" from "3.50" or "G.H5" from "G.H5.1" + If a label of `cons_list` doesn't have a `splitchar` in + it, an Exception is thrown (this is a suspicious case) Returns ------- defs : dictionary dictionary keyed with subdomain-names and valued with arrays of residue indices """ - defs = _defdict(list) - for ii, key in enumerate(cons_list): - if str(key).lower() != "none": - assert splitchar in _mdcu.lists.force_iterable(key), "Consensus keys have to have a '%s'-character" \ - " in them, but '%s' (type %s) hasn't" % ( - splitchar, str(key), type(key)) - if key[0].isnumeric(): # it means it is GPCR - new_key = key.split(splitchar)[0] - elif key[0].isalpha(): # it means it CGN - new_key = '.'.join(key.split(splitchar)[:-1]) - else: - raise Exception([ii, splitchar]) - defs[new_key].append(ii) - return {key: _np.array(val) for key, val in defs.items()} + bad_labels = [val for val in conlabs if splitchar not in str(val) and str(val).lower() != "none"] + if len(bad_labels)>0: + raise ValueError(f"Some labels of 'cons_list' don't have '{splitchar}' in them. " + f"Are you sure these are valid consensus labels(e.g. '3.50' or 'G.H5.26'?:\n{bad_labels}") + df = _DataFrame(conlabs, columns=["conlab"]) + conlab2confrag = lambda x: str(x)[::-1].split(splitchar, 1)[-1][::-1] + df["frag"] = df.conlab.map(conlab2confrag) + consensus_frags = {key: val.index.values for key, val in df.groupby("frag") if str(key).lower() != "none"} + return {key: _np.array(val) for key, val in consensus_frags.items()} def _sort_consensus_labels(subset, sorted_superset, diff --git a/tests/test_nomenclature.py b/tests/test_nomenclature.py index f77c7a11..724f017f 100644 --- a/tests/test_nomenclature.py +++ b/tests/test_nomenclature.py @@ -643,29 +643,29 @@ def test_raises(self): ) -class Test_map2defs(unittest.TestCase): +class Test_conlabs2confrags(unittest.TestCase): def setUp(self): self.cons_list = ['3.67', 'G.H5.1', 'G.H5.6', '5.69'] self.cons_list_w_Nones = ['3.67', None, None, 'G.H5.1', 'G.H5.6', '5.69'] self.cons_list_wo_dots = ['367', None, None, 'G.H5.1', 'G.H5.6', '5.69'] def test_works(self): - map2defs = nomenclature._map2defs(self.cons_list) + map2defs = nomenclature.conlabs2confrags(self.cons_list) assert _np.array_equal(map2defs['3'], [0]) assert _np.array_equal(map2defs['G.H5'], [1, 2]) assert _np.array_equal(map2defs['5'], [3]) _np.testing.assert_equal(len(map2defs), 3) def test_works_w_Nones(self): - map2defs = nomenclature._map2defs(self.cons_list_w_Nones) + map2defs = nomenclature.conlabs2confrags(self.cons_list_w_Nones) assert _np.array_equal(map2defs['3'], [0]) assert _np.array_equal(map2defs['G.H5'], [3, 4]) assert _np.array_equal(map2defs['5'], [5]) _np.testing.assert_equal(len(map2defs), 3) def test_works_wo_dot_raises(self): - with self.assertRaises(AssertionError): - nomenclature._map2defs(self.cons_list_wo_dots) + with self.assertRaises(ValueError): + nomenclature.conlabs2confrags(self.cons_list_wo_dots) class Test_fill_CGN_gaps(unittest.TestCase): From a0bdd91b3e8373b67bb37d1c5723a99d6e72bd72 Mon Sep 17 00:00:00 2001 From: gph82 Date: Wed, 14 Aug 2024 12:13:50 +0200 Subject: [PATCH 28/83] [nomenclature.conlabs2confrags] new optarg replace_GPCR_frags --- mdciao/nomenclature/nomenclature.py | 8 +++++++- tests/test_nomenclature.py | 7 +++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index b19fabad..55d9670f 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -2259,7 +2259,7 @@ def guess_by_nomenclature(CLin, top, fragments=None, nomenclature_name=None, return answer -def conlabs2confrags(conlabs, splitchar="."): +def conlabs2confrags(conlabs, splitchar=".", replace_GPCR_frags=False): r""" Subdomain definitions form a list of consensus labels. @@ -2279,6 +2279,9 @@ def conlabs2confrags(conlabs, splitchar="."): consensus labels, e.g. "3" from "3.50" or "G.H5" from "G.H5.1" If a label of `cons_list` doesn't have a `splitchar` in it, an Exception is thrown (this is a suspicious case) + replace_GPCR_frags : bool, default is False + If True, will replace the fragment labels coming from GPCR + conlabs like "34.50" or "7.49" to "ICL2" or "TM7", respectively. Returns ------- defs : dictionary @@ -2292,6 +2295,9 @@ def conlabs2confrags(conlabs, splitchar="."): conlab2confrag = lambda x: str(x)[::-1].split(splitchar, 1)[-1][::-1] df["frag"] = df.conlab.map(conlab2confrag) consensus_frags = {key: val.index.values for key, val in df.groupby("frag") if str(key).lower() != "none"} + if replace_GPCR_frags: + consensus_frags = {_GPCR_num2lett.get(key, key): val for key, val in consensus_frags.items()} + return {key: _np.array(val) for key, val in consensus_frags.items()} diff --git a/tests/test_nomenclature.py b/tests/test_nomenclature.py index 724f017f..7d57b88f 100644 --- a/tests/test_nomenclature.py +++ b/tests/test_nomenclature.py @@ -656,6 +656,13 @@ def test_works(self): assert _np.array_equal(map2defs['5'], [3]) _np.testing.assert_equal(len(map2defs), 3) + def test_works_replaces(self): + map2defs = nomenclature.conlabs2confrags(self.cons_list, replace_GPCR_frags=True) + assert _np.array_equal(map2defs['TM3'], [0]) + assert _np.array_equal(map2defs['G.H5'], [1, 2]) + assert _np.array_equal(map2defs['TM5'], [3]) + _np.testing.assert_equal(len(map2defs), 3) + def test_works_w_Nones(self): map2defs = nomenclature.conlabs2confrags(self.cons_list_w_Nones) assert _np.array_equal(map2defs['3'], [0]) From 55f9e3fa12bbb1abeec7f0beca116ec93038ebaf Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 10:49:25 +0200 Subject: [PATCH 29/83] [nomenclature.LabelerConsensus.top2labels] top arg can be a path to a filename, tests added --- mdciao/nomenclature/nomenclature.py | 19 +++++++++++-------- tests/test_nomenclature.py | 4 ++++ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index 55d9670f..43aba28f 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -458,12 +458,12 @@ class LabelerConsensus(object): At the moment child classes are * :obj:`LabelerGPCR` for GPCR-notation, this can be: - * structure based schemes (Gloriam et al) - * sequence based schemes - * Class-A: Ballesteros-Weinstein - * Class-B: Wootten - * Class-C: Pin - * Class-F: Wang + * structure based schemes, by Gloriam et al + * sequence based schemes + * Class-A: Ballesteros-Weinstein + * Class-B: Wootten + * Class-C: Pin + * Class-F: Wang * :obj:`LabelerCGN` for Common-Gprotein-nomenclature (CGN) * :obj:`LabelerKLIFS` for Kinase-Ligand Interaction notation of the 85 pocket-residues of kinases @@ -884,8 +884,9 @@ def top2labels(self, top, Parameters ---------- - top : - :obj:`~mdtraj.Topology` object + top : :obj:`~mdtraj.Topology` object or str + The topology as an object or a path + to a filename, e.g. a pdb file. allow_nonmatch : bool, default is True Use consensus labels for non-matching positions in case the non-matches have equal lengths @@ -917,6 +918,8 @@ def top2labels(self, top, map : list List of len = top.n_residues with the consensus labels """ + if isinstance(top, str): + top = _md.load(top).top self.aligntop(top, min_seqID_rate=min_seqID_rate, **aligntop_kwargs) out_list = _alignment_df2_conslist(self.most_recent_alignment, allow_nonmatch=allow_nonmatch) out_list = out_list + [None for __ in range(top.n_residues - len(out_list))] diff --git a/tests/test_nomenclature.py b/tests/test_nomenclature.py index 7d57b88f..36938c6c 100644 --- a/tests/test_nomenclature.py +++ b/tests/test_nomenclature.py @@ -379,6 +379,10 @@ def test_most_recent_labels_works(self): labels = self.cgn_local.top2labels(self.top) self.assertListEqual(labels, self.cgn_local.most_recent_top2labels) + def test_reads_pdb_for_top(self): + labels = self.cgn_local.top2labels(test_filenames.pdb_3SN6) + self.assertListEqual(labels, self.cgn_local.most_recent_top2labels) + def test_hole_in_subdomain(self): frags = self.cgn_local.top2frags(self.top) a5_w_hole_idxs = frags["G.H5"][:5]+frags["G.H5"][-5:] From bb0b795dd55002cb22781704ba2456a89bb07229 Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 10:56:08 +0200 Subject: [PATCH 30/83] [contacts.ContactGroup.plot_freqs_as_flareplot] minor docs --- mdciao/contacts/contacts.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 5ce4647b..80189589 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -5798,16 +5798,17 @@ def plot_freqs_as_flareplot(self, ctc_cutoff_Ang, labels (strings) themselves. They need to be "gettable" by residue index, i.e. dict, list or array. Typically, one - generates these maps by using the top2labels - method of the LabelerConsensus object - * :obj:`LabelerConsensus`-objects + generates these maps by using + :obj:`mdciao.nomenclature.LabelerConsensus.top2labels`. + * :obj:`mdciao.nomenclature.LabelerConsensus`-objects When these objects are passed, their - top2labels and top2fragments methods are + :obj:`mdciao.nomenclature.LabelerConsensus.top2labels` and + :obj:`mdciao.nomenclature.LabelerConsensus.top2fragments` are called on-the-fly, generating not only the consensus labels but also the consensus fragments (i.e. subdomains) to further fragment the topology into sub-domains, like TM6 or G.H5. - If :obj:`fragments` are parsed, they will be + If `fragments` are parsed, they will be made compatible with the consensus fragments. If you want the consensus labels but not the sub-fragmentation, simply use the first option. From 7b7cf25358699dca7fd9b67bfd9e57d3c00b659a Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 10:58:39 +0200 Subject: [PATCH 31/83] [cli._parse_consensus_option] minor --- mdciao/cli/cli.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 80480b14..061d22d5 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -102,16 +102,13 @@ def _parse_consensus_option(option, consensus_type, full of Nones is returned * str The needed identifier to instantiate an - `LabelerGPCR`, :obj:`LabelerCGN` or :obj:`LabelerKLIFS` object. - Examples would be a `UniProt_name`, a `ref_PDB`, or - an `UniProt_AC` - respectively - * :obj:`LabelerConsensus` - An already instantiated :obj:`LabelerGPCR`, - :obj:`LabelerCGN` or :obj:`LabelerKLIFS` object. + `LabelerGPCR`, `LabelerCGN` or `LabelerKLIFS` object. + * a `LabelerConsensus` object + An already instantiated `LabelerGPCR`, + `LabelerCGN` or `LabelerKLIFS` object. The method then does nothing. Usecase are repeated - calls to any of the methods in :obj:`command_line_tools` - without each call instantiating its own :obj:`LabelerConsensus` + calls to any of the methods in `mdciao.cli` + without each call instantiating its own `LabelerConsensus` * iterable An iterable (list, dict, array) mapping residue indices of 'top` to consensus labels. From ddc15f9066a4d2de340f8b36ca6eb9fa5c152d5c Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 11:04:41 +0200 Subject: [PATCH 32/83] [cli._parse_consensus_options_and_return_fragment_defs] Return consensus fragments even if no ConsensusLabelers were passed but label lists were passed --- mdciao/cli/cli.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 061d22d5..48bc35b3 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -189,6 +189,24 @@ def _parse_consensus_options_and_return_fragment_defs(option_dict, top, accept_guess=False, save_nomenclature_files=False, verbose=True): + r""" + + The consensus frags will be inferred + from true ConsensusLabelers objects or from lists of consensus labels + + Parameters + ---------- + option_dict + top + fragments_as_residue_idxs + accept_guess + save_nomenclature_files + verbose + + Returns + ------- + + """ consensus_frags, consensus_maps, consensus_labelers = {}, {}, {} for key, option in option_dict.items(): map_CL, CL = _parse_consensus_option(option, key, top, fragments_as_residue_idxs, @@ -208,6 +226,8 @@ def _parse_consensus_options_and_return_fragment_defs(option_dict, top, verbose=verbose or not accept_guess)) if not accept_guess: input("Hit enter to continue!\n") + elif not all([str(val).lower()=="none" for val in map_CL]): + consensus_frags.update(_mdcnomenc.conlabs2confrags(map_CL, replace_GPCR_frags=[key=="GPCR"])) _mdcu.lists.assert_no_intersection(list(consensus_frags.values()),"consensus fragment") return consensus_frags, consensus_maps, consensus_labelers From 8352cafcdef37116982032286225786e70ecd5b3 Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 11:06:29 +0200 Subject: [PATCH 33/83] [cli.interface] When calling plot_freqs_as_flareplot include consensus_maps (lists) This can label residues in the flareplot even if no ConsensusLabelers were passed or instantiated --- mdciao/cli/cli.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 48bc35b3..804301cc 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -1632,8 +1632,14 @@ def interface( print(fn.fullpath_matrix) if flareplot: + consensus_maps_ = [] + for key in ["GPCR", "CGN", "KLIFS"]: + if key in consensus_labelers.keys(): + consensus_maps_.append(consensus_labelers[key]) + elif key in consensus_maps.keys(): + consensus_maps_.append(consensus_maps[key]) ifig, iax = ctc_grp_intf.plot_freqs_as_flareplot(ctc_cutoff_Ang, - consensus_maps=consensus_labelers.values(), + consensus_maps=consensus_maps_, SS=refgeom, fragment_names=fragment_names, fragments=fragments_as_residue_idxs, From 0701310bfd3ec178bdf48ac642f372db15ef2c24 Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 12:14:46 +0200 Subject: [PATCH 34/83] [cli.interface] new optarg AA_selection, tests --- mdciao/cli/cli.py | 66 ++++++++++++++++++++++++++++++++++++++----- tests/test_cli.py | 72 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+), 7 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 804301cc..546825aa 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -1103,6 +1103,7 @@ def interface( topology=None, frag_idxs_group_1=None, frag_idxs_group_2=None, + AA_selection=None, GPCR_UniProt="None", CGN_UniProt="None", KLIFS_string=None, @@ -1161,12 +1162,12 @@ def interface( in a receptor--G-protein complex, one partner is the receptor and the other partner is the G-protein. - By default, mdciao.cli.interface doesn't allow interface - members to share residues. However, sometimes it's + This is why mdciao.cli.interface doesn't allow interface + members to share residues by default. However, sometimes it's useful to allow it because the contacts of one fragment - with itself (the self-contacts) are also important. - E.g. the C-terminus of a receptor interfacing with - the entire receptor, **including the C-terminus**. + with itself are also important. E.g. the + C-terminus of a receptor interfacing with + the entire receptor, **including the C-terminus itself**. To allow for this behaviour, use `self_interface` = True, and possibly increase `n_nearest`, since otherwise neighboring residues of the shared set (e.g. C-terminus) @@ -1174,13 +1175,14 @@ def interface( Finally, the interface strength, defined as the per-residue sum of contacts participating in - the interface, is written as the `bfactor `_ + the interface, is written as the + `bfactor `_ in a .pdb file called (for the default `ctc_cutoff_Ang`=4) 'interface.overall@4.0_Ang.as_bfactors.pdb'. You can see an example of how to use this file (e.g. with VMD) in the online documentation. The structures, i.e. frames, in that .pdb-file are chosen using the - method :obj:`mdciao.contacts.ContactGroup.n_repframes`. + method :obj:`mdciao.contacts.ContactGroup.repframes` . See below the parameter `n_repframes` for more info. Parameters @@ -1216,6 +1218,37 @@ def interface( Defaults to None which will prompt the user of information, except when only two fragments are present. Then it defaults to [1] + AA_selection : str or list, default is None + Whatever the fragment definition and fragment selection + has been, one can further refine the list of + potential residue pairs by making a per aminoacid (AA) + selection here. E.g., if one has selected the interface + to be "TM3" vs "TM2", but wants to select only some + regions of those helices, one can pass here an `AA_selection`. + This can be a string or a list of len two: + + * A string leads to a boolean "or" selection, i.e. keep + residue pair [ii,jj] if either ii **or** jj + match `AA_selection`. E.g. + + >>> AA_selection = "3.45-3.55" + + is equivalent of "3.45-3.55" vs "TM2" contacts + * A list of len two leads to a boolean "and" selection, i.e. keep + residue pair [ii,jj] if ii **and** jj + match `AA_selection`. E.g. + + >>> AA_selection = ["3.45-3.55","2.45-2.55"] + + is equivalent of "3.45-3.55" vs "2.45-2.55" contacts + + In principle, one could use + + >>> fragments = ["3.45-3.55","2.45-2.55"] + + and get the same contacts, but this would then exclude all other + residues of the topology from being tagged with fragment + and or consensus labels. GPCR_UniProt : str or :obj:`mdciao.nomenclature.LabelerGPCR`, default is None For GPCR nomenclature. If str, e.g. "adrb2_human". will try to locate a local filename or do a web lookup in the GPCRdb. @@ -1481,6 +1514,25 @@ def interface( print("\nWill look for contacts in the interface between fragments\n%s\nand\n%s. "% ('\n'.join(_twrap(', '.join(['%s' % gg for gg in intf_frags_as_str_or_keys[0]]))), '\n'.join(_twrap(', '.join(['%s' % gg for gg in intf_frags_as_str_or_keys[1]]))))) + + # Sub-select at the AA-level #TODO consider making method out of this + if AA_selection is not None: + if isinstance(AA_selection, str): + lambda_sel = lambda pair, sel: _np.in1d(pair, sel).any() + elif isinstance(AA_selection, list) and len(AA_selection)==2: + lambda_sel = lambda pair, sel: _np.in1d(pair, sel).all() + AA_selection = ",".join(AA_selection) + else: + raise ValueError(f"'AA_selection' as to be a sting or a list of len 2, " + f"but your input is a {type(AA_selection).__name__} of len {len(AA_selection)}.") + sel = _mdcu.residue_and_atom.rangeexpand_residues2residxs(AA_selection, + fragments_as_residue_idxs, + refgeom.top, + fragment_names=fragment_names, + additional_resnaming_dicts=consensus_maps) + print(f"Excluding residue pairs not involving residues '{AA_selection}' ({len(sel)} AAs).") + ctc_idxs = [pair for pair in ctc_idxs if lambda_sel(pair, sel)] + print(f"Performing a first pass on the {len(ctc_idxs)} group_1-group_2 residue pairs to compute lower bounds " f"on residue-residue distances via residue-COM distances.") lb_cutoff_buffer_Ang = 2.5 diff --git a/tests/test_cli.py b/tests/test_cli.py index 89294a83..82c616ca 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -612,6 +612,78 @@ def test_w_nomenclature_CGN_GPCR_fragments_are_consensus_and_flareplot_and_self_ self_interface=True, ) + def test_w_nomenclature_CGN_GPCR_fragments_are_consensus_and_flareplot_and_AA_selection_OR(self): + with TemporaryDirectory(suffix='_test_mdciao') as tmpdir: + shutil.copy(test_filenames.gnas2_human_xlsx, tmpdir) + shutil.copy(test_filenames.adrb2_human_xlsx, tmpdir) + with remember_cwd(): + os.chdir(tmpdir) + intf = cli.interface([self.traj, self.traj_reverse], + self.geom, + ctc_cutoff_Ang=5, + n_nearest=4, + output_dir=tmpdir, + fragments=["consensus"], + CGN_UniProt="gnas2_human", + GPCR_UniProt="adrb2_human", + accept_guess=True, + frag_idxs_group_1='TM6', + frag_idxs_group_2='TM5', + self_interface=True, + AA_selection="5.50x50-5.55x55" + ) + TM5 = ["5.50x50", "5.51x51", "5.52x52", "5.53x53", "5.54x54", "5.55x55"] + assert all ([lab[1] in TM5 for lab in intf.consensus_labels]), intf.consensus_labels + _plt.close("all") + + def test_w_nomenclature_CGN_GPCR_fragments_are_consensus_and_flareplot_and_AA_selection_AND(self): + with TemporaryDirectory(suffix='_test_mdciao') as tmpdir: + shutil.copy(test_filenames.gnas2_human_xlsx, tmpdir) + shutil.copy(test_filenames.adrb2_human_xlsx, tmpdir) + with remember_cwd(): + intf = cli.interface([self.traj, self.traj_reverse], + self.geom, + ctc_cutoff_Ang=5, + n_nearest=4, + output_dir=tmpdir, + fragments=["consensus"], + CGN_UniProt="gnas2_human", + GPCR_UniProt="adrb2_human", + accept_guess=True, + frag_idxs_group_1='TM6', + frag_idxs_group_2='TM5', + self_interface=True, + AA_selection=["5.50x50-5.55x55", "6.45x45,6.49x49"] + ) + TM5 = ["5.50x50", "5.51x51", "5.52x52", "5.53x53", "5.54x54", "5.55x55"] + assert all ([lab[1] in TM5 for lab in intf.consensus_labels]), intf.consensus_labels + assert all ([lab[0] in ["6.45x45","6.49x49"] for lab in intf.consensus_labels]), intf.consensus_labels + _plt.close("all") + + def test_w_nomenclature_CGN_GPCR_fragments_are_consensus_and_flareplot_and_AA_selection_raises(self): + with TemporaryDirectory(suffix='_test_mdciao') as tmpdir: + shutil.copy(test_filenames.gnas2_human_xlsx, tmpdir) + shutil.copy(test_filenames.adrb2_human_xlsx, tmpdir) + with remember_cwd(): + with self.assertRaises(ValueError): + intf = cli.interface([self.traj, self.traj_reverse], + self.geom, + ctc_cutoff_Ang=5, + n_nearest=4, + output_dir=tmpdir, + fragments=["consensus"], + CGN_UniProt="gnas2_human", + GPCR_UniProt="adrb2_human", + accept_guess=True, + frag_idxs_group_1='TM6', + frag_idxs_group_2='TM5', + self_interface=True, + AA_selection=["5.50x50-5.55x55,6.45x45,6.49x49"] + ) + + + + class Test_pdb(TestCLTBaseClass): def test_works(self): From f57dd6b1f7791400a62c4cbf6928ddc3a19661d2 Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 12:29:15 +0200 Subject: [PATCH 35/83] [fragments._fragments_strings_to_fragments] sort the output of unsorted ranges, e.g. "30-40,20-25" -> [20,...,25,30,...,40] --- mdciao/fragments/fragments.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mdciao/fragments/fragments.py b/mdciao/fragments/fragments.py index 9c320c5d..9737b71f 100644 --- a/mdciao/fragments/fragments.py +++ b/mdciao/fragments/fragments.py @@ -856,7 +856,8 @@ def _fragments_strings_to_fragments(fragment_input, top, verbose=False): Try to understand how the user wants to fragment the topology Pretty flexible - Check also :obj:`rangeexpand` to understand the expressions + Check also :obj:`mdciao.utils.residue_and_atom.rangeexpand_residues2residxs` + to understand the expressions. Parameters @@ -926,7 +927,7 @@ def _fragments_strings_to_fragments(fragment_input, top, verbose=False): verbose=False) else: raise ValueError(f"The 'fragments' string has to be one of {['consensus']+_allowed_fragment_methods}, but " - f"the provided string was {fragment_input}") + f"the provided string was {fragment_input}.") else: method = "user input by residue array or range" fragments_as_residue_idxs = [] @@ -935,13 +936,13 @@ def _fragments_strings_to_fragments(fragment_input, top, verbose=False): if not isinstance(fri,str): fragments_as_residue_idxs.append(fri) else: - fragments_as_residue_idxs.append( - _mdcu.residue_and_atom.rangeexpand_residues2residxs(fri, + ifrag = _mdcu.residue_and_atom.rangeexpand_residues2residxs(fri, temp_fragments, top, interpret_as_res_idxs=fri.replace("-","").replace(",","").isnumeric(), extra_string_info="\nThis fragmentation is only for disambiguation purposes:" - )) + ) + fragments_as_residue_idxs.append(sorted(ifrag)) if len(fragment_input)==1: assert isinstance(fragment_input[0],str) method += " with only one fragment provided (all other residues are fragment 2)" From 0f6a84849a0b3bc8d8295599bb98a14925a59a0f Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 12:33:12 +0200 Subject: [PATCH 36/83] [cli.interface] better fragment handling Larg(ish) refactor to better separate fragmentation (for residue tagging and plots) and fragment selection. Biggest changes is that consensus selection can be done regardless of what fragmentation heuristic is used. I.e. one can fragment in chaisn and still select e.g. TM6 vs TM6 --- mdciao/cli/cli.py | 114 ++++++++++++++++++++++++++++++---------------- 1 file changed, 74 insertions(+), 40 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 546825aa..cf298dc7 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -1101,6 +1101,7 @@ def residue_neighborhoods(residues, def interface( trajectories, topology=None, + fragments='lig_resSeq+', frag_idxs_group_1=None, frag_idxs_group_2=None, AA_selection=None, @@ -1110,7 +1111,6 @@ def interface( chunksize_in_frames=2000, ctc_cutoff_Ang=4, curve_color="auto", - fragments='lig_resSeq+', fragment_names="", graphic_dpi=150, graphic_ext=".pdf", @@ -1148,12 +1148,25 @@ def interface( ): r"""Contact-frequencies between two groups of residues - The groups of residues can be defined directly - by using residue indices or by defining molecular fragments - and using these definitions as a shorthand to address - large sub-domains of the molecular topology. See in particular - the documentation for `fragments`, `frag_idxs_group_1` - `frag_idxs_group_2`. + The two groups of residues can be defined directly: + + * by using specific residue indices or ranges + * by using defined molecular fragments, + chains defined in the topology or pdb-file. + * by guessing molecular fragments, using some + fragmentation heuristic. + The fragment definition and the fragment selection + are separate, i.e. there might be six chains but + one can specify to compute the interface between + chains [0,1] vs [2,3]. Read more in the + documentation for `fragments`, `frag_idxs_group_1`, + and `frag_idxs_group_2`. + + One can further refine the fragment selection + with an aminoacid (AA) selection using + `AA_selection`, to further specify the residues + of interest if the fragment definitions are too broad. + See the docstring for more info. Typically, the two groups of residues conforming both sides of the interface, also called interface members, @@ -1204,6 +1217,48 @@ def interface( be used, i.e. when no :obj:`topology` is passed, the first :obj:`trajectory` has to be either a .gro or .pdb file, or an :obj:`~mdtraj.Trajectory` object + fragments : str, list, None, default is "lig_resSeq+" + How to fragment the topology. Will be used for: + + * tagging of residues, e.g. "GLU30@frag1" + * disambiguation of residues, e.g. more than one + "GLU30" exists. + * grouping of residues in graphical + representations, e.g. flareplots + * defining the interface fragments + There exist several input modes: + + * A single string with the name of a + fragmentation heuristic, e.g. + "lig_resSeq+", which is the default + and usually yields good results. See + :obj:`mdciao.fragments.get_fragments` + for more info on defaults and other heuristics. + * A list of definitions. Each entry of this list can be: + + * an iterable of integers (lists or np.arrays, e.g. np.arange(20,30) + * a range expressed as an integer string, "20-30" + * a ranges expressed as residue descriptors "GLU30-LEU40" + * A special string, "consensus", to use consensus + subdomains, like "TM1" or "G.H5", as fragment definitions. + + Numeric expressions are interpreted as zero-indexed and unique + residue serial indices, i.e. 30-40 does not necessarily equate + "GLU30-LEU40" unless serial and sequence index coincide. + If there's more than one "GLU30", the user gets asked to + disambiguate. + + Please note, since fragment definiton and fragment selection are + separate, one can use consensus definitions to define the interface + regardless of having passed "consensus" here. I.e., you can + use `fragments='chains'` to divide the topology for representation + and residue-tagging purposes but then define the interface as: + >>> frag_idxs_group_1="TM3" + >>> frag_idxs_group_2="TM2" + to compute the interface of TM3 vs TM2 in a GPCR. For + this mode of selection to work, the only condition is that the consensus + labels have been provided via `GPCR_Uniprot`, + `CGN_UniProt` or `KLIFS_string` (see below). frag_idxs_group_1 : NoneType, default is None Indices of the fragments that belong to the group_1. Strings can be CSVs and include ranges, e.g. '1,3-4', @@ -1286,7 +1341,7 @@ def interface( See :obj:`mdciao.nomenclature` for more info and references. Alos, please note the difference between UniProt Accession Code and UniProt entry name as explained - `here `_ . + `here `_ . chunksize_in_frames : int, default is 2000 Stream through the trajectories in chunks of this size. @@ -1296,29 +1351,6 @@ def interface( curve_color : str, default is 'auto' Type of color used for the curves. Alternatives are "P" or "H" - fragments : str, list, None, default is "lig_resSeq+" - Topology fragments. There exist several input modes: - - * Name of a fragmentation heuristic, e.g. - "lig_resSeq+", which is the default of - and usually yields good results. See - :obj:`mdciao.fragments.get_fragments` - for more info on defaults and other heuristics. - * List of len N that can mix different possibilities: - - * iterable of integers (lists or np.arrays, e.g. np.arange(20,30) - * ranges expressed as integer strings, "20-30" - * ranges expressed as residue descriptors ["GLU30-LEU40"] - - * "consensus" : use things like "TM*" or "G.H*", i.e. - GPCR or CGN-sub-subunit labels. - - Numeric expressions are interpreted as zero-indexed and unique - residue serial indices, i.e. 30-40 does not necessarily equate - "GLU30-LEU40" unless serial and sequence index coincide. - If there's more than one "GLU30", the user gets asked to - disambiguate. The resulting fragments need not cover - all of the topology, they only need to not overlap. fragment_names : str or list, default is '' If string, it has to be a list of comma-separated values. If you want unnamed fragments, use None, @@ -1480,17 +1512,19 @@ def interface( print("Will compute contact frequencies for trajectories:\n%s" "\n with a stride of %u frames" % (_mdcu.str_and_dict.inform_about_trajectories(xtcs, only_show_first_and_last=15), stride)) - fragments_as_residue_idxs, fragment_names, user_wants_consensus, consensus_labelers, consensus_maps, consensus_frags, top2confrag = _parse_fragdefs_fragnames_consensus( + fragments_as_residue_idxs, fragment_names, _, consensus_labelers, consensus_maps, consensus_frags, top2confrag = _parse_fragdefs_fragnames_consensus( refgeom.top, fragments, fragment_names, GPCR_UniProt, CGN_UniProt, KLIFS_string, accept_guess, save_nomenclature_files) - if user_wants_consensus: - intf_frags_as_residxs, \ - intf_frags_as_str_or_keys = _mdcfrg.frag_dict_2_frag_groups(consensus_frags, ng=2, answers=[frag_idxs_group_1, frag_idxs_group_2]) - + fragments_as_residue_idxs_d = {str(ii) : val for ii, val in enumerate(fragments_as_residue_idxs)} + if len(fragments_as_residue_idxs)==2 and frag_idxs_group_1 is None and frag_idxs_group_2 is None: + frag_idxs_group_1, frag_idxs_group_2 =[0], [1] else: - intf_frags_as_residxs, \ - intf_frags_as_str_or_keys = _mdcfrg.frag_list_2_frag_groups(fragments_as_residue_idxs, - frag_idxs_group_1, frag_idxs_group_2, - ) + fragments_as_residue_idxs_d.update(consensus_frags) + + intf_frags_as_residxs, \ + intf_frags_as_str_or_keys = _mdcfrg.frag_dict_2_frag_groups(fragments_as_residue_idxs_d, ng=2, + answers=[frag_idxs_group_1, frag_idxs_group_2], + ) + intersect = list(set(intf_frags_as_residxs[0]).intersection(intf_frags_as_residxs[1])) if len(intersect) > 0: if self_interface: From 91a6f09b66aaeeb22a0fef0472372eb5c44c948c Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 12:43:58 +0200 Subject: [PATCH 37/83] [cli.interface] API BREAK frag_idxs_group_1, frag_idxs_group_2 becomes interface_selection_1, interface_selection_2 --- mdciao/cli/cli.py | 64 ++++++++++++++++++++++++++++++----------------- tests/test_cli.py | 52 +++++++++++++++++++------------------- 2 files changed, 67 insertions(+), 49 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index cf298dc7..67e1e437 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -1102,8 +1102,8 @@ def interface( trajectories, topology=None, fragments='lig_resSeq+', - frag_idxs_group_1=None, - frag_idxs_group_2=None, + interface_selection_1=None, + interface_selection_2=None, AA_selection=None, GPCR_UniProt="None", CGN_UniProt="None", @@ -1159,8 +1159,8 @@ def interface( are separate, i.e. there might be six chains but one can specify to compute the interface between chains [0,1] vs [2,3]. Read more in the - documentation for `fragments`, `frag_idxs_group_1`, - and `frag_idxs_group_2`. + documentation for `fragments`, `interface_selection_1`, + and `interface_selection_2`. One can further refine the fragment selection with an aminoacid (AA) selection using @@ -1253,26 +1253,44 @@ def interface( regardless of having passed "consensus" here. I.e., you can use `fragments='chains'` to divide the topology for representation and residue-tagging purposes but then define the interface as: - >>> frag_idxs_group_1="TM3" - >>> frag_idxs_group_2="TM2" + >>> interface_selection_1="TM3" + >>> interface_selection_2="TM2" to compute the interface of TM3 vs TM2 in a GPCR. For this mode of selection to work, the only condition is that the consensus labels have been provided via `GPCR_Uniprot`, `CGN_UniProt` or `KLIFS_string` (see below). - frag_idxs_group_1 : NoneType, default is None - Indices of the fragments that belong to the group_1. - Strings can be CSVs and include ranges, e.g. '1,3-4', - or be consensus labels "TM*,-TM6". - Defaults to None which will prompt the user of - information, except when only two fragments are - present. Then it defaults to [0] - frag_idxs_group_2 : NoneType, default is None - Indices of the fragments that belong to the group_2. - Strings can be CSVs and include ranges, e.g. '1,3-4', - or be consensus labels "TM*,-TM6". - Defaults to None which will prompt the user of - information, except when only two fragments are - present. Then it defaults to [1] + interface_selection_1 : str or list, default is None + Selection of the `fragments` that belong to one + side of the interface. Strings can be CSVs + and include: + * ranges, e.g. '1,3-4' + * wildcards, e.g. "TM*" or "G.H.??" + * exclusions, e.g. "TM*,-TM6" (all TMs except TM6) + The default is to prompt the user for + information, except when: + * `fragments` yielded only one fragment that + **doesn't** cover the whole topology. Then + all othe residues are put into a second + fragment and then the interface is computed + between these two fragments. + * `fragments` yielded just two fragments. Then + the interface is computed between these two fragments. + interface_selection_2 : str or list, default is None + Selection of the `fragments` that belong to the other + side of the interface. Strings can be CSVs + and include: + * ranges, e.g. '1,3-4' + * wildcards, e.g. "TM*" or "G.H.??" + * exclusions, e.g. "TM*,-TM6" (all TMs except TM6) + The default is to prompt the user for + information, except when: + * `fragments` yielded only one fragment that + **doesn't** cover the whole topology. Then + all othe residues are put into a second + fragment and then the interface is computed + between these two fragments. + * `fragments` yielded just two fragments. Then + the interface is computed between these two fragments. AA_selection : str or list, default is None Whatever the fragment definition and fragment selection has been, one can further refine the list of @@ -1515,14 +1533,14 @@ def interface( fragments_as_residue_idxs, fragment_names, _, consensus_labelers, consensus_maps, consensus_frags, top2confrag = _parse_fragdefs_fragnames_consensus( refgeom.top, fragments, fragment_names, GPCR_UniProt, CGN_UniProt, KLIFS_string, accept_guess, save_nomenclature_files) fragments_as_residue_idxs_d = {str(ii) : val for ii, val in enumerate(fragments_as_residue_idxs)} - if len(fragments_as_residue_idxs)==2 and frag_idxs_group_1 is None and frag_idxs_group_2 is None: - frag_idxs_group_1, frag_idxs_group_2 =[0], [1] + if len(fragments_as_residue_idxs)==2 and interface_selection_1 is None and interface_selection_2 is None: + interface_selection_1, interface_selection_2 =[0], [1] else: fragments_as_residue_idxs_d.update(consensus_frags) intf_frags_as_residxs, \ intf_frags_as_str_or_keys = _mdcfrg.frag_dict_2_frag_groups(fragments_as_residue_idxs_d, ng=2, - answers=[frag_idxs_group_1, frag_idxs_group_2], + answers=[interface_selection_1, interface_selection_2], ) intersect = list(set(intf_frags_as_residxs[0]).intersection(intf_frags_as_residxs[1])) diff --git a/tests/test_cli.py b/tests/test_cli.py index 82c616ca..bef712f9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -473,8 +473,8 @@ def test_interface(self): with TemporaryDirectory(suffix='_test_mdciao') as tmpdir: cli.interface([self.traj, self.traj_reverse], self.geom, - frag_idxs_group_1=[0], - frag_idxs_group_2=[1], + interface_selection_1=[0], + interface_selection_2=[1], output_dir=tmpdir, flareplot=True, no_disk=self.no_disk @@ -483,13 +483,13 @@ def test_interface(self): def test_no_top(self): with TemporaryDirectory(suffix='_test_mdciao') as tmpdir: cli.interface([self.traj, self.traj_reverse], - frag_idxs_group_1=[0], - frag_idxs_group_2=[1], + interface_selection_1=[0], + interface_selection_2=[1], output_dir=tmpdir, flareplot=False, plot_timedep=False, no_disk = self.no_disk - ) + ) def test_interface_wo_frag_idxs_groups(self): with TemporaryDirectory(suffix='_test_mdciao') as tmpdir: @@ -527,8 +527,8 @@ def test_w_just_two_fragments_by_user(self): with TemporaryDirectory(suffix='_test_mdciao') as tmpdir: cli.interface([self.traj, self.traj_reverse], self.geom, - frag_idxs_group_1=[0], - frag_idxs_group_2=[1], + interface_selection_1=[0], + interface_selection_2=[1], output_dir=tmpdir, fragments=["0-5", "6-10"], @@ -607,8 +607,8 @@ def test_w_nomenclature_CGN_GPCR_fragments_are_consensus_and_flareplot_and_self_ CGN_UniProt="gnas2_human", GPCR_UniProt="adrb2_human", accept_guess=True, - frag_idxs_group_1='TM6', - frag_idxs_group_2='TM5,TM6', + interface_selection_1='TM6', + interface_selection_2='TM5,TM6', self_interface=True, ) @@ -619,19 +619,19 @@ def test_w_nomenclature_CGN_GPCR_fragments_are_consensus_and_flareplot_and_AA_se with remember_cwd(): os.chdir(tmpdir) intf = cli.interface([self.traj, self.traj_reverse], - self.geom, - ctc_cutoff_Ang=5, - n_nearest=4, - output_dir=tmpdir, - fragments=["consensus"], - CGN_UniProt="gnas2_human", - GPCR_UniProt="adrb2_human", - accept_guess=True, - frag_idxs_group_1='TM6', - frag_idxs_group_2='TM5', - self_interface=True, - AA_selection="5.50x50-5.55x55" - ) + self.geom, + ctc_cutoff_Ang=5, + n_nearest=4, + output_dir=tmpdir, + fragments=["consensus"], + CGN_UniProt="gnas2_human", + GPCR_UniProt="adrb2_human", + accept_guess=True, + interface_selection_1='TM6', + interface_selection_2='TM5', + self_interface=True, + AA_selection="5.50x50-5.55x55" + ) TM5 = ["5.50x50", "5.51x51", "5.52x52", "5.53x53", "5.54x54", "5.55x55"] assert all ([lab[1] in TM5 for lab in intf.consensus_labels]), intf.consensus_labels _plt.close("all") @@ -650,8 +650,8 @@ def test_w_nomenclature_CGN_GPCR_fragments_are_consensus_and_flareplot_and_AA_se CGN_UniProt="gnas2_human", GPCR_UniProt="adrb2_human", accept_guess=True, - frag_idxs_group_1='TM6', - frag_idxs_group_2='TM5', + interface_selection_1='TM6', + interface_selection_2='TM5', self_interface=True, AA_selection=["5.50x50-5.55x55", "6.45x45,6.49x49"] ) @@ -675,8 +675,8 @@ def test_w_nomenclature_CGN_GPCR_fragments_are_consensus_and_flareplot_and_AA_se CGN_UniProt="gnas2_human", GPCR_UniProt="adrb2_human", accept_guess=True, - frag_idxs_group_1='TM6', - frag_idxs_group_2='TM5', + interface_selection_1='TM6', + interface_selection_2='TM5', self_interface=True, AA_selection=["5.50x50-5.55x55,6.45x45,6.49x49"] ) From 338fbe283ac37f3faf778011794e00f07df722ca Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 14:52:50 +0200 Subject: [PATCH 38/83] [examples and parsers] refactor frag_idxs_group_1 -> interface_selection_1 --- mdciao/examples/examples.py | 8 ++++---- mdciao/parsers.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/mdciao/examples/examples.py b/mdciao/examples/examples.py index 00fa553d..dd1582d1 100644 --- a/mdciao/examples/examples.py +++ b/mdciao/examples/examples.py @@ -124,8 +124,8 @@ def mdc_sites(self): def mdc_interface(self): return ["mdc_interface.py ", "%s %s" % (self.pdb, self.xtc), - " --frag_idxs_group_1 0-2", - " --frag_idxs_group_2 3", + " --interface_selection_1 0-2", + " --interface_selection_2 3", " --ctc_control 20", " --GPCR_UniProt %s" % self.GPCRlabs_file, " --CGN_UniProt %s" % self.CGN_file, @@ -609,8 +609,8 @@ def Interface_B2AR_Gas(**kwargs): "GPCR_UniProt": GPCRLabeler_ardb2_human(), "CGN_UniProt": CGNLabeler_gnas2_human(), "no_disk": True, - "frag_idxs_group_1":[0], - "frag_idxs_group_2":[3], + "interface_selection_1":[0], + "interface_selection_2":[3], "ctc_control":1.0, "accept_guess": True} for key, val in kwargs.items(): diff --git a/mdciao/parsers.py b/mdciao/parsers.py index a567bf5c..b3be3be1 100644 --- a/mdciao/parsers.py +++ b/mdciao/parsers.py @@ -620,12 +620,12 @@ def parser_for_interface(): '"min_freq". ') _parser_add_fragments(parser) - parser.add_argument("-fg1","--frag_idxs_group_1", type=str, - help="Indices of the fragments that belong to the group_1, as CSVs or range, e.g. '1,3-4'. " + parser.add_argument("-isel1","--interface_selection_1", type=str, + help="Indices of the fragments that belong to the one side of the interface, as CSVs or range, e.g. '1,3-4'. " "Defaults to None which will prompt the user of information, except when " "only two fragments are present. Then it defaults to [0]", default=None) - parser.add_argument("-fg2","--frag_idxs_group_2", type=str, - help="Indices of the fragments that belong to the group_2, as CSVs or range, e.g. '1,3-4'. " + parser.add_argument("-isel2","--interface_selection_2", type=str, + help="Indices of the fragments that belong to the other side of the interface, as CSVs or range, e.g. '1,3-4'. " "Defaults to None which will prompt the user of information, except when " "only two fragments are present. Then it defaults to [1]", default=None) _parser_add_cutoff(parser) From 767d56b259b2aa0434fab84488780b0446a9485a Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 15:21:14 +0200 Subject: [PATCH 39/83] [examples.notebook] update to the refactor frag_idxs_group_1 -> interface_selection_1 --- mdciao/examples/Comparing_CGs_Bars.ipynb | 16 +-------- mdciao/examples/Comparing_CGs_Flares.ipynb | 2 +- mdciao/examples/EGFR_Kinase_Inhibitors.ipynb | 6 ++-- mdciao/examples/Flareplot_Schemes.ipynb | 16 ++++----- .../examples/MSA_via_Consensus_Labels.ipynb | 10 +----- mdciao/examples/Manuscript.ipynb | 6 ++-- mdciao/examples/Missing_Contacts.ipynb | 34 +++++++++---------- mdciao/examples/Tutorial.ipynb | 6 ++-- 8 files changed, 36 insertions(+), 60 deletions(-) diff --git a/mdciao/examples/Comparing_CGs_Bars.ipynb b/mdciao/examples/Comparing_CGs_Bars.ipynb index 0f4df6df..da04997e 100644 --- a/mdciao/examples/Comparing_CGs_Bars.ipynb +++ b/mdciao/examples/Comparing_CGs_Bars.ipynb @@ -697,20 +697,6 @@ " colors=\"Set2\",\n", " );" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -729,7 +715,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/mdciao/examples/Comparing_CGs_Flares.ipynb b/mdciao/examples/Comparing_CGs_Flares.ipynb index b1929f34..e86ffad5 100644 --- a/mdciao/examples/Comparing_CGs_Flares.ipynb +++ b/mdciao/examples/Comparing_CGs_Flares.ipynb @@ -715,7 +715,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.11.9" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/mdciao/examples/EGFR_Kinase_Inhibitors.ipynb b/mdciao/examples/EGFR_Kinase_Inhibitors.ipynb index ffb656ee..d165e176 100644 --- a/mdciao/examples/EGFR_Kinase_Inhibitors.ipynb +++ b/mdciao/examples/EGFR_Kinase_Inhibitors.ipynb @@ -165,8 +165,8 @@ " fragment_names=[\"EGFR\", \"ligand\"],\n", " KLIFS_string=KLIFS, \n", " ctc_control=1.0, \n", - " frag_idxs_group_1=[0],\n", - " frag_idxs_group_2=[1],\n", + " interface_selection_1=[0],\n", + " interface_selection_2=[1],\n", " accept_guess=True, figures=False, no_disk=True)\n", " " ] @@ -372,7 +372,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/mdciao/examples/Flareplot_Schemes.ipynb b/mdciao/examples/Flareplot_Schemes.ipynb index c8c86998..e5721031 100644 --- a/mdciao/examples/Flareplot_Schemes.ipynb +++ b/mdciao/examples/Flareplot_Schemes.ipynb @@ -62,9 +62,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, we choose which fragments to use when computing the G-protein-Receptor interface, via the ``frag_idxs_group`` variables, namely: \n", - "* ```frag_idxs_group_1=[0,1,2]``` : the G$\\alpha$, G$\\beta$, and G$\\gamma$ sub-units \n", - "* ```frag_idxs_group_2=[4]```   : the B2AR receptor\n", + "Now, we choose which fragments to use when computing the G-protein-Receptor interface, via the ``interface_selection`` variables, namely: \n", + "* ```interface_selection_1=[0,1,2]``` : the G$\\alpha$, G$\\beta$, and G$\\gamma$ sub-units \n", + "* ```interface_selection_2=[4]```   : the B2AR receptor\n", "\n", "And we [compute the interface](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.cli.interface.html#mdciao.cli.interface) without producing or saving any figures or files using the [options](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.cli.interface.html#mdciao.cli.interface) ```figures=False``` and ```no_disk=True```." ] @@ -79,8 +79,8 @@ "CGN = mdciao.nomenclature.LabelerCGN(\"gnas2_human\")\n", "intf = mdciao.cli.interface(geom,\n", " ctc_cutoff_Ang=3.5,\n", - " frag_idxs_group_1=[0, 1, 2],\n", - " frag_idxs_group_2=[4], \n", + " interface_selection_1=[0, 1, 2],\n", + " interface_selection_2=[4], \n", " no_disk=True, \n", " figures=False,\n", " fragment_names=fragment_names,\n", @@ -171,8 +171,8 @@ "### Unused Space : ```scheme='interface'```\n", "While it might be usefull to plot **all** residues and fragments of the topology, most of the flareplot is *unused*. E.g., we know for sure that the 4TL and the NB won't get any contacts, because they simply were not considered when defining the interface, as we did above: \n", " \n", - "* ```frag_idxs_group_1=[0,1,2]``` : the G$\\alpha$, G$\\beta$, and G$\\gamma$ sub-units \n", - "* ```frag_idxs_group_2=[4]```   : the B2AR receptor\n", + "* ```interface_selection_1=[0,1,2]``` : the G$\\alpha$, G$\\beta$, and G$\\gamma$ sub-units \n", + "* ```interface_selection_2=[4]```   : the B2AR receptor\n", "\n", "So, we can hide 4TL and the NB by using the ```scheme=interface``` option, which will automatically hide fragments that weren't even considered in the interface definition. This is possible because the residues defining these fragments are stored internally in the [intf.interface_fragments]() object, and get re-used here.\n", "\n", @@ -503,7 +503,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/mdciao/examples/MSA_via_Consensus_Labels.ipynb b/mdciao/examples/MSA_via_Consensus_Labels.ipynb index ccb9fddc..12022f88 100644 --- a/mdciao/examples/MSA_via_Consensus_Labels.ipynb +++ b/mdciao/examples/MSA_via_Consensus_Labels.ipynb @@ -222,14 +222,6 @@ " iwd.add_cartoon(color=matplotlib.colors.to_hex(colors[key]), component=ii)\n", "iwd" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3a7de60-93cd-4610-8dae-44e0d159e1c0", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -248,7 +240,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/mdciao/examples/Manuscript.ipynb b/mdciao/examples/Manuscript.ipynb index f11c617e..cf0326fa 100644 --- a/mdciao/examples/Manuscript.ipynb +++ b/mdciao/examples/Manuscript.ipynb @@ -89,8 +89,8 @@ "intf = mdciao.cli.interface(traj,\n", " title=\"3SN6 beta2AR-Galpha interface\",\n", " fragments=fragments, fragment_names = fragment_names, \n", - " frag_idxs_group_1=[0], \n", - " frag_idxs_group_2=[3],\n", + " interface_selection_1=[0], \n", + " interface_selection_2=[3],\n", " GPCR_UniProt=GPCR, CGN_UniProt=CGN, \n", " accept_guess=True, no_disk=True, figures=False)" ] @@ -258,7 +258,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/mdciao/examples/Missing_Contacts.ipynb b/mdciao/examples/Missing_Contacts.ipynb index b96862b1..36e5016c 100644 --- a/mdciao/examples/Missing_Contacts.ipynb +++ b/mdciao/examples/Missing_Contacts.ipynb @@ -340,7 +340,7 @@ "outputs": [], "source": [ "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/prot.pdb\", \n", - " no_disk=True, frag_idxs_group_1=[0], frag_idxs_group_2=[1], \n", + " no_disk=True, interface_selection_1=[0], interface_selection_2=[1], \n", " ctc_control=1.0, \n", " min_freq=0,\n", " figures=False);\n", @@ -375,7 +375,7 @@ "outputs": [], "source": [ "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/prot.pdb\", \n", - " no_disk=True, frag_idxs_group_1=[0], frag_idxs_group_2=[1], \n", + " no_disk=True, interface_selection_1=[0], interface_selection_2=[1], \n", " ctc_control=1.0, \n", " figures=False);\n", "intf.plot_freqs_as_bars(4, shorten_AAs=True, defrag=\"@\", cumsum=True, sort_by_freq=True);" @@ -403,30 +403,28 @@ "\n", "From the docs\n", "```\n", - " By default, mdciao.cli.interface doesn't allow interface\n", - " members to share residues. However, sometimes it's\n", - " useful to allow it because the contacts of one fragment\n", - " with itself (the self-contacts) are also important.\n", - " E.g. the C-terminus of a receptor interfacing with\n", - " the entire receptor, **including the C-terminus**.\n", - " To allow for this behaviour, use `self_interface` = True,\n", - " and possibly increase `n_nearest`, since otherwise\n", - " neighboring residues of the shared set (e.g. C-terminus)\n", - " will always appear as formed\n", + " This is why mdciao.cli.interface doesn't allow interface\n", + " members to share residues by default. However, sometimes it's\n", + " useful to allow it because the contacts of one fragment\n", + " with itself are also important. E.g. the\n", + " C-terminus of a receptor interfacing with\n", + " the entire receptor, **including the C-terminus itself**.\n", + " To allow for this behaviour, use `self_interface` = True,\n", + " and possibly increase `n_nearest`, since otherwise\n", + " neighboring residues of the shared set (e.g. C-terminus)\n", + " will always appear as formed.\n", "```\n", "\n", - "We can compute the contacts of the $\\alpha$5-helix of the G-protein. Whereas most of the helix is straig, the C-terminal bends a bit backwards and interacts with itself:" + "We can compute the contacts of the $\\alpha$5-helix of the G-protein. Whereas most of the helix is straight, the C-terminal bends a bit backwards and interacts with itself:" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "raw", "metadata": {}, - "outputs": [], "source": [ "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/prot.pdb\",\n", " fragments=\"consensus\",accept_guess=True,\n", - " no_disk=True, frag_idxs_group_1=\"G.H5\", frag_idxs_group_2=\"G.H5\", \n", + " no_disk=True, interface_selection_1=\"G.H5\", interface_selection_2=\"G.H5\", \n", " ctc_control=1.0, \n", " GPCR_UniProt=\"adrb2_human\",CGN_UniProt=\"gnas2_human\",\n", " self_interface=True,\n", @@ -486,7 +484,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.11.9" }, "toc-autonumbering": true }, diff --git a/mdciao/examples/Tutorial.ipynb b/mdciao/examples/Tutorial.ipynb index 4c2e84b7..c2352c75 100644 --- a/mdciao/examples/Tutorial.ipynb +++ b/mdciao/examples/Tutorial.ipynb @@ -429,8 +429,8 @@ "outputs": [], "source": [ "mdciao.cli.interface(traj,\n", - " frag_idxs_group_1=[0],\n", - " frag_idxs_group_2=[3],\n", + " interface_selection_1=[0],\n", + " interface_selection_2=[3],\n", " GPCR_UniProt=GPCR,\n", " CGN_UniProt=CGN,\n", " title=\"3SN6 beta2AR-Galpha interface\",\n", @@ -751,7 +751,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.11.9" } }, "nbformat": 4, From 7336ba2c98e3e2d1d973d3d3a2432cef74fa5b47 Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 15:46:01 +0200 Subject: [PATCH 40/83] [examples.notebook.Missing_Contacts] add a cell showcasing AA_selection --- mdciao/examples/Missing_Contacts.ipynb | 49 +++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/mdciao/examples/Missing_Contacts.ipynb b/mdciao/examples/Missing_Contacts.ipynb index 36e5016c..82707b1e 100644 --- a/mdciao/examples/Missing_Contacts.ipynb +++ b/mdciao/examples/Missing_Contacts.ipynb @@ -419,8 +419,10 @@ ] }, { - "cell_type": "raw", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/prot.pdb\",\n", " fragments=\"consensus\",accept_guess=True,\n", @@ -448,6 +450,51 @@ "We have also ploted them also as bars, including the atom-types. " ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Interfaces: `AA_selection`\n", + "If the `fragments` themselves are still too broad a definition, one can select a sub-set of aminoacids of those fragments via `AA_selection`:\n", + "\n", + "From the docs\n", + "```\n", + " AA_selection : str or list, default is None\n", + " Whatever the fragment definition and fragment selection\n", + " has been, one can further refine the list of\n", + " potential residue pairs by making a per aminoacid (AA)\n", + " selection here. E.g., if one has selected the interface\n", + " to be \"TM3\" vs \"TM2\", but wants to select only some\n", + " regions of those helices, one can pass here an `AA_selection`.\n", + " [...]\n", + "```\n", + "Please read the rest of the docs, since the paramter has more options than the ones we're about to use.\n", + "\n", + "Here, we define the interface as contacts of the $\\alpha$5-helix of the G-protein with the TM-bundle, using \n", + "```no_disk=True, interface_selection_1=\"G.H5\", interface_selection_2=\"TM*\"```\n", + "\n", + "and then use ``` AA_selection=\"390-394\"``` to select the C-terminal *tip* of $\\alpha$5. You can look the $\\alpha$5 definition from the output a couple of cells above \n", + "```G.H5 with 26 AAs THR369@G.H5.01 ( 328) - LEU394@G.H5.26 (353 ) (G.H5) ```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/prot.pdb\",\n", + " fragments=\"consensus\",accept_guess=True,\n", + " no_disk=True, interface_selection_1=\"G.H5\", interface_selection_2=\"TM*\", \n", + " ctc_control=1.0, \n", + " GPCR_UniProt=\"adrb2_human\",CGN_UniProt=\"gnas2_human\",\n", + " min_freq=0,\n", + " n_nearest=4,\n", + " AA_selection=\"390-394\",\n", + " figures=False)\n", + "intf.plot_freqs_as_bars(4, shorten_AAs=True, plot_atomtypes=True);" + ] + }, { "cell_type": "markdown", "metadata": { From dcb3bc0e950fbcbe35c02bf524107d9564ee84dc Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 15:51:00 +0200 Subject: [PATCH 41/83] [cli._parse_fragdefs_fragnames_consensus] don't use consensus_labelers to get potential keys for the consensus maps This way if GPCR_Uniprot was a list of labels, they don't get lost here and can be used e.g. in the flareplot --- mdciao/cli/cli.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 67e1e437..7c6437a4 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -2580,9 +2580,8 @@ def _parse_fragdefs_fragnames_consensus(top, fragments, fragment_names, GPCR_Uni accept_guess=accept_guess, save_nomenclature_files=save_nomenclature_files) # pop out the Nones of the maps - consensus_maps = {key : consensus_maps[key] for key in consensus_labelers.keys()} + consensus_maps = {key : val for key, val in consensus_maps.items() if not all(_np.array(val)==None)} top2confrag = _np.full(top.n_residues, None) for key, val in consensus_frags.items(): top2confrag[val] = key - return fragments_as_residue_idxs, fragment_names, user_wants_consensus, consensus_labelers, consensus_maps, consensus_frags, top2confrag \ No newline at end of file From 602238cfb6090352a007ebd2c3dd30285a2503bc Mon Sep 17 00:00:00 2001 From: gph82 Date: Thu, 15 Aug 2024 17:30:31 +0200 Subject: [PATCH 42/83] [examples.fetch_example_data] new dataset ghrelin@ghsr --- mdciao/examples/examples.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mdciao/examples/examples.py b/mdciao/examples/examples.py index dd1582d1..cee228b0 100644 --- a/mdciao/examples/examples.py +++ b/mdciao/examples/examples.py @@ -453,6 +453,16 @@ def fetch_example_data(alias_or_url="b2ar@Gs", made available via `molSSI `_. (1 npy file with interfaces for 4 setups and one sample trajectory file, ca 35 MB) + * ghrelin@ghsr : https://proteinformatics.uni-leipzig.de/mdciao/ghrelin_receptor.zip + Growth hormone secretagogue receptor type 1, ghrelin receptor for short, bound + to ghrelin. Provided kindly by Dr. Alexander Vogel (1 traj, ca. 10 MB, 411 frames, dt = 100ns) + For the associated publication see here: + + * Analysis of the Dynamics of the Human Growth Hormone Secretagogue Receptor Reveals Insights into the Energy Landscape of the Molecule + A. A. Smith, E. M. Pacull, S. Stecher, P. W. Hildebrand, A. Vogel, D. Huster, + Angew. Chem. Int. Ed. 2023, 62, e202302003. + + unzip : bool, default is True Try unzipping the file after downloading From 2aabdd59ad3a45067aeff44bfca8956069e639cb Mon Sep 17 00:00:00 2001 From: gph82 Date: Fri, 16 Aug 2024 15:15:58 +0200 Subject: [PATCH 43/83] [contacts.ContactGroup.plot_freqs_as_bars] refactor truncate_at -> lower_cutoff_val --- mdciao/cli/cli.py | 2 +- mdciao/contacts/contacts.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 7c6437a4..1362d763 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -1702,7 +1702,7 @@ def interface( xlim=_np.min((n_ctcs, ctc_grp_intf.n_ctcs)), label_fontsize_factor=panelsize2font / panelsize, shorten_AAs=short_AA_names, - truncate_at=min_freq, + lower_cutoff_val=min_freq, total_freq=df.freq.sum() ) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 80189589..4e63ee05 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -4457,7 +4457,7 @@ def plot_freqs_as_bars(self, color="tab:blue", shorten_AAs=False, label_fontsize_factor=1, - truncate_at=None, + lower_cutoff_val=None, plot_atomtypes=False, sort_by_freq=False, sum_freqs=True, @@ -4500,7 +4500,7 @@ def plot_freqs_as_bars(self, label_fontsize_factor : float, default is 1 Labels will be written in a fontsize rcParams["font.size"] * label_fontsize_factor - truncate_at : float, default is None + lower_cutoff_val : float, default is None Only plot frequencies above this value. Default is to plot all plot_atomtypes : bool, default is False @@ -4562,7 +4562,7 @@ def plot_freqs_as_bars(self, ax = _mdcplots.plots._plot_freqbars_baseplot(freqs[order], ax=ax, color=color, - lower_cutoff_val=truncate_at) + lower_cutoff_val=lower_cutoff_val) label_bars = [ictc.labels.w_fragments for ictc in self.contact_pairs] if shorten_AAs: @@ -4950,7 +4950,7 @@ def plot_neighborhood_freqs(self, ctc_cutoff_Ang, ax=ax, xlim=xmax, shorten_AAs=shorten_AAs, - truncate_at=None, + lower_cutoff_val=None, plot_atomtypes=plot_atomtypes, sort_by_freq=sort_by_freq, switch_off_Ang=switch_off_Ang, From 9c7facdeb213307b47d2429fdd1cd9c1bce4f9de Mon Sep 17 00:00:00 2001 From: gph82 Date: Fri, 16 Aug 2024 15:16:58 +0200 Subject: [PATCH 44/83] [contacts.ContactGroup.plot_frequency_sums_as_bars] refactor truncate_at -> lower_cutoff_val --- mdciao/cli/cli.py | 2 +- mdciao/contacts/contacts.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 1362d763..2d7c3e0c 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -1711,7 +1711,7 @@ def interface( ax=histoax[1], list_by_interface=True, label_fontsize_factor=panelsize2font / panelsize, - truncate_at=.05, + lower_cutoff_val=.05, shorten_AAs=short_AA_names, sort_by_freq=sort_by_av_ctcs, ) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 4e63ee05..bd8be451 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -5655,7 +5655,7 @@ def plot_frequency_sums_as_bars(self, ax=None, shorten_AAs=False, label_fontsize_factor=1, - truncate_at=0, + lower_cutoff_val=0, bar_width_in_inches=.75, list_by_interface=False, sort_by_freq=True, @@ -5680,7 +5680,7 @@ def plot_frequency_sums_as_bars(self, label_fontsize_factor : float, default is 1 Some control over fontsizes when plotting a high number of bars - truncate_at : float, default is 0 + lower_cutoff_val : float, default is 0 Do not show sums of freqs lower than this value bar_width_in_inches : float, default is .75 If no :obj:`ax` is parsed, this controls that the @@ -5717,8 +5717,8 @@ def plot_frequency_sums_as_bars(self, freqs = _np.array([j for idict in frq_dict_list for j in idict.values()]) # Truncate - label_bars = [label_bars[ii] for ii in _np.flatnonzero(freqs>truncate_at)] - freqs = freqs[freqs>truncate_at] + label_bars = [label_bars[ii] for ii in _np.flatnonzero(freqs > lower_cutoff_val)] + freqs = freqs[freqs > lower_cutoff_val] xvec = _np.arange(len(freqs)) if ax is None: @@ -5748,7 +5748,7 @@ def plot_frequency_sums_as_bars(self, ax.set_xlim([-.5, xmax + 1 - .5]) if list_by_interface and interface_vline: - xpos = len([ifreq for ifreq in frq_dict_list[0].values() if ifreq >truncate_at]) + xpos = len([ifreq for ifreq in frq_dict_list[0].values() if ifreq > lower_cutoff_val]) ax.axvline(xpos - .5, color="lightgray", linestyle="--", zorder=-1) return ax From 55635ccbf4abfffbca2970707560d19778e42024 Mon Sep 17 00:00:00 2001 From: gph82 Date: Sat, 17 Aug 2024 15:04:41 +0200 Subject: [PATCH 45/83] [nomenclature.LabelerConsensus.top2frags] docs: also mention that the fragments parameter helps with the alignment, not only the clashes --- mdciao/nomenclature/nomenclature.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index 43aba28f..ec15bf82 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -1029,17 +1029,22 @@ def top2frags(self, top, top: :obj:`~mdtraj.Topology` or path to topology file (e.g. a pdb) fragments: iterable of integers, default is None - The user can parse an existing list of fragment-definitions - (via residue idxs) to check if the newly found, consensus - definitions (`defs`) clash with the input in `fragments`. - *Clash* means that the `defs` would span over more - than one of the fragments in defined in `fragments`. + Any useful fragment definition as lists of residue indices. + Useful means: + + * Help with the alignment needed for consensus fragment definition. + Look at :obj:`LabelerConsensus.aligntop` and its `fragments` + and `min_seqID_rate` parameters. + * Check if the newly found, consensus fragment definitions (`defs`) + clash with the input in `fragments`. Clash* means that + the `defs` would span over more than + one of the fragments in defined in `fragments`. An interactive prompt will ask the user which fragments to keep in case of clashes. Check the method :obj:`~mdciao.fragments.check_if_fragment_clashes` - for more info + for more info. min_seqID_rate : float, default is .5 With big topologies, like a receptor-Gprotein system, the "brute-force" alignment method From a1d36c65b419bc7b36260c29cfc44b615a87a199 Mon Sep 17 00:00:00 2001 From: gph82 Date: Sun, 25 Aug 2024 18:54:01 +0200 Subject: [PATCH 46/83] [nomenclature] minor docs --- mdciao/nomenclature/nomenclature.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index ec15bf82..eb0c8181 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -632,6 +632,7 @@ def aligntop(self, top, Note ---- `fragments` only has an effect if both + * the `top` is an actual :obj:`~mdtraj.Topology` carrying the sequence indices, since if `top` is a sequence string, then there's no fragmentation heuristic possible. @@ -648,8 +649,8 @@ def aligntop(self, top, heuristic (this might change in the future). **To explicitly circumvent this forced fragmentation and subsequent check, use `fragments=False`. - This will simply use the first alignment that comes out of - :obj:`mdciao.utils.sequence.my_bioalign`, regardless + This will simply use the first alignment that comes out of** + :obj:`mdciao.utils.sequence.my_bioalign` **, regardless of there being other, equally scored, alignments and potential clashes with sensitive fragmentations.** verbose: boolean, default is False From 05522e7ff051d094c2c36c224fb8058c44c3ce7b Mon Sep 17 00:00:00 2001 From: gph82 Date: Sun, 25 Aug 2024 19:41:53 +0200 Subject: [PATCH 47/83] [plots.compare_violins] return the representative frames, tests --- mdciao/plots/plots.py | 164 ++++++++++++++++++++++++++---------------- tests/test_plots.py | 54 +++++++------- 2 files changed, 130 insertions(+), 88 deletions(-) diff --git a/mdciao/plots/plots.py b/mdciao/plots/plots.py index 2b85e19c..97f5d8c8 100644 --- a/mdciao/plots/plots.py +++ b/mdciao/plots/plots.py @@ -1366,7 +1366,7 @@ def compare_violins(groups, matching their contact labels, since the residue indices might differ across :obj:`groups`. To achieve this: * "K30-D40" is considered equivalent to "D40-D30", - use :obj:`key_separator` to change this. + use `key_separator` to change this. * "K30-D40" is considered equivalent to "K30-E40" if a :obj:`mutations_dict={"E40":"D40"}` is passed * "K30@3.50-D40" is considered equivalent to "K30-D40" @@ -1448,46 +1448,51 @@ def compare_violins(groups, to least "formed" on the right of the plot. However, for each residue pair, this mean is an average over the distance in all - the different :obj:`groups`, so some + the different `groups`, so some heterogeneity is expected. Alternatively, you can sort using the contact labels, regardless of the distance values. Note that for this, string comparisons between contact-labels will take place. and that - contact-labels are altered by :obj:`key_separator` - to unify across different :obj:`groups` - Try setting :obj:`key_separator` to None + contact-labels are altered by `key_separator` + to unify across different `groups` + Try setting `key_separator` to None if you see unexpected behavior, although though this might have other side effects, - (see obj:~`mdciao.utils.str_and_dict.unify_freq_dicts`) - :obj:`sort_by` can be a: - * str : 'residue' - Sort by ascending residue sequence index (resSeq), - which will be inferred from each contact label, - e.g. 30 for "GLU30@3.50". See :obj:`~mdciao.contacts.ContactGroup.gen_ctc_labels` - for more info on how they are generated. - Internally, the order is generated via - :obj:`~mdciao.utils.str_and_dict.lexsort_ctc_labels`. - If you want to reverse or alter this - ascending default order, we recommend using - :obj:`~mdciao.utils.str_and_dict.lexsort_ctc_labels` - **before** calling :obj:`compare_violins` and use - its output (sorted_ctc_labels) as a list - argument for :obj:`sort_by`. Also note that - residue indices as contained in - :obj:`~mdciao.contacts.ContactGroup.res_idx_pairs` + (see :obj:`~mdciao.utils.str_and_dict.unify_freq_dicts`) + `sort_by` can be a: + * str : 'residue' or 'numeric' + Sort by ascending residue sequence index (resSeq), + which will be inferred from each contact label, + e.g. 30 for "GLU30@3.50". See :obj:`~mdciao.contacts.ContactGroup.gen_ctc_labels` + for more info on how they are generated. + Internally, the order is generated via + :obj:`~mdciao.utils.str_and_dict.lexsort_ctc_labels`. + If you want to reverse or alter this + ascending default order, we recommend using + :obj:`~mdciao.utils.str_and_dict.lexsort_ctc_labels` + **before** calling :obj:`compare_violins` and use + its output (`labels`) as a list + argument for `sort_by`. Also note that + residue indices as contained in + :obj:`~mdciao.contacts.ContactGroup.res_idx_pairs` + * str : 'keep' + Sort using the same order of the labels as in + the first contact group + * str : 'consensus' + Sort following consensus nomenclature (GPCR, CGN or KLIFS) * list : a list of contact labels, - eg. ["GLU30-ALA30", "ARG131@3.50-TYR20"]. - Only these residue pairs (in this order) - will be shown, regardless of what other - pairs are contained in the :obj:`groups`. It - assumes the user knows what contacts - are present and can come up with a meaningful - list. Not all labels need to be in all - :obj:`groups` nor do all :obj:`groups` - have to contain all labels, but at least - one label needs to match, otherwise the - method will fail + eg. ["GLU30-ALA30", "ARG131@3.50-TYR20"]. + Only these residue pairs (in this order) + will be shown, regardless of what other + pairs are contained in the `groups`. It + assumes the user knows what contacts + are present and can come up with a meaningful + list. Not all labels need to be in all + `groups` nor do all `groups` + have to contain all labels, but at least + one label needs to match, otherwise the + method will fail zero_freq : float, default is 1e-2 Frequencies below this number will be considered zero and not shown it they are @@ -1506,37 +1511,41 @@ def compare_violins(groups, can also be removed. Only has an effect if `ctc_cutoff_Ang` is not None. representatives : anything (bool, int, dict, list) default is None - Plot, with a small dot on top of the violins, - the values of the residue-residue distances of representative - geometries. The representative geometries can be parsed - directly as a dict of :obj:`~mdtraj.Trajectory` objects, - or extracted on-the-fly by calling the :obj:`mdciao.contacts.ContactGroup.repframes` - method of each of the `groups`. Check the docs of - :obj:`mdciao.contacts.ContactGroup.repframes` to find out what is meant - with "representative". + Include information about representative values in the + plot. This can be done in several ways. Easiest + is to let this method call :obj:`mdciao.contacts.ContactGroup.repframes` + internally. This will locate representative frames, extract + their residue-residue distance values and plot them as small dots + on top of the violins. When possible, also the geometries corresponding + to these frames will be returned. Alternatively, the user + can directly input a dictionary of :obj:`~mdtraj.Trajectory` objects + (representative or not) for which the residue-residue distance values + will be computed and plotted. Check the docs of + :obj:`mdciao.contacts.ContactGroup.repframes` to find out + what is meant with "representative". This is what each type of input does: * boolean True: - Calls :obj:`mdciao.ContactGroup.repframes` with the - method's default parameters and plots the result + Calls :obj:`mdciao.ContactGroup.repframes` with the + method's default parameters. * int > 0: - Calls :obj:`mdciao.ContactGroup.repframes` with the - parameter `n_frames` set to this integer. This parameter - controls how many representatives are extracted and - subsequently plotted. + Calls :obj:`mdciao.ContactGroup.repframes` with the + parameter `n_frames` set to this integer. This parameter + controls how many representatives are extracted and + subsequently plotted. * dict of parameters: - A dictionary with explict values for the optional - parameters of :obj:`mdciao.contacts.ContactGroup.repframes`, - usually `n_frames` (an int) and `scheme`, ("mean" or "mode"), - depending what you mean with "representative". Check the method's - documentation for more info. + A dictionary with explict values for the optional + parameters of :obj:`mdciao.contacts.ContactGroup.repframes`, + usually `n_frames` (an int) and `scheme`, ("mean" or "mode"), + depending what you mean with "representative". Check the method's + documentation for more info. * dict of :obj:`~mdtraj.Trajectory` objects: - Has to have the same keys as `groups`. No checks are done - whether these objects match the actual molecular topologies - of `groups`, so beware of potential mismatches here. - Typically, these frames come from having used - :obj:`mdciao.contacts.ContactGroup.repframes` with - `return_traj`=True. + Has to have the same keys as `groups`. No checks are done + whether these objects match the actual molecular topologies + of `groups`, so beware of potential mismatches here. + Typically, these frames come from having used + :obj:`mdciao.contacts.ContactGroup.repframes` with + `return_traj`=True. * dict of dicts containing values #TODO not implemented yet @@ -1547,6 +1556,12 @@ def compare_violins(groups, labels : list The list of plotted labels, in the order they are plotted + repframes : dict + Will only be returned if + `representatives` was not None. + The representative frames for + each `group` according to the + parameters of `representatives` """ _fontsize=_rcParams["font.size"] _rcParams["font.size"] = fontsize @@ -1559,6 +1574,7 @@ def compare_violins(groups, else: _groups = groups repframes_per_sys_per_ctc = {} + reptraj_per_sys_per_ctc = {} for syskey, group in _groups.items(): labels = group.gen_ctc_labels(AA_format=AA_format, fragments=[True if defrag is None else False][0], @@ -1570,22 +1586,41 @@ def compare_violins(groups, freqs_per_sys_per_ctc[syskey] = {key:freq for key, freq in zip(labels, group.frequency_per_contact(ctc_cutoff_Ang))} if bool(representatives): + #Tune the kwargs on a per-case basis then call repframes only once, + # wrapped in the try block for when there's no files + repframes_kwargs = {"ctc_cutoff_Ang": ctc_cutoff_Ang, + "return_traj": True} # Do we have representatives? if isinstance(representatives, bool): - d = group.repframes(ctc_cutoff_Ang=ctc_cutoff_Ang)[2] + pass if isinstance(representatives, int) and representatives>0: - d = group.repframes(ctc_cutoff_Ang=ctc_cutoff_Ang, n_frames=representatives)[2].T + repframes_kwargs.update({"n_frames" : representatives, + "verbose" : False}) if isinstance(representatives, dict) and len(representatives)>0: if syskey not in representatives.keys() : + representatives.update(repframes_kwargs) representatives.pop("ctc_cutoff_ang", None) representatives.pop("show_violins", None) - d = group.repframes(**representatives)[2].T + representatives["return_traj"] = True else: assert isinstance(representatives[syskey], _md.Trajectory) d = _md.compute_contacts(representatives[syskey], contacts=group.res_idxs_pairs)[0].T + traj = representatives[syskey] + repframes_kwargs = None + + if repframes_kwargs is not None: + try: + __, __, d, traj = group.repframes(**repframes_kwargs) + except FileNotFoundError as e: + print(e) + repframes_kwargs["return_traj"] = False + __, __, d = group.repframes(**repframes_kwargs) + traj = None + d = d.T.squeeze() + repframes_per_sys_per_ctc[syskey] = {key: val * 10 for key, val in zip(labels, d)} - + reptraj_per_sys_per_ctc[syskey]=traj representatives = bool(representatives) # Unify data data4violins_per_sys_per_ctc = _mdcu.str_and_dict.unify_freq_dicts(data4violins_per_sys_per_ctc, @@ -1701,7 +1736,10 @@ def compare_violins(groups, myfig.tight_layout() _rcParams["font.size"] = _fontsize - return myfig, iax, list(key2ii.keys()) + if repframes_per_sys_per_ctc != {}: + return myfig, iax, list(key2ii.keys()), reptraj_per_sys_per_ctc + else: + return myfig, iax, list(key2ii.keys()) def _sorter_by_key_or_val(sort_by, indict): diff --git a/tests/test_plots.py b/tests/test_plots.py index b56fb10c..268160de 100644 --- a/tests/test_plots.py +++ b/tests/test_plots.py @@ -741,11 +741,10 @@ def setUpClass(cls): cls.CGL394_larger = ContactGroupL394(ctc_control=.99, ctc_cutoff_Ang=5) def test_works(self): - fig, ax, sorted_keys = plots.compare_violins({"small":self.CGL394, "big":self.CGL394_larger}, - anchor="L394", - ymax=10, ctc_cutoff_Ang=4) - - #fig.savefig("test.pdf") + fig, ax, sorted_keys = plots.compare_violins({"small": self.CGL394, "big": self.CGL394_larger}, + anchor="L394", + ymax=10, ctc_cutoff_Ang=4) + # fig.savefig("test.pdf") _plt.close("all") def test_works_no_defrag_and_list_zero_freq_remove_identities(self): @@ -772,36 +771,39 @@ def test_works_no_defrag_and_list_zero_freq_remove_identities(self): def test_works_no_defrag_and_list(self): fig, ax, sorted_keys = plots.compare_violins([self.CGL394, self.CGL394_larger], - anchor="L394", - ymax=10, ctc_cutoff_Ang=4, - defrag=None) + anchor="L394", + ymax=10, ctc_cutoff_Ang=4, + defrag=None) #fig.savefig("test.pdf") _plt.close("all") def test_repframes_True(self): - fig, ax, sorted_keys = plots.compare_violins({"small": self.CGL394, "big": self.CGL394_larger}, - anchor="L394", - ymax=10, ctc_cutoff_Ang=4, - representatives=True) - + fig, ax, sorted_keys, repframes = plots.compare_violins({"small": self.CGL394, "big": self.CGL394_larger}, + anchor="L394", + ymax=10, ctc_cutoff_Ang=4, + representatives=True) + assert repframes["small"] is None + assert repframes["big"] is None #fig.savefig("test.pdf") _plt.close("all") def test_repframes_int(self): - fig, ax, sorted_keys = plots.compare_violins({"small": self.CGL394, "big": self.CGL394_larger}, - anchor="L394", - ymax=10, ctc_cutoff_Ang=4, - representatives=2) + fig, ax, sorted_keys, repframes = plots.compare_violins({"small": self.CGL394, "big": self.CGL394_larger}, + anchor="L394", + ymax=10, ctc_cutoff_Ang=4, + representatives=2) + assert repframes["small"] is None + assert repframes["big"] is None # fig.savefig("test.pdf") _plt.close("all") def test_repframes_dict_kwargs(self): - fig, ax, sorted_keys = plots.compare_violins({"small": self.CGL394, "big": self.CGL394_larger}, - anchor="L394", - ymax=10, ctc_cutoff_Ang=4, - representatives={"n_frames":3, "scheme":"mean"}) + fig, ax, sorted_keys, repframes_out = plots.compare_violins({"small": self.CGL394, "big": self.CGL394_larger}, + anchor="L394", + ymax=10, ctc_cutoff_Ang=4, + representatives={"n_frames": 3, "scheme": "mean"}) #fig.savefig("test.pdf") _plt.close("all") @@ -809,9 +811,11 @@ def test_repframes_dict_geoms(self): traj = _md.load(test_filenames.traj_xtc_stride_20, top=test_filenames.top_pdb) repframes = {"small" : traj[:3], "big" : traj[:5]} - fig, ax, sorted_keys = plots.compare_violins({"small": self.CGL394, "big": self.CGL394_larger}, - anchor="L394", - ymax=10, ctc_cutoff_Ang=4, - representatives=repframes) + fig, ax, sorted_keys, repframes_out = plots.compare_violins({"small": self.CGL394, "big": self.CGL394_larger}, + anchor="L394", + ymax=10, ctc_cutoff_Ang=4, + representatives=repframes) + for key, val in repframes_out.items(): + assert val is repframes[key] #fig.savefig("test.pdf") _plt.close("all") \ No newline at end of file From 7556f8d06efa6a1c84e8789fef6715937ccfa300 Mon Sep 17 00:00:00 2001 From: gph82 Date: Sun, 25 Aug 2024 19:49:16 +0200 Subject: [PATCH 48/83] [contacts.ContactGroup.repframes] use fstring --- mdciao/contacts/contacts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index bd8be451..0f4ce57f 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -6378,7 +6378,7 @@ def repframes(self, scheme="mode", if _path.exists(reptraj): geoms.append(_md.load(reptraj, top=self.top,frame=frame_idx)) else: - raise FileNotFoundError("The file %s can't be found anymore. Is this an `mdciao.examples` object?"%reptraj) + raise FileNotFoundError(f"The file '{reptraj}' can't be found anymore. Is this an `mdciao.examples` object?") else: geoms.append(reptraj[frame_idx]) return_tuple += tuple([geoms]) From f6c188fe1dfe187c3a820329473ac89ef44c7dfd Mon Sep 17 00:00:00 2001 From: gph82 Date: Sun, 25 Aug 2024 20:38:38 +0200 Subject: [PATCH 49/83] [contacts.ConatactGroup.frequency_as_contact_matrix_CG] use 'fragments' when calling _consensus_maps2consensus_frags --- mdciao/contacts/contacts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 0f4ce57f..41840e8b 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -4254,6 +4254,7 @@ def frequency_as_contact_matrix_CG(self, if consensus_labelers is not None: consensus_maps, consensus_frags = _consensus_maps2consensus_frags(self.top, consensus_labelers, + fragments=fragments, verbose=verbose) if len(consensus_frags) > 0: fragments, fragment_names = _mdcfr.mix_fragments(self.top.n_residues - 1, From 6dc08976742bdabc6123ea52c3a6bd1f6e795f1c Mon Sep 17 00:00:00 2001 From: gph82 Date: Sun, 25 Aug 2024 22:10:41 +0200 Subject: [PATCH 50/83] [utils.residue_and_atom._top2AAmap] new method, tests --- mdciao/utils/residue_and_atom.py | 26 ++++++++++++++++ tests/test_residue_and_atom_utils.py | 44 ++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/mdciao/utils/residue_and_atom.py b/mdciao/utils/residue_and_atom.py index 96c62af5..ab002407 100644 --- a/mdciao/utils/residue_and_atom.py +++ b/mdciao/utils/residue_and_atom.py @@ -19,6 +19,7 @@ from mdciao.utils.str_and_dict import _kwargs_subs from collections import Counter as _Counter from pandas import DataFrame as _DF +from collections import defaultdict as _defdict def residues_from_descriptors(residue_descriptors, fragments, top, @@ -268,6 +269,31 @@ def rangeexpand_residues2residxs(range_as_str, fragments, top, residxs_out = _pandas_unique(residxs_out) return residxs_out +def _top2AAmap(top): + r""" + + Return a dictionary mapping AA expresions (GLU30, E30) to topology indices, for easier grabbing + + Maybe use with find_AA at some point + + Parameters + ---------- + top : :obj:`mdtraj.Topology` + + Returns + ------- + AA_dict : dict + Keys are residue short and long codes (GLU30, E30) or + long codes for nonstandard AAs (GTP365). Values + are lists, since one topology might have more + than one residue labeled E30 + """ + AA_dict = _defdict(list) + [AA_dict[str(rr)].append(rr.index) for rr in top.residues] + [AA_dict[shorten_AA(rr, keep_index=True, substitute_fail='long')].append(rr.index) for rr + in top.residues] + return {key : _np.unique(val).tolist() for key, val in AA_dict.items()} + def int_from_AA_code(key): """ Returns the integer part from a residue name, None if there isn't diff --git a/tests/test_residue_and_atom_utils.py b/tests/test_residue_and_atom_utils.py index f323128e..0fb30b77 100644 --- a/tests/test_residue_and_atom_utils.py +++ b/tests/test_residue_and_atom_utils.py @@ -59,6 +59,50 @@ def test_just_numbers(self): np.testing.assert_array_equal(residue_and_atom.find_AA("28", self.geom2frags.top), [5, 13]) +class Test_top2AAmap(unittest.TestCase): + + def setUp(self): + self.geom = md.load(test_filenames.small_monomer) + self.geom2frags = md.load(test_filenames.small_dimer) + + def test_works(self): + AA = residue_and_atom._top2AAmap(self.geom.top) + test_dict = {"GLU30": [0], + "VAL31": [1], + "TRP32": [2], + "ILE26": [3], + "GLU27": [4], + "LYS29": [5], + "P0G381": [6], + "GDP382": [7]} + test_dict.update({"E30": [0], + "V31": [1], + "W32": [2], + "I26": [3], + "E27": [4], + "K29": [5]}) + self.assertDictEqual(AA, test_dict + ) + + def test_works_dimer(self): + AA = residue_and_atom._top2AAmap(self.geom2frags.top) + test_dict = {"GLU30": [0, 8], + "VAL31": [1, 9], + "TRP32": [2, 10], + "ILE26": [3, 11], + "GLU27": [4, 12], + "LYS28": [5, 13], + "P0G381": [6, 14], + "GDP382": [7, 15]} + test_dict.update({"E30": [0, 8], + "V31": [1, 9], + "W32": [2, 10], + "I26": [3, 11], + "E27": [4, 12], + "K28": [5, 13]}) + self.assertDictEqual(AA, test_dict) + + class Test_int_from_AA_code(unittest.TestCase): def test_int_from_AA_code(self): assert (residue_and_atom.int_from_AA_code("GLU30") == 30) From 9c223afab231d8ca294fb6e3bf160b7321508796 Mon Sep 17 00:00:00 2001 From: gph82 Date: Sun, 25 Aug 2024 22:46:53 +0200 Subject: [PATCH 51/83] [utils.residue_and_atom.rangeexpand_residues2residxs] accept expressions for exclusion, e.g. "-GLU30", tests --- mdciao/utils/residue_and_atom.py | 36 ++++++++++++++++++---------- tests/test_residue_and_atom_utils.py | 6 +++++ 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/mdciao/utils/residue_and_atom.py b/mdciao/utils/residue_and_atom.py index ab002407..cd1af0f7 100644 --- a/mdciao/utils/residue_and_atom.py +++ b/mdciao/utils/residue_and_atom.py @@ -16,7 +16,7 @@ import numpy as _np from pandas import unique as _pandas_unique from mdciao.utils.lists import in_what_N_fragments as _in_what_N_fragments, force_iterable as _force_iterable -from mdciao.utils.str_and_dict import _kwargs_subs +from mdciao.utils.str_and_dict import _kwargs_subs, match_dict_by_patterns as _match_dict_by_patterns from collections import Counter as _Counter from pandas import DataFrame as _DF from collections import defaultdict as _defdict @@ -169,7 +169,7 @@ def rangeexpand_residues2residxs(range_as_str, fragments, top, Generalized range-expander from residue descriptors. Residue descriptors can be anything that :obj:`find_AA` understands. - Expanding a range means getting "2-5,7" as input and returning "2,3,4,5,7" + Expanding a range means getting "2-5,7" as input and returning "2,3,4,5,7". To dis-ambiguate descriptors, a fragment definition and a topology are needed @@ -184,12 +184,17 @@ def rangeexpand_residues2residxs(range_as_str, fragments, top, The input (= compressed range) is very flexible and accepts mixed descriptors and wildcards, eg: GLU*,ARG*,GDP*,LEU394,380-385 is a valid range. - Wildcards use the full resnames, i.e. E* is NOT equivalent to GLU* + Expressions starting with "-", e.g. are exclusions, s.t. "GLU*,-GLU30" will + select all GLUs except GLU30. + + Wildcards use the full resnames, i.e. "E*" is NOT equivalent to "GLU*" + + Expressions leading to empty ranges raise ValueError. Be aware, though, that wildcards are very powerful and easily "grab" a lot of residues, leading to long calculations and large outputs. - See :obj:`find_AA` for more on residue descriptors + See :obj:`find_AA` for more on residue descriptors. Parameters ---------- @@ -214,20 +219,21 @@ def rangeexpand_residues2residxs(range_as_str, fragments, top, residxs_out = list of unique residue indices """ residxs_out = [] - #print("For the range", range_as_str) + AA_dict_for_exclusion, exclude = _top2AAmap(top), [] if not isinstance(range_as_str,str): range_as_str = _force_iterable(range_as_str) assert all([isinstance(ii,(int,_np.int64)) for ii in range_as_str]),(range_as_str,[type(ii) for ii in range_as_str]) range_as_str= ','.join([str(ii) for ii in range_as_str]) for r in [r for r in range_as_str.split(',') if r!=""]: - assert not r.startswith("-") - if "*" in r or "?" in r: - assert "-" not in r - filtered = find_AA(r, top, extra_columns= residues_from_descriptors_kwargs.get("additional_resnaming_dicts")) - if len(filtered)==0: - raise ValueError("The input range contains '%s' which " - "returns no residues!"%r) - residxs_out.extend(filtered) + if "*" in r or "?" in r or r.startswith("-"): + if r.startswith("-"): + exclude.extend(_match_dict_by_patterns(r[1:], AA_dict_for_exclusion)[1]) + else: + filtered = find_AA(r, top, extra_columns= residues_from_descriptors_kwargs.get("additional_resnaming_dicts")) + if len(filtered)==0: + raise ValueError("The input range contains '%s' which " + "returns no residues!"%r) + residxs_out.extend(filtered) else: resnames = r.split('-') is_range = "-" in r @@ -263,6 +269,10 @@ def rangeexpand_residues2residxs(range_as_str, fragments, top, residxs_out.extend(for_extending) + # Exclude the exclusions + exclude = _np.unique(exclude) + residxs_out=[rr for rr in residxs_out if rr not in exclude] + if sort: residxs_out = sorted(residxs_out) diff --git a/tests/test_residue_and_atom_utils.py b/tests/test_residue_and_atom_utils.py index 0fb30b77..de6d83ed 100644 --- a/tests/test_residue_and_atom_utils.py +++ b/tests/test_residue_and_atom_utils.py @@ -378,6 +378,12 @@ def test_wildcards(self): self.top) np.testing.assert_array_equal(expanded_range, [0, 4]) + def test_exlusions(self): + expanded_range = residue_and_atom.rangeexpand_residues2residxs("GLU*,-GLU30", + self.fragments, + self.top) + np.testing.assert_array_equal(expanded_range, [4]) + def test_rangeexpand_res_idxs(self): expanded_range = residue_and_atom.rangeexpand_residues2residxs("2-4,6", self.fragments, From 7f5163c5af87f154896aa36d46cea5fc347f5aa4 Mon Sep 17 00:00:00 2001 From: gph82 Date: Sun, 25 Aug 2024 22:59:26 +0200 Subject: [PATCH 52/83] [cli.sites] return None instead of "NA" for missing consensus labels when using choose_between_consensus_dicts At this point it might be wise to remove the no_key optarg alltogether, since no method uses "NA" anymore --- mdciao/cli/cli.py | 2 +- mdciao/nomenclature/nomenclature.py | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 2d7c3e0c..761c5655 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -2111,7 +2111,7 @@ def sites(site_inputs, site_as_gc[key] = [] for idx in imap: pair = ctc_idxs_small[idx] - consensus_labels = [_mdcnomenc.choose_between_consensus_dicts(idx, list(consensus_maps.values())) for idx in pair] + consensus_labels = [_mdcnomenc.choose_between_consensus_dicts(idx, list(consensus_maps.values()), no_key=None) for idx in pair] fragment_idxs = [_mdcu.lists.in_what_fragment(idx, fragments_as_residue_idxs) for idx in pair] site_as_gc[key].append(_mdcctcs.ContactPair(pair, [itraj[:, idx] for itraj in ctcs], diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index eb0c8181..d75278c4 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -2020,10 +2020,10 @@ def _fill_consensus_gaps(consensus_list, top, verbose=False): def choose_between_consensus_dicts(idx, consensus_maps, no_key="NA"): """ - Choose the best consensus label for a given :obj:`idx` in case - there are more than one consensus(es) at play (e.g. GPCR and CGN). + Choose the best consensus label for a given `idx` in case + there are more than one consensus(es) at play (e.g. GPCR, CGN, KLIFS). - Wil raise error if both dictionaries have a consensus label for + Wil raise error if more than one dictionary has a consensus label for the same index (unusual case) Parameters @@ -2031,15 +2031,18 @@ def choose_between_consensus_dicts(idx, consensus_maps, no_key="NA"): idx : int index for which the relabeling is needed consensus_maps : list - The items in the list should be "gettable" by using :obj:`idx`, + The items in the list should be "gettable" by using `idx`, either by being lists, arrays, or dicts, s.t., the corresponding value should be the label. no_key : str - output message if there is no label for the residue idx in any of the dictionaries. - + Output string if there is no label for the + residue `idx` in any of the dictionaries. + Mighg be removed in the future, since currently + all calls to this method use no_key=None, + since no method uses "NA" anymore Returns ------- - string + string: str label of the residue idx if present else :obj:`no_key` """ From 0934a1fd7465824e3c6d5c601ffe09e2c1e48976 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 26 Aug 2024 14:12:35 +0200 Subject: [PATCH 53/83] [contacts.ContactGroup._args2df] new colum "self interface residx" to account for residue in both interface members --- mdciao/contacts/contacts.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 41840e8b..a35d2094 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -6003,6 +6003,10 @@ def _args2df(self, ctc_cutoff_Ang, fragments, fragment_names, consensus_maps, ve for ii in [0, 1]: [list_of_dicts[res].update({"interface fragment":ii}) for res in self.interface_fragments[ii]] [list_of_dicts[res].update({"interface residx": ii}) for res in self.interface_residxs[ii]] + # Account for residues in both interface members + self_interface = _np.intersect1d(self.interface_fragments[0], self.interface_fragments[1]) + for ii in range(self.top.n_residues): + list_of_dicts[ii].update({"self interface residx": [True if ii in self_interface else False][0]}) if consensus_maps is not None: consensus_maps, consensus_frags = _consensus_maps2consensus_frags(self.top, consensus_maps, verbose=verbose, fragments=fragments) From 347e62df3032369f322c8156c9e5d44c463cb922 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 26 Aug 2024 14:17:35 +0200 Subject: [PATCH 54/83] [contacts.ContactGroup] docs --- mdciao/contacts/contacts.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index a35d2094..f774d5fb 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -2535,18 +2535,19 @@ def __init__(self, Parameters ---------- list_of_contact_objects : list - list of :obj:`ContactPair` objects + List of :obj:`ContactPair` objects. + Will be accesseible at :obj:`ContactGroup.contact_pairs`. interface_fragments : list of two iterables of indexes, default is None An interface is defined by two groups of residue indices. This input doesn't need to have all or any of the residue indices in - :obj:`res_idxs_pairs`. + `res_idxs_pairs`. - This input will be will be used to group + This input will be used to group the object's own residue idxs present in - :obj:`residxs_pairs` into the two groups + `residxs_pairs` into the two groups of the interface. These two groups will be accessible through the attribute self.interface_residxs @@ -2557,15 +2558,15 @@ def __init__(self, The molecular topology associated with this object. Normally, the default behaviour is enough. It checks whether all - ContactPairs of :obj:`list_of_contact_objects` + ContactPairs of `list_of_contact_objects` share the same self.top and use that one. If they have different topologies, the method fails, since you can't instantiate - a ContactGroup with ContactPairs from different. + a ContactGroup with ContactPairs from different topologies. In case the ContactPairs don't have any topology at all (self.top is None for all ContactPairs) - you can pass one here. Or, if the have one, and you - pass one here, it will be checked that :obj:`top` provided + you can pass one here. Or, if they have one, and you + pass one here, it will be checked that `top` provided here coincides with the ContactPairs' shared topology name : string, default is None Optional name you want to give this object, @@ -2575,7 +2576,7 @@ def __init__(self, neighbors_excluded : int, default is None The neighbors excluded when creating the underlying ContactPairs passed in - :obj:`list_of_contact_objects` + `list_of_contact_objects` max_cutoff_Ang : float, default is None Operations involving cutoffs higher than this will be forbidden and will From 3e46c64bf8f794f9ca76da11b78abc39b4e3d972 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 26 Aug 2024 14:21:04 +0200 Subject: [PATCH 55/83] [contacts.ContactGroup._full_color_list] Handle color in specific situation: one single fragment, self-interface=True, where two colors are needed --- mdciao/contacts/contacts.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index f774d5fb..97ce67bf 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -7894,9 +7894,6 @@ def _full_color_list(top, df, colors=None) -> _DF: The main idea is to incorporate per-residue color values - Main ideas: - * Create - Parameters ---------- top : :obj:`~mdtraj.Topology` @@ -7925,13 +7922,21 @@ def _full_color_list(top, df, colors=None) -> _DF: if colors is None: _colors = list(_mdcplots.color_dict_guesser("tab10", _np.arange(len(frags_from_df))).values()) jdf["frag_color"] = list(_mdcflare._utils.col_list_from_input_and_fragments(_colors, frags_from_df)) + one_intf_frag = _np.unique(jdf["interface fragment"]) + if len(one_intf_frag)==1: + assert jdf["self interface residx"].any(), ValueError("If there's only one interface fragment, " + "then there should be some shared residues between interface members") + #For the purposes of the flareplot we'll split the shared residxs + jdf.loc[jdf["self interface residx"], "interface fragment"] = {1 : 0, + 0 : 1}[one_intf_frag[0]] + else: jdf["frag_color"] = list(_mdcflare._utils.col_list_from_input_and_fragments(colors, frags_from_df)) if "interface fragment" in df.keys(): - # TODO do this from self.interface_indices - intf_from_df = [_np.flatnonzero(df["interface fragment"] == ii) for ii in - df[~df["interface fragment"].isnull()]["interface fragment"].unique()] + # TODO do this from self.interface_indices or with groupby + intf_from_df = [_np.flatnonzero(jdf["interface fragment"] == ii) for ii in + jdf[~jdf["interface fragment"].isnull()]["interface fragment"].unique()] intf_colors = [None] * top.n_residues if colors is None: if len(frags_from_df)==1: #means no fragments, TODO think about other way of infering this From 6f2f1b4d61b62e8b175bd052c2a22f33ea4ac0a7 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 27 Aug 2024 19:28:52 +0200 Subject: [PATCH 56/83] [plots.compare_groups_of_contacts] Remove unused optarg exclude --- mdciao/plots/plots.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mdciao/plots/plots.py b/mdciao/plots/plots.py index 97f5d8c8..339dc9cb 100644 --- a/mdciao/plots/plots.py +++ b/mdciao/plots/plots.py @@ -725,7 +725,6 @@ def compare_groups_of_contacts(groups, fontsize=16, anchor=None, plot_singles=False, - exclude=None, ctc_cutoff_Ang=None, AA_format='short', defrag='@', @@ -810,10 +809,6 @@ def compare_groups_of_contacts(groups, plotted separately. The labels used will have been already "mutated" using :obj:`mutations_dict` and "anchored" using :obj:`anchor`. This plot is temporary and cannot be saved - exclude : list, default is None - keys containing these strings will be excluded. - NOTE: This is not implemented yet, will raise an error - ctc_cutoff_Ang : float, default is None Needed value to compute frequencies on-the-fly if the input was using :obj:`ContactGroup` objects AA_format : str, default is "short" @@ -937,7 +932,7 @@ def compare_groups_of_contacts(groups, freqs[key] = idict if distro: - freqs = _mdcu.str_and_dict.unify_freq_dicts(freqs, exclude, defrag=defrag, is_freq=False) + freqs = _mdcu.str_and_dict.unify_freq_dicts(freqs, defrag=defrag, is_freq=False) myfig, __ = plot_unified_distro_dicts(freqs, colors=colors, ctc_cutoff_Ang=ctc_cutoff_Ang, fontsize=fontsize, @@ -972,7 +967,7 @@ def compare_groups_of_contacts(groups, myfig.tight_layout() if interface: - freqs = [_mdcu.str_and_dict.unify_freq_dicts({key : val[ii] for key, val in freqs.items()}, exclude, + freqs = [_mdcu.str_and_dict.unify_freq_dicts({key : val[ii] for key, val in freqs.items()}, per_residue=False, defrag=defrag) for ii in [0,1]] by_interface_sorted_keys = [_sorting_schemes(idict, sort_by=kwargs_plot_unified_freq_dicts.get("sort_by", "mean"), @@ -984,7 +979,7 @@ def compare_groups_of_contacts(groups, freqs[0][key].update(freqs[1][key]) freqs = freqs[0] else: - freqs = _mdcu.str_and_dict.unify_freq_dicts(freqs, exclude, + freqs = _mdcu.str_and_dict.unify_freq_dicts(freqs, per_residue=per_residue, defrag=defrag) if per_residue or interface: From 3669e4691f905cb2f52444ad761669e498318ea1 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 27 Aug 2024 19:45:02 +0200 Subject: [PATCH 57/83] [contacts.ContactGroup._args2df] "self interface residx" only if interface --- mdciao/contacts/contacts.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 97ce67bf..3a6d761f 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -7922,13 +7922,14 @@ def _full_color_list(top, df, colors=None) -> _DF: if colors is None: _colors = list(_mdcplots.color_dict_guesser("tab10", _np.arange(len(frags_from_df))).values()) jdf["frag_color"] = list(_mdcflare._utils.col_list_from_input_and_fragments(_colors, frags_from_df)) - one_intf_frag = _np.unique(jdf["interface fragment"]) - if len(one_intf_frag)==1: - assert jdf["self interface residx"].any(), ValueError("If there's only one interface fragment, " - "then there should be some shared residues between interface members") - #For the purposes of the flareplot we'll split the shared residxs - jdf.loc[jdf["self interface residx"], "interface fragment"] = {1 : 0, - 0 : 1}[one_intf_frag[0]] + if "interface fragment" in df.keys(): + one_intf_frag = _np.unique(jdf["interface fragment"]) + if len(one_intf_frag)==1: + assert jdf["self interface residx"].any(), ValueError("If there's only one interface fragment, " + "then there should be some shared residues between interface members") + #For the purposes of the flareplot we'll split the shared residxs + jdf.loc[jdf["self interface residx"], "interface fragment"] = {1 : 0, + 0 : 1}[one_intf_frag[0]] else: jdf["frag_color"] = list(_mdcflare._utils.col_list_from_input_and_fragments(colors, frags_from_df)) From d61fef5ff951fd9f29de6860faf72702631b2dac Mon Sep 17 00:00:00 2001 From: gph82 Date: Wed, 28 Aug 2024 13:41:52 +0200 Subject: [PATCH 58/83] [contacts.ContactGroup.repframes] docs --- mdciao/contacts/contacts.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 3a6d761f..072b7a27 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -6261,13 +6261,13 @@ def repframes(self, scheme="mode", Please note that "representative" can have other meanings in other contexts. Here, it's just a way to pick a frames/geometries that will most likely resemble most of what - is also seen in the distributions, barplots and flareplots. + is also seen in the distributions, barplots, violinplots, and flareplots. Please also note that minimizing **averages** has its own limitations and might not always yield the best result, However, it is the easiest and quickest to implement. Feel free to use any of Sklearn's great regression tools - under constraints to get a better "representative" + under constraints to get a better "representative". Parameters ---------- @@ -6277,8 +6277,8 @@ def repframes(self, scheme="mode", to the most likely distance, i.e. to the mode, i.e. to the distance values at which - the distributions (:obj:`plot_distance_distributions`) - peak. You can check the modes in + the distributions (:obj:`plot_distance_distributions` or :obj:`plot_violins`) + peak. You can check the mode values in :obj:`~mdciao.contacts.ContactGroup.modes` * "mean" : minimize average distance to the mean values of the distances @@ -6317,21 +6317,23 @@ def repframes(self, scheme="mode", Returns ------- frames : list - A list of :obj:`n_frames` tuples, - each tuple containing the traj_idx - and the frame_idx that minimize - RMSDd + A list of `n_frames` tuples, + each tuple containing the trajectory + and frame index that minimize + RMSDd. RMSDd : np.ndarray A 1D array containing the root-mean-square-deviation (in Angstrom) over distances (not positions) - of the returned :obj:`frames` to the - computed :obj:`reference`. This mean - is weighted by the contact frequencies - in case a :obj:`ctc_cutoff_Ang` was given. - Should always be in ascending order + of the returned `frames` to the + computed reference as specified by the `scheme`. + This mean is weighted by the contact frequencies + in case a `ctc_cutoff_Ang` was given. + Should always be in ascending order, i.e. + the `frames` are sorted from closest to furthest + to the reference. values : np.ndarray A 2D array of shape(n_frames, n_ctcs) containing - the distance values of the :obj:`frames` in + the distance values of the `frames` in Angstrom trajs : list A list of :obj:`~mdtraj.Trajectory` objects From 3701747536971a365de19e4df6da1d5954802cae Mon Sep 17 00:00:00 2001 From: gph82 Date: Wed, 28 Aug 2024 13:49:55 +0200 Subject: [PATCH 59/83] [cli.interface] use the repframe instead of refgeom for SS display in the flareplot --- mdciao/cli/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 761c5655..6f8ece46 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -1671,7 +1671,7 @@ def interface( print(fn.fullpath_overall_dat) if n_repframes>0: n_repframes = _np.min((n_repframes,50)) - repframes_geom = ctc_grp_intf.repframes(ctc_cutoff_Ang=ctc_cutoff_Ang, return_traj=True, n_frames=n_repframes, verbose=False)[-1] + repframes_geom = ctc_grp_intf.repframes(ctc_cutoff_Ang=ctc_cutoff_Ang, return_traj=True, n_frames=n_repframes, verbose=False)[-1][0] repframes_geom = _md.Trajectory([geom.xyz[0] for geom in repframes_geom], topology=repframes_geom[0].top, unitcell_angles=[geom.unitcell_angles[0] for geom in repframes_geom], unitcell_lengths=[geom.unitcell_lengths[0] for geom in repframes_geom], @@ -1744,7 +1744,7 @@ def interface( consensus_maps_.append(consensus_maps[key]) ifig, iax = ctc_grp_intf.plot_freqs_as_flareplot(ctc_cutoff_Ang, consensus_maps=consensus_maps_, - SS=refgeom, + SS= ctc_grp_intf.repframes(ctc_cutoff_Ang=ctc_cutoff_Ang, return_traj=True, n_frames=1, verbose=False)[-1], fragment_names=fragment_names, fragments=fragments_as_residue_idxs, ) From 2ea269ac2d65de890af3f93e27eb87275ecf75e2 Mon Sep 17 00:00:00 2001 From: gph82 Date: Wed, 28 Aug 2024 17:25:50 +0200 Subject: [PATCH 60/83] [flare.freqs2flare] docs --- mdciao/flare/flare.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mdciao/flare/flare.py b/mdciao/flare/flare.py index 01a45780..173a7fc0 100644 --- a/mdciao/flare/flare.py +++ b/mdciao/flare/flare.py @@ -183,13 +183,14 @@ def freqs2flare(freqs, res_idxs_pairs, simultaneously. SS : any, default is None Can be several things: - * Array containing secondary structure (ss) information to - be included in the flareplot. Indexed by residue index, - i.e. it can also be a dictionary as long as - SS[idx] returns the SS for residue with that residue idx - * Path to filename, will be passed to - :obj:`mdciao.utils.residue_and_atom.get_SS`, check - the docs there + + * Array containing secondary structure (ss) information to + be included in the flareplot. Indexed by residue index, + i.e. it can also be a dictionary as long as + SS[idx] returns the SS for residue with that residue idx + * Path to filename, will be passed to + :obj:`mdciao.utils.residue_and_atom.get_SS`, check + the docs there panelsize: float, default is 10 Size in inches of the panel (=figsize in matplotlib). Will be ignored if a pre-existing axis object is parsed @@ -293,9 +294,9 @@ def freqs2flare(freqs, res_idxs_pairs, You can do ax.figure.savefig("figure.png") to save the figure. Checkout :obj:`~matplotlib.figure.Figure.savefig` for more options - - plotted_pairs : 2D np.ndarray - + plotted_pairs : 1D np.ndarray + The indices of `res_idxs_pairs` + that have been plotted. plot_attribs : dict Objects of the plot if the user wants to manipulate them further or re-use From c8e7824610625a86b1db52ee05d005e95e374298 Mon Sep 17 00:00:00 2001 From: gph82 Date: Wed, 28 Aug 2024 17:28:25 +0200 Subject: [PATCH 61/83] [contacts.ContactGroup.plot_freqs_as_flareplot] API CHANGE: optarg fragments can be string, also flareplot_attrs --- mdciao/contacts/contacts.py | 36 ++++++++++++++++++++++++------------ tests/test_contacts.py | 12 ++++++------ 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index 072b7a27..f517c704 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -4157,7 +4157,7 @@ def frequency_as_contact_matrix_CG(self, Frequencies of :obj:`self.frequency_per_contact` get coarse-grained into fragments. Fragment - definitions come from :obj:`fragments` and/or + definitions come from `fragments` and/or from the :obj:`consensus_labelers`. These definitions need to contain all residues in self.res_idxs_pairs @@ -4166,7 +4166,7 @@ def frequency_as_contact_matrix_CG(self, definitions get spliced together using :obj:`~mdciao.fragments.splice_orphan_fragments`. This might lead to sub-sets of the input - :obj:`fragments` getting re-labeled as "subfrags" + `fragments` getting re-labeled as "subfrags" and residues not defined anywhere being labelled "orphans". This leads to cumbersome fragment names (and can change in the future), @@ -5782,12 +5782,14 @@ def plot_freqs_as_flareplot(self, ctc_cutoff_Ang, ---------- ctc_cutoff_Ang : float The cutoff to use - fragments : list of iterables, default is None + fragments : string or list of iterables, default is None The way the topology is fragmented. Default is to put all residues in one fragment. This optarg can modify the behaviour of scheme='all', - since residues absent from :obj:`fragments` - will not be plotted, see below. + since residues absent from `fragments` + will not be plotted, see below. If string, + it will be passed as `method` to :obj:mdciao.fragments.get_fragments`, + to get the fragments on the fly. fragment_names : list of strings, default is None The fragment names, at least len(fragments) fragment_colors : None or list of color-likes @@ -5864,15 +5866,15 @@ def plot_freqs_as_flareplot(self, ctc_cutoff_Ang, only work if self.is_interface is True * 'auto' Uses :obj:`self.is_interface` to decide. If True, - :obj:`scheme` is set to 'interface'. + `scheme` is set to 'interface'. If False, e.g. a residue neighborhood or - a site, then :obj:`scheme` is set to 'all' + a site, then `scheme` is set to 'all' * 'interface_sparse': - like 'interface', but using the input :obj:`fragments` + like 'interface', but using the input `fragments` to break self.interface_fragments (which are only two, by definition) further down into other fragments. Of these, show only the ones where at least one residue - participates in the interface. If :obj:`fragments` is + participates in the interface. If `fragments` is None, `scheme='interface'` and `scheme='interface_sparse'` are the same thing. * 'residues': @@ -5884,8 +5886,8 @@ def plot_freqs_as_flareplot(self, ctc_cutoff_Ang, like 'interface_sparse', but leaving out sub-domains not participating in the interface with any contacts.For this, - the :obj:`consensus_maps` need to - be actual :obj:`LabelerConsensus`-objects + the `consensus_maps` need to + be actual `LabelerConsensus`-objects kwargs_freqs2flare: dict Optional keyword arguments for :obj:`mdciao.flare.freqs2flare`. Note that many of these kwargs will be overwritten internally @@ -5907,8 +5909,18 @@ def plot_freqs_as_flareplot(self, ctc_cutoff_Ang, ------- ifig : :obj:`~matplotlib.figure.Figure` ax : :obj:`~matplotlib.axes.Axes` + flareplot_attrs : dict + Flareplot attributes as dictionary containing + matplotlib objects (texts, dots, curves etc) + for further manipulation and fine tuning + of the plot if necessary. See the returned + values of :obj:`mdciao.flare.freqs2flare` + for more information. """ + if isinstance(fragments, str): + fragments = _mdcfr.get_fragments(self.top, fragments, verbose=True) + # We need three (!) methods to guess around the fragments/names/colors...this is bad but "easier" to debug df = self._args2df(ctc_cutoff_Ang, fragments, fragment_names, consensus_maps, verbose=False) fcdf = _full_color_list(self.top, df, colors=fragment_colors) @@ -5940,7 +5952,7 @@ def plot_freqs_as_flareplot(self, ctc_cutoff_Ang, _mdcflare._utils.change_axlims_and_resize_Texts(iax, outer_r_in_data_units) ifig = iax.figure #ifig.tight_layout() - return ifig, iax + return ifig, iax, flareplot_attrs def _args2df(self, ctc_cutoff_Ang, fragments, fragment_names, consensus_maps, verbose) -> _DF: r""" diff --git a/tests/test_contacts.py b/tests/test_contacts.py index c72f179f..0ed57b0b 100644 --- a/tests/test_contacts.py +++ b/tests/test_contacts.py @@ -2740,7 +2740,7 @@ def test_plot_freqs_as_flareplot_just_runs(self): # the minimal examples here cannot test the full flareplot # TODO add full-fledged example here? CG = contacts.ContactGroup([self.cp1_wtop_and_conslabs,self.cp2_wtop_and_conslabs, self.cp3_wtop_and_conslabs]) - ifig, iax = CG.plot_freqs_as_flareplot(10,) + ifig, iax, flareplot_attrs = CG.plot_freqs_as_flareplot(10,) @unittest.skipIf(_sys.version.startswith("3.7") and _platform.system().lower()=="darwin", "Random segfaults when using md.compute_dssp on Python 3.7 on MacOs. See https://github.com/mdtraj/mdtraj/issues/1574") def test_plot_freqs_as_flareplot_just_runs_w_options(self): @@ -2749,7 +2749,7 @@ def test_plot_freqs_as_flareplot_just_runs_w_options(self): # TODO add full-fledged example here? CG = contacts.ContactGroup([self.cp1_wtop_and_conslabs,self.cp2_wtop_and_conslabs, self.cp3_wtop_and_conslabs], top=self.top) - ifig, iax = CG.plot_freqs_as_flareplot(10,SS=self.geom) + ifig, iax, flareplot_attrs = CG.plot_freqs_as_flareplot(10, SS=self.geom) ifig.tight_layout() _plt.close("all") #ifig.savefig("test.pdf") @@ -2757,16 +2757,16 @@ def test_plot_freqs_as_flareplot_just_runs_w_options(self): def test_plot_freqs_as_flareplot_just_runs_w_consensus_maps(self): CG = contacts.ContactGroup([self.cp1_wtop_and_conslabs,self.cp2_wtop_and_conslabs, self.cp3_wtop_and_conslabs], top=self.top) - ifig, iax = CG.plot_freqs_as_flareplot(10,SS=self.geom, - consensus_maps=[["GPH"]*self.top.n_residues]) + ifig, iax, flareplot_attrs = CG.plot_freqs_as_flareplot(10, SS=self.geom, + consensus_maps=[["GPH"] * self.top.n_residues]) ifig.tight_layout() _plt.close("all") def test_plot_freqs_as_flareplot_just_runs_w_SS_array(self): CG = contacts.ContactGroup([self.cp1_wtop_and_conslabs,self.cp2_wtop_and_conslabs, self.cp3_wtop_and_conslabs], top=self.top) - ifig, iax = CG.plot_freqs_as_flareplot(10, - SS=_np.array(["H"]*self.top.n_residues)) + ifig, iax, flareplot_attrs = CG.plot_freqs_as_flareplot(10, + SS=_np.array(["H"] * self.top.n_residues)) ifig.tight_layout() _plt.close("all") From 45a84cb30548e28b5a859807af2969a4edb2971f Mon Sep 17 00:00:00 2001 From: gph82 Date: Wed, 28 Aug 2024 17:29:50 +0200 Subject: [PATCH 62/83] [cli.interface] adapt to new return behavior of plot_freqs_as_flareplot --- mdciao/cli/cli.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 6f8ece46..3ee6a371 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -1742,12 +1742,14 @@ def interface( consensus_maps_.append(consensus_labelers[key]) elif key in consensus_maps.keys(): consensus_maps_.append(consensus_maps[key]) - ifig, iax = ctc_grp_intf.plot_freqs_as_flareplot(ctc_cutoff_Ang, - consensus_maps=consensus_maps_, - SS= ctc_grp_intf.repframes(ctc_cutoff_Ang=ctc_cutoff_Ang, return_traj=True, n_frames=1, verbose=False)[-1], - fragment_names=fragment_names, - fragments=fragments_as_residue_idxs, - ) + ifig, iax, _ = ctc_grp_intf.plot_freqs_as_flareplot(ctc_cutoff_Ang, + consensus_maps=consensus_maps_, + SS=ctc_grp_intf.repframes(ctc_cutoff_Ang=ctc_cutoff_Ang, + return_traj=True, n_frames=1, + verbose=False)[-1][0], + fragment_names=fragment_names, + fragments=fragments_as_residue_idxs, + ) ifig.tight_layout() if savefigs: ifig.savefig(fn.fullpath_flare_vec, bbox_inches="tight") From 15a74191d41213e644f096b13875c440b70353b2 Mon Sep 17 00:00:00 2001 From: gph82 Date: Fri, 30 Aug 2024 21:23:29 +0200 Subject: [PATCH 63/83] [contacts.ContactGroup.repframes] API CHANGE: returned value 'trajs' is not list of trajs but joined trajs, cli.interface adapted --- mdciao/cli/cli.py | 4 ++-- mdciao/contacts/contacts.py | 11 +++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 3ee6a371..ac962ebc 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -1496,7 +1496,7 @@ def interface( in the interface are, on average over all pairs, at a distance close to the most-likely the residue-residue distances over all data. This has some caveats, - expressed in the documentation of :obj:`mdciao.contacts.ContactGroup.n_repframes`. + expressed in the documentation of :obj:`mdciao.contacts.ContactGroup.repframes`. To check what frames have been chosen as representative, it is better to run mdciao in API mode and call :obj:`mdciao.contacts.ContactGroup.n_repframes` @@ -1671,7 +1671,7 @@ def interface( print(fn.fullpath_overall_dat) if n_repframes>0: n_repframes = _np.min((n_repframes,50)) - repframes_geom = ctc_grp_intf.repframes(ctc_cutoff_Ang=ctc_cutoff_Ang, return_traj=True, n_frames=n_repframes, verbose=False)[-1][0] + repframes_geom = ctc_grp_intf.repframes(ctc_cutoff_Ang=ctc_cutoff_Ang, return_traj=True, n_frames=n_repframes, verbose=False)[-1] repframes_geom = _md.Trajectory([geom.xyz[0] for geom in repframes_geom], topology=repframes_geom[0].top, unitcell_angles=[geom.unitcell_angles[0] for geom in repframes_geom], unitcell_lengths=[geom.unitcell_lengths[0] for geom in repframes_geom], diff --git a/mdciao/contacts/contacts.py b/mdciao/contacts/contacts.py index f517c704..dd0d5d15 100644 --- a/mdciao/contacts/contacts.py +++ b/mdciao/contacts/contacts.py @@ -6347,9 +6347,9 @@ def repframes(self, scheme="mode", A 2D array of shape(n_frames, n_ctcs) containing the distance values of the `frames` in Angstrom - trajs : list - A list of :obj:`~mdtraj.Trajectory` objects - Only if return_traj=True + trajs : :obj:`~mdtraj.Trajectory` + An :obj:`~mdtraj.Trajectory` with `n_frames` + frames. Only if `return_traj`=True """ all_ds = self.stacked_time_traces @@ -6396,7 +6396,10 @@ def repframes(self, scheme="mode", print("Returning frame %u of traj nr. %u: %s"%(frame_idx, traj_idx, reptraj)) if isinstance(reptraj, str): if _path.exists(reptraj): - geoms.append(_md.load(reptraj, top=self.top,frame=frame_idx)) + if len(geoms) == 0: + geoms = _md.load(reptraj, top=self.top,frame=frame_idx) + else: + geoms = geoms.join(_md.load(reptraj, top=self.top, frame=frame_idx)) else: raise FileNotFoundError(f"The file '{reptraj}' can't be found anymore. Is this an `mdciao.examples` object?") else: From 8101d697e52436172162ab2c0bb5100da23d21e6 Mon Sep 17 00:00:00 2001 From: gph82 Date: Fri, 30 Aug 2024 21:39:22 +0200 Subject: [PATCH 64/83] [examples/notebooks] Adapt to new API behavior (repframes and flareplot_attrs) --- mdciao/examples/Comparing_CGs_Flares.ipynb | 154 ++++-- mdciao/examples/EGFR_Kinase_Inhibitors.ipynb | 48 +- mdciao/examples/Flareplot_Schemes.ipynb | 85 ++-- .../examples/MSA_via_Consensus_Labels.ipynb | 449 +++++++++++++++++- mdciao/examples/Manuscript.ipynb | 24 +- mdciao/examples/Missing_Contacts.ipynb | 54 ++- 6 files changed, 648 insertions(+), 166 deletions(-) diff --git a/mdciao/examples/Comparing_CGs_Flares.ipynb b/mdciao/examples/Comparing_CGs_Flares.ipynb index e86ffad5..ded53180 100644 --- a/mdciao/examples/Comparing_CGs_Flares.ipynb +++ b/mdciao/examples/Comparing_CGs_Flares.ipynb @@ -105,7 +105,7 @@ "outputs": [], "source": [ "intfWT = interfaces[\"WT\"]\n", - "ifig, iax = intfWT.plot_freqs_as_flareplot(4, scheme=\"all\")" + "ifig, iax, flareplot_attrs = intfWT.plot_freqs_as_flareplot(4, scheme=\"all\")" ] }, { @@ -157,10 +157,10 @@ "source": [ "fragments = mdciao.fragments.get_fragments(intfWT.top,method=\"chains\");\n", "fragment_names = [\"RBD\",\"GLC^RBD\", \"ACE\",\"GLC^ACE\", \"Zn\\nand Cl\",\"NaCl\"]\n", - "ifig, iax = intfWT.plot_freqs_as_flareplot(4, \n", - " scheme=\"all\",\n", - " fragments=fragments, \n", - " fragment_names=fragment_names)" + "ifig, iax, flareplot_attrs= intfWT.plot_freqs_as_flareplot(4, \n", + " scheme=\"all\",\n", + " fragments=fragments, \n", + " fragment_names=fragment_names)" ] }, { @@ -176,8 +176,8 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intfWT.plot_freqs_as_flareplot(4, fragments=fragments,\n", - " fragment_names=fragment_names)" + "ifig, iax, flareplot_attrs = intfWT.plot_freqs_as_flareplot(4, fragments=fragments,\n", + " fragment_names=fragment_names)" ] }, { @@ -260,9 +260,9 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intfWT.plot_freqs_as_flareplot(4, fragments=fragments, \n", - " fragment_names=fragment_names,\n", - " SS=\"example_cov19/run3-clone0.stride.050.h5\")" + "ifig, iax, flareplot_attrs = intfWT.plot_freqs_as_flareplot(4, fragments=fragments, \n", + " fragment_names=fragment_names,\n", + " SS=\"example_cov19/run3-clone0.stride.050.h5\")" ] }, { @@ -311,11 +311,11 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intfWT.plot_freqs_as_flareplot(4, fragments=fragments, \n", - " fragment_names=fragment_names,\n", - " SS=\"example_cov19/run3-clone0.stride.050.h5\",\n", - " highlight_residxs=[85,107], \n", - " )" + "ifig, iax, flareplot_attrs = intfWT.plot_freqs_as_flareplot(4, fragments=fragments, \n", + " fragment_names=fragment_names,\n", + " SS=\"example_cov19/run3-clone0.stride.050.h5\",\n", + " highlight_residxs=[85,107], \n", + " )" ] }, { @@ -346,12 +346,12 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intfWT.plot_freqs_as_flareplot(4, fragments=fragments, \n", - " fragment_names=fragment_names,\n", - " SS=\"example_cov19/run3-clone0.stride.050.h5\",\n", - " highlight_residxs=[85,107],\n", - " scheme=\"residues_sparse\",\n", - " )\n", + "ifig, iax, flareplot_attrs = intfWT.plot_freqs_as_flareplot(4, fragments=fragments, \n", + " fragment_names=fragment_names,\n", + " SS=\"example_cov19/run3-clone0.stride.050.h5\",\n", + " highlight_residxs=[85,107],\n", + " scheme=\"residues_sparse\",\n", + " )\n", "iax.set_title(\"WT\\n$\\\\Sigma$ = %2.1f\"%intfWT.frequency_per_contact(4).sum(), fontsize=20)" ] }, @@ -368,6 +368,63 @@ "In the title, we have also included $\\Sigma$, the sum of all plotted contact-frequencies, to provide an indicator of the average number of contacts present(ed) in this interface." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tweaking the flareplot after plotting: flareplot attributes\n", + "\n", + "It is challenging to cover all usecases (and tastes), with respect to how the figure should look like, by using named arguments only. This is why [plot_freqs_as_flareplot](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html#mdciao.contacts.ContactGroup.plot_freqs_as_flareplot) provides access to all elements of the plot (after they've been plotted) via the returned dictionary `flareplot_attrs`. From the docs [docs](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html#mdciao.contacts.ContactGroup.plot_freqs_as_flareplot)\n", + " \n", + " Returns\n", + " -------\n", + " ifig : :obj:`~matplotlib.figure.Figure`\n", + " ax : :obj:`~matplotlib.axes.Axes`\n", + " flareplot_attrs : dict\n", + " Flareplot attributes as dictionary containing\n", + " matplotlib objects (texts, dots, curves etc)\n", + " for further manipulation and fine tuning\n", + " of the plot if necessary. See the returned\n", + " values of :obj:`mdciao.flare.freqs2flare`\n", + " for more information.\n", + "\n", + "The keys of this dictionary are `'fragment_labels', 'dot_labels', 'dots', 'SS_labels', 'r', 'bezier_lw', 'bezier_curves'`.\n", + "\n", + "For example, we're going to change the color of some labels by manipulating `fragment_labels` after the plot, using\n", + "\n", + "```python\n", + "for text in flareplot_attrs[\"fragment_labels\"]:\n", + " if text.get_text().startswith(\"RBD\"):\n", + " text.set_color(\"tab:blue\")\n", + " elif text.get_text().startswith(\"ACE\"):\n", + " text.set_color(\"tab:green\")\n", + " else:\n", + " text.set_color(\"tab:red\")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ifig, iax, flareplot_attrs = intfWT.plot_freqs_as_flareplot(4, fragments=fragments, \n", + " fragment_names=fragment_names,\n", + " SS=\"example_cov19/run3-clone0.stride.050.h5\",\n", + " highlight_residxs=[85,107],\n", + " scheme=\"residues_sparse\",\n", + " )\n", + "iax.set_title(\"WT\\n$\\\\Sigma$ = %2.1f\"%intfWT.frequency_per_contact(4).sum(), fontsize=20)\n", + "for text in flareplot_attrs[\"fragment_labels\"]:\n", + " if text.get_text().startswith(\"RBD\"):\n", + " text.set_color(\"tab:blue\")\n", + " elif text.get_text().startswith(\"ACE\"):\n", + " text.set_color(\"tab:green\")\n", + " else:\n", + " text.set_color(\"tab:red\") " + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -502,13 +559,13 @@ "\n", "* Second, we call [plot_freqs_as_flareplot](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html#mdciao.contacts.ContactGroup.plot_freqs_as_flareplot) again, using ``plot_curves_only=True`` and passing the ``ax=myax`` of the previous call. From the docs:\n", "```\n", - "ax : :obj:`~matplotlib.axes.Axes`, default is None\n", + "ax : :obj:`~matplotlib.axes.Axes`\n", " Parse an axis to draw on, otherwise one will be created\n", - " using :obj:`panelsize`. In case you want to\n", - " re-use the same cirlce of residues as a\n", + " using `panelsize`. In case you want to\n", + " re-use the same circle of residues as a\n", " background to plot different sets\n", - " of :obj:`freqs`, **YOU HAVE TO USE THE SAME**\n", - " :obj:`fragments` and :obj:`sparse` values \n", + " of `freqs`, **YOU HAVE TO USE THE SAME**\n", + " `fragments` and `sparse` values\n", " **on all calls**, else the\n", " bezier lines will be placed erroneously.\n", "[...]\n", @@ -543,26 +600,26 @@ "outputs": [], "source": [ "key1 = \"WT\"\n", - "myfig, myax = interfaces[key1].plot_freqs_as_flareplot(4, \n", - " fragments=mdciao.fragments.get_fragments(interfaces[key1].top, \"chains\", verbose=False), \n", - " fragment_names=fragment_names,\n", - " SS=\"example_cov19/run3-clone0.stride.050.h5\",\n", - " highlight_residxs=[85,107],\n", - " sparse_residues=union,\n", - " bezier_linecolor=colors[key1],\n", - " )\n", + "ifig, iax, flareplot_attrs = interfaces[key1].plot_freqs_as_flareplot(4, \n", + " fragments=mdciao.fragments.get_fragments(interfaces[key1].top, \"chains\", verbose=False), \n", + " fragment_names=fragment_names,\n", + " SS=\"example_cov19/run3-clone0.stride.050.h5\",\n", + " highlight_residxs=[85,107],\n", + " sparse_residues=union,\n", + " bezier_linecolor=colors[key1],\n", + " )\n", "key2=\"K417V\"\n", - "myfig, myax = interfaces[key2].plot_freqs_as_flareplot(4, \n", - " fragments=mdciao.fragments.get_fragments(interfaces[key2].top, \"chains\", verbose=False), \n", - " #fragment_names=fragment_names[:-2],\n", - " #SS=\"run3-clone0.stride.050.h5\",\n", - " #highlight_residxs=[85,107],\n", - " sparse_residues=union,\n", - " bezier_linecolor=colors[key2],\n", - " ax=myax,\n", - " plot_curves_only=True\n", - " )\n", - "[myax.plot(np.nan, np.nan, label=key,color=colors[key], lw=5) for key in [key1,key2]]\n", + "ifig, iax, flareplot_attrs = interfaces[key2].plot_freqs_as_flareplot(4, \n", + " fragments=mdciao.fragments.get_fragments(interfaces[key2].top, \"chains\", verbose=False), \n", + " #fragment_names=fragment_names[:-2],\n", + " #SS=\"run3-clone0.stride.050.h5\",\n", + " #highlight_residxs=[85,107],\n", + " sparse_residues=union,\n", + " bezier_linecolor=colors[key2],\n", + " ax=iax,\n", + " plot_curves_only=True\n", + " )\n", + "[iax.plot(np.nan, np.nan, label=key,color=colors[key], lw=5) for key in [key1,key2]]\n", "myfig.suptitle(\"WT vs K417\", y=1.01, fontsize=16)\n", "plt.legend(fontsize=16);" ] @@ -697,6 +754,13 @@ " * Swapping of interaction partners between ``RBD`` and the ``ACE`` glycans\n", " \n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/mdciao/examples/EGFR_Kinase_Inhibitors.ipynb b/mdciao/examples/EGFR_Kinase_Inhibitors.ipynb index d165e176..45c5f3bc 100644 --- a/mdciao/examples/EGFR_Kinase_Inhibitors.ipynb +++ b/mdciao/examples/EGFR_Kinase_Inhibitors.ipynb @@ -188,22 +188,22 @@ "outputs": [], "source": [ "colors = mdciao.plots.color_dict_guesser(\"tab10\", binding_pocket.keys())\n", - "myfig, myax, keys = mdciao.plots.compare_violins(binding_pocket,\n", - " colors=colors, \n", - " anchor=\"ligand\", \n", - " ctc_cutoff_Ang=4,\n", - " mutations_dict={\n", - " \"EUX1\": \"ligand\",\n", - " \"7VH1\": \"ligand\",\n", - " \"W321\": \"ligand\",\n", - " \"03P1\": \"ligand\"\n", - " },\n", - " defrag=None,\n", - " sort_by=\"residue\",\n", - " inch_per_contacts=.80,\n", - " legend_rows=2, \n", - " representatives=True,\n", - " )\n", + "myfig, myax, keys, representatives = mdciao.plots.compare_violins(binding_pocket,\n", + " colors=colors, \n", + " anchor=\"ligand\", \n", + " ctc_cutoff_Ang=4,\n", + " mutations_dict={\n", + " \"EUX1\": \"ligand\",\n", + " \"7VH1\": \"ligand\",\n", + " \"W321\": \"ligand\",\n", + " \"03P1\": \"ligand\"\n", + " },\n", + " defrag=None,\n", + " sort_by=\"residue\",\n", + " inch_per_contacts=.80,\n", + " legend_rows=2, \n", + " representatives=True,\n", + " )\n", "myax.set_title(\"binding pocket interactions\"\n", " \"\\nfor 4 different EGFR inhibitors\")\n", "myfig.tight_layout()\n", @@ -216,21 +216,7 @@ "metadata": {}, "source": [ "# Show the representative geometries\n", - "These are the same geometries being shown as small dots inside the violins of the previous figure, using the [repframes](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html#mdciao.contacts.ContactGroup.repframes) method:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4016ab27", - "metadata": {}, - "outputs": [], - "source": [ - "representatives = {}\n", - "ref = None\n", - "for key, bp in binding_pocket.items():\n", - " repframe = bp.repframes(return_traj=True)[-1][0]\n", - " representatives[key] = repframe" + "The object `representatives` is a dictionary contaning the geometries behind the small dots inside the violins of the previous figure, using the [repframes](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html#mdciao.contacts.ContactGroup.repframes) method. In the next cells we will first align them and then overlap them using the KLIFS nomenclature." ] }, { diff --git a/mdciao/examples/Flareplot_Schemes.ipynb b/mdciao/examples/Flareplot_Schemes.ipynb index e5721031..af8ef12f 100644 --- a/mdciao/examples/Flareplot_Schemes.ipynb +++ b/mdciao/examples/Flareplot_Schemes.ipynb @@ -25,7 +25,15 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking https://files.rcsb.org/download/3SN6.pdb ..." + ] + } + ], "source": [ "import mdciao\n", "geom = mdciao.cli.pdb(\"3SN6\")\n", @@ -125,7 +133,7 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intf.plot_freqs_as_flareplot(3.5, scheme=\"all\");\n", + "ifig, iax, flareplot_attrs = intf.plot_freqs_as_flareplot(3.5, scheme=\"all\");\n", "ifig.savefig(\"flare.all.pdf\")" ] }, @@ -146,10 +154,10 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intf.plot_freqs_as_flareplot(3.5, \n", - " scheme=\"all\",\n", - " fragments=fragments,\n", - " fragment_names=fragment_names)\n", + "ifig, iax, flareplot_attrs = intf.plot_freqs_as_flareplot(3.5, \n", + " scheme=\"all\",\n", + " fragments=fragments,\n", + " fragment_names=fragment_names)\n", "ifig.savefig(\"flare.all.w_fragments.pdf\")" ] }, @@ -225,11 +233,11 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intf.plot_freqs_as_flareplot(3.5,\n", - " scheme=\"interface_sparse\", \n", - " fragments=fragments, \n", - " fragment_names=fragment_names,\n", - " SS=True,\n", + "ifig, iax, flareplot_attrs = intf.plot_freqs_as_flareplot(3.5,\n", + " scheme=\"interface_sparse\", \n", + " fragments=fragments, \n", + " fragment_names=fragment_names,\n", + " SS=True,\n", " );\n", "ifig.savefig(\"flare.interface_sparse.w_fragments.pdf\")" ] @@ -300,12 +308,12 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intf.plot_freqs_as_flareplot(3.5, \n", - " scheme=\"interface_sparse\", \n", - " fragments=fragments, \n", - " fragment_names=fragment_names,\n", - " consensus_maps=[GPCR, CGN],\n", - " SS=True)\n", + "ifig, iax, flareplot_attrs = intf.plot_freqs_as_flareplot(3.5, \n", + " scheme=\"interface_sparse\", \n", + " fragments=fragments, \n", + " fragment_names=fragment_names,\n", + " consensus_maps=[GPCR, CGN],\n", + " SS=True)\n", "ifig.savefig(\"flare.interface_sparse.w_fragments.w_consensus.pdf\")" ] }, @@ -339,13 +347,12 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intf.plot_freqs_as_flareplot(3.5, \n", - " scheme=\"consensus_sparse\", \n", - " fragments=fragments, \n", - " fragment_names=fragment_names,\n", - " consensus_maps=[GPCR, CGN],\n", - " SS=True,\n", - " );\n", + "ifig, iax, flareplot_attrs = intf.plot_freqs_as_flareplot(3.5, \n", + " scheme=\"consensus_sparse\", \n", + " fragments=fragments, \n", + " fragment_names=fragment_names,\n", + " consensus_maps=[GPCR, CGN],\n", + " SS=True);\n", "ifig.savefig(\"flare.consensus.w_fragments.w_consensus.pdf\")" ] }, @@ -408,13 +415,13 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intf.plot_freqs_as_flareplot(3.5,\n", - " scheme=\"residues\", \n", - " fragments=fragments, \n", - " fragment_names=fragment_names,\n", - " consensus_maps=[GPCR, CGN],\n", - " SS=True,\n", - " );\n", + "ifig, iax, flareplot_attrs = intf.plot_freqs_as_flareplot(3.5,\n", + " scheme=\"residues\", \n", + " fragments=fragments, \n", + " fragment_names=fragment_names,\n", + " consensus_maps=[GPCR, CGN],\n", + " SS=True,\n", + " );\n", "\n", "\n", "ifig.savefig(\"flare.residues.w_fragments.w_consensus.pdf\")" @@ -434,14 +441,14 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intf.plot_freqs_as_flareplot(3.5,\n", - " scheme=\"all\", \n", - " fragments=fragments, \n", - " fragment_names=fragment_names,\n", - " consensus_maps=[GPCR, CGN],\n", - " SS=True,\n", - " coarse_grain=True, \n", - " );" + "ifig, iax, flareplot_attrs = intf.plot_freqs_as_flareplot(3.5,\n", + " scheme=\"all\", \n", + " fragments=fragments, \n", + " fragment_names=fragment_names,\n", + " consensus_maps=[GPCR, CGN],\n", + " SS=True,\n", + " coarse_grain=True, \n", + " );" ] }, { diff --git a/mdciao/examples/MSA_via_Consensus_Labels.ipynb b/mdciao/examples/MSA_via_Consensus_Labels.ipynb index 12022f88..08212f71 100644 --- a/mdciao/examples/MSA_via_Consensus_Labels.ipynb +++ b/mdciao/examples/MSA_via_Consensus_Labels.ipynb @@ -19,10 +19,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "efd17285", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6cb6f97fd6ac40869a65c21b1df76b46", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import mdciao\n", "import nglview\n", @@ -39,10 +52,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "b4f92862", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking https://files.rcsb.org/download/3CAP.pdb ...Please cite the following 3rd party publication:\n", + " * Crystal structure of the ligand-free G-protein-coupled receptor opsin\n", + " Park, J.H. et al., Nature 2008\n", + " https://doi.org/10.1038/nature07063\n", + "Checking https://files.rcsb.org/download/3SN6.pdb ...Please cite the following 3rd party publication:\n", + " * Crystal structure of the beta2 adrenergic receptor-Gs protein complex\n", + " Rasmussen, S.G. et al., Nature 2011\n", + " https://doi.org/10.1038/nature10361\n", + "Checking https://files.rcsb.org/download/6DDF.pdb ..." + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/guille/miniconda3/lib/python3.11/site-packages/mdtraj/formats/pdb/pdbfile.py:206: UserWarning: Unlikely unit cell vectors detected in PDB file likely resulting from a dummy CRYST1 record. Discarding unit cell vectors.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please cite the following 3rd party publication:\n", + " * Structure of the mu-opioid receptor-Giprotein complex.\n", + " Koehl, A. et al., Nature 2018\n", + " https://doi.org/10.1038/s41586-018-0219-7\n", + "Checking https://files.rcsb.org/download/7CKW.pdb ...Please cite the following 3rd party publication:\n", + " * Ligand recognition and allosteric regulation of DRD1-Gs signaling complexes.\n", + " Xiao, P. et al., Cell 2021\n", + " https://doi.org/10.1016/j.cell.2021.01.028\n" + ] + } + ], "source": [ "pdbs = {\"OPS\" : mdciao.cli.pdb(\"3CAP\"), \n", " \"B2AR\" : mdciao.cli.pdb(\"3SN6\"), \n", @@ -60,10 +111,98 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "5b2f4abe", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No local file ./opsd_bovin.xlsx found, checking online in\n", + "https://gpcrdb.org/services/residues/extended/opsd_bovin ...done!\n", + "Please cite the following reference to the GPCRdb:\n", + " * Kooistra et al, (2021) GPCRdb in 2021: Integrating GPCR sequence, structure and function\n", + " Nucleic Acids Research 49, D335--D343\n", + " https://doi.org/10.1093/nar/gkaa1080\n", + "For more information, call mdciao.nomenclature.references()\n", + "No local file ./adrb2_human.xlsx found, checking online in\n", + "https://gpcrdb.org/services/residues/extended/adrb2_human ..." + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/guille/Programs/mdciao/mdciao/nomenclature/nomenclature.py:386: FutureWarning: Passing literal json to 'read_json' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.\n", + " df = _read_json(a.text)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "done!\n", + "Please cite the following reference to the GPCRdb:\n", + " * Kooistra et al, (2021) GPCRdb in 2021: Integrating GPCR sequence, structure and function\n", + " Nucleic Acids Research 49, D335--D343\n", + " https://doi.org/10.1093/nar/gkaa1080\n", + "For more information, call mdciao.nomenclature.references()\n", + "No local file ./oprm_mouse.xlsx found, checking online in\n", + "https://gpcrdb.org/services/residues/extended/oprm_mouse ..." + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/guille/Programs/mdciao/mdciao/nomenclature/nomenclature.py:386: FutureWarning: Passing literal json to 'read_json' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.\n", + " df = _read_json(a.text)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "done!\n", + "Please cite the following reference to the GPCRdb:\n", + " * Kooistra et al, (2021) GPCRdb in 2021: Integrating GPCR sequence, structure and function\n", + " Nucleic Acids Research 49, D335--D343\n", + " https://doi.org/10.1093/nar/gkaa1080\n", + "For more information, call mdciao.nomenclature.references()\n", + "No local file ./DRD1_HUMAN.xlsx found, checking online in\n", + "https://gpcrdb.org/services/residues/extended/drd1_human ..." + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/guille/Programs/mdciao/mdciao/nomenclature/nomenclature.py:386: FutureWarning: Passing literal json to 'read_json' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.\n", + " df = _read_json(a.text)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "done!\n", + "Please cite the following reference to the GPCRdb:\n", + " * Kooistra et al, (2021) GPCRdb in 2021: Integrating GPCR sequence, structure and function\n", + " Nucleic Acids Research 49, D335--D343\n", + " https://doi.org/10.1093/nar/gkaa1080\n", + "For more information, call mdciao.nomenclature.references()\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/guille/Programs/mdciao/mdciao/nomenclature/nomenclature.py:386: FutureWarning: Passing literal json to 'read_json' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.\n", + " df = _read_json(a.text)\n" + ] + } + ], "source": [ "maps = { \"OPS\": mdciao.nomenclature.LabelerGPCR(\"opsd_bovin\"),\n", " \"B2AR\": mdciao.nomenclature.LabelerGPCR(\"adrb2_human\"),\n", @@ -83,10 +222,73 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "4d15ec10", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OPS\n", + "Auto-detected fragments with method 'resSeq'\n", + "fragment 0 with 326 AAs MET1 ( 0) - ASN326 (325 ) (0) \n", + "fragment 1 with 326 AAs MET1 ( 326) - ASN326 (651 ) (1) \n", + "fragment 2 with 4 AAs NAG1 ( 652) - BMA4 (655 ) (2) \n", + "fragment 3 with 6 AAs NAG1 ( 656) - BMA4 (661 ) (3) resSeq jumps\n", + "fragment 4 with 2 AAs NAG1 ( 662) - NAG2 (663 ) (4) \n", + "fragment 5 with 4 AAs BGL801 ( 664) - BGL804 (667 ) (5) \n", + "fragment 6 with 1 AAs PLM901 ( 668) - PLM901 (668 ) (6) \n", + "fragment 7 with 2 AAs BGL805 ( 669) - BGL806 (670 ) (7) \n", + "fragment 8 with 1 AAs PLM902 ( 671) - PLM902 (671 ) (8) \n", + "fragment 9 with 10 AAs HOH902 ( 672) - HOH907 (681 ) (9) resSeq jumps\n", + "The GPCR-labels align best with fragments: [0] (first-last: MET1-ASN326).\n", + "\n", + "B2AR\n", + "Auto-detected fragments with method 'resSeq'\n", + "fragment 0 with 51 AAs THR9 ( 0) - GLN59 (50 ) (0) \n", + "fragment 1 with 115 AAs LYS88 ( 51) - VAL202 (165 ) (1) \n", + "fragment 2 with 51 AAs SER205 ( 166) - MET255 (216 ) (2) \n", + "fragment 3 with 132 AAs THR263 ( 217) - LEU394 (348 ) (3) \n", + "fragment 4 with 340 AAs GLN1 ( 349) - ASN340 (688 ) (4) \n", + "fragment 5 with 58 AAs ASN5 ( 689) - ARG62 (746 ) (5) \n", + "fragment 6 with 159 AAs ASN1002 ( 747) - ALA1160 (905 ) (6) \n", + "fragment 7 with 146 AAs GLU30 ( 906) - ARG175 (1051) (7) \n", + "fragment 8 with 61 AAs GLN179 (1052) - ARG239 (1112) (8) \n", + "fragment 9 with 77 AAs CYS265 (1113) - CYS341 (1189) (9) \n", + "fragment 10 with 128 AAs GLN1 (1190) - SER128 (1317) (10) \n", + "fragment 11 with 1 AAs P0G1601 (1318) - P0G1601 (1318) (11) \n", + "The GPCR-labels align best with fragments: [7, 8, 9] (first-last: GLU30-CYS341).\n", + "\n", + "MUOR\n", + "Auto-detected fragments with method 'resSeq'\n", + "fragment 0 with 51 AAs LEU5 ( 0) - ILE55 (50 ) (0) \n", + "fragment 1 with 52 AAs THR182 ( 51) - VAL233 (102 ) (1) \n", + "fragment 2 with 114 AAs ASN241 ( 103) - PHE354 (216 ) (2) \n", + "fragment 3 with 336 AAs ASP5 ( 217) - ASN340 (552 ) (3) \n", + "fragment 4 with 53 AAs ILE9 ( 553) - PHE61 (605 ) (4) \n", + "fragment 5 with 281 AAs MET65 ( 606) - ARG345 (886 ) (5) \n", + "fragment 6 with 5 AAs TYR1 ( 887) - ETA5 (891 ) (6) \n", + "The GPCR-labels align best with fragments: [5] (first-last: MET65-ARG345).\n", + "\n", + "DOP\n", + "Auto-detected fragments with method 'resSeq'\n", + "fragment 0 with 53 AAs ASP11 ( 0) - LEU63 (52 ) (0) \n", + "fragment 1 with 51 AAs THR205 ( 53) - MET255 (103 ) (1) \n", + "fragment 2 with 132 AAs THR263 ( 104) - LEU394 (235 ) (2) \n", + "fragment 3 with 340 AAs GLY1 ( 236) - ASN340 (575 ) (3) \n", + "fragment 4 with 58 AAs ASN5 ( 576) - ARG62 (633 ) (4) \n", + "fragment 5 with 128 AAs GLN1 ( 634) - SER128 (761 ) (5) \n", + "fragment 6 with 146 AAs PHE20 ( 762) - LYS165 (907 ) (6) \n", + "fragment 7 with 53 AAs ASN185 ( 908) - HIS237 (960 ) (7) \n", + "fragment 8 with 36 AAs SER263 ( 961) - CYS298 (996 ) (8) \n", + "fragment 9 with 38 AAs CYS307 ( 997) - LEU344 (1034) (9) \n", + "fragment 10 with 2 AAs G3C501 (1035) - CLR502 (1036) (10) \n", + "The GPCR-labels align best with fragments: [6, 7, 8, 9] (first-last: PHE20-LEU344).\n", + "\n" + ] + } + ], "source": [ "pdb_just_receptor = {}\n", "for key, pdb in pdbs.items():\n", @@ -112,10 +314,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "89bbf443", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a17ea646d3de4bc6a312011a0ab3f5fa", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "NGLWidget()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "colors = {\"MUOR\":\"tab:red\", \n", " \"OPS\":\"tab:blue\", \n", @@ -139,10 +356,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "77dce09d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/guille/Programs/mdciao/mdciao/nomenclature/nomenclature.py:2369: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " labs_out = [subset[oo] for oo in order]\n" + ] + } + ], "source": [ "AC = mdciao.nomenclature.AlignerConsensus(maps,\n", " tops={key : geom.top for key, geom in pdb_just_receptor.items()})" @@ -158,10 +384,91 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "68a28310", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/guille/Programs/mdciao/mdciao/nomenclature/nomenclature.py:1829: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n", + " df = df.applymap(lambda x: \"%u%%\" % x)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OPSB2ARMUORDOP
OPS100%22%24%21%
B2AR22%100%28%42%
MUOR24%28%100%28%
DOP21%42%28%100%
\n", + "
" + ], + "text/plain": [ + " OPS B2AR MUOR DOP\n", + "OPS 100% 22% 24% 21%\n", + "B2AR 22% 100% 28% 42%\n", + "MUOR 24% 28% 100% 28%\n", + "DOP 21% 42% 28% 100%" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "AC.sequence_match()" ] @@ -177,17 +484,106 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "1de3a82f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
consensusOPSB2AR
1153.50x501065746
1163.51x511076757
1173.52x521088769
1183.53x531095780
1193.54x541102785
1203.55x551109793
1213.56x561115800
\n", + "
" + ], + "text/plain": [ + " consensus OPS B2AR\n", + "115 3.50x50 1065 746\n", + "116 3.51x51 1076 757\n", + "117 3.52x52 1088 769\n", + "118 3.53x53 1095 780\n", + "119 3.54x54 1102 785\n", + "120 3.55x55 1109 793\n", + "121 3.56x56 1115 800" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "AC.CAidxs_match(\"3.5*\", keys=[\"OPS\",\"B2AR\"])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "76638fb1", "metadata": {}, "outputs": [], @@ -210,10 +606,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "da01597f-8b39-4eac-bc37-18414e708983", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e726cbd49fc8416898c9bdb902042d3c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "NGLWidget()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "iwd = nglview.NGLWidget()\n", "for ii, (key, geom) in enumerate(pdb_just_receptor.items()):\n", diff --git a/mdciao/examples/Manuscript.ipynb b/mdciao/examples/Manuscript.ipynb index cf0326fa..0ddd806f 100644 --- a/mdciao/examples/Manuscript.ipynb +++ b/mdciao/examples/Manuscript.ipynb @@ -153,11 +153,11 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intf.plot_freqs_as_flareplot(4, \n", - " fragments=fragments, fragment_names = fragment_names, \n", - " scheme=\"consensus_sparse\", consensus_maps=[GPCR, CGN], \n", - " aura=intf.frequency_sum_per_residue_idx_dict(4,return_array=True),\n", - " SS=True)\n", + "ifig, iax, flareplot_attrs = intf.plot_freqs_as_flareplot(4, \n", + " fragments=fragments, fragment_names = fragment_names, \n", + " scheme=\"consensus_sparse\", consensus_maps=[GPCR, CGN], \n", + " aura=intf.frequency_sum_per_residue_idx_dict(4,return_array=True),\n", + " SS=True)\n", "ifig.figure.savefig(\"flare.svg\")" ] }, @@ -174,14 +174,14 @@ "metadata": {}, "outputs": [], "source": [ - "ifig, iax = intf.plot_freqs_as_flareplot(4, \n", - " fragments=fragments, fragment_names = fragment_names, \n", - " consensus_maps=[GPCR, CGN], \n", - " coarse_grain=True,\n", - " )\n", + "ifig, iax, flareplot_attrs = intf.plot_freqs_as_flareplot(4, \n", + " fragments=fragments, fragment_names = fragment_names, \n", + " consensus_maps=[GPCR, CGN], \n", + " coarse_grain=True,\n", + " )\n", "freqs = intf.frequency_as_contact_matrix_CG(4, fragments=fragments, fragment_names = fragment_names,\n", - " consensus_labelers=[GPCR, CGN],\n", - " interface=True).round(1).replace(0,\"\") \n", + " consensus_labelers=[GPCR, CGN],\n", + " interface=True).round(1).replace(0,\"\") \n", "freqs" ] }, diff --git a/mdciao/examples/Missing_Contacts.ipynb b/mdciao/examples/Missing_Contacts.ipynb index 82707b1e..c1a02934 100644 --- a/mdciao/examples/Missing_Contacts.ipynb +++ b/mdciao/examples/Missing_Contacts.ipynb @@ -399,21 +399,30 @@ "metadata": {}, "source": [ "## Interfaces: `self_interface`\n", - "When computing interfaces, one typically computes contacts between two different groups of residues, and these two groups usually don't share any residues, i.e. one group is the GPCR, the other is the G-protein. [mdciao.cli.interface](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.cli.interface.html#mdciao.cli.interface), will actually complain if the do. However, that restriction can be lifted with `self_interface=True`.\n", "\n", - "From the docs\n", - "```\n", - " This is why mdciao.cli.interface doesn't allow interface\n", - " members to share residues by default. However, sometimes it's\n", - " useful to allow it because the contacts of one fragment\n", - " with itself are also important. E.g. the\n", - " C-terminus of a receptor interfacing with\n", - " the entire receptor, **including the C-terminus itself**.\n", - " To allow for this behaviour, use `self_interface` = True,\n", - " and possibly increase `n_nearest`, since otherwise\n", - " neighboring residues of the shared set (e.g. C-terminus)\n", - " will always appear as formed.\n", - "```\n", + "From the docs of [mdciao.cli.interface](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.cli.interface.html#mdciao.cli.interface):\n", + " \n", + " Note\n", + " ----\n", + " If your definitions of `interface_selection_1` and\n", + " `interface_selection_2` lead to some overlap between\n", + " the interface members (see below), mdciao's default\n", + " is to ignore contact pairs within the same fragment.\n", + " E.g., in the context of a GPCR, computing\n", + " \"TM3\" vs \"TM*\" (\"TM3\" vs \"all TMs\") won't include\n", + " TM3-TM3 contacts by default. To include these\n", + " (or equivalent) contacts set `self_interface` = True.\n", + "\n", + " Another example could be computing the interface of\n", + " C-terminus of a receptor with the entire receptor,\n", + " where it might be useful to including the contacts of\n", + " the C-terminus with itself.\n", + "\n", + " When using `self_interface` = True, it's advisable to\n", + " increase `n_nearest`, since otherwise neighboring\n", + " residues of the shared set (the TM3-TM3 or the Cterm-Cterm)\n", + " will always appear as formed.will always appear as formed.\n", + "\n", "\n", "We can compute the contacts of the $\\alpha$5-helix of the G-protein. Whereas most of the helix is straight, the C-terminal bends a bit backwards and interacts with itself:" ] @@ -462,13 +471,18 @@ " AA_selection : str or list, default is None\n", " Whatever the fragment definition and fragment selection\n", " has been, one can further refine the list of\n", - " potential residue pairs by making a per aminoacid (AA)\n", - " selection here. E.g., if one has selected the interface\n", - " to be \"TM3\" vs \"TM2\", but wants to select only some\n", - " regions of those helices, one can pass here an `AA_selection`.\n", - " [...]\n", + " potential residue pairs by making a selection at\n", + " the level of single aminoacids (AAs).\n", + " E.g., if (like above) one has selected the interface\n", + " to be \"TM3\" vs \"TM2\",\n", + "\n", + " >>> interface_selection_1=\"TM3\"\n", + " >>> interface_selection_2=\"TM2\"\n", + "\n", + " but wants to select only some regions of those helices,\n", + " one can pass here an `AA_selection`.\n", "```\n", - "Please read the rest of the docs, since the paramter has more options than the ones we're about to use.\n", + "Please read the rest of the docs, since the parameter has more options than the ones we're about to use.\n", "\n", "Here, we define the interface as contacts of the $\\alpha$5-helix of the G-protein with the TM-bundle, using \n", "```no_disk=True, interface_selection_1=\"G.H5\", interface_selection_2=\"TM*\"```\n", From 3cf8278f5950f35b705e460fb4bd626466ec9614 Mon Sep 17 00:00:00 2001 From: gph82 Date: Fri, 30 Aug 2024 21:43:41 +0200 Subject: [PATCH 65/83] [examples/notebooks] add numbering to notebooks --- .../{Tutorial.ipynb => 01.Tutorial.ipynb} | 0 ...ntacts.ipynb => 02.Missing_Contacts.ipynb} | 0 ...Bars.ipynb => 03.Comparing_CGs_Bars.ipynb} | 0 ...es.ipynb => 04.Comparing_CGs_Flares.ipynb} | 0 ...hemes.ipynb => 05.Flareplot_Schemes.ipynb} | 0 ...pynb => 06.MSA_via_Consensus_Labels.ipynb} | 0 ....ipynb => 07.EGFR_Kinase_Inhibitors.ipynb} | 0 .../{Manuscript.ipynb => 08.Manuscript.ipynb} | 0 mdciao/examples/examples.py | 26 +++++++++++++------ 9 files changed, 18 insertions(+), 8 deletions(-) rename mdciao/examples/{Tutorial.ipynb => 01.Tutorial.ipynb} (100%) rename mdciao/examples/{Missing_Contacts.ipynb => 02.Missing_Contacts.ipynb} (100%) rename mdciao/examples/{Comparing_CGs_Bars.ipynb => 03.Comparing_CGs_Bars.ipynb} (100%) rename mdciao/examples/{Comparing_CGs_Flares.ipynb => 04.Comparing_CGs_Flares.ipynb} (100%) rename mdciao/examples/{Flareplot_Schemes.ipynb => 05.Flareplot_Schemes.ipynb} (100%) rename mdciao/examples/{MSA_via_Consensus_Labels.ipynb => 06.MSA_via_Consensus_Labels.ipynb} (100%) rename mdciao/examples/{EGFR_Kinase_Inhibitors.ipynb => 07.EGFR_Kinase_Inhibitors.ipynb} (100%) rename mdciao/examples/{Manuscript.ipynb => 08.Manuscript.ipynb} (100%) diff --git a/mdciao/examples/Tutorial.ipynb b/mdciao/examples/01.Tutorial.ipynb similarity index 100% rename from mdciao/examples/Tutorial.ipynb rename to mdciao/examples/01.Tutorial.ipynb diff --git a/mdciao/examples/Missing_Contacts.ipynb b/mdciao/examples/02.Missing_Contacts.ipynb similarity index 100% rename from mdciao/examples/Missing_Contacts.ipynb rename to mdciao/examples/02.Missing_Contacts.ipynb diff --git a/mdciao/examples/Comparing_CGs_Bars.ipynb b/mdciao/examples/03.Comparing_CGs_Bars.ipynb similarity index 100% rename from mdciao/examples/Comparing_CGs_Bars.ipynb rename to mdciao/examples/03.Comparing_CGs_Bars.ipynb diff --git a/mdciao/examples/Comparing_CGs_Flares.ipynb b/mdciao/examples/04.Comparing_CGs_Flares.ipynb similarity index 100% rename from mdciao/examples/Comparing_CGs_Flares.ipynb rename to mdciao/examples/04.Comparing_CGs_Flares.ipynb diff --git a/mdciao/examples/Flareplot_Schemes.ipynb b/mdciao/examples/05.Flareplot_Schemes.ipynb similarity index 100% rename from mdciao/examples/Flareplot_Schemes.ipynb rename to mdciao/examples/05.Flareplot_Schemes.ipynb diff --git a/mdciao/examples/MSA_via_Consensus_Labels.ipynb b/mdciao/examples/06.MSA_via_Consensus_Labels.ipynb similarity index 100% rename from mdciao/examples/MSA_via_Consensus_Labels.ipynb rename to mdciao/examples/06.MSA_via_Consensus_Labels.ipynb diff --git a/mdciao/examples/EGFR_Kinase_Inhibitors.ipynb b/mdciao/examples/07.EGFR_Kinase_Inhibitors.ipynb similarity index 100% rename from mdciao/examples/EGFR_Kinase_Inhibitors.ipynb rename to mdciao/examples/07.EGFR_Kinase_Inhibitors.ipynb diff --git a/mdciao/examples/Manuscript.ipynb b/mdciao/examples/08.Manuscript.ipynb similarity index 100% rename from mdciao/examples/Manuscript.ipynb rename to mdciao/examples/08.Manuscript.ipynb diff --git a/mdciao/examples/examples.py b/mdciao/examples/examples.py index cee228b0..c782eee3 100644 --- a/mdciao/examples/examples.py +++ b/mdciao/examples/examples.py @@ -407,12 +407,12 @@ def fetch_example_data(alias_or_url="b2ar@Gs", r""" Download the example data as zipfile and unzip it to the working directory. This data is used in the notebooks: - * Manuscript.ipynb (b2ar@Gs) - * Tutorial.ipynb (b2ar@Gs) - * Missing_Contacts.ipynb (b2ar@Gs) - * EGFR Kinase Inhibitors.ipynb (EGFR) - * Comparing_CGs_Bars.ipyn (cov19) - * Comparing_CGs_Flares.ipynb (cov19) + * 08.Manuscript.ipynb (b2ar@Gs) + * 01.Tutorial.ipynb (b2ar@Gs) + * 02.Missing_Contacts.ipynb (b2ar@Gs) + * 07.EGFR Kinase Inhibitors.ipynb (EGFR) + * 03.Comparing_CGs_Bars.ipyn (cov19) + * 04.Comparing_CGs_Flares.ipynb (cov19) which can all be run locally issuing, from the CLI: @@ -458,10 +458,18 @@ def fetch_example_data(alias_or_url="b2ar@Gs", to ghrelin. Provided kindly by Dr. Alexander Vogel (1 traj, ca. 10 MB, 411 frames, dt = 100ns) For the associated publication see here: - * Analysis of the Dynamics of the Human Growth Hormone Secretagogue Receptor Reveals Insights into the Energy Landscape of the Molecule + * Analysis of the Dynamics of the Human Growth Hormone Secretagogue Receptor Reveals Insights + into the Energy Landscape of the Molecule A. A. Smith, E. M. Pacull, S. Stecher, P. W. Hildebrand, A. Vogel, D. Huster, Angew. Chem. Int. Ed. 2023, 62, e202302003. + * Y1 : https://proteinformatics.uni-leipzig.de/mdciao/y1_apo.zip + Neuropeptide Y receptor type 1, Y1 receptor for short, in apo form. + Provided kindly by Dr. Alexander Vogel (1 traj, ca. 11 MB, 528 frames, dt = 50ns) + For the associated publication see here: + * Towards Probing Conformational States of Y2 Receptor Using Hyperpolarized 129Xe NMR. + Schmidt, P.; Vogel, A.; Schwarze, B.; Seufert, F.; Licha, K.; Wycisk, V.; Kilian, W.; Hildebrand, P.W.; Mitschang, L. + Molecules 2023, 28, 1424. [https://doi.org/10.3390/molecules28031424]() unzip : bool, default is True Try unzipping the file after downloading @@ -476,7 +484,9 @@ def fetch_example_data(alias_or_url="b2ar@Gs", alias2url = {"b2ar@Gs": "https://proteinformatics.uni-leipzig.de//mdciao/mdciao_example.zip", "EGFR": "https://proteinformatics.uni-leipzig.de/mdciao/example_kinases.zip", "cov19" : "https://proteinformatics.uni-leipzig.de/mdciao/example_cov19.zip", - "test": "https://proteinformatics.uni-leipzig.de/mdciao/mdciao_test_small.zip"} + "test": "https://proteinformatics.uni-leipzig.de/mdciao/mdciao_test_small.zip", + "ghrelin@ghsr" : "https://proteinformatics.uni-leipzig.de/mdciao/ghrelin_receptor.zip", + "y1_apo" : "https://proteinformatics.uni-leipzig.de/mdciao/y1_apo.zip"} if alias_or_url in alias2url.keys(): url = alias2url[alias_or_url] From 89546979cbb2c7490208d481aa16499f421bc7d2 Mon Sep 17 00:00:00 2001 From: gph82 Date: Sat, 31 Aug 2024 11:36:20 +0200 Subject: [PATCH 66/83] [examples._recursive_prompt] new optarg skip_on_existing, tests --- mdciao/examples/examples.py | 21 ++++++++++++++++----- tests/test_examples.py | 7 +++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/mdciao/examples/examples.py b/mdciao/examples/examples.py index c782eee3..ab8b5ee5 100644 --- a/mdciao/examples/examples.py +++ b/mdciao/examples/examples.py @@ -340,11 +340,10 @@ def notebooks(folder ="mdciao_notebooks"): return dest -def _recursive_prompt(input_path, pattern, count=1, verbose=False, is_file=False): +def _recursive_prompt(input_path, pattern, count=1, verbose=False, is_file=False, skip_on_existing=False): r""" Ensure input_path doesn't exist and keep generating/prompting for alternative filenames/dirnames - Poorman's attempt at actually a good recursive function, but does its job. A maximum recursion depth of 50 is hard-coded @@ -358,16 +357,26 @@ def _recursive_prompt(input_path, pattern, count=1, verbose=False, is_file=False count : int, default is 0 Where in the recursion we are verbose : bool, default is False - is_file : book, default is False + is_file : bool, default is False Name-generating is different for folders than from files: * mdciao_notebook -> mdciao_notebook_00 * mdciao_example.zip -> mdciao_example_00.zip - + skip_on_existing : bool, default is False + If the `input_path` is found, instead of + prompting for a new path, simply skip this + method and return the `input_path` without + doing anything. Returns ------- nox_path : str - A newly created, previosuly non-existent path + By default, `new_path` is a newly created, + previously non-existent path. If the + `input_path` existed, the default is to create + a `new_path`, unless `skip_on_existing` was set + to True, in which case nothing happens and the + existing `input_path` is returned without + doing anything. """ @@ -377,6 +386,8 @@ def _recursive_prompt(input_path, pattern, count=1, verbose=False, is_file=False while _path.exists(input_path): if verbose: print("%s exists" % input_path) + if skip_on_existing: + break input_path = _path.join(cwd, pattern) + "_%02u" % count if is_file: input_path += ext diff --git a/tests/test_examples.py b/tests/test_examples.py index e48ebbd7..0d9bf154 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -120,6 +120,13 @@ def test_escapes_recursion(self): with self.assertRaises(RecursionError): examples._recursive_prompt("test_00.dat", "test", is_file=True) + def test_skip_on_existing(self): + with TemporaryDirectory(suffix="_mdciao_test_recursive") as td: + with remember_cwd(): + os.chdir(td) + open("test.00.dat", "w").close() + examples._recursive_prompt("test_00.dat", "test", is_file=True, skip_on_existing=True) + class Test_down_safely(unittest.TestCase): def test_just_works(self): From ec8204667d5738cc5e1422f2711411cd44a3c812 Mon Sep 17 00:00:00 2001 From: gph82 Date: Sat, 31 Aug 2024 11:40:56 +0200 Subject: [PATCH 67/83] [examples._down_url_safely] new optarg skip_on_existing, tests --- mdciao/examples/examples.py | 37 ++++++++++++++++++++----------------- tests/test_examples.py | 12 ++++++++++++ 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/mdciao/examples/examples.py b/mdciao/examples/examples.py index ab8b5ee5..857bb43b 100644 --- a/mdciao/examples/examples.py +++ b/mdciao/examples/examples.py @@ -558,7 +558,7 @@ def _unzip2dir(full_path_zipfile): return full_dir -def _down_url_safely(url, chunk_size = 128, verbose=False, rename_to=None): +def _down_url_safely(url, chunk_size = 128, verbose=False, rename_to=None, skip_on_existing=False): r""" Downloads a file from a URL to a tmpfile and copies it to the current directory @@ -587,22 +587,25 @@ def _down_url_safely(url, chunk_size = 128, verbose=False, rename_to=None): target_file = rename_to filename_nonx = _recursive_prompt(target_file, _path.splitext(target_file)[0], - is_file=True, verbose=True) - r = _rget(url, stream=True) - total_size_in_bytes = int(r.headers.get('content-length', 0)) - pb = _tqdma(total=total_size_in_bytes, - desc="Downloading %s to %s" % (filename_orig, _path.basename(filename_nonx))) - with _TDir(suffix="_mdciao_download") as t: - _filename = _path.join(t,_path.basename(filename_nonx)) - with open(_filename, 'wb') as fd: - r.iter_content() - for chunk in r.iter_content(chunk_size=chunk_size): - fd.write(chunk) - pb.update(128) - pb.close() - if verbose: - print("Dowloaded file %s"%filename_nonx) - _shcopy(_filename,filename_nonx) + is_file=True, verbose=True, skip_on_existing=skip_on_existing) + if _path.exists(filename_nonx) and skip_on_existing: + pass + else: + r = _rget(url, stream=True) + total_size_in_bytes = int(r.headers.get('content-length', 0)) + pb = _tqdma(total=total_size_in_bytes, + desc="Downloading %s to %s" % (filename_orig, _path.basename(filename_nonx))) + with _TDir(suffix="_mdciao_download") as t: + _filename = _path.join(t,_path.basename(filename_nonx)) + with open(_filename, 'wb') as fd: + r.iter_content() + for chunk in r.iter_content(chunk_size=chunk_size): + fd.write(chunk) + pb.update(128) + pb.close() + if verbose: + print("Dowloaded file %s"%filename_nonx) + _shcopy(_filename,filename_nonx) return filename_nonx diff --git a/tests/test_examples.py b/tests/test_examples.py index 0d9bf154..19997c56 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -144,6 +144,18 @@ def test_different_name(self): assert local_path.endswith("myfile.zip") assert os.path.exists(local_path) + def test_skip_on_existing(self): + with TemporaryDirectory(suffix="_mdciao_test_down_safely") as td: + with remember_cwd(): + os.chdir(td) + with open("mdciao_test_small.zip","w") as f: + f.write("Won't be overwrriten") + local_path = examples._down_url_safely("https://proteinformatics.uni-leipzig.de/mdciao/mdciao_test_small.zip", + verbose=True, skip_on_existing=True) + assert open("mdciao_test_small.zip").read() == "Won't be overwrriten" + assert local_path.endswith("mdciao_test_small.zip") + assert os.path.exists(local_path) + class Test_fetch_example_data(unittest.TestCase): def test_just_works(self): From e5c53c74eef2edabd71e874abbdd5fb9ce9ae199 Mon Sep 17 00:00:00 2001 From: gph82 Date: Sat, 31 Aug 2024 11:54:12 +0200 Subject: [PATCH 68/83] [examples.fetch_example_data] new optarg skip_on_existing, new unzip behaviour, tests --- mdciao/examples/examples.py | 69 ++++++++++++++++++++++++++++--------- tests/test_examples.py | 37 ++++++++++++++++++++ 2 files changed, 89 insertions(+), 17 deletions(-) diff --git a/mdciao/examples/examples.py b/mdciao/examples/examples.py index 857bb43b..a22f787d 100644 --- a/mdciao/examples/examples.py +++ b/mdciao/examples/examples.py @@ -414,16 +414,26 @@ def _recursive_prompt(input_path, pattern, count=1, verbose=False, is_file=False def fetch_example_data(alias_or_url="b2ar@Gs", - unzip=True): - r""" Download the example data as zipfile and unzip it to the working directory. + unzip=True, + skip_on_existing=False): + r""" Download example data as zipfile and unzip it to the working directory. + + The available datasets are (see below for full info) + * 'b2ar@Gs' + * 'EGFR' + * 'cov19' + * 'ghrelin@ghsr' + * 'mor@muor' This data is used in the notebooks: - * 08.Manuscript.ipynb (b2ar@Gs) * 01.Tutorial.ipynb (b2ar@Gs) * 02.Missing_Contacts.ipynb (b2ar@Gs) - * 07.EGFR Kinase Inhibitors.ipynb (EGFR) * 03.Comparing_CGs_Bars.ipyn (cov19) * 04.Comparing_CGs_Flares.ipynb (cov19) + * 07.EGFR Kinase Inhibitors.ipynb (EGFR) + * 08.Manuscript.ipynb (b2ar@Gs) + * 09.Consensus_labels.ipynb (ghrelin@ghsr) + which can all be run locally issuing, from the CLI: @@ -435,9 +445,12 @@ def fetch_example_data(alias_or_url="b2ar@Gs", Note ---- - New filenames for the downloaded file, and the resulting folder - will be generated to avoid overwriting. No files will - be overwritten when extracting. + By default, a zipfile is downloaded and then extracted + to a directory with its own name. If these files already + exist, the user will be prompted for new filenames + s.t. no files are ever overwritten when extracting. Use + `unzip` to change filenames a priory and `skip_on_existing` + to simply skip either the downloading or the unzipping. Parameters ---------- @@ -466,24 +479,43 @@ def fetch_example_data(alias_or_url="b2ar@Gs", * ghrelin@ghsr : https://proteinformatics.uni-leipzig.de/mdciao/ghrelin_receptor.zip Growth hormone secretagogue receptor type 1, ghrelin receptor for short, bound - to ghrelin. Provided kindly by Dr. Alexander Vogel (1 traj, ca. 10 MB, 411 frames, dt = 100ns) + to ghrelin. Provided kindly by Dr. Alexander Vogel (1 traj, ca. 10 MB, 411 frames, dt = 100ns). For the associated publication see here: - * Analysis of the Dynamics of the Human Growth Hormone Secretagogue Receptor Reveals Insights - into the Energy Landscape of the Molecule - A. A. Smith, E. M. Pacull, S. Stecher, P. W. Hildebrand, A. Vogel, D. Huster, - Angew. Chem. Int. Ed. 2023, 62, e202302003. + * Analysis of the Dynamics of the Human Growth Hormone Secretagogue Receptor Reveals Insights into the Energy Landscape of the Molecule + A. A. Smith, E. M. Pacull, S. Stecher, P. W. Hildebrand, A. Vogel, D. Huster + Angew. Chem. Int. Ed. 2023, 62, e202302003 * Y1 : https://proteinformatics.uni-leipzig.de/mdciao/y1_apo.zip Neuropeptide Y receptor type 1, Y1 receptor for short, in apo form. Provided kindly by Dr. Alexander Vogel (1 traj, ca. 11 MB, 528 frames, dt = 50ns) For the associated publication see here: + * Towards Probing Conformational States of Y2 Receptor Using Hyperpolarized 129Xe NMR. Schmidt, P.; Vogel, A.; Schwarze, B.; Seufert, F.; Licha, K.; Wycisk, V.; Kilian, W.; Hildebrand, P.W.; Mitschang, L. - Molecules 2023, 28, 1424. [https://doi.org/10.3390/molecules28031424]() - - unzip : bool, default is True - Try unzipping the file after downloading + Molecules 2023, 28, 1424. [](https://doi.org/10.3390/molecules28031424) + + * mor@muor : https://proteinformatics.uni-leipzig.de/mdciao/muor_199.zip + Active mu-opioid receptor bound to the agonist morphine (1 traj, ca. 7 MB, 400 frames, dt = 100ns). + Kindly made available for this purpose by the GPCRmd. The GPCRmd's simulation + report can be found [https://www.gpcrmd.org/dynadb/dynamics/id/199/](here) , + the original publication is: + + * Dynamic and Kinetic Elements of µ-Opioid Receptor Functional Selectivity. + Kapoor, A., Martinez-Rosell, G., Provasi, D. et al. + Sci Rep 7, 11255 (2017). [](https://doi.org/10.1038/s41598-017-11483-8) + + unzip : bool, or str, default is True + Try unzipping the file after downloading. If string, + the url will be downloaded to this filename and then + unzipped to a folder with this name + in the working directory, regardless of the + zipfile's name and internal structure. + skip_on_existing : bool, default is False + In case of finding existing files and directories + skip the downloading and unzipping steps. + This allows for re-runs w/o being prompted + for new filenames to avoid overwriting. Returns ------- @@ -497,6 +529,7 @@ def fetch_example_data(alias_or_url="b2ar@Gs", "cov19" : "https://proteinformatics.uni-leipzig.de/mdciao/example_cov19.zip", "test": "https://proteinformatics.uni-leipzig.de/mdciao/mdciao_test_small.zip", "ghrelin@ghsr" : "https://proteinformatics.uni-leipzig.de/mdciao/ghrelin_receptor.zip", + "mor@muor" : "https://proteinformatics.uni-leipzig.de/mdciao/muor_199.zip", "y1_apo" : "https://proteinformatics.uni-leipzig.de/mdciao/y1_apo.zip"} if alias_or_url in alias2url.keys(): @@ -505,7 +538,9 @@ def fetch_example_data(alias_or_url="b2ar@Gs", url = alias_or_url else: raise ValueError("Cannot find %s in the known aliases or in the known urls:\n%s" % (alias_or_url, alias2url)) - downed_file_full = _down_url_safely(url) + downed_file_full = _down_url_safely(url, + rename_to=[None if isinstance(unzip,bool) else f"{_path.splitext(unzip)[0]}.zip"][0], + skip_on_existing=skip_on_existing) if unzip: return _unzip2dir(downed_file_full) else: diff --git a/tests/test_examples.py b/tests/test_examples.py index 19997c56..89c76c8d 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -197,6 +197,43 @@ def test_alias(self): assert len(files) == 1 assert files[0] == "mdciao_test_small.zip" + + def test_alias_unzip_to_otherfile(self): + with TemporaryDirectory(suffix="_mdciao_test_fetch") as td: + with remember_cwd(): + os.chdir(td) + local_path = examples.fetch_example_data("test", + unzip="unzip_here") + assert os.path.exists(local_path) + # assert os.path.exists((os.path.splitext(local_path))[0]) + files = os.listdir(td) + assert len(files) == 2 + assert files[0] == "unzip_here.zip" + assert files[1] == "unzip_here" + extracted = os.listdir(files[1]) + assert extracted[0] == "A.dat" + assert extracted[1] == "B.dat" + + + def test_skip_on_existing(self): + with TemporaryDirectory(suffix="_mdciao_test_fetch") as td: + with remember_cwd(): + os.chdir(td) + local_path = examples.fetch_example_data("test", + unzip=False) + assert os.path.exists(local_path) + # assert os.path.exists((os.path.splitext(local_path))[0]) + files = os.listdir(td) + assert len(files) == 1 + assert files[0] == "mdciao_test_small.zip" + # Create a fake file to test it doesn't ovewrite + with open("mdciao_test_small.zip", "w") as f: + f.write("Won't be overwrriten") + local_path = examples.fetch_example_data("test", + unzip=False, skip_on_existing=True) + assert open("mdciao_test_small.zip").read() == "Won't be overwrriten" + + class Test_notebooks(unittest.TestCase): def test_just_works(self): From 319e06e80816c084a05d5bed0a447b6b4f19047c Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 2 Sep 2024 01:43:20 +0200 Subject: [PATCH 69/83] [nomenclature._GPCRdb_web_lookup] avoid usage of pandas _read_json --- mdciao/nomenclature/nomenclature.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index d75278c4..0708183a 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -31,7 +31,6 @@ from tempfile import NamedTemporaryFile as _NamedTemporaryFile from pandas import \ - read_json as _read_json, \ read_excel as _read_excel, \ read_csv as _read_csv, \ DataFrame as _DataFrame, \ @@ -383,7 +382,7 @@ def _GPCRdb_web_lookup(url, verbose=True, DFout = ValueError('Contacted %s url successfully (no 404),\n' 'but Uniprot name %s yields nothing' % (url, UniProt_name)) else: - df = _read_json(a.text) + df = _DataFrame(a.json()) mydict = df.T.to_dict() for key, val in mydict.items(): try: From 86b65f62e3a8e1d8171ed73366239b31016d0a52 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 2 Sep 2024 01:44:03 +0200 Subject: [PATCH 70/83] [examples.fetch_example_data] unify names --- mdciao/examples/examples.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mdciao/examples/examples.py b/mdciao/examples/examples.py index a22f787d..2b26ed74 100644 --- a/mdciao/examples/examples.py +++ b/mdciao/examples/examples.py @@ -459,7 +459,7 @@ def fetch_example_data(alias_or_url="b2ar@Gs", Currently, these are the available aliases and their urls * b2ar@Gs : https://proteinformatics.uni-leipzig.de/mdciao/mdciao_example.zip Beta 2 adrenergic receptor in complex with Gs-protein. Provided - kindly by H. Batebi (1 traj, ca. 10 MB, 280 frames, dt = 10 ps) + kindly by Dr. H. Batebi (1 traj, ca. 10 MB, 280 frames, dt = 10 ps) * EGFR : http://proteinformatics.uni-leipzig.de/mdciao/example_kinases.zip Epidermal Growth Factor Receptor (EGFR) in complex with @@ -479,7 +479,7 @@ def fetch_example_data(alias_or_url="b2ar@Gs", * ghrelin@ghsr : https://proteinformatics.uni-leipzig.de/mdciao/ghrelin_receptor.zip Growth hormone secretagogue receptor type 1, ghrelin receptor for short, bound - to ghrelin. Provided kindly by Dr. Alexander Vogel (1 traj, ca. 10 MB, 411 frames, dt = 100ns). + to ghrelin. Provided kindly by Dr. A. Vogel (1 traj, ca. 10 MB, 411 frames, dt = 100ns). For the associated publication see here: * Analysis of the Dynamics of the Human Growth Hormone Secretagogue Receptor Reveals Insights into the Energy Landscape of the Molecule @@ -488,7 +488,7 @@ def fetch_example_data(alias_or_url="b2ar@Gs", * Y1 : https://proteinformatics.uni-leipzig.de/mdciao/y1_apo.zip Neuropeptide Y receptor type 1, Y1 receptor for short, in apo form. - Provided kindly by Dr. Alexander Vogel (1 traj, ca. 11 MB, 528 frames, dt = 50ns) + Provided kindly by Dr. A. Vogel (1 traj, ca. 11 MB, 528 frames, dt = 50ns) For the associated publication see here: * Towards Probing Conformational States of Y2 Receptor Using Hyperpolarized 129Xe NMR. From 773c605d8ee28a1516f7bcd4778ee607a824a104 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 2 Sep 2024 01:46:07 +0200 Subject: [PATCH 71/83] [examples/notebooks] new notebook 09.Consensus_Labels.ipynb --- mdciao/examples/09.Consensus_Labels.ipynb | 783 ++++++++++++++++++++++ 1 file changed, 783 insertions(+) create mode 100644 mdciao/examples/09.Consensus_Labels.ipynb diff --git a/mdciao/examples/09.Consensus_Labels.ipynb b/mdciao/examples/09.Consensus_Labels.ipynb new file mode 100644 index 00000000..893d1873 --- /dev/null +++ b/mdciao/examples/09.Consensus_Labels.ipynb @@ -0,0 +1,783 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5d498cf1-8a17-40c5-84ae-f29de0abc286", + "metadata": {}, + "source": [ + "# Comparing Frequencies: Consensus Nomenclature\n", + "In this notebook, we exploit the GPCR [consensus nomenclature](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/mdciao.nomenclature.html) to compute and compare contact frequencies across four GPCRs that have very little sequence identity. \n", + "\n", + "Nevertheless, the consensus nomenclature will allow us to:\n", + "* Use the same function calls for all systems, regardless of the underlying primary sequence\n", + "* Compare the frequencies across systems by using consensus labels\n", + "\n", + "The four systems we will be comparing are: \n", + "* Beta 2 adrenergic receptor in complex with Gs-protein. \n", + " Provided kindly by Dr. H. Batebi \n", + "* Growth hormone secretagogue receptor type 1, ghrelin receptor for short. \n", + " Provided kindly by Dr. A. Vogel\n", + "* Neuropeptide Y receptor type 1, Y1 receptor for short, in apo form. \n", + " Provided kindly by Dr. A. Vogel.\n", + "* Active mu-opioid receptor bound to the agonist morphine. \n", + " Kindly made available for this purpose by the GPCRmd. \n", + "\n", + "All these example datasets will be downloaded on the fly using [mdciao.examples.fetch_example_data](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.examples.fetch_example_data.html#mdciao-examples-fetch-example-data), please note the individual references for each dataset there.\n", + "\n", + "Also note that `mdciao` ships with all references regarding the used nomenclature schemes, you can issue [mdciao.nomenclature.references](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.nomenclature.references.html#mdciao.nomenclature.references) to print them out." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58fc7e71-bf6c-40aa-8395-d03c17fd1960", + "metadata": {}, + "outputs": [], + "source": [ + "import mdciao" + ] + }, + { + "cell_type": "markdown", + "id": "8aca7103-0550-46c2-b0b4-b44b97ddad59", + "metadata": {}, + "source": [ + "## Download Trajectory Data\n", + "Throughout the notebook, we will use the same aliases used by [mdciao.examples.fetch_example_data](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.examples.fetch_example_data.html#mdciao-examples-fetch-example-data) to adress the different sytems/datasets, `\"b2ar@Gs\", \"ghrelin@ghsr\" , \"mor@muor\", \"y1_apo\"`, but one could create an alias dictionary for nicer tagging of plots etc (see note at the bottom of the notebook)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffd1218f-e8c0-469c-84e3-04d7c3eae77c", + "metadata": {}, + "outputs": [], + "source": [ + "systems = [\"b2ar@Gs\", \n", + " \"ghrelin@ghsr\" , \n", + " \"mor@muor\", \n", + " \"y1_apo\"]\n", + "for system in systems:\n", + " d = mdciao.examples.fetch_example_data(system, \n", + " unzip=system,\n", + " skip_on_existing=True)" + ] + }, + { + "cell_type": "markdown", + "id": "4ba782f2-0525-49be-a815-f85097c7ae2d", + "metadata": {}, + "source": [ + "## Nomenclature Data\n", + "We will get the nomenclature data, i.e. the per-residue consensus labels mapped to the canonical primary sequence of the receptor, from the [GPCRdb](https://gpcrdb.org/) (in this case). The lookup happens via [UniProt Entry Names](https://www.uniprot.org/help/difference%5Faccession%5Fentryname) and uses mdciao's [nomenclature clases](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/mdciao.nomenclature.html). \n", + "\n", + "These objects contain all nomenclature information, and map the consensus labels and fragments (\"3.50\", or \"TM3\", respectively) not only to the canonical sequence, but to tht of the topologies at hand, using class methods like [top2labels](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.nomenclature.LabelerGPCR.html#mdciao.nomenclature.LabelerGPCR.top2labels) and [top2frags](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.nomenclature.LabelerGPCR.html#mdciao.nomenclature.LabelerGPCR.top2frags). \n", + "\n", + "So, as a user, you need to know these [UniProt Entry Names](https://www.uniprot.org/help/difference%5Faccession%5Fentryname) for each one of your systems." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "368b63d9-8f95-474e-9a4b-daad25d920a6", + "metadata": {}, + "outputs": [], + "source": [ + "key2GPCR_UniProt = {\"b2ar@Gs\" : \"adrb2_human\", \n", + " \"ghrelin@ghsr\" : \"ghsr_human\", \n", + " \"mor@muor\" : \"oprm_mouse\", \n", + " \"y1_apo\" : \"npy1r_human\"\n", + " }\n", + "GPCR = {key : mdciao.nomenclature.LabelerGPCR(val, scheme=\"BW\", \n", + " write_to_disk=True, local_path=key) for key, val in key2GPCR_UniProt.items()}" + ] + }, + { + "cell_type": "markdown", + "id": "c67b7891-0d58-4150-a864-480aa4d68af5", + "metadata": {}, + "source": [ + "For the `\"b2ar@Gs` system, we also get the [CGN labels](https://www.mrc-lmb.cam.ac.uk/CGN/faq.html), i.e. those for the G-protein." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cd8f67e-190d-4d23-91b7-134a0d8e2fe4", + "metadata": {}, + "outputs": [], + "source": [ + "CGN = {\"b2ar@Gs\" : mdciao.nomenclature.LabelerCGN(\"gnas2_human\", write_to_disk=True, local_path=\"b2ar@Gs\")}" + ] + }, + { + "cell_type": "markdown", + "id": "c4dafc80-cd1e-4571-9e1b-bd812cbf8ea8", + "metadata": {}, + "source": [ + "Note that in the above cells we're storing the retrieved data as `.xlsx`-files in the individual directories." + ] + }, + { + "cell_type": "markdown", + "id": "a6ebd1eb-2b32-4ab4-8457-2604fd45e7a5", + "metadata": {}, + "source": [ + "## Residue Neighborhood of 3.50\n", + "We start by computing the residue neighborhood of notorious residue `3.50` of TM3 in all systems, without even knowing what residue precisely is `3.50` in all systems.\n", + "\n", + "Since we will store the results in the `rn` dictionary and compare them across systems later, we're supressing all outputs in the cell below, but feel free to comment out the `%%capture` and the ` #figures=False` statements" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "332b7cda-ab57-4a81-a61f-ce7d55398125", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "rn = {}\n", + "for system in systems:\n", + " rn[system] = list(mdciao.cli.residue_neighborhoods(\"3.50\", f\"{system}/traj.xtc\", topology=f\"{system}/top.pdb\", \n", + " output_dir=system, GPCR_UniProt=GPCR[system], accept_guess=True, \n", + " ctc_control=1.0,\n", + " no_disk=True, \n", + " figures=False,\n", + " CGN_UniProt=CGN.get(system,None),\n", + " fragments=\"chains\").values())[0]" + ] + }, + { + "cell_type": "markdown", + "id": "fd216a3c-74f5-4bc7-a72e-74094be26a9f", + "metadata": {}, + "source": [ + "## Compare Frequency Bars\n", + "We show all contact frequencies of all four systems together using [mdciao.plots.compare_groups_of_contacts](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.plots.compare_groups_of_contacts.html#mdciao-plots-compare-groups-of-contacts). \n", + "\n", + "To show what the plot would look like without the consensus labels, initially we won't make use of them:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cf81857-45f4-4980-a694-cb74971f0eb7", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax ,plotted_freqs = mdciao.plots.compare_groups_of_contacts(rn, ctc_cutoff_Ang=4, \n", + " defrag=None);" + ] + }, + { + "cell_type": "markdown", + "id": "1ef9393e-7235-42b2-b51e-d3dac56c455c", + "metadata": {}, + "source": [ + "This plot is a mess, since all `3.50` residues are different residues in all systems:\n", + "* R131 in b2ar@Gs\n", + "* R141 in ghrelin@ghsr\n", + "* R165 in mor@muor\n", + "* R138 in y1_apo\n", + "\n", + "Of course, the same goes for all other residues. This means there's no shared contact pairs to be compared agains each other when using residue names. \n", + "\n", + "However, if we specifiy ``AA_format=\"try_consensus\"``, the method will try to use those labels (which are unified across systems) when possible:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cad6883-26d2-4a6b-8a35-3c8b55a631c3", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax ,plotted_freqs = mdciao.plots.compare_groups_of_contacts(rn, ctc_cutoff_Ang=4, defrag=None, \n", + " AA_format=\"try_consensus\", \n", + " anchor=\"3.50\",\n", + " sort_by=\"consensus\",\n", + " );" + ] + }, + { + "cell_type": "markdown", + "id": "e3444463-5227-471b-89c8-256d00cca44b", + "metadata": {}, + "source": [ + "Much nicer. We also make the plot even more compact by using:\n", + "* `anchor=\"3.50\"` this eliminates \"3.50\" from all labels and only uses the label of the other residue in the residue pair.\n", + "* `sort_by=\"consensus\"` sorts the contacts, insted of by descending frequency (like the first plot), by their consensus label.\n", + "\n", + "Also note, in `b2ar@Gs`, one residue corresponds to the Gs-unit, the 23rd residue of the α5 helix." + ] + }, + { + "cell_type": "markdown", + "id": "b6e23cb8-9eed-418d-add2-ead96e060757", + "metadata": {}, + "source": [ + "## Interface: TM3 vs all other TMs\n", + "We now compute whole interfaces between fragments following the same approach, i.e using consensus labels. In this case, we're computing the contacts of the `TM3` with all other elements of the system. We're doing so by using the keyword arguments:\n", + "\n", + "```python\n", + "interface_selection_1=\"TM3\", \n", + "interface_selection_2=\"*\", \n", + "```\n", + "\n", + "For the purposes of this notebook, we focus on the usage of consensus descriptors, but please read the full documentation in [mdciao.cli.interface](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.cli.interface.html) for how interfaces can be defined.\n", + "\n", + "Also note that we're supressing the output since we will be comparing (like above) the different systems later. Just comment out the `%%capture` if you want to see the output and the figures.\n", + "### Compute" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d06e7e4e-131d-4e92-9489-13ca6c9fad72", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "intf = {}\n", + "for system in systems:\n", + " intf[system] = mdciao.cli.interface(f\"{system}/traj.xtc\", topology=f\"{system}/top.pdb\", output_dir=system, \n", + " GPCR_UniProt=GPCR[system], \n", + " accept_guess=True, \n", + " CGN_UniProt=CGN.get(system, None),\n", + " ctc_control=1.0, \n", + " interface_selection_1=\"TM3\", \n", + " interface_selection_2=\"*\", \n", + " no_disk=True,\n", + " fragments=\"chains\",\n", + " plot_timedep=False,\n", + " figures=True,\n", + " self_interface=True,\n", + " title=f\"interface {system}\",\n", + " n_nearest=4\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "2044f97d-1516-425c-9684-bca717d1ca4a", + "metadata": {}, + "source": [ + "### Compare Frequency Bars\n", + "Using the same method as above, we compare now frequencies, but instead of resolving to each individual pair (there's about 150 TM3 vs all contacts), we aggregate the frequencies to each residue using `per_residue=True`. This loses the per-pair information but makes the plot more compact to begin with." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "354c63a8-4646-4bb3-bb76-680e25cdb422", + "metadata": {}, + "outputs": [], + "source": [ + "mdciao.plots.plots.compare_groups_of_contacts(intf, ctc_cutoff_Ang=4, defrag=None, \n", + " per_residue=True, \n", + " AA_format=\"try_consensus\", \n", + " figsize=None,\n", + " sort_by=\"consensus\", \n", + " lower_cutoff_val=1,\n", + " interface=True,\n", + " );" + ] + }, + { + "cell_type": "markdown", + "id": "f053c219-f480-4d4b-b9fe-7cd03b400ff6", + "metadata": {}, + "source": [ + "There's a lot to unpack here, but we we can immediately see that e.g. `3.28` and `3.37` behave differently across systems. We'll check them later, but now we pick a representation that tries to be compact without losing pair information." + ] + }, + { + "cell_type": "markdown", + "id": "e87c8dce-14b8-4ddb-b8a8-f992376b542c", + "metadata": {}, + "source": [ + "### Compare Flareplots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53bb6cd0-d5e8-4be5-9d74-fc8d95a9715d", + "metadata": {}, + "outputs": [], + "source": [ + "from matplotlib import pyplot as plt\n", + "myfig, myax = plt.subplots(2,2, sharex=True, sharey=True, figsize=(70,70), tight_layout=True)\n", + "for (system, iintf) , iax in zip(intf.items(), myax.flatten()):\n", + " consensus_maps=[GPCR[system]]\n", + " iCGN = CGN.get(system, None)\n", + " if iCGN is not None:\n", + " consensus_maps.append(iCGN)\n", + " iintf.plot_freqs_as_flareplot(4, fragments=\"chains\", \n", + " scheme=\"consensus_sparse\", \n", + " aura=iintf.frequency_sum_per_residue_idx_dict(4, return_array=True),\n", + " panelsize=15,\n", + " ax=iax, \n", + " consensus_maps=consensus_maps)\n", + " iax.set_title(f\"{system}: TM3 interface at 4 Angstrom\", fontsize=60)\n", + "print()\n", + "myfig.tight_layout(h_pad=5)" + ] + }, + { + "cell_type": "markdown", + "id": "7f6dac21-e2c0-43fe-b92a-43ef17313a33", + "metadata": {}, + "source": [ + "This representation tries to capture the system's topology, and the individual pairs as well as collective behaviours. We have an [entire notebook]() devoted on how these plots work and how one can fine-tune them. One of the easiest things to spot is the difference in TM3 vs TM6 contacts.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "c23b16d9-9594-449e-88bb-eeee950a79bc", + "metadata": {}, + "source": [ + "### Coarse-Graining to Consensus Fragments\n", + "We can also coarse-grain the frequencies to the fragments, showing a bit the structure and scaffolding role of TM3 in the TM-bundle. All this, without having to directly define the fragments for each system:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33a398c0-801e-4f68-b6e6-ddccfb5ca6d4", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas\n", + "CG_freqs = {}\n", + "for (system, iintf) , iax in zip(intf.items(), myax.flatten()):\n", + " consensus_maps=[GPCR[system]]\n", + " iCGN = CGN.get(system, None)\n", + " if iCGN is not None:\n", + " consensus_maps.append(iCGN)\n", + " CG_freqs[system] = iintf.frequency_as_contact_matrix_CG (4, fragments=mdciao.fragments.get_fragments(iintf.top, \"chains\"),\n", + " sparse=True,\n", + " consensus_labelers=consensus_maps)[\"TM3\"]\n", + "df = pandas.DataFrame(CG_freqs).T\n", + "df = df[[key for key in mdciao.nomenclature.nomenclature._GPCR_fragments+mdciao.nomenclature.nomenclature._CGN_fragments if key in df.keys()]]\n", + "df.round(1).fillna(\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "0dff614c-37fe-4ea0-a25b-08c160ba9697", + "metadata": {}, + "source": [ + "The table above summarized TM3's behavior, contact-wise, with the other elements. Some things are immediately apparent, like `mor@muor` having less contacts with TM2 and more with TM6" + ] + }, + { + "cell_type": "markdown", + "id": "4e07981a-8340-4b0f-9863-02e306937309", + "metadata": {}, + "source": [ + "## Residue Neighborhood: Select 3.37 via consensus labels without recomputing anything" + ] + }, + { + "cell_type": "markdown", + "id": "35c737d1-6497-43c6-8c57-385863c58f6a", + "metadata": {}, + "source": [ + "We now go back to the residue-level analysis. \n", + "\n", + "We coould re-compute `3.37`'s neighbordhood the same way we did `3.50`'s initially. However, the individual contacts have already been computed when computing TM3's interface, s.t. we can simply filter the interface for any contacts containing `3.37` (or any other residue) using [ContactGroup.select_by_residues](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html#mdciao.contacts.ContactGroup.select_by_residues)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0cbfce9-0971-4b0c-b22d-038469dce566", + "metadata": {}, + "outputs": [], + "source": [ + "rn337 = {system : iintf.select_by_residues(\"3.37\") for system, iintf in intf.items()}" + ] + }, + { + "cell_type": "markdown", + "id": "4dc0b721-5c2f-4347-938c-96c55e28ca7b", + "metadata": {}, + "source": [ + "### Compare Frequency Bars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b207830a-693a-473b-b745-12375bead38a", + "metadata": {}, + "outputs": [], + "source": [ + "mdciao.plots.compare_groups_of_contacts(rn337, ctc_cutoff_Ang=4, AA_format=\"try_consensus\", anchor=\"3.37\", sort_by=\"consensus\");" + ] + }, + { + "cell_type": "markdown", + "id": "71134a51-8aeb-47f3-a490-c9ccde945487", + "metadata": {}, + "source": [ + "As we saw in the interface plot, `3.37` in `ghrelin@ghsr` has many more contacts than the other systems. Now we see why: that the Ghrelin-receptor's `3.37` residue is interacting with TM4 and TM5 more than the rest. " + ] + }, + { + "cell_type": "markdown", + "id": "fb195aed-b13f-43a0-a44a-481c5bc8243d", + "metadata": {}, + "source": [ + "### Compare Contact-Distances\n", + "We will inspect this further, first using [violin plots](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.plots.compare_violins.html#mdciao.plots.compare_violins) in the next cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07b2bd91-3477-41f2-a094-a1439d19474d", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax, plotted_labels = mdciao.plots.compare_violins(rn337, \n", + " ctc_cutoff_Ang=4, defrag=None, \n", + " AA_format=\"try_consensus\", \n", + " sort_by=\"consensus\",\n", + " anchor=\"3.37\",\n", + " figsize=None);" + ] + }, + { + "cell_type": "markdown", + "id": "7dac24b3-8108-4b7d-8834-2a7a139a31ec", + "metadata": {}, + "source": [ + "Turns out, `4.57, 4.60, 5.42`, and `5.43` weren't even included other `3.37` neighborhoods (`mor@muor, b2ar@Gs, y1_apo`), because they never came in contact with `3.37` (or almost never, according to `min_freq=0.1`, the default value of [mdciao.cli.interface](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.cli.interface.html)). \n", + "\n", + "Still, we can use mdciao's [mdciao.cli.sites](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.cli.sites.html) to provide an explict list of pairs of residues we want to look at, regardless of them forming a contact or not. \n", + "\n", + "### Sites\n", + "You guessed right, we can specifiy them simply using the consensus labels. We stored them in the cell above in the ``plotted_labels`` variable, so it's very easy to pass them on to the site definitions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0b584d8-f82d-4b39-99a3-410fd25fb769", + "metadata": {}, + "outputs": [], + "source": [ + "site_337_def = {\"name\": \"selected 3.77 distances\", \n", + " \"pairs\" : {\"consensus\" : [f\"3.37-{label}\" for label in plotted_labels]}}\n", + "site_337_def" + ] + }, + { + "cell_type": "markdown", + "id": "8ff44faf-3deb-4a0e-8c53-6596a601fd18", + "metadata": {}, + "source": [ + "Again, feel free to comment in `%%capture` to see all outputs, but we will be comparing the distances two cells below anyways." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da728237-5f7d-4187-927d-5a4cc594f05d", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "site_337 = {}\n", + "for system in systems:\n", + " site_337[system] = mdciao.cli.sites(site_337_def,\n", + " f\"{system}/traj.xtc\", topology=f\"{system}/top.pdb\", output_dir=system, \n", + " GPCR_UniProt=GPCR[system], accept_guess=True, \n", + " no_disk=True,allow_partial_sites=True,\n", + " CGN_UniProt=CGN.get(system, None),figures=False,\n", + " fragments=\"chains\")[\"selected 3.77 distances\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "546da2c8-f019-4ec5-b5e7-e7c20ada33bf", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax, plotted_labels, repframes = mdciao.plots.compare_violins(site_337, defrag=None, \n", + " AA_format=\"try_consensus\", \n", + " anchor=\"3.37\",\n", + " sort_by=\"consensus\",\n", + " representatives=True,\n", + " ctc_cutoff_Ang=4,\n", + " );" + ] + }, + { + "cell_type": "markdown", + "id": "6ff1835e-f09b-430c-9f1d-6ce3e7884b70", + "metadata": {}, + "source": [ + "We see that the distances for`4.60, 5.42`, and `5.43` for `mor@muor, b2ar@Gs, y1_apo` virtually never cross the 4 Å cutoff.\n", + "\n", + "Also note that by using `representatives=True` we have triggered some things in the violin plots. For each system, we have tried to locate a frame in the trajectory in which the shown residue-residue distances adopt values close to the most likely value (=where the violin is widest). You can read about these representative frames here: [ContactGroup.repframes](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html#mdciao.contacts.ContactGroup.repframes).\n", + "\n", + "These frames are represented as dots inside the individual violins and also as returned geometries in form of `mdtraj` [trajectories](https://www.mdtraj.org/1.9.8.dev0/api/generated/mdtraj.Trajectory.html#mdtraj.Trajectory) (stored in the `repframes` returned value) which we can visualize in 3D. " + ] + }, + { + "cell_type": "markdown", + "id": "d33afa18-f164-4bc6-9286-a060703320ee", + "metadata": {}, + "source": [ + "## Visualizing Representative Frames" + ] + }, + { + "cell_type": "markdown", + "id": "a1d1a253-5c2b-43da-b550-ea12f9e670a9", + "metadata": {}, + "source": [ + "The first thing we note is that geometries won't be aligned, because they're all coming from different simulations. Also, `b2ar@Gs` has the whole G-protein as well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "161e2a91-586c-420f-bc80-52b7f49bfd02", + "metadata": {}, + "outputs": [], + "source": [ + "import nglview\n", + "from matplotlib import colors as mplcolors\n", + "colors = mdciao.plots.color_dict_guesser(\"tab10\", repframes.keys())\n", + "iwd = nglview.NGLWidget()\n", + "for ii, (system, geom) in enumerate(repframes.items()):\n", + " iwd.add_trajectory(geom)#, [[gpcr_idxs]]), title=\"test\")\n", + " iwd.clear_representations(component=ii)\n", + " iwd.add_cartoon(component=ii, color=mplcolors.to_hex(colors[system]), name=system, radius=.1)\n", + "iwd" + ] + }, + { + "cell_type": "markdown", + "id": "88fbd034-1b0a-45ec-bf7c-685e34f4d85e", + "metadata": {}, + "source": [ + "To produce a high quality alignment of the receptor structures, even with low primary-sequence identity, we can arrive at a multiple-sequence-alignment (MSA) via the consensus labels, which act as a proxy for sequence identity. For this, we use `mdciao`'s [AlignerConsensus](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.nomenclature.AlignerConsensus.html#mdciao.nomenclature.AlignerConsensus) class. There's a whole notebook about them [here](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/MSA_via_Consensus_Labels.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9354bc23-4e1c-4a85-988b-d503330b292c", + "metadata": {}, + "outputs": [], + "source": [ + "AC = mdciao.nomenclature.AlignerConsensus(GPCR, tops={key : val.top for key, val in repframes.items()})" + ] + }, + { + "cell_type": "markdown", + "id": "58762679-a4aa-4f94-bf01-8b681854f3f5", + "metadata": {}, + "source": [ + "This is the consensus-MSA, for example for TM3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88ff85af-ca16-461d-a93c-bcdcaf56af19", + "metadata": {}, + "outputs": [], + "source": [ + "AC.AAresSeq_match(\"3.*\")" + ] + }, + { + "cell_type": "markdown", + "id": "3c7d702b-cac3-4daa-9d63-00a1f547a5b2", + "metadata": {}, + "source": [ + "## Superpose Geometries" + ] + }, + { + "cell_type": "markdown", + "id": "f8fe670f-b0de-4bc3-a5ba-7355b4178907", + "metadata": {}, + "source": [ + "For the alignment we use the selection \"?.*,-6.*,8.*\" which selects all TMs, except TM6 and H8:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95922eb0-f403-4653-9db2-a4542fa99f55", + "metadata": {}, + "outputs": [], + "source": [ + "CA_idxs = AC.CAidxs_match(\"?.*,-6.*,8.*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf79b85-a3af-4f9c-99b5-b957e89c3284", + "metadata": {}, + "outputs": [], + "source": [ + "ref_key = \"b2ar@Gs\"\n", + "for ii, (system, geom) in enumerate(repframes.items()):\n", + " if system!=ref_key:\n", + " geom.superpose(repframes[ref_key], atom_indices=CA_idxs[system], ref_atom_indices=CA_idxs[ref_key])" + ] + }, + { + "cell_type": "markdown", + "id": "27202b0e-bb76-45f4-bdfd-a5530223284b", + "metadata": {}, + "source": [ + "## Show Geometries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26c01618-af29-4101-b39b-02e283d82221", + "metadata": {}, + "outputs": [], + "source": [ + "iwd = nglview.NGLWidget()\n", + "for ii, (system, geom) in enumerate(repframes.items()):\n", + " iwd.add_trajectory(geom)\n", + " iwd.clear_representations(component=ii)\n", + " iwd.add_cartoon(component=ii, color=mplcolors.to_hex(colors[system]), name=system, radius=.1) \n", + "iwd" + ] + }, + { + "cell_type": "markdown", + "id": "fcce49a5-3973-41ae-873d-024b475fee2c", + "metadata": {}, + "source": [ + "## Show 3.37's neighborhood\n", + "Finally, we fine-tune the 3D representation to include the most varying contacts of `3.37`: `4.60, 5.42`, and `5.43`, which we show first as a table across the four systems:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e4f3930-3554-4919-a46b-8f385c724db5", + "metadata": {}, + "outputs": [], + "source": [ + "show=[\"3.37\",\"4.60\", \"5.42\", \"5.43\"]\n", + "AC.AAresSeq_match(\",\".join(show))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c0c91c9-5b7a-4baf-9aa5-45bcf0d521ae", + "metadata": {}, + "outputs": [], + "source": [ + "iwd = nglview.NGLWidget()\n", + "ref_key = \"b2ar@Gs\"\n", + "for ii, (system, geom) in enumerate(repframes.items()):\n", + " iwd.add_trajectory(geom, title=system)\n", + " iwd.clear_representations(component=ii)\n", + " iwd.add_cartoon(component=ii, color=mplcolors.to_hex(colors[system]), name=system, radius=.1)\n", + " AAs = \" \".join([GPCR[system].conlab2AA[jj][1:] for jj in show])\n", + " iwd.add_licorice(component=ii, selection=f\"({AAs}) and not Hydrogen\", radius=.5, color=mplcolors.to_hex(colors[system]))\n", + "iwd.gui_style = \"ngl\"\n", + "iwd" + ] + }, + { + "cell_type": "markdown", + "id": "ef91148b-d044-41ab-94cc-69af5ea49718", + "metadata": {}, + "source": [ + "From the 3D plot above we can make some observations, most clearly we note that ghrelin@ghsr has the bulikest residue, `Y128@3.50`, which is pointing straight down to a region where TM5 appears to have a bulge, precisely towards the `3.50` position, something the other TM5s don't have. " + ] + }, + { + "cell_type": "markdown", + "id": "fa8f6b4a-95ee-4c3c-9f98-75bba7050e9e", + "metadata": {}, + "source": [ + "## Final Remarks\n", + " \n", + "Some final observations\n", + "\n", + "* The point of this notebook isn't to arrive at a particular finding but rather to showcase the utility of streamilining the contact-analysis using consensus nomenclature.\n", + "\n", + "* We have kept the system names as they are downloaded with [mdciao.examples.fetch_example_data], because they all follow the convention of having a `traj.xtc` and `top.pdb` files, but you can map any topology and trajectory files using aliases and dictionaries:\n", + " ```python\n", + " alias = {\"b2ar@Gs\" : \"adrb2\", \n", + " \"ghrelin@ghsr\" : \"ghsr\",\n", + " \"mor@muor\" : \"muor\", \n", + " \"y1_apo\" : \"y1\"\n", + " }\n", + " #these are just examples of possible other topology filenames.\n", + " top = {\"adrb2\" : \"adrb2/prot1.pdb\", \n", + " \"ghsr\" : \"ghrelin/system.pdb\", \n", + " \"muor\" : \"muor/muor.pdb\", \n", + " \"y1\" : \"y1/top.pdb\"\n", + " }\n", + " #these are just examples of possible other trajectory filenames.\n", + " trajs = {\"adrb2\" : \"adrb2/traj.xtc\", \n", + " \"ghsr\" : \"ghrelin/traj1.xtc\", \"ghrelin/traj2.xtc\",\n", + " \"muor\" : \"muor/run*.xtc\", \n", + " \"y1\" : \"y1/run1.xtc\"\n", + " }\n", + "\n", + " ```\n", + "* Althouth the trajectories we have been using are similar in number of frames, they are wildly different in simulated physical length, s.o there isn't really much physical or biological sense in comparing them other than for this demo:\n", + "```\n", + " * b2ar@Gs: 280 frames, dt = 10ps, 2.8ns in total\n", + " * ghrelin@ghsr: 411 frames, dt = 10ns, 41μs in total\n", + " * mor@muor: 400 frames, dt = 10ns, 40μs in total\n", + " * y1_apo: 528 frames, dt = 50ns, 26.4ns in total\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e769df06-e6c7-49f0-8acf-ff17860dcf15", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 4d5f7dcb7c69f18f04aae7f4d9331a12f83e6185 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 2 Sep 2024 01:47:24 +0200 Subject: [PATCH 72/83] [examples.fetch_example_data] complete data usages --- mdciao/examples/examples.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mdciao/examples/examples.py b/mdciao/examples/examples.py index 2b26ed74..ae6d5618 100644 --- a/mdciao/examples/examples.py +++ b/mdciao/examples/examples.py @@ -424,6 +424,7 @@ def fetch_example_data(alias_or_url="b2ar@Gs", * 'cov19' * 'ghrelin@ghsr' * 'mor@muor' + * 'Y1' This data is used in the notebooks: * 01.Tutorial.ipynb (b2ar@Gs) @@ -432,7 +433,7 @@ def fetch_example_data(alias_or_url="b2ar@Gs", * 04.Comparing_CGs_Flares.ipynb (cov19) * 07.EGFR Kinase Inhibitors.ipynb (EGFR) * 08.Manuscript.ipynb (b2ar@Gs) - * 09.Consensus_labels.ipynb (ghrelin@ghsr) + * 09.Consensus_labels.ipynb (ghrelin@ghsr,b2ar@Gs,mor@muor,Y1) which can all be run locally issuing, from the CLI: From dbcc09f96b81f2d3f1b2fead0f4a23f2cec80ff3 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 2 Sep 2024 01:58:18 +0200 Subject: [PATCH 73/83] [general] rename prot.pdb -> top.pdb --- doc/basic_usage.rst | 2 +- doc/highlights.rst | 10 +- doc/index.rst | 4 +- mdciao/examples/01.Tutorial.ipynb | 14 +- mdciao/examples/02.Missing_Contacts.ipynb | 10 +- mdciao/examples/05.Flareplot_Schemes.ipynb | 10 +- .../06.MSA_via_Consensus_Labels.ipynb | 449 +----------------- mdciao/examples/08.Manuscript.ipynb | 2 +- mdciao/examples/examples.py | 2 +- 9 files changed, 42 insertions(+), 461 deletions(-) diff --git a/doc/basic_usage.rst b/doc/basic_usage.rst index 2da73536..07fd4582 100644 --- a/doc/basic_usage.rst +++ b/doc/basic_usage.rst @@ -19,7 +19,7 @@ Below you will find a very simple example of how to use ``mdciao`` from the comm This basic command:: - mdc_neighborhoods.py prot.pdb traj.xtc --residues L394 -nf #nf: don't use fragments + mdc_neighborhoods.py top.pdb traj.xtc --residues L394 -nf #nf: don't use fragments will print the following to the terminal (some headers have been left out):: diff --git a/doc/highlights.rst b/doc/highlights.rst index bb0cd1f8..3da46fdd 100644 --- a/doc/highlights.rst +++ b/doc/highlights.rst @@ -8,7 +8,7 @@ Highlights .. _`initial example`: * paper-ready tables and figures from the command line:: - mdc_neighborhoods.py prot.pdb traj.xtc -r L394 --GPCR adrb2_human --CGN gnas2_human -ni -at #ni: not interactive, at: show atom-types + mdc_neighborhoods.py top.pdb traj.xtc -r L394 --GPCR adrb2_human --CGN gnas2_human -ni -at #ni: not interactive, at: show atom-types .. figure:: imgs/bars_and_PDF.png :scale: 40% @@ -78,7 +78,7 @@ Highlights - *G.HN.** : CGN-nomenclature for the :math:`G\alpha_N`-subunit You can check your selection **before** running a computation by using ``mdc_residues.py``:: - >>> mdc_residues.py GLU*,P0G,380-394,G.HN.* prot.pdb --GPCR adrb2_human --CGN GNAS2_HUMAN -ni + >>> mdc_residues.py GLU*,P0G,380-394,G.HN.* top.pdb --GPCR adrb2_human --CGN GNAS2_HUMAN -ni Your selection 'GLU*,P0G,380-394,G.HN.*' yields: residue residx fragment resSeq GPCR CGN GLU10 6 0 10 None G.HN.27 @@ -148,7 +148,7 @@ Highlights * use fragment definitions --like the ones above, ``0`` for the :math:`G\alpha`-unit and ``3`` for the receptor-- to compute interfaces in an automated way, i.e. without having to specifying individual residues:: - >>> mdc_interface.py prot.pdb traj.xtc -fg1 0 -fg2 3 --GPCR adrb2_human --CGN GNAS2_HUMAN -t "3SN6 beta2AR-Galpha interface" -ni + >>> mdc_interface.py top.pdb traj.xtc -fg1 0 -fg2 3 --GPCR adrb2_human --CGN GNAS2_HUMAN -t "3SN6 beta2AR-Galpha interface" -ni ... The following 50 contacts capture 35.56 (~96%) of the total frequency 36.88 (over 75 contacts with nonzero frequency). As orientation value, the first 44 ctcs already capture 90.0% of 36.88. @@ -239,7 +239,7 @@ Highlights The command:: - >>> mdc_sites.py prot.pdb traj.xtc --site tip.json -at -nf -sa #sa: short AA-names + >>> mdc_sites.py top.pdb traj.xtc --site tip.json -at -nf -sa #sa: short AA-names ... The following files have been created: ./sites.overall@4.0_Ang.pdf @@ -285,7 +285,7 @@ Highlights Now we use ``mdc_neighborhoods.py`` on our data:: - >>> mdc_neighborhoods.py prot.pdb traj.xtc -r R131 -nf -o 3SN6.MD + >>> mdc_neighborhoods.py top.pdb traj.xtc -r R131 -nf -o 3SN6.MD ... The following 4 contacts capture 2.12 (~100%) of the total frequency 2.12 (over 5 contacts with nonzero frequency). ... diff --git a/doc/index.rst b/doc/index.rst index 783a56d8..4daff7f8 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -37,12 +37,12 @@ Basic Principle ``mdciao`` takes the files typically generated by a molecular dynamics (MD) simulation, i.e. -* topology files, like *prot.gro* or *prot.pdb* +* topology files, like *prot.gro* or *top.pdb* * trajectory files, like *traj1.xtc*, *traj2.xtc* and calculates the time-traces of residue-residue distances, and from there, **contact frequencies** and **distance distributions**. The most simple command line call would look approximately like this:: - mdc_neighborhoods.py prot.pdb traj.xtc --residues L394 + mdc_neighborhoods.py top.pdb traj.xtc --residues L394 [...] The following 5 contacts capture 3.88 (~90%) of the total frequency 4.31 (over 7 contacts with nonzero frequency). As orientation value, the first 5 ctcs already capture 90.0% of 4.31. diff --git a/mdciao/examples/01.Tutorial.ipynb b/mdciao/examples/01.Tutorial.ipynb index c2352c75..b6854d9c 100644 --- a/mdciao/examples/01.Tutorial.ipynb +++ b/mdciao/examples/01.Tutorial.ipynb @@ -63,7 +63,7 @@ "Now we replicate the CLI command:\n", "\n", "```\n", - "mdc_neighborhoods.py prot.pdb traj.xtc --residues L394 -nf #nf: don't use fragments\n", + "mdc_neighborhoods.py top.pdb traj.xtc --residues L394 -nf #nf: don't use fragments\n", "```\n", "\n", "but in API mode. We use the method [cli.residue_neighborhoods](http://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.cli.residue_neighborhoods.html#mdciao.cli.residue_neighborhoods):" @@ -77,7 +77,7 @@ "source": [ "neighborhoods = mdciao.cli.residue_neighborhoods(\"L394\",\n", " \"mdciao_example/traj.xtc\", \n", - " topology=\"mdciao_example/prot.pdb\", \n", + " topology=\"mdciao_example/top.pdb\", \n", " fragments=None)" ] }, @@ -114,7 +114,7 @@ "outputs": [], "source": [ "import mdtraj as md\n", - "traj = md.load(\"mdciao_example/traj.xtc\", top=\"mdciao_example/prot.pdb\")\n", + "traj = md.load(\"mdciao_example/traj.xtc\", top=\"mdciao_example/top.pdb\")\n", "traj" ] }, @@ -144,7 +144,7 @@ "Now, the more elaborated CLI-command:\n", "\n", "```\n", - "mdc_neighborhoods.py prot.pdb traj.xtc -r L394 --GPCR adrb2_human --CGN gnas2_human -ni -at #ni: not interactive, at: show atom-types\n", + "mdc_neighborhoods.py top.pdb traj.xtc -r L394 --GPCR adrb2_human --CGN gnas2_human -ni -at #ni: not interactive, at: show atom-types\n", "```\n", "\n", "We keep the ``no_disk`` option to avoid writing to disk, but you can change this if you want. **Please note** that some options **do not carry** exactly the same names as their CLI equivalents. E.g. ``ni`` in the CLI (= *don't be interactive*) is now ``accept_guess`` in the API. These differences are needed for compatiblity with other methods, but might get unified in the future. " @@ -220,7 +220,7 @@ "Now, we can play around with residue selection, replicating the CLI-command:\n", "\n", "```\n", - "mdc_residues.py GLU*,P0G,380-394,G.HN.* prot.pdb --GPCR adrb2_human --CGN gnas2_human -ni\n", + "mdc_residues.py GLU*,P0G,380-394,G.HN.* top.pdb --GPCR adrb2_human --CGN gnas2_human -ni\n", "```\n", "\n", "Check the docs [here](http://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.cli.residue_selection.html) to check the output values `res_idxs_list`,` fragments`, and `consensus_maps`, although most of out useful output is written out.\n", @@ -411,7 +411,7 @@ "Now we move to a more elaborate command:\n", "\n", "```\n", - "mdc_interface.py prot.pdb traj.xtc -fg1 0 -fg2 3 --GPCR adrb2_human --CGN gnas2_human -t \"3SN6 beta2AR-Galpha interface\" -ni\n", + "mdc_interface.py top.pdb traj.xtc -fg1 0 -fg2 3 --GPCR adrb2_human --CGN gnas2_human -t \"3SN6 beta2AR-Galpha interface\" -ni\n", "```\n", "\n", "and replicate it using ``cli.interface``. Check the docs [here](http://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.cli.interface.html#mdciao.cli.interface) or in the method's docstring. \n", @@ -448,7 +448,7 @@ "Now we use a different approach. Instead of letting ``mdciao`` discover contacts automatically, we list them beforehand as ``site`` dictionaries, and feed this dictionaries to directly to the [method](http://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.cli.sites.html) ``cli.sites``. The CLI command we're replicating is:\n", "\n", "```\n", - "mdc_sites.py prot.pdb traj.xtc --site tip.json -at -nf -sa #sa: short AA-names\n", + "mdc_sites.py top.pdb traj.xtc --site tip.json -at -nf -sa #sa: short AA-names\n", "```\n", "\n", "However, in the API-spirit, we're not even using a file on disk to define the ``site``, but create it on the fly as a Python dictionary:" diff --git a/mdciao/examples/02.Missing_Contacts.ipynb b/mdciao/examples/02.Missing_Contacts.ipynb index c1a02934..474194b0 100644 --- a/mdciao/examples/02.Missing_Contacts.ipynb +++ b/mdciao/examples/02.Missing_Contacts.ipynb @@ -54,7 +54,7 @@ "outputs": [], "source": [ "import mdtraj as md\n", - "traj = md.load(\"mdciao_example/traj.xtc\",top=\"mdciao_example/prot.pdb\")" + "traj = md.load(\"mdciao_example/traj.xtc\",top=\"mdciao_example/top.pdb\")" ] }, { @@ -339,7 +339,7 @@ "metadata": {}, "outputs": [], "source": [ - "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/prot.pdb\", \n", + "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/top.pdb\", \n", " no_disk=True, interface_selection_1=[0], interface_selection_2=[1], \n", " ctc_control=1.0, \n", " min_freq=0,\n", @@ -374,7 +374,7 @@ "metadata": {}, "outputs": [], "source": [ - "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/prot.pdb\", \n", + "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/top.pdb\", \n", " no_disk=True, interface_selection_1=[0], interface_selection_2=[1], \n", " ctc_control=1.0, \n", " figures=False);\n", @@ -433,7 +433,7 @@ "metadata": {}, "outputs": [], "source": [ - "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/prot.pdb\",\n", + "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/top.pdb\",\n", " fragments=\"consensus\",accept_guess=True,\n", " no_disk=True, interface_selection_1=\"G.H5\", interface_selection_2=\"G.H5\", \n", " ctc_control=1.0, \n", @@ -497,7 +497,7 @@ "metadata": {}, "outputs": [], "source": [ - "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/prot.pdb\",\n", + "intf = mdciao.cli.interface(\"mdciao_example/traj.xtc\", topology=\"mdciao_example/top.pdb\",\n", " fragments=\"consensus\",accept_guess=True,\n", " no_disk=True, interface_selection_1=\"G.H5\", interface_selection_2=\"TM*\", \n", " ctc_control=1.0, \n", diff --git a/mdciao/examples/05.Flareplot_Schemes.ipynb b/mdciao/examples/05.Flareplot_Schemes.ipynb index af8ef12f..8a12fe52 100644 --- a/mdciao/examples/05.Flareplot_Schemes.ipynb +++ b/mdciao/examples/05.Flareplot_Schemes.ipynb @@ -25,15 +25,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Checking https://files.rcsb.org/download/3SN6.pdb ..." - ] - } - ], + "outputs": [], "source": [ "import mdciao\n", "geom = mdciao.cli.pdb(\"3SN6\")\n", diff --git a/mdciao/examples/06.MSA_via_Consensus_Labels.ipynb b/mdciao/examples/06.MSA_via_Consensus_Labels.ipynb index 08212f71..12022f88 100644 --- a/mdciao/examples/06.MSA_via_Consensus_Labels.ipynb +++ b/mdciao/examples/06.MSA_via_Consensus_Labels.ipynb @@ -19,23 +19,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "efd17285", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6cb6f97fd6ac40869a65c21b1df76b46", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import mdciao\n", "import nglview\n", @@ -52,48 +39,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "b4f92862", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Checking https://files.rcsb.org/download/3CAP.pdb ...Please cite the following 3rd party publication:\n", - " * Crystal structure of the ligand-free G-protein-coupled receptor opsin\n", - " Park, J.H. et al., Nature 2008\n", - " https://doi.org/10.1038/nature07063\n", - "Checking https://files.rcsb.org/download/3SN6.pdb ...Please cite the following 3rd party publication:\n", - " * Crystal structure of the beta2 adrenergic receptor-Gs protein complex\n", - " Rasmussen, S.G. et al., Nature 2011\n", - " https://doi.org/10.1038/nature10361\n", - "Checking https://files.rcsb.org/download/6DDF.pdb ..." - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/guille/miniconda3/lib/python3.11/site-packages/mdtraj/formats/pdb/pdbfile.py:206: UserWarning: Unlikely unit cell vectors detected in PDB file likely resulting from a dummy CRYST1 record. Discarding unit cell vectors.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Please cite the following 3rd party publication:\n", - " * Structure of the mu-opioid receptor-Giprotein complex.\n", - " Koehl, A. et al., Nature 2018\n", - " https://doi.org/10.1038/s41586-018-0219-7\n", - "Checking https://files.rcsb.org/download/7CKW.pdb ...Please cite the following 3rd party publication:\n", - " * Ligand recognition and allosteric regulation of DRD1-Gs signaling complexes.\n", - " Xiao, P. et al., Cell 2021\n", - " https://doi.org/10.1016/j.cell.2021.01.028\n" - ] - } - ], + "outputs": [], "source": [ "pdbs = {\"OPS\" : mdciao.cli.pdb(\"3CAP\"), \n", " \"B2AR\" : mdciao.cli.pdb(\"3SN6\"), \n", @@ -111,98 +60,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "5b2f4abe", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No local file ./opsd_bovin.xlsx found, checking online in\n", - "https://gpcrdb.org/services/residues/extended/opsd_bovin ...done!\n", - "Please cite the following reference to the GPCRdb:\n", - " * Kooistra et al, (2021) GPCRdb in 2021: Integrating GPCR sequence, structure and function\n", - " Nucleic Acids Research 49, D335--D343\n", - " https://doi.org/10.1093/nar/gkaa1080\n", - "For more information, call mdciao.nomenclature.references()\n", - "No local file ./adrb2_human.xlsx found, checking online in\n", - "https://gpcrdb.org/services/residues/extended/adrb2_human ..." - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/guille/Programs/mdciao/mdciao/nomenclature/nomenclature.py:386: FutureWarning: Passing literal json to 'read_json' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.\n", - " df = _read_json(a.text)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "done!\n", - "Please cite the following reference to the GPCRdb:\n", - " * Kooistra et al, (2021) GPCRdb in 2021: Integrating GPCR sequence, structure and function\n", - " Nucleic Acids Research 49, D335--D343\n", - " https://doi.org/10.1093/nar/gkaa1080\n", - "For more information, call mdciao.nomenclature.references()\n", - "No local file ./oprm_mouse.xlsx found, checking online in\n", - "https://gpcrdb.org/services/residues/extended/oprm_mouse ..." - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/guille/Programs/mdciao/mdciao/nomenclature/nomenclature.py:386: FutureWarning: Passing literal json to 'read_json' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.\n", - " df = _read_json(a.text)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "done!\n", - "Please cite the following reference to the GPCRdb:\n", - " * Kooistra et al, (2021) GPCRdb in 2021: Integrating GPCR sequence, structure and function\n", - " Nucleic Acids Research 49, D335--D343\n", - " https://doi.org/10.1093/nar/gkaa1080\n", - "For more information, call mdciao.nomenclature.references()\n", - "No local file ./DRD1_HUMAN.xlsx found, checking online in\n", - "https://gpcrdb.org/services/residues/extended/drd1_human ..." - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/guille/Programs/mdciao/mdciao/nomenclature/nomenclature.py:386: FutureWarning: Passing literal json to 'read_json' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.\n", - " df = _read_json(a.text)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "done!\n", - "Please cite the following reference to the GPCRdb:\n", - " * Kooistra et al, (2021) GPCRdb in 2021: Integrating GPCR sequence, structure and function\n", - " Nucleic Acids Research 49, D335--D343\n", - " https://doi.org/10.1093/nar/gkaa1080\n", - "For more information, call mdciao.nomenclature.references()\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/guille/Programs/mdciao/mdciao/nomenclature/nomenclature.py:386: FutureWarning: Passing literal json to 'read_json' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.\n", - " df = _read_json(a.text)\n" - ] - } - ], + "outputs": [], "source": [ "maps = { \"OPS\": mdciao.nomenclature.LabelerGPCR(\"opsd_bovin\"),\n", " \"B2AR\": mdciao.nomenclature.LabelerGPCR(\"adrb2_human\"),\n", @@ -222,73 +83,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "4d15ec10", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "OPS\n", - "Auto-detected fragments with method 'resSeq'\n", - "fragment 0 with 326 AAs MET1 ( 0) - ASN326 (325 ) (0) \n", - "fragment 1 with 326 AAs MET1 ( 326) - ASN326 (651 ) (1) \n", - "fragment 2 with 4 AAs NAG1 ( 652) - BMA4 (655 ) (2) \n", - "fragment 3 with 6 AAs NAG1 ( 656) - BMA4 (661 ) (3) resSeq jumps\n", - "fragment 4 with 2 AAs NAG1 ( 662) - NAG2 (663 ) (4) \n", - "fragment 5 with 4 AAs BGL801 ( 664) - BGL804 (667 ) (5) \n", - "fragment 6 with 1 AAs PLM901 ( 668) - PLM901 (668 ) (6) \n", - "fragment 7 with 2 AAs BGL805 ( 669) - BGL806 (670 ) (7) \n", - "fragment 8 with 1 AAs PLM902 ( 671) - PLM902 (671 ) (8) \n", - "fragment 9 with 10 AAs HOH902 ( 672) - HOH907 (681 ) (9) resSeq jumps\n", - "The GPCR-labels align best with fragments: [0] (first-last: MET1-ASN326).\n", - "\n", - "B2AR\n", - "Auto-detected fragments with method 'resSeq'\n", - "fragment 0 with 51 AAs THR9 ( 0) - GLN59 (50 ) (0) \n", - "fragment 1 with 115 AAs LYS88 ( 51) - VAL202 (165 ) (1) \n", - "fragment 2 with 51 AAs SER205 ( 166) - MET255 (216 ) (2) \n", - "fragment 3 with 132 AAs THR263 ( 217) - LEU394 (348 ) (3) \n", - "fragment 4 with 340 AAs GLN1 ( 349) - ASN340 (688 ) (4) \n", - "fragment 5 with 58 AAs ASN5 ( 689) - ARG62 (746 ) (5) \n", - "fragment 6 with 159 AAs ASN1002 ( 747) - ALA1160 (905 ) (6) \n", - "fragment 7 with 146 AAs GLU30 ( 906) - ARG175 (1051) (7) \n", - "fragment 8 with 61 AAs GLN179 (1052) - ARG239 (1112) (8) \n", - "fragment 9 with 77 AAs CYS265 (1113) - CYS341 (1189) (9) \n", - "fragment 10 with 128 AAs GLN1 (1190) - SER128 (1317) (10) \n", - "fragment 11 with 1 AAs P0G1601 (1318) - P0G1601 (1318) (11) \n", - "The GPCR-labels align best with fragments: [7, 8, 9] (first-last: GLU30-CYS341).\n", - "\n", - "MUOR\n", - "Auto-detected fragments with method 'resSeq'\n", - "fragment 0 with 51 AAs LEU5 ( 0) - ILE55 (50 ) (0) \n", - "fragment 1 with 52 AAs THR182 ( 51) - VAL233 (102 ) (1) \n", - "fragment 2 with 114 AAs ASN241 ( 103) - PHE354 (216 ) (2) \n", - "fragment 3 with 336 AAs ASP5 ( 217) - ASN340 (552 ) (3) \n", - "fragment 4 with 53 AAs ILE9 ( 553) - PHE61 (605 ) (4) \n", - "fragment 5 with 281 AAs MET65 ( 606) - ARG345 (886 ) (5) \n", - "fragment 6 with 5 AAs TYR1 ( 887) - ETA5 (891 ) (6) \n", - "The GPCR-labels align best with fragments: [5] (first-last: MET65-ARG345).\n", - "\n", - "DOP\n", - "Auto-detected fragments with method 'resSeq'\n", - "fragment 0 with 53 AAs ASP11 ( 0) - LEU63 (52 ) (0) \n", - "fragment 1 with 51 AAs THR205 ( 53) - MET255 (103 ) (1) \n", - "fragment 2 with 132 AAs THR263 ( 104) - LEU394 (235 ) (2) \n", - "fragment 3 with 340 AAs GLY1 ( 236) - ASN340 (575 ) (3) \n", - "fragment 4 with 58 AAs ASN5 ( 576) - ARG62 (633 ) (4) \n", - "fragment 5 with 128 AAs GLN1 ( 634) - SER128 (761 ) (5) \n", - "fragment 6 with 146 AAs PHE20 ( 762) - LYS165 (907 ) (6) \n", - "fragment 7 with 53 AAs ASN185 ( 908) - HIS237 (960 ) (7) \n", - "fragment 8 with 36 AAs SER263 ( 961) - CYS298 (996 ) (8) \n", - "fragment 9 with 38 AAs CYS307 ( 997) - LEU344 (1034) (9) \n", - "fragment 10 with 2 AAs G3C501 (1035) - CLR502 (1036) (10) \n", - "The GPCR-labels align best with fragments: [6, 7, 8, 9] (first-last: PHE20-LEU344).\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "pdb_just_receptor = {}\n", "for key, pdb in pdbs.items():\n", @@ -314,25 +112,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "89bbf443", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a17ea646d3de4bc6a312011a0ab3f5fa", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "NGLWidget()" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "colors = {\"MUOR\":\"tab:red\", \n", " \"OPS\":\"tab:blue\", \n", @@ -356,19 +139,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "77dce09d", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/guille/Programs/mdciao/mdciao/nomenclature/nomenclature.py:2369: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", - " labs_out = [subset[oo] for oo in order]\n" - ] - } - ], + "outputs": [], "source": [ "AC = mdciao.nomenclature.AlignerConsensus(maps,\n", " tops={key : geom.top for key, geom in pdb_just_receptor.items()})" @@ -384,91 +158,10 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "68a28310", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/guille/Programs/mdciao/mdciao/nomenclature/nomenclature.py:1829: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n", - " df = df.applymap(lambda x: \"%u%%\" % x)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
OPSB2ARMUORDOP
OPS100%22%24%21%
B2AR22%100%28%42%
MUOR24%28%100%28%
DOP21%42%28%100%
\n", - "
" - ], - "text/plain": [ - " OPS B2AR MUOR DOP\n", - "OPS 100% 22% 24% 21%\n", - "B2AR 22% 100% 28% 42%\n", - "MUOR 24% 28% 100% 28%\n", - "DOP 21% 42% 28% 100%" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "AC.sequence_match()" ] @@ -484,106 +177,17 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "1de3a82f", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
consensusOPSB2AR
1153.50x501065746
1163.51x511076757
1173.52x521088769
1183.53x531095780
1193.54x541102785
1203.55x551109793
1213.56x561115800
\n", - "
" - ], - "text/plain": [ - " consensus OPS B2AR\n", - "115 3.50x50 1065 746\n", - "116 3.51x51 1076 757\n", - "117 3.52x52 1088 769\n", - "118 3.53x53 1095 780\n", - "119 3.54x54 1102 785\n", - "120 3.55x55 1109 793\n", - "121 3.56x56 1115 800" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "AC.CAidxs_match(\"3.5*\", keys=[\"OPS\",\"B2AR\"])" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "76638fb1", "metadata": {}, "outputs": [], @@ -606,25 +210,10 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "da01597f-8b39-4eac-bc37-18414e708983", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e726cbd49fc8416898c9bdb902042d3c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "NGLWidget()" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "iwd = nglview.NGLWidget()\n", "for ii, (key, geom) in enumerate(pdb_just_receptor.items()):\n", diff --git a/mdciao/examples/08.Manuscript.ipynb b/mdciao/examples/08.Manuscript.ipynb index 0ddd806f..39cd0941 100644 --- a/mdciao/examples/08.Manuscript.ipynb +++ b/mdciao/examples/08.Manuscript.ipynb @@ -34,7 +34,7 @@ "import os\n", "if not os.path.exists(\"mdciao_example\"):\n", " mdciao.examples.fetch_example_data(\"b2ar@Gs\")\n", - "traj = md.load(\"mdciao_example/traj.xtc\",top=\"mdciao_example/prot.pdb\")" + "traj = md.load(\"mdciao_example/traj.xtc\",top=\"mdciao_example/top.pdb\")" ] }, { diff --git a/mdciao/examples/examples.py b/mdciao/examples/examples.py index ae6d5618..46e76bca 100644 --- a/mdciao/examples/examples.py +++ b/mdciao/examples/examples.py @@ -554,7 +554,7 @@ def _unzip2dir(full_path_zipfile): The folder's full path is kept, including zipfile's name minus the .zip extension Background: "mdciao_example.zip" was zipped in origin with this structure: - * mdciao_example/prot.pdb + * mdciao_example/top.pdb * mdciao_example/traj.xtc However, it might have been renamed to "mdciao_example_05.zip" when auto-downloading. From c79d5651d5a863373e091a9f5277914bc61fb2b3 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 2 Sep 2024 17:49:30 +0200 Subject: [PATCH 74/83] [doc/FAQ & doc/gallery] include new notebook, renaming of links to numbered notebooks --- doc/FAQ.rst | 8 +++---- doc/gallery.rst | 63 +++++++++++++++++++++++++++---------------------- 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/doc/FAQ.rst b/doc/FAQ.rst index ae087dcc..24f8149c 100644 --- a/doc/FAQ.rst +++ b/doc/FAQ.rst @@ -3,7 +3,7 @@ FAQ Notebooks .. toctree:: - notebooks/Missing_Contacts.ipynb - notebooks/Flareplot_Schemes.ipynb - notebooks/Comparing_CGs_Bars.ipynb - notebooks/Comparing_CGs_Flares.ipynb + notebooks/02.Missing_Contacts.ipynb + notebooks/05.Flareplot_Schemes.ipynb + notebooks/03.Comparing_CGs_Bars.ipynb + notebooks/04.Comparing_CGs_Flares.ipynb diff --git a/doc/gallery.rst b/doc/gallery.rst index 428405d0..2bb1b279 100644 --- a/doc/gallery.rst +++ b/doc/gallery.rst @@ -21,13 +21,13 @@ Tutorials .. list-table:: * - .. figure:: _build/html/_images/interface.combined.png - :target: notebooks/Tutorial.html + :target: notebooks/01.Tutorial.html :height: 100px .. - |br| `One single notebook providing `_ - |br| `an overview of mdciao `_ + |br| `One single notebook providing `_ + |br| `an overview of mdciao `_ FAQs ---- @@ -36,40 +36,40 @@ how particular optional parameters affect the output of some methods. .. list-table:: - * - .. figure:: _build/doctrees/nbsphinx/notebooks_Missing_Contacts_15_1.png - :target: notebooks/Missing_Contacts.html + * - .. figure:: _build/doctrees/nbsphinx/notebooks_02.Missing_Contacts_15_1.png + :target: notebooks/02.Missing_Contacts.html :height: 100px .. - |br| `Missing Contacts `_ + |br| `Missing Contacts `_ - .. figure:: _build/html/_images/notebooks_Comparing_CGs_Bars_41_1.png - :target: notebooks/Comparing_CGs_Bars.html + :target: notebooks/03.Comparing_CGs_Bars.html :height: 100px .. - |br| `Comparing Frequencies: `_ - |br| `Bar Plots `_ + |br| `Comparing Frequencies: `_ + |br| `Bar Plots `_ * - .. figure:: _build/html/_images/notebooks_Comparing_CGs_Flares_41_1.png - :target: notebooks/Comparing_CGs_Flares.html + :target: notebooks/04.Comparing_CGs_Flares.html :height: 100px .. - |br| `Comparing Frequencies: `_ - |br| `Flareplots `_ + |br| `Comparing Frequencies: `_ + |br| `Flareplots `_ - .. figure:: _build/doctrees/nbsphinx/notebooks_Flareplot_Schemes_22_1.png - :target: notebooks/Flareplot_Schemes.html + :target: notebooks/05.Flareplot_Schemes.html :height: 100px .. - |br| `Controlling Flareplots: `_ - |br| `Schemes `_ + |br| `Controlling Flareplots: `_ + |br| `Schemes `_ Examples -------- @@ -79,23 +79,23 @@ They are the best starting point to copy and modify with your own data. .. list-table:: * - .. figure:: _build/html/_images/notebooks_Manuscript_17_0.png - :target: notebooks/Manuscript.html + :target: notebooks/08.Manuscript.html :height: 100px .. - |br| `Interfaces: `_ - |br| `β2 Adrenergic Receptor in Complex with `_ - |br| `Empty Gs-Protein `_ + |br| `Interfaces: `_ + |br| `β2 Adrenergic Receptor in Complex with `_ + |br| `Empty Gs-Protein `_ - .. figure:: _build/html/_images/notebooks_EGFR_Kinase_Inhibitors_14_0.png - :target: notebooks/EGFR_Kinase_Inhibitors.html + :target: notebooks/07.EGFR_Kinase_Inhibitors.html :height: 100px .. - |br| `Binding-Pockets: `_ - |br| `EGFR Kinase Inhibitors `_ + |br| `Binding-Pockets: `_ + |br| `EGFR Kinase Inhibitors `_ * - .. figure:: _build/html/_images/notebooks_Covid-19-Spike-Protein-Example_23_1.png :target: notebooks/Covid-19-Spike-Protein-Example.html @@ -116,15 +116,22 @@ They are the best starting point to copy and modify with your own data. |br| `Example 2: Molecular Interfaces `_ * - .. figure:: imgs/MSA_via_Consensus_Labels.png - :target: notebooks/MSA_via_Consensus_Labels.html + :target: notebooks/06.MSA_via_Consensus_Labels.html :height: 100px .. - |br| `3D Multiple Sequence Alignment via `_ - |br| `Consensus Labels on μ-Opioid Receptor, `_ - |br| `β2 Adregneric Receptor, Opsin, and `_ - |br| `Dopamine D1 Receptor `_ + |br| `3D Multiple Sequence Alignment via `_ + |br| `Consensus Labels on μ-Opioid Receptor, `_ + |br| `β2 Adregneric Receptor, Opsin, and `_ + |br| `Dopamine D1 Receptor `_ + - .. figure:: imgs/MSA_via_Consensus_Labels.png + :target: notebooks/09.Consensus_Labels.html + :height: 100px + + .. - - .. \ No newline at end of file + |br| `Contact Frequencies `_ + |br| `for multiple systems `_ + |br| `via consensus labels `_ From 5855683e6376150537f2ed3777b0005d56fdaf21 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 2 Sep 2024 17:53:46 +0200 Subject: [PATCH 75/83] [examples/notebooks] update hrefs to new numbered notebooks (plus some edits for the html render) --- mdciao/examples/01.Tutorial.ipynb | 4 ++-- mdciao/examples/04.Comparing_CGs_Flares.ipynb | 4 ++-- mdciao/examples/05.Flareplot_Schemes.ipynb | 4 ++-- mdciao/examples/09.Consensus_Labels.ipynb | 16 ++++++++++++---- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/mdciao/examples/01.Tutorial.ipynb b/mdciao/examples/01.Tutorial.ipynb index b6854d9c..90ba3d50 100644 --- a/mdciao/examples/01.Tutorial.ipynb +++ b/mdciao/examples/01.Tutorial.ipynb @@ -418,8 +418,8 @@ "\n", "Additionally, we now have two other notebooks explicitly devoted to the representation of interfaces:\n", "\n", - "* [Bar Plots](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/Comparing_CGs_Bars.html)\n", - "* [FlarePlots](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/Comparing_CGs_Flares.html)" + "* [Bar Plots](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/03.Comparing_CGs_Bars.html)\n", + "* [FlarePlots](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/04.Comparing_CGs_Flares.html)" ] }, { diff --git a/mdciao/examples/04.Comparing_CGs_Flares.ipynb b/mdciao/examples/04.Comparing_CGs_Flares.ipynb index ded53180..5cc9d516 100644 --- a/mdciao/examples/04.Comparing_CGs_Flares.ipynb +++ b/mdciao/examples/04.Comparing_CGs_Flares.ipynb @@ -187,7 +187,7 @@ "This is better, but there's still a lot of information that's not really needed. Before we continue tweaking, just remember that there are **other** ways of looking at the interface frequencies in a much sparse way:\n", "\n", "* [ContactGroup.plot_freqs_as_bars](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html#mdciao.contacts.ContactGroup.plot_freqs_as_bars): \n", - " This is the *sparsest* of all, since only non-zero contacts are plotted as bars. We've devoted an [entire notebook](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/Comparing_CGs_Bars.html) to these types of plots and comparisons.\n", + " This is the *sparsest* of all, since only non-zero contacts are plotted as bars. We've devoted an [entire notebook](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/03.Comparing_CGs_Bars.html) to these types of plots and comparisons.\n", "\n", "* [ContactGroup.plot_interface_frequency_matrix](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html#mdciao.contacts.ContactGroup.frequency_as_contact_matrix): \n", " This is a contact matrix where the x-and y-axes contain only those residues of one side (of the interface) that have **at least one non-zero contact** with the other side. This means, no residue is shown unless it participates in the interface somehow. It's less sparse than [ContactGroup.plot_freqs_as_bars](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html#mdciao.contacts.ContactGroup.plot_freqs_as_bars) because it does contain a lot of blank pixels, but it's sparser than the [flareplot] because it's limited to the residues that participate in the interface. Let's check it out:" @@ -675,7 +675,7 @@ "metadata": {}, "source": [ "## Plotting Frequency Differences\n", - "Finally, another possibility is to showcase contact differences directly, i.e., not the frequency values **themselves**, but the **change** in those values between pairs of datasets. This will make equally strong (or equally weak) contacts (=curves) vanish from the plot and instead direct the eye towards large changes. It's somewhat equivalent to the ``sort_by='std'`` option of the [mdciao.plots.compare_groups_of_contacs](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.plots.compare_groups_of_contacts.html), that we have dicussed in the [other notebook](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/Comparing_CGs_Bars.html#Sorting-by-Standard-Deviation).\n", + "Finally, another possibility is to showcase contact differences directly, i.e., not the frequency values **themselves**, but the **change** in those values between pairs of datasets. This will make equally strong (or equally weak) contacts (=curves) vanish from the plot and instead direct the eye towards large changes. It's somewhat equivalent to the ``sort_by='std'`` option of the [mdciao.plots.compare_groups_of_contacs](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.plots.compare_groups_of_contacts.html), that we have dicussed in the [other notebook](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/03.Comparing_CGs_Bars.html#Sorting-by-Standard-Deviation).\n", "\n", "
\n", "\n", diff --git a/mdciao/examples/05.Flareplot_Schemes.ipynb b/mdciao/examples/05.Flareplot_Schemes.ipynb index 8a12fe52..44678a29 100644 --- a/mdciao/examples/05.Flareplot_Schemes.ipynb +++ b/mdciao/examples/05.Flareplot_Schemes.ipynb @@ -7,7 +7,7 @@ "# Controlling Flareplots\n", "The purpose of this notebook is to show the different ways in which a user can select which residues get shown or hidden in a flareplot, and how they can be broken down into different types of fragments to inform about the molecular topology.\n", "\n", - "We will try different ways of calling the method [plot_freqs_as_flareplot](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html#mdciao.contacts.ContactGroup.plot_freqs_as_flareplot), which is a class method of the object [ContactGroup](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html). Under the hood, the lower-level [mdciao.flare.freqs2flare](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.flare.freqs2flare.html#mdciao.flare.freqs2flare) is at work, which is explained in [this other notebook](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/Comparing_CGs_Flares.html#The-Lower-Level-Method-freqs2flare)\n", + "We will try different ways of calling the method [plot_freqs_as_flareplot](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html#mdciao.contacts.ContactGroup.plot_freqs_as_flareplot), which is a class method of the object [ContactGroup](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.contacts.ContactGroup.html). Under the hood, the lower-level [mdciao.flare.freqs2flare](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.flare.freqs2flare.html#mdciao.flare.freqs2flare) is at work, which is explained in [this other notebook](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/04.Comparing_CGs_Flares.html#The-Lower-Level-Method-freqs2flare)\n", "\n", "
\n", " \n", @@ -36,7 +36,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "These fragments do not exactly coincide with the ``chains`` (check [this](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/Tutorial.html#Fragmentation-Heuristics) for more info), but they are useful for this example. The fragments are: \n", + "These fragments do not exactly coincide with the ``chains`` (check [this](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/01.Tutorial.html#Fragmentation-Heuristics) for more info), but they are useful for this example. The fragments are: \n", "\n", "0. G-protein $\\alpha$ sub-unit \n", "1. G-protein $\\beta$ sub-unit \n", diff --git a/mdciao/examples/09.Consensus_Labels.ipynb b/mdciao/examples/09.Consensus_Labels.ipynb index 893d1873..0b6a97ba 100644 --- a/mdciao/examples/09.Consensus_Labels.ipynb +++ b/mdciao/examples/09.Consensus_Labels.ipynb @@ -9,16 +9,21 @@ "In this notebook, we exploit the GPCR [consensus nomenclature](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/mdciao.nomenclature.html) to compute and compare contact frequencies across four GPCRs that have very little sequence identity. \n", "\n", "Nevertheless, the consensus nomenclature will allow us to:\n", + "\n", "* Use the same function calls for all systems, regardless of the underlying primary sequence\n", + "\n", "* Compare the frequencies across systems by using consensus labels\n", "\n", "The four systems we will be comparing are: \n", "* Beta 2 adrenergic receptor in complex with Gs-protein. \n", - " Provided kindly by Dr. H. Batebi \n", + " Provided kindly by Dr. H. Batebi\n", + " \n", "* Growth hormone secretagogue receptor type 1, ghrelin receptor for short. \n", " Provided kindly by Dr. A. Vogel\n", + "\n", "* Neuropeptide Y receptor type 1, Y1 receptor for short, in apo form. \n", " Provided kindly by Dr. A. Vogel.\n", + "\n", "* Active mu-opioid receptor bound to the agonist morphine. \n", " Kindly made available for this purpose by the GPCRmd. \n", "\n", @@ -569,7 +574,7 @@ "id": "88fbd034-1b0a-45ec-bf7c-685e34f4d85e", "metadata": {}, "source": [ - "To produce a high quality alignment of the receptor structures, even with low primary-sequence identity, we can arrive at a multiple-sequence-alignment (MSA) via the consensus labels, which act as a proxy for sequence identity. For this, we use `mdciao`'s [AlignerConsensus](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.nomenclature.AlignerConsensus.html#mdciao.nomenclature.AlignerConsensus) class. There's a whole notebook about them [here](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/MSA_via_Consensus_Labels.html)." + "To produce a high quality alignment of the receptor structures, even with low primary-sequence identity, we can arrive at a multiple-sequence-alignment (MSA) via the consensus labels, which act as a proxy for sequence identity. For this, we use `mdciao`'s [AlignerConsensus](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.nomenclature.AlignerConsensus.html#mdciao.nomenclature.AlignerConsensus) class. There's a whole notebook about them [here](https://proteinformatics.uni-leipzig.de/mdciao/notebooks/06.MSA_via_Consensus_Labels.html)." ] }, { @@ -718,9 +723,10 @@ " \n", "Some final observations\n", "\n", - "* The point of this notebook isn't to arrive at a particular finding but rather to showcase the utility of streamilining the contact-analysis using consensus nomenclature.\n", + "* The point of this notebook isn't to arrive at a particular finding but rather to showcase the utility of streamilining the contact-analysis across diffeent systems using consensus nomenclature.\n", "\n", - "* We have kept the system names as they are downloaded with [mdciao.examples.fetch_example_data], because they all follow the convention of having a `traj.xtc` and `top.pdb` files, but you can map any topology and trajectory files using aliases and dictionaries:\n", + "* We have kept the system names as they are downloaded with [mdciao.examples.fetch_example_data](https://proteinformatics.uni-leipzig.de/mdciao/api/generated/generated/mdciao.examples.fetch_example_data.html#mdciao-examples-fetch-example-data), because they all follow the convention of having a `traj.xtc` and `top.pdb` files, but you can map any topology and trajectory files using aliases and dictionaries:\n", + " \n", " ```python\n", " alias = {\"b2ar@Gs\" : \"adrb2\", \n", " \"ghrelin@ghsr\" : \"ghsr\",\n", @@ -741,7 +747,9 @@ " }\n", "\n", " ```\n", + "\n", "* Althouth the trajectories we have been using are similar in number of frames, they are wildly different in simulated physical length, s.o there isn't really much physical or biological sense in comparing them other than for this demo:\n", + "\n", "```\n", " * b2ar@Gs: 280 frames, dt = 10ps, 2.8ns in total\n", " * ghrelin@ghsr: 411 frames, dt = 10ns, 41μs in total\n", From e4d16bdbb568ed110a73fe659cff4ac26f783940 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 2 Sep 2024 17:58:18 +0200 Subject: [PATCH 76/83] minor --- doc/cli_stub.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/cli_stub.rst b/doc/cli_stub.rst index 33278ff9..510bd592 100644 --- a/doc/cli_stub.rst +++ b/doc/cli_stub.rst @@ -48,4 +48,4 @@ What these tools do is: * mdc_residues Find residues in an input topology using Unix filename pattern matching. Example :ref:`here `. -You can see their documentation by using the ``-h`` flag when invoking them from the command line, keep reading the ref:`Highlights` or the :ref:`CLI Reference`. \ No newline at end of file +You can see their documentation by using the ``-h`` flag when invoking them from the command line, keep reading the :ref:`Highlights` or the :ref:`CLI Reference`. \ No newline at end of file From 60ed1423c96c0fed6a7c3509633208fe95db830d Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 2 Sep 2024 18:10:19 +0200 Subject: [PATCH 77/83] minor --- doc/gallery.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/gallery.rst b/doc/gallery.rst index 2bb1b279..62e8fde8 100644 --- a/doc/gallery.rst +++ b/doc/gallery.rst @@ -13,7 +13,7 @@ The notebooks can be accessed locally by issuing:: from the CLI. This will create a local "sandboxed" copy of the notebooks, which you can modify and play around with without breaking -the original notebooks. Note: all notebooks except the last two. +the original notebooks. Note: the Covid notebooks are not shipped with `mdciao`. Tutorials --------- From 6a0acd7cfacd5852fe2152e21a39ab1d2e366c79 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 2 Sep 2024 20:41:57 +0200 Subject: [PATCH 78/83] [cli.interface] interface members 'intf_frags_as_residxs' get unique'd since now there can be redunandies in the definitions --- mdciao/cli/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index ac962ebc..883378f4 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -1542,7 +1542,7 @@ def interface( intf_frags_as_str_or_keys = _mdcfrg.frag_dict_2_frag_groups(fragments_as_residue_idxs_d, ng=2, answers=[interface_selection_1, interface_selection_2], ) - + intf_frags_as_residxs = [_np.unique(ifrg) for ifrg in intf_frags_as_residxs] intersect = list(set(intf_frags_as_residxs[0]).intersection(intf_frags_as_residxs[1])) if len(intersect) > 0: if self_interface: From e5111bfe74c9262f0ea0257ca9c0c85e9e8d5a54 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 2 Sep 2024 21:16:34 +0200 Subject: [PATCH 79/83] [cli.interface] API CHANGE wrt to self_interface, dont raise AssertionError but simply eliminate self-contacts. self_interface=True skips the elimination (instead of skipping the AssertionError) This follows 0f6a84849a0b3bc8d8295599bb98a14925a59a0f which now makes it easy for there to be self contacts (It was harder before to get such a case). The 02.Missing_Contacts.ipynb explains this in the self_interface section. --- mdciao/cli/cli.py | 110 ++++++++++++++++++++++++++++++---------------- 1 file changed, 71 insertions(+), 39 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index 883378f4..cb43bd87 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -1152,9 +1152,12 @@ def interface( * by using specific residue indices or ranges * by using defined molecular fragments, - chains defined in the topology or pdb-file. + e.g. chains defined in the topology or pdb-file. * by guessing molecular fragments, using some fragmentation heuristic. + * by guessing molecular fragments, using a consensus + nomenclature like GPCR, CGN or KLIFS generic residue + numbering. The fragment definition and the fragment selection are separate, i.e. there might be six chains but one can specify to compute the interface between @@ -1163,8 +1166,8 @@ def interface( and `interface_selection_2`. One can further refine the fragment selection - with an aminoacid (AA) selection using - `AA_selection`, to further specify the residues + at the level of single aminoacids (AAs) using + `AA_selection`. This can fine-tune the residues of interest if the fragment definitions are too broad. See the docstring for more info. @@ -1175,17 +1178,32 @@ def interface( in a receptor--G-protein complex, one partner is the receptor and the other partner is the G-protein. - This is why mdciao.cli.interface doesn't allow interface - members to share residues by default. However, sometimes it's - useful to allow it because the contacts of one fragment - with itself are also important. E.g. the - C-terminus of a receptor interfacing with - the entire receptor, **including the C-terminus itself**. - To allow for this behaviour, use `self_interface` = True, - and possibly increase `n_nearest`, since otherwise - neighboring residues of the shared set (e.g. C-terminus) + Note + ---- + If your definitions of `interface_selection_1` and + `interface_selection_2` lead to some overlap between + the interface members (see below), mdciao's default + is to ignore contact pairs within the same fragment. + E.g., in the context of a GPCR, computing + "TM3" vs "TM*" ("TM3" vs "all TMs") won't include + TM3-TM3 contacts by default. To include these + (or equivalent) contacts set `self_interface` = True. + + Another example could be computing the interface of the + C-terminus of a receptor with the entire receptor, + where it might be useful to including the contacts of + the C-terminus with itself. + + When using `self_interface` = True, it's advisable to + increase `n_nearest`, since otherwise neighboring + residues of the shared set (the TM3-TM3 or the Cterm-Cterm) will always appear as formed. + See the documentation on `fragments`, + `interface_selection_1`, `interface_selection_2`, + `AA_selection`, `n_nearest` and `self_interface`. + + Finally, the interface strength, defined as the per-residue sum of contacts participating in the interface, is written as the @@ -1242,7 +1260,7 @@ def interface( * A special string, "consensus", to use consensus subdomains, like "TM1" or "G.H5", as fragment definitions. - Numeric expressions are interpreted as zero-indexed and unique + Numeric expressions are interpreted as zero-indexed, unique residue serial indices, i.e. 30-40 does not necessarily equate "GLU30-LEU40" unless serial and sequence index coincide. If there's more than one "GLU30", the user gets asked to @@ -1253,8 +1271,10 @@ def interface( regardless of having passed "consensus" here. I.e., you can use `fragments='chains'` to divide the topology for representation and residue-tagging purposes but then define the interface as: + >>> interface_selection_1="TM3" >>> interface_selection_2="TM2" + to compute the interface of TM3 vs TM2 in a GPCR. For this mode of selection to work, the only condition is that the consensus labels have been provided via `GPCR_Uniprot`, @@ -1266,7 +1286,7 @@ def interface( * ranges, e.g. '1,3-4' * wildcards, e.g. "TM*" or "G.H.??" * exclusions, e.g. "TM*,-TM6" (all TMs except TM6) - The default is to prompt the user for + The default (None) is to prompt the user for information, except when: * `fragments` yielded only one fragment that **doesn't** cover the whole topology. Then @@ -1282,7 +1302,7 @@ def interface( * ranges, e.g. '1,3-4' * wildcards, e.g. "TM*" or "G.H.??" * exclusions, e.g. "TM*,-TM6" (all TMs except TM6) - The default is to prompt the user for + The default (None) is to prompt the user for information, except when: * `fragments` yielded only one fragment that **doesn't** cover the whole topology. Then @@ -1294,11 +1314,17 @@ def interface( AA_selection : str or list, default is None Whatever the fragment definition and fragment selection has been, one can further refine the list of - potential residue pairs by making a per aminoacid (AA) - selection here. E.g., if one has selected the interface - to be "TM3" vs "TM2", but wants to select only some - regions of those helices, one can pass here an `AA_selection`. - This can be a string or a list of len two: + potential residue pairs by making a selection at + the level of single aminoacids (AAs). + E.g., if (like above) one has selected the interface + to be "TM3" vs "TM2", + + >>> interface_selection_1="TM3" + >>> interface_selection_2="TM2" + + but wants to select only some regions of those helices, + one can pass here an `AA_selection`. + This can be a string or a list of two items: * A string leads to a boolean "or" selection, i.e. keep residue pair [ii,jj] if either ii **or** jj @@ -1307,21 +1333,30 @@ def interface( >>> AA_selection = "3.45-3.55" is equivalent of "3.45-3.55" vs "TM2" contacts - * A list of len two leads to a boolean "and" selection, i.e. keep + * A list of with two items (each a string expression) + leads to a boolean "and" selection, i.e. keep residue pair [ii,jj] if ii **and** jj match `AA_selection`. E.g. >>> AA_selection = ["3.45-3.55","2.45-2.55"] - is equivalent of "3.45-3.55" vs "2.45-2.55" contacts + is equivalent of "3.45-3.55" vs "2.45-2.55" contacts. + + The strings for the selection are interpreted by + :obj:`~mdciao.utils.residue_and_atom.rangeexpand_residues2residxs`, + so read there for more info on what expressions are allowed, + like mixed descriptors and wildcards, eg: "GLU*,ARG*,GDP*,LEU394,GLU30-ARG50". + are valid. - In principle, one could use + Finally, CSVs are interpreted as boolean "or", i.e.: - >>> fragments = ["3.45-3.55","2.45-2.55"] + >>> AA_selection = "GLU30,TRP50" + + will select pairs that contain GLU30 **or** TRP50. If you + are sure about your residue pair selection, i.e. you + have a very specific list of residue-pairs you want + to compute, use :obj:`mdciao.cli.sites`. - and get the same contacts, but this would then exclude all other - residues of the topology from being tagged with fragment - and or consensus labels. GPCR_UniProt : str or :obj:`mdciao.nomenclature.LabelerGPCR`, default is None For GPCR nomenclature. If str, e.g. "adrb2_human". will try to locate a local filename or do a web lookup in the GPCRdb. @@ -1544,18 +1579,15 @@ def interface( ) intf_frags_as_residxs = [_np.unique(ifrg) for ifrg in intf_frags_as_residxs] intersect = list(set(intf_frags_as_residxs[0]).intersection(intf_frags_as_residxs[1])) - if len(intersect) > 0: - if self_interface: - ctc_idxs = _mdcu.lists.unique_product_w_intersection(intf_frags_as_residxs[0], intf_frags_as_residxs[1]) - else: - raise AssertionError("Some residues appear in both members of the interface, but this" - " behavior is blocked by default.\nIf you are sure this" - " is correct, unblock this option with 'self_interface=True'.\n" - "The residues are %s" % intersect) - else: - ctc_idxs = _np.vstack(list(_iterpd(intf_frags_as_residxs[0], intf_frags_as_residxs[1]))) - # Remove self-contacts - ctc_idxs = _np.vstack([pair for pair in ctc_idxs if pair[0]!=pair[1]]) + ctc_idxs = _mdcu.lists.unique_product_w_intersection(intf_frags_as_residxs[0], intf_frags_as_residxs[1]) + last_n_ctcs = len(ctc_idxs) + if len(intersect)>0 and not self_interface: + ctc_idxs = [pair for pair in ctc_idxs if not _np.in1d(pair, intersect).all()] + if len(ctc_idxs)!=last_n_ctcs: + print() + print(f"\nExcluding contacts within the same members of the interface reduces from {last_n_ctcs} to {len(ctc_idxs)} residue pairs. " + f"Use 'self_interface=True' to keep these {last_n_ctcs-len(ctc_idxs)} discarded pairs.") + last_n_ctcs = len(ctc_idxs) # Create a neighborlist if n_nearest>0: From cb6af803ee09e02ede4fc0954fc2ce7297d73ab8 Mon Sep 17 00:00:00 2001 From: gph82 Date: Mon, 2 Sep 2024 21:18:01 +0200 Subject: [PATCH 80/83] [cli.interface] Refactor how contact-list reduction is reported and book-kept --- mdciao/cli/cli.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mdciao/cli/cli.py b/mdciao/cli/cli.py index cb43bd87..e31a4829 100644 --- a/mdciao/cli/cli.py +++ b/mdciao/cli/cli.py @@ -1591,9 +1591,12 @@ def interface( # Create a neighborlist if n_nearest>0: - print("Excluding contacts between %u nearest neighbors"%n_nearest) nl = _mdcu.bonds.bonded_neighborlist_from_top(refgeom.top, n=n_nearest) ctc_idxs = _np.vstack([(ii,jj) for ii,jj in ctc_idxs if jj not in nl[ii]]) + if len(ctc_idxs)!=last_n_ctcs: + print(f"\nExcluding contacts between {n_nearest} nearest neighbors reduces from {last_n_ctcs} to {len(ctc_idxs)} residue pairs. " + f"Use 'n_nearest' to control this ({last_n_ctcs-len(ctc_idxs)} residue pairs discarded).") + last_n_ctcs=len(ctc_idxs) print("\nWill look for contacts in the interface between fragments\n%s\nand\n%s. "% ('\n'.join(_twrap(', '.join(['%s' % gg for gg in intf_frags_as_str_or_keys[0]]))), @@ -1614,10 +1617,12 @@ def interface( refgeom.top, fragment_names=fragment_names, additional_resnaming_dicts=consensus_maps) - print(f"Excluding residue pairs not involving residues '{AA_selection}' ({len(sel)} AAs).") ctc_idxs = [pair for pair in ctc_idxs if lambda_sel(pair, sel)] - - print(f"Performing a first pass on the {len(ctc_idxs)} group_1-group_2 residue pairs to compute lower bounds " + if len(ctc_idxs)!=last_n_ctcs: + print(f"\nExcluding residue pairs not involving residues '{AA_selection}' ({len(sel)} AAs) " + f"reduces from {last_n_ctcs} to {len(ctc_idxs)} residue pairs.") + last_n_ctcs = len(ctc_idxs) + print(f"\nPerforming a first pass on the {last_n_ctcs} group_1-group_2 residue pairs to compute lower bounds " f"on residue-residue distances via residue-COM distances.") lb_cutoff_buffer_Ang = 2.5 idx_of_lower_lower_bounds = _mdcctcs.trajs2lower_bounds(xtcs, refgeom.top, ctc_idxs, @@ -1632,7 +1637,7 @@ def interface( if len(ctc_idxs_intf)==0: print("No contacts found at %2.1f Ang. No output produced." % ctc_cutoff_Ang) return - print(f"Reduced to only {len(ctc_idxs_intf)} residue pairs for the computation of actual residue-residue distances:") + print(f"Reduced to only {len(ctc_idxs_intf)} (from {last_n_ctcs}) residue pairs for the computation of actual residue-residue distances:") ctcs, times, at_pair_trajs = _mdcctcs.trajs2ctcs(xtcs, refgeom.top, ctc_idxs_intf, stride=stride, return_times_and_atoms=True, consolidate=False, From 98d0c48d46d217baefa1b8197d05cda497f25c19 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 3 Sep 2024 16:01:28 +0200 Subject: [PATCH 81/83] [nomenclature] minor --- mdciao/nomenclature/nomenclature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index 0708183a..c8c30283 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -1494,7 +1494,7 @@ def __init__(self, maps, tops=None): * Type :obj:`~mdciao.nomenclature.LabelerGPCR`, :obj:`~mdciao.nomenclature.LabelerCGN`, or :obj:`~mdciao.nomenclature.LabelerKLIFS` - Recommended option, the most succint and versatile. + Recommended option, the most succinct and versatile. Pass this object and the maps will get created internally on-the-fly either by calling :obj:`~mdciao.nomenclature.LabelerGPCR.AA2conlab` @@ -1840,7 +1840,7 @@ def _only_matches(df: _DataFrame, patterns=None, keys=None, select_keys=False, d Parameters ---------- df : :obj:`~pandas.DataFrame` or None - The dataframe to be filter by matching `patterns` and `keys`. + The dataframe to be filtered by matching `patterns` and `keys`. If None, the method simply returns None. patterns : str, default is None A list in CSV-format of patterns to be matched From 2456f44b17e678d7eb07e2778ba101b6e5b10ce8 Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 3 Sep 2024 16:26:46 +0200 Subject: [PATCH 82/83] [everywhere] refactor typo GPCRmd -> GPRCdb --- mdciao/filenames/filenames.py | 2 +- mdciao/nomenclature/nomenclature.py | 4 ++-- ...lsx => GPCRdb_B2AR_nomenclature_test.xlsx} | Bin tests/test_nomenclature.py | 22 +++++++++--------- 4 files changed, 14 insertions(+), 14 deletions(-) rename tests/data/nomenclature/{GPCRmd_B2AR_nomenclature_test.xlsx => GPCRdb_B2AR_nomenclature_test.xlsx} (100%) diff --git a/mdciao/filenames/filenames.py b/mdciao/filenames/filenames.py index e4c3db74..b17fa131 100644 --- a/mdciao/filenames/filenames.py +++ b/mdciao/filenames/filenames.py @@ -89,7 +89,7 @@ def __init__(self): # nomenclature self.gnas2_human_xlsx = _path.join(self.nomenclature_path, "gnas2_human.xlsx") - self.GPCRmd_B2AR_nomenclature_test_xlsx = _path.join(self.nomenclature_path,"GPCRmd_B2AR_nomenclature_test.xlsx") + self.GPCRdb_B2AR_nomenclature_test_xlsx = _path.join(self.nomenclature_path, "GPCRdb_B2AR_nomenclature_test.xlsx") self.pdb_3SN6_mut = _path.join(self.nomenclature_path, "3SN6_GLU10GLX.pdb.gz") self.adrb2_human_xlsx = _path.join(self.nomenclature_path,"adrb2_human.xlsx") self.nomenclature_bib = _path.join(self.nomenclature_path,"nomenclature.bib") diff --git a/mdciao/nomenclature/nomenclature.py b/mdciao/nomenclature/nomenclature.py index c8c30283..300e3969 100644 --- a/mdciao/nomenclature/nomenclature.py +++ b/mdciao/nomenclature/nomenclature.py @@ -326,8 +326,8 @@ def _GPCRdb_finder(descriptor, else: xlsxname = format % descriptor fullpath = _path.join(local_path, xlsxname) - GPCRmd = "https://gpcrdb.org/services/residues/extended" - url = "%s/%s" % (GPCRmd, descriptor.lower()) + GPCRdb = "https://gpcrdb.org/services/residues/extended" + url = "%s/%s" % (GPCRdb, descriptor.lower()) if fullpath.endswith(".xlsx"): local_lookup_lambda = lambda fullpath: _read_excel(fullpath, diff --git a/tests/data/nomenclature/GPCRmd_B2AR_nomenclature_test.xlsx b/tests/data/nomenclature/GPCRdb_B2AR_nomenclature_test.xlsx similarity index 100% rename from tests/data/nomenclature/GPCRmd_B2AR_nomenclature_test.xlsx rename to tests/data/nomenclature/GPCRdb_B2AR_nomenclature_test.xlsx diff --git a/tests/test_nomenclature.py b/tests/test_nomenclature.py index 36938c6c..5861d529 100644 --- a/tests/test_nomenclature.py +++ b/tests/test_nomenclature.py @@ -74,7 +74,7 @@ def test_fails_bc_no_online_access(self): try_web_lookup=False) -class Test_GPCRmd_lookup_GPCR(unittest.TestCase): +class Test_GPCRdb_lookup_GPCR(unittest.TestCase): def test_works(self): DF = nomenclature._GPCRdb_web_lookup("https://gpcrdb.org/services/residues/extended/adrb2_human") @@ -88,7 +88,7 @@ def test_wrong_code(self): class Test_GPCRdb_finder(unittest.TestCase): def test_works_locally_xlsx(self): - df, filename = nomenclature._GPCRdb_finder(test_filenames.GPCRmd_B2AR_nomenclature_test_xlsx, + df, filename = nomenclature._GPCRdb_finder(test_filenames.GPCRdb_B2AR_nomenclature_test_xlsx, try_web_lookup=False, ) @@ -99,7 +99,7 @@ def test_works_locally_xlsx(self): def test_works_locally_pkl(self): with _NamedTemporaryFile(suffix=".pkl") as named_pickle: - read_excel(test_filenames.GPCRmd_B2AR_nomenclature_test_xlsx).to_pickle(named_pickle.name) + read_excel(test_filenames.GPCRdb_B2AR_nomenclature_test_xlsx).to_pickle(named_pickle.name) df, filename = nomenclature._GPCRdb_finder(named_pickle.name, try_web_lookup=False, ) @@ -146,7 +146,7 @@ def test_not_find_online_but_no_raise(self): assert isinstance(filename, str) def test_wont_fail_if_found_online_and_write_to_disk(self): - df, filename = nomenclature._GPCRdb_finder(test_filenames.GPCRmd_B2AR_nomenclature_test_xlsx, + df, filename = nomenclature._GPCRdb_finder(test_filenames.GPCRdb_B2AR_nomenclature_test_xlsx, try_web_lookup=False, write_to_disk=True, ) @@ -159,7 +159,7 @@ def test_wont_fail_if_found_online_and_write_to_disk(self): class Test_table2GPCR_by_AAcode(unittest.TestCase): def setUp(self): - self.file = test_filenames.GPCRmd_B2AR_nomenclature_test_xlsx + self.file = test_filenames.GPCRdb_B2AR_nomenclature_test_xlsx def test_just_works(self): table2GPCR = nomenclature._GPCRdbDataFrame2conlabs(tablefile=self.file) @@ -395,10 +395,10 @@ class TestLabelerGPCR_local(unittest.TestCase): # The setup is in itself a test def setUp(self): self.tmpdir = mkdtemp("_test_mdciao_GPCR_local") - self._GPCRmd_B2AR_nomenclature_test_xlsx = path.join(self.tmpdir, path.basename( - test_filenames.GPCRmd_B2AR_nomenclature_test_xlsx)) - shutil.copy(test_filenames.GPCRmd_B2AR_nomenclature_test_xlsx, self._GPCRmd_B2AR_nomenclature_test_xlsx) - self.GPCR_local = nomenclature.LabelerGPCR(self._GPCRmd_B2AR_nomenclature_test_xlsx, + self._GPCRdb_B2AR_nomenclature_test_xlsx = path.join(self.tmpdir, path.basename( + test_filenames.GPCRdb_B2AR_nomenclature_test_xlsx)) + shutil.copy(test_filenames.GPCRdb_B2AR_nomenclature_test_xlsx, self._GPCRdb_B2AR_nomenclature_test_xlsx) + self.GPCR_local = nomenclature.LabelerGPCR(self._GPCRdb_B2AR_nomenclature_test_xlsx, try_web_lookup=False, local_path=self.tmpdir, ) @@ -419,7 +419,7 @@ def tearDown(self): def test_correct_files(self): _np.testing.assert_equal(self.GPCR_local.tablefile, - self._GPCRmd_B2AR_nomenclature_test_xlsx) + self._GPCRdb_B2AR_nomenclature_test_xlsx) def test_dataframe(self): self.assertIsInstance(self.GPCR_local.dataframe, DataFrame) self.assertSequenceEqual(list(self.GPCR_local.dataframe.keys()), @@ -477,7 +477,7 @@ def test_aligntop_with_self_residxs(self): self.assertTrue(all([val in [2, 3] for val in top2self.values()])) def test_uniprot_name(self): - self.assertEqual(self.GPCR_local.UniProt_name, self._GPCRmd_B2AR_nomenclature_test_xlsx) + self.assertEqual(self.GPCR_local.UniProt_name, self._GPCRdb_B2AR_nomenclature_test_xlsx) class Test_aligntop_full(unittest.TestCase): # Has to be done with full GPCR nomencl, not with small one From 44c5a2d4ba7d7097cdd34fc6b48dabda45d4237b Mon Sep 17 00:00:00 2001 From: gph82 Date: Tue, 3 Sep 2024 16:47:05 +0200 Subject: [PATCH 83/83] [test_examples.Test_fetch_example_data.test_alias_unzip_to_otherfile] sort the output of os.listdir --- tests/test_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_examples.py b/tests/test_examples.py index 89c76c8d..6818e7a5 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -210,7 +210,7 @@ def test_alias_unzip_to_otherfile(self): assert len(files) == 2 assert files[0] == "unzip_here.zip" assert files[1] == "unzip_here" - extracted = os.listdir(files[1]) + extracted = sorted(os.listdir(files[1])) assert extracted[0] == "A.dat" assert extracted[1] == "B.dat"