diff --git a/bin/trait_mapping/get_children_with_keywords.py b/bin/trait_mapping/get_children_with_keywords.py new file mode 100755 index 00000000..c29a97a5 --- /dev/null +++ b/bin/trait_mapping/get_children_with_keywords.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +import argparse +from collections import defaultdict + +from cmat.clinvar_xml_io.ontology_uri import OntologyUri +from cmat.trait_mapping.ols import build_ols_query +from cmat.trait_mapping.utils import json_request + + +def append_embedded(results, json_response): + if json_response and '_embedded' in json_response: + for key in json_response['_embedded']: + results[key].extend(json_response['_embedded'][key]) + + +def query_and_depaginate(url): + json_response = json_request(url) + results = defaultdict(list) + append_embedded(results, json_response) + while 'next' in json_response['_links']: + json_response = json_request(json_response['_links']['next']['href']) + append_embedded(results, json_response) + return results + + +def search_in(keywords, text): + return set((keyword for keyword in keywords if keyword in text)) + + +def main(): + parser = argparse.ArgumentParser('Search OLS for children of a term that match certain keywords in their label, description or synonyms') + parser.add_argument('--ontology', type=str, default='MONDO', help='Name of the Ontology to find the parent and children') + parser.add_argument('--parent_curie', type=str, help='Curie of the parent term', required=True) + parser.add_argument('--keywords', type=str, nargs='+', help="Words that must be present in the child's ontology label, description or synonyms to be reported") + args = parser.parse_args() + keywords = set(args.keywords) + + db = args.ontology + parent_curie = args.parent_curie + url = build_ols_query(OntologyUri(parent_curie, db).uri) + results = query_and_depaginate(url) + for term in results['terms']: + if term['ontology_prefix'] == db: + children_results = query_and_depaginate(term['_links']['children']['href']) + for child_term in children_results['terms']: + if child_term['ontology_prefix'] == db: + keyword_found = set() + keyword_found.update(search_in(keywords, child_term['label'])) + keyword_found.update(search_in(keywords, child_term['description'])) + for synonym in child_term['synonyms']: + keyword_found.update(search_in(keywords, synonym)) + if keyword_found == keywords: + print(child_term['iri'], child_term['label']) + + +if __name__ == '__main__': + main() + diff --git a/docs/manual-curation/step2-manual-curation.md b/docs/manual-curation/step2-manual-curation.md index 4e4800bd..ff02d045 100644 --- a/docs/manual-curation/step2-manual-curation.md +++ b/docs/manual-curation/step2-manual-curation.md @@ -97,3 +97,11 @@ Terms for import do not require any additional manual intervention, but new term * **MedGen, OMIM** - Links to the specified resource, useful references if any of the above cannot be found. These are often present in the "Suggested exact mapping" column. Any additional comments can be left in the final column, they will be passed on to EFO. + +Note: It is common that new terms are required to be inserted between a general term and more specific ones. The idea being that the new term would group a subset of the specific terms but not all of them. +To help with this a script was developed: given a parent CURIE it will search for all the children of that term that matches specific keyword in their label, description or synonyms. +This is useful for exampl when looking for all the terms that specifically labeled as "dominant" in a long list of children terms. + +```bash +${PYTHON_BIN} ${CODE_ROOT}/bin/trait_mapping/get_children_with_keywords.py --ontology MONDO --parent_curie MONDO:0100062 --keywords dominant +``` \ No newline at end of file