Skip to content

Commit

Permalink
Merge pull request #683 from biolink/fix_isoforms
Browse files Browse the repository at this point in the history
force GAF 2.2 output, instead of taking version from input GAFs - fixing the chickens
  • Loading branch information
sierra-moxon authored Aug 2, 2024
2 parents d2b7967 + 38de691 commit cb22a25
Show file tree
Hide file tree
Showing 9 changed files with 4,872 additions and 4,731 deletions.
5 changes: 3 additions & 2 deletions bin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ See [command line docs](http://ontobio.readthedocs.io/en/latest/commandline.html
To test validate.py "validate" command, the command that produces the final GPADs in the pipeline via the "mega make"
(aka: "produces GAFs, GPADs, ttl" stage), on a particular source:

Note: snapshot below in the URL can be changed to any pipeline branch; its listed here for ease of cp/paste.
```bash
poetry install
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset mgi MGI --gpad-gpi-output-version 2.0
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset goa_chicken goa --gpad-gpi-output-version 2.0
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/snapshot/" --only-dataset mgi MGI --gpad-gpi-output-version 2.0
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/snapshot/" --only-dataset goa_chicken goa --gpad-gpi-output-version 2.0
```


Expand Down
39 changes: 27 additions & 12 deletions bin/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def produce_gaf(dataset, source_gaf, ontology_graph, gpipaths=None, paint=False,
rule_contexts=rule_contexts,
rule_set=rule_set,
)
logger.info("Producing {}".format(source_gaf))
click.echo("Producing {}".format(source_gaf))
# logger.info("AssocParserConfig used: {}".format(config))
split_source = os.path.split(source_gaf)[0]
validated_gaf_path = os.path.join(split_source, "{}_valid.gaf".format(dataset))
Expand Down Expand Up @@ -711,10 +711,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
rule_set=rule_set
)[0]

click.echo("Producing GPI from GAF files...")
gpi = produce_gpi(dataset, absolute_target, valid_gaf, ontology_graph, gpad_gpi_output_version)

gpi_list = [gpi]
gpi_list = []

matching_gpi_path = None
click.echo("Try to find other GPIs in metadata and merge...")
Expand All @@ -739,6 +736,9 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
replace_existing_files=not skip_existing_files)
if paint_metadata else None)

click.echo("Producing GPI for use in creating GPADs...")
gpi = produce_gpi(dataset, absolute_target, valid_gaf, ontology_graph, gpad_gpi_output_version)
gpi_list.append(gpi)
click.echo("Executing 'make_gpads' in validate.produce with all the assembled GAF files...")
make_gpads(dataset, valid_gaf, products,
ontology_graph, noctua_gpad_src, paint_gaf_src,
Expand All @@ -750,18 +750,32 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
rule_metadata=rule_metadata, replace_existing_files=not skip_existing_files,
gaf_output_version=gaf_output_version)

click.echo("Pre-isoform fix gaf file...{}".format(end_gaf))
click.echo("Executing the isoform fixing step in validate.produce...")
# run the resulting gaf through one last parse and replace, to handle the isoforms
# see: https://github.com/geneontology/go-site/issues/2291
output_gaf_path = os.path.join(os.path.split(end_gaf)[0], "{}.gaf".format(dataset))
isoform_fixed_gaf = fix_pro_isoforms_in_gaf(end_gaf, matching_gpi_path, ontology_graph, output_gaf_path)
click.echo(isoform_fixed_gaf)
temp_output_gaf_path = os.path.join(os.path.split(end_gaf)[0], "{}_temp.gaf".format(dataset))
click.echo("temp_output_gaf_path: {}".format(temp_output_gaf_path))
isoform_fixed_gaf = fix_pro_isoforms_in_gaf(end_gaf, matching_gpi_path, ontology_graph, temp_output_gaf_path)
click.echo("isoform_fixed_gaf: ".format(isoform_fixed_gaf))

final_output_gaf_path = os.path.join(os.path.split(end_gaf)[0], "{}.gaf".format(dataset))

click.echo("Rename the temporary isoform fixed file to the final GAF...")
os.rename(temp_output_gaf_path, final_output_gaf_path)
click.echo("final_output_gaf_path: ".format(final_output_gaf_path))

click.echo("Producing final GPI after all GAF corrections...")
final_gpi = produce_gpi(dataset, absolute_target, final_output_gaf_path, ontology_graph, gpad_gpi_output_version)

click.echo("Creating ttl files...")
make_ttls(dataset, isoform_fixed_gaf, products, ontology_graph)
make_ttls(dataset, final_output_gaf_path, products, ontology_graph)


def fix_pro_isoforms_in_gaf(gaf_file_to_fix: str, gpi_file: str, ontology_graph, output_file_path: str) -> str:
def fix_pro_isoforms_in_gaf(gaf_file_to_fix: str,
gpi_file: str,
ontology_graph,
output_file_path: str) -> str:
"""
Given a GAF file and a GPI file, fix the GAF file by converting isoform annotations to gene annotations. Storing
the isoforms back in subject_extensions collection, changing the full_name, synonyms, label, and type back to the
Expand All @@ -787,7 +801,7 @@ def fix_pro_isoforms_in_gaf(gaf_file_to_fix: str, gpi_file: str, ontology_graph,
"id": gpi_entry.get('id')}

gafparser = GafParser(config=assocparser.AssocParserConfig(ontology=ontology_graph))
gafwriter = GafWriter(file=open(output_file_path, "w"), source="test", version=gafparser.version)
gafwriter = GafWriter(file=open(output_file_path, "w"), version="2.2")

# these are statistic parameters that record when a substitution is made.
substitution_count = 0
Expand Down Expand Up @@ -820,7 +834,7 @@ def fix_pro_isoforms_in_gaf(gaf_file_to_fix: str, gpi_file: str, ontology_graph,

# we need to put the isoform currently being swapped, back into "Column 17" which is a
# subject_extension member.
isoform_term = Curie(namespace=old_identity, identity=old_namespace)
isoform_term = Curie(namespace=old_namespace, identity=old_identity)
isoform_relation = Curie(namespace="RO", identity="0002327")
new_subject_extension = ExtensionUnit(relation=isoform_relation, term=isoform_term)
source_assoc.subject_extensions.append(new_subject_extension)
Expand All @@ -837,6 +851,7 @@ def fix_pro_isoforms_in_gaf(gaf_file_to_fix: str, gpi_file: str, ontology_graph,
click.echo(f"Substituted {substitution_count} entries in {gaf_file_to_fix} "
f"and left {no_substitution_count} entries unchanged.")

return output_file_path

@cli.command()
@click.pass_context
Expand Down
Loading

0 comments on commit cb22a25

Please sign in to comment.